Cluster Analysis on a Wine Dataset

Wine-Dataset-Analysis
In [8]:
import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from yellowbrick.cluster import KElbowVisualizer
In [2]:
col_names = [
"Class",
"Alcohol",
"Malic acid",
"Ash",
"Alcalinity of ash",
"Magnesium",
"Total phenols",
"Flavanoids",
"Nonflavanoid phenols",
"Proanthocyanins",
"Color intensity",
"Hue",
"OD280/OD315 of diluted wines",
"Proline"]


df = pd.read_csv("wine.data", names=col_names)
In [3]:
df.head()
Out[3]:
Class Alcohol Malic acid Ash Alcalinity of ash Magnesium Total phenols Flavanoids Nonflavanoid phenols Proanthocyanins Color intensity Hue OD280/OD315 of diluted wines Proline
0 1 14.23 1.71 2.43 15.6 127 2.80 3.06 0.28 2.29 5.64 1.04 3.92 1065
1 1 13.20 1.78 2.14 11.2 100 2.65 2.76 0.26 1.28 4.38 1.05 3.40 1050
2 1 13.16 2.36 2.67 18.6 101 2.80 3.24 0.30 2.81 5.68 1.03 3.17 1185
3 1 14.37 1.95 2.50 16.8 113 3.85 3.49 0.24 2.18 7.80 0.86 3.45 1480
4 1 13.24 2.59 2.87 21.0 118 2.80 2.69 0.39 1.82 4.32 1.04 2.93 735
In [4]:
df.shape
Out[4]:
(178, 14)
In [5]:
sns.heatmap(df.iloc[:, 1:14].corr())
Out[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x24ea9e3a320>
In [6]:
df.iloc[:, 1:14] = stats.zscore(df.iloc[:, 1:14])
In [7]:
df.head()
Out[7]:
Class Alcohol Malic acid Ash Alcalinity of ash Magnesium Total phenols Flavanoids Nonflavanoid phenols Proanthocyanins Color intensity Hue OD280/OD315 of diluted wines Proline
0 1 1.518613 -0.562250 0.232053 -1.169593 1.913905 0.808997 1.034819 -0.659563 1.224884 0.251717 0.362177 1.847920 1.013009
1 1 0.246290 -0.499413 -0.827996 -2.490847 0.018145 0.568648 0.733629 -0.820719 -0.544721 -0.293321 0.406051 1.113449 0.965242
2 1 0.196879 0.021231 1.109334 -0.268738 0.088358 0.808997 1.215533 -0.498407 2.135968 0.269020 0.318304 0.788587 1.395148
3 1 1.691550 -0.346811 0.487926 -0.809251 0.930918 2.491446 1.466525 -0.981875 1.032155 1.186068 -0.427544 1.184071 2.334574
4 1 0.295700 0.227694 1.840403 0.451946 1.281985 0.808997 0.663351 0.226796 0.401404 -0.319276 0.362177 0.449601 -0.037874
In [9]:
model = KMeans()
visualizer = KElbowVisualizer(model, k=(2, 12 + 1))

X = df.iloc[:, 1:14].values

visualizer.fit(X)
visualizer.show()
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x24eac197c50>
In [10]:
model = KMeans(n_clusters=4)
model.fit(X)
Out[10]:
KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)
In [11]:
pca = PCA(n_components=2)
pca.fit(X)
Out[11]:
PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)
In [12]:
print(pca.explained_variance_ratio_)
[0.36198848 0.1920749 ]
In [13]:
pca_X = pca.fit_transform(X)
In [16]:
pca_X.shape
Out[16]:
(178, 2)
In [26]:
df_pca = pd.DataFrame(np.concatenate([pca_X, model.labels_.reshape(-1, 1)], axis=1),
                    columns=["PCA_1", "PCA_2", "cluster"])
df_pca["cluster"] = df_pca["cluster"].astype(np.int32)
df_pca.head()
Out[26]:
PCA_1 PCA_2 cluster
0 3.316751 -1.443463 2
1 2.209465 0.333393 2
2 2.516740 -1.031151 2
3 3.757066 -2.756372 2
4 1.008908 -0.869831 0
In [32]:
sns.scatterplot(x="PCA_1", y="PCA_2", hue="cluster", style="cluster", data=df_pca)
plt.xlabel("PCA_1")
plt.ylabel("PCA_2")
plt.title("K-means Clustering of Wine Data")
plt.show()