import numpy as np
from scipy import stats
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from yellowbrick.cluster import KElbowVisualizer

col_names = [
"Class",
"Alcohol",
"Malic acid",
"Ash",
"Alcalinity of ash",
"Magnesium",
"Total phenols",
"Flavanoids",
"Nonflavanoid phenols",
"Proanthocyanins",
"Color intensity",
"Hue",
"OD280/OD315 of diluted wines",
"Proline"]


df = pd.read_csv("wine.data", names=col_names)

df.head()

df.shape

(178, 14)

sns.heatmap(df.iloc[:, 1:14].corr())

<matplotlib.axes._subplots.AxesSubplot at 0x24ea9e3a320>

df.iloc[:, 1:14] = stats.zscore(df.iloc[:, 1:14])

df.head()

model = KMeans()
visualizer = KElbowVisualizer(model, k=(2, 12 + 1))

X = df.iloc[:, 1:14].values

visualizer.fit(X)
visualizer.show()

<matplotlib.axes._subplots.AxesSubplot at 0x24eac197c50>

model = KMeans(n_clusters=4)
model.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=4, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

pca = PCA(n_components=2)
pca.fit(X)

PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

print(pca.explained_variance_ratio_)

[0.36198848 0.1920749 ]

pca_X = pca.fit_transform(X)

pca_X.shape

(178, 2)

df_pca = pd.DataFrame(np.concatenate([pca_X, model.labels_.reshape(-1, 1)], axis=1),
                    columns=["PCA_1", "PCA_2", "cluster"])
df_pca["cluster"] = df_pca["cluster"].astype(np.int32)
df_pca.head()

sns.scatterplot(x="PCA_1", y="PCA_2", hue="cluster", style="cluster", data=df_pca)
plt.xlabel("PCA_1")
plt.ylabel("PCA_2")
plt.title("K-means Clustering of Wine Data")
plt.show()

	Class	Alcohol	Malic acid	Ash	Alcalinity of ash	Magnesium	Total phenols	Flavanoids	Nonflavanoid phenols	Proanthocyanins	Color intensity	Hue	OD280/OD315 of diluted wines	Proline
0	1	14.23	1.71	2.43	15.6	127	2.80	3.06	0.28	2.29	5.64	1.04	3.92	1065
1	1	13.20	1.78	2.14	11.2	100	2.65	2.76	0.26	1.28	4.38	1.05	3.40	1050
2	1	13.16	2.36	2.67	18.6	101	2.80	3.24	0.30	2.81	5.68	1.03	3.17	1185
3	1	14.37	1.95	2.50	16.8	113	3.85	3.49	0.24	2.18	7.80	0.86	3.45	1480
4	1	13.24	2.59	2.87	21.0	118	2.80	2.69	0.39	1.82	4.32	1.04	2.93	735

	Class	Alcohol	Malic acid	Ash	Alcalinity of ash	Magnesium	Total phenols	Flavanoids	Nonflavanoid phenols	Proanthocyanins	Color intensity	Hue	OD280/OD315 of diluted wines	Proline
0	1	1.518613	-0.562250	0.232053	-1.169593	1.913905	0.808997	1.034819	-0.659563	1.224884	0.251717	0.362177	1.847920	1.013009
1	1	0.246290	-0.499413	-0.827996	-2.490847	0.018145	0.568648	0.733629	-0.820719	-0.544721	-0.293321	0.406051	1.113449	0.965242
2	1	0.196879	0.021231	1.109334	-0.268738	0.088358	0.808997	1.215533	-0.498407	2.135968	0.269020	0.318304	0.788587	1.395148
3	1	1.691550	-0.346811	0.487926	-0.809251	0.930918	2.491446	1.466525	-0.981875	1.032155	1.186068	-0.427544	1.184071	2.334574
4	1	0.295700	0.227694	1.840403	0.451946	1.281985	0.808997	0.663351	0.226796	0.401404	-0.319276	0.362177	0.449601	-0.037874

	PCA_1	PCA_2	cluster
0	3.316751	-1.443463	2
1	2.209465	0.333393	2
2	2.516740	-1.031151	2
3	3.757066	-2.756372	2
4	1.008908	-0.869831	0