import pandas
import numpy
import matplotlib.pyplot as plt
import seaborn
seaborn.set_style("white")

from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
from sklearn.preprocessing import scale

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
wine = pandas.read_csv(url, header = None, sep = ",")
wine.columns = ["class", "Alcohol", "Malic acid", "Ash", "Alcalinity of ash", "Magnesium", 
                "Total phenols", "Flavanoids", "Nonflavanoid phenols", "Proanthocyanins", 
                "Color intensity", "Hue", "OD280/OD315 of diluted wines", "Proline"]
wine

wine2 = wine.drop(columns = "class")
wine2

hac = AgglomerativeClustering(distance_threshold = 0, n_clusters = None)
hac.fit(scale(wine2))

AgglomerativeClustering(distance_threshold=0, n_clusters=None)

AgglomerativeClustering(distance_threshold=0, n_clusters=None)

from scipy.cluster.hierarchy import dendrogram

def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = numpy.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = numpy.column_stack([model.children_, model.distances_, counts]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

plt.figure(figsize = (16, 8))
plt.title("CAH (Ward)")
plot_dendrogram(hac)
plt.axhline(y = 20, linewidth = .5, color = "dimgray", linestyle = "--")
plt.show()

hac3 = AgglomerativeClustering(n_clusters = 3)
hac3.fit(scale(wine2))

AgglomerativeClustering(n_clusters=3)

AgglomerativeClustering(n_clusters=3)

wine2.assign(classe = hac3.labels_).groupby("classe").mean().round(2)

g = seaborn.catplot(
    data = wine2.assign(classe = [str(v) for v in hac3.labels_]) \
                .melt(id_vars = "classe"),
    x = "value", y = "classe",
    kind = "box",
    col = "variable", col_wrap = 2, sharex = False,
    height = 2, aspect = 5
)
g.set_titles(col_template = "{col_name}", fontweight = "bold", size = 24)
g.set_xticklabels(size = 18)
g.tight_layout()
g.set_axis_labels(x_var = "")
plt.show()

pandas.crosstab(hac3.labels_, wine["class"])

inertia = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters = k, init = "random", n_init = 20).fit(scale(wine2))
    inertia = inertia + [kmeans.inertia_]
rsquare = [(inertia[0] - i) / inertia[0] for i in inertia]
criteres = pandas.DataFrame({
    "k": range(1, 11), 
    "inertia": inertia,
    "rsquare": rsquare,
    "pseudof": [(rsquare[k-1] / (k - 1)) / ((1 - rsquare[k-1]) / (wine2.shape[0] - k)) if k > 1 else None for k in range(1, 11)]
})

g = seaborn.FacetGrid(data = criteres.melt(id_vars = "k"), col = "variable", sharey = False, 
                      height = 5, aspect = 1)
g.map_dataframe(seaborn.lineplot, x = "k", y = "value")
g.map(plt.axvline, x = 3, linewidth = .5, color = "dimgray", linestyle = "--")
g.add_legend()
plt.show()

kmeans3 = KMeans(n_clusters = 3, n_init = 20)
kmeans3.fit(scale(wine2))

KMeans(n_clusters=3, n_init=20)

KMeans(n_clusters=3, n_init=20)

wine2.assign(classe = kmeans3.labels_).groupby("classe").mean().round(2)

g = seaborn.catplot(
    data = wine2.assign(classe = [str(v) for v in kmeans3.labels_]) \
                .melt(id_vars = "classe"),
    x = "value", y = "classe",
    kind = "box",
    col = "variable", col_wrap = 2, sharex = False,
    height = 2, aspect = 5
)
g.set_titles(col_template = "{col_name}", fontweight = "bold", size = 24)
g.set_xticklabels(size = 18)
g.tight_layout()
g.set_axis_labels(x_var = "")
plt.show()

pandas.crosstab(kmeans3.labels_, wine["class"])

tab = pandas.crosstab(kmeans3.labels_, hac3.labels_)
tab.index.name = "Kmeans"
tab.columns.name = "HAC"
tab

from sklearn.decomposition import PCA

pca = PCA(n_components = 2)
pca.fit(scale(wine2))

PCA(n_components=2)

PCA(n_components=2)

wine_pca_row = pandas.DataFrame(pca.transform(scale(wine2)), columns = ["Dim1", "Dim2"])

g_pca = seaborn.lmplot(
    data = wine_pca_row.assign(classe = hac3.labels_), 
    x = "Dim1", y = "Dim2", hue = "classe", fit_reg = False, 
    height = 6, aspect = 2)

g_pca.fig.suptitle("Avec CAH")

plt.show()

g_pca = seaborn.lmplot(
    data = wine_pca_row.assign(classe = kmeans3.labels_), 
    x = "Dim1", y = "Dim2", hue = "classe", fit_reg = False, 
    height = 6, aspect = 2)

g_pca.fig.suptitle("Avec k-means")

plt.show()

	class	Alcohol	Malic acid	Ash	Alcalinity of ash	Magnesium	Total phenols	Flavanoids	Nonflavanoid phenols	Proanthocyanins	Color intensity	Hue	OD280/OD315 of diluted wines	Proline
0	1	14.23	1.71	2.43	15.6	127	2.80	3.06	0.28	2.29	5.64	1.04	3.92	1065
1	1	13.20	1.78	2.14	11.2	100	2.65	2.76	0.26	1.28	4.38	1.05	3.40	1050
2	1	13.16	2.36	2.67	18.6	101	2.80	3.24	0.30	2.81	5.68	1.03	3.17	1185
3	1	14.37	1.95	2.50	16.8	113	3.85	3.49	0.24	2.18	7.80	0.86	3.45	1480
4	1	13.24	2.59	2.87	21.0	118	2.80	2.69	0.39	1.82	4.32	1.04	2.93	735
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
173	3	13.71	5.65	2.45	20.5	95	1.68	0.61	0.52	1.06	7.70	0.64	1.74	740
174	3	13.40	3.91	2.48	23.0	102	1.80	0.75	0.43	1.41	7.30	0.70	1.56	750
175	3	13.27	4.28	2.26	20.0	120	1.59	0.69	0.43	1.35	10.20	0.59	1.56	835
176	3	13.17	2.59	2.37	20.0	120	1.65	0.68	0.53	1.46	9.30	0.60	1.62	840
177	3	14.13	4.10	2.74	24.5	96	2.05	0.76	0.56	1.35	9.20	0.61	1.60	560

	Alcohol	Malic acid	Ash	Alcalinity of ash	Magnesium	Total phenols	Flavanoids	Nonflavanoid phenols	Proanthocyanins	Color intensity	Hue	OD280/OD315 of diluted wines	Proline
0	14.23	1.71	2.43	15.6	127	2.80	3.06	0.28	2.29	5.64	1.04	3.92	1065
1	13.20	1.78	2.14	11.2	100	2.65	2.76	0.26	1.28	4.38	1.05	3.40	1050
2	13.16	2.36	2.67	18.6	101	2.80	3.24	0.30	2.81	5.68	1.03	3.17	1185
3	14.37	1.95	2.50	16.8	113	3.85	3.49	0.24	2.18	7.80	0.86	3.45	1480
4	13.24	2.59	2.87	21.0	118	2.80	2.69	0.39	1.82	4.32	1.04	2.93	735
...	...	...	...	...	...	...	...	...	...	...	...	...	...
173	13.71	5.65	2.45	20.5	95	1.68	0.61	0.52	1.06	7.70	0.64	1.74	740
174	13.40	3.91	2.48	23.0	102	1.80	0.75	0.43	1.41	7.30	0.70	1.56	750
175	13.27	4.28	2.26	20.0	120	1.59	0.69	0.43	1.35	10.20	0.59	1.56	835
176	13.17	2.59	2.37	20.0	120	1.65	0.68	0.53	1.46	9.30	0.60	1.62	840
177	14.13	4.10	2.74	24.5	96	2.05	0.76	0.56	1.35	9.20	0.61	1.60	560

	Alcohol	Malic acid	Ash	Alcalinity of ash	Magnesium	Total phenols	Flavanoids	Nonflavanoid phenols	Proanthocyanins	Color intensity	Hue	OD280/OD315 of diluted wines	Proline
classe
0	12.20	1.94	2.22	20.21	92.55	2.26	2.09	0.36	1.69	2.90	1.06	2.86	501.43
1	13.06	3.17	2.41	21.00	99.86	1.69	0.85	0.45	1.13	6.85	0.72	1.73	624.95
2	13.67	1.97	2.46	17.53	106.16	2.85	3.01	0.29	1.91	5.45	1.07	3.16	1076.05

class	1	2	3
row_0
0	0	58	0
1	0	8	48
2	59	5	0

	Alcohol	Malic acid	Ash	Alcalinity of ash	Magnesium	Total phenols	Flavanoids	Nonflavanoid phenols	Proanthocyanins	Color intensity	Hue	OD280/OD315 of diluted wines	Proline
classe
0	13.13	3.31	2.42	21.24	98.67	1.68	0.82	0.45	1.15	7.23	0.69	1.70	619.06
1	13.68	2.00	2.47	17.46	107.97	2.85	3.00	0.29	1.92	5.45	1.07	3.16	1100.23
2	12.25	1.90	2.23	20.06	92.74	2.25	2.05	0.36	1.62	2.97	1.06	2.80	510.17

Séance 7 - correction¶

Réalisation de la CAH¶

Réalisation de $k$-means¶

Comparaison des deux partitions¶

Représentation sur le plan factoriel de l'ACP¶

HAC	0	1	2
Kmeans
0	0	51	0
1	1	0	61
2	57	5	3