Séance 7 - correction¶
In [1]:
import pandas
import numpy
import matplotlib.pyplot as plt
import seaborn
seaborn.set_style("white")
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
from sklearn.preprocessing import scale
In [2]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
wine = pandas.read_csv(url, header = None, sep = ",")
wine.columns = ["class", "Alcohol", "Malic acid", "Ash", "Alcalinity of ash", "Magnesium",
"Total phenols", "Flavanoids", "Nonflavanoid phenols", "Proanthocyanins",
"Color intensity", "Hue", "OD280/OD315 of diluted wines", "Proline"]
wine
Out[2]:
class | Alcohol | Malic acid | Ash | Alcalinity of ash | Magnesium | Total phenols | Flavanoids | Nonflavanoid phenols | Proanthocyanins | Color intensity | Hue | OD280/OD315 of diluted wines | Proline | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 14.23 | 1.71 | 2.43 | 15.6 | 127 | 2.80 | 3.06 | 0.28 | 2.29 | 5.64 | 1.04 | 3.92 | 1065 |
1 | 1 | 13.20 | 1.78 | 2.14 | 11.2 | 100 | 2.65 | 2.76 | 0.26 | 1.28 | 4.38 | 1.05 | 3.40 | 1050 |
2 | 1 | 13.16 | 2.36 | 2.67 | 18.6 | 101 | 2.80 | 3.24 | 0.30 | 2.81 | 5.68 | 1.03 | 3.17 | 1185 |
3 | 1 | 14.37 | 1.95 | 2.50 | 16.8 | 113 | 3.85 | 3.49 | 0.24 | 2.18 | 7.80 | 0.86 | 3.45 | 1480 |
4 | 1 | 13.24 | 2.59 | 2.87 | 21.0 | 118 | 2.80 | 2.69 | 0.39 | 1.82 | 4.32 | 1.04 | 2.93 | 735 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
173 | 3 | 13.71 | 5.65 | 2.45 | 20.5 | 95 | 1.68 | 0.61 | 0.52 | 1.06 | 7.70 | 0.64 | 1.74 | 740 |
174 | 3 | 13.40 | 3.91 | 2.48 | 23.0 | 102 | 1.80 | 0.75 | 0.43 | 1.41 | 7.30 | 0.70 | 1.56 | 750 |
175 | 3 | 13.27 | 4.28 | 2.26 | 20.0 | 120 | 1.59 | 0.69 | 0.43 | 1.35 | 10.20 | 0.59 | 1.56 | 835 |
176 | 3 | 13.17 | 2.59 | 2.37 | 20.0 | 120 | 1.65 | 0.68 | 0.53 | 1.46 | 9.30 | 0.60 | 1.62 | 840 |
177 | 3 | 14.13 | 4.10 | 2.74 | 24.5 | 96 | 2.05 | 0.76 | 0.56 | 1.35 | 9.20 | 0.61 | 1.60 | 560 |
178 rows × 14 columns
In [3]:
wine2 = wine.drop(columns = "class")
wine2
Out[3]:
Alcohol | Malic acid | Ash | Alcalinity of ash | Magnesium | Total phenols | Flavanoids | Nonflavanoid phenols | Proanthocyanins | Color intensity | Hue | OD280/OD315 of diluted wines | Proline | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 14.23 | 1.71 | 2.43 | 15.6 | 127 | 2.80 | 3.06 | 0.28 | 2.29 | 5.64 | 1.04 | 3.92 | 1065 |
1 | 13.20 | 1.78 | 2.14 | 11.2 | 100 | 2.65 | 2.76 | 0.26 | 1.28 | 4.38 | 1.05 | 3.40 | 1050 |
2 | 13.16 | 2.36 | 2.67 | 18.6 | 101 | 2.80 | 3.24 | 0.30 | 2.81 | 5.68 | 1.03 | 3.17 | 1185 |
3 | 14.37 | 1.95 | 2.50 | 16.8 | 113 | 3.85 | 3.49 | 0.24 | 2.18 | 7.80 | 0.86 | 3.45 | 1480 |
4 | 13.24 | 2.59 | 2.87 | 21.0 | 118 | 2.80 | 2.69 | 0.39 | 1.82 | 4.32 | 1.04 | 2.93 | 735 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
173 | 13.71 | 5.65 | 2.45 | 20.5 | 95 | 1.68 | 0.61 | 0.52 | 1.06 | 7.70 | 0.64 | 1.74 | 740 |
174 | 13.40 | 3.91 | 2.48 | 23.0 | 102 | 1.80 | 0.75 | 0.43 | 1.41 | 7.30 | 0.70 | 1.56 | 750 |
175 | 13.27 | 4.28 | 2.26 | 20.0 | 120 | 1.59 | 0.69 | 0.43 | 1.35 | 10.20 | 0.59 | 1.56 | 835 |
176 | 13.17 | 2.59 | 2.37 | 20.0 | 120 | 1.65 | 0.68 | 0.53 | 1.46 | 9.30 | 0.60 | 1.62 | 840 |
177 | 14.13 | 4.10 | 2.74 | 24.5 | 96 | 2.05 | 0.76 | 0.56 | 1.35 | 9.20 | 0.61 | 1.60 | 560 |
178 rows × 13 columns
Réalisation de la CAH¶
In [4]:
hac = AgglomerativeClustering(distance_threshold = 0, n_clusters = None)
hac.fit(scale(wine2))
Out[4]:
AgglomerativeClustering(distance_threshold=0, n_clusters=None)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
AgglomerativeClustering(distance_threshold=0, n_clusters=None)
In [5]:
from scipy.cluster.hierarchy import dendrogram
def plot_dendrogram(model, **kwargs):
# Create linkage matrix and then plot the dendrogram
# create the counts of samples under each node
counts = numpy.zeros(model.children_.shape[0])
n_samples = len(model.labels_)
for i, merge in enumerate(model.children_):
current_count = 0
for child_idx in merge:
if child_idx < n_samples:
current_count += 1 # leaf node
else:
current_count += counts[child_idx - n_samples]
counts[i] = current_count
linkage_matrix = numpy.column_stack([model.children_, model.distances_, counts]).astype(float)
# Plot the corresponding dendrogram
dendrogram(linkage_matrix, **kwargs)
In [6]:
plt.figure(figsize = (16, 8))
plt.title("CAH (Ward)")
plot_dendrogram(hac)
plt.axhline(y = 20, linewidth = .5, color = "dimgray", linestyle = "--")
plt.show()
Le dendrogramme suggère 3 classes
In [7]:
hac3 = AgglomerativeClustering(n_clusters = 3)
hac3.fit(scale(wine2))
Out[7]:
AgglomerativeClustering(n_clusters=3)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
AgglomerativeClustering(n_clusters=3)
On peut donc caractériser les classes (on ne retient que les points importants) :
- 0 : Faible taux d'alcool, faible taux de magnésium, faible intensité de couleur et faible valeur pour Proline
- 1 : (très) Fort taux d'acide malique, faible taux de phénols, de flavanoïdes de proanthocyanins et teinte (hue) pâle, forte intensité de couleur, et valeurs de OD280/OD315 (très) faible
- 2 : faible alcalinité, beaucoup de phénols, flavanoïdes et proanthocyanins, fortes valeurs pour Proline
In [8]:
wine2.assign(classe = hac3.labels_).groupby("classe").mean().round(2)
Out[8]:
Alcohol | Malic acid | Ash | Alcalinity of ash | Magnesium | Total phenols | Flavanoids | Nonflavanoid phenols | Proanthocyanins | Color intensity | Hue | OD280/OD315 of diluted wines | Proline | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
classe | |||||||||||||
0 | 12.20 | 1.94 | 2.22 | 20.21 | 92.55 | 2.26 | 2.09 | 0.36 | 1.69 | 2.90 | 1.06 | 2.86 | 501.43 |
1 | 13.06 | 3.17 | 2.41 | 21.00 | 99.86 | 1.69 | 0.85 | 0.45 | 1.13 | 6.85 | 0.72 | 1.73 | 624.95 |
2 | 13.67 | 1.97 | 2.46 | 17.53 | 106.16 | 2.85 | 3.01 | 0.29 | 1.91 | 5.45 | 1.07 | 3.16 | 1076.05 |
In [9]:
g = seaborn.catplot(
data = wine2.assign(classe = [str(v) for v in hac3.labels_]) \
.melt(id_vars = "classe"),
x = "value", y = "classe",
kind = "box",
col = "variable", col_wrap = 2, sharex = False,
height = 2, aspect = 5
)
g.set_titles(col_template = "{col_name}", fontweight = "bold", size = 24)
g.set_xticklabels(size = 18)
g.tight_layout()
g.set_axis_labels(x_var = "")
plt.show()
In [10]:
pandas.crosstab(hac3.labels_, wine["class"])
Out[10]:
class | 1 | 2 | 3 |
---|---|---|---|
row_0 | |||
0 | 0 | 58 | 0 |
1 | 0 | 8 | 48 |
2 | 59 | 5 | 0 |
Réalisation de $k$-means¶
In [11]:
inertia = []
for k in range(1, 11):
kmeans = KMeans(n_clusters = k, init = "random", n_init = 20).fit(scale(wine2))
inertia = inertia + [kmeans.inertia_]
rsquare = [(inertia[0] - i) / inertia[0] for i in inertia]
criteres = pandas.DataFrame({
"k": range(1, 11),
"inertia": inertia,
"rsquare": rsquare,
"pseudof": [(rsquare[k-1] / (k - 1)) / ((1 - rsquare[k-1]) / (wine2.shape[0] - k)) if k > 1 else None for k in range(1, 11)]
})
In [12]:
g = seaborn.FacetGrid(data = criteres.melt(id_vars = "k"), col = "variable", sharey = False,
height = 5, aspect = 1)
g.map_dataframe(seaborn.lineplot, x = "k", y = "value")
g.map(plt.axvline, x = 3, linewidth = .5, color = "dimgray", linestyle = "--")
g.add_legend()
plt.show()
Ici aussi, on choisit 3 classes
In [13]:
kmeans3 = KMeans(n_clusters = 3, n_init = 20)
kmeans3.fit(scale(wine2))
Out[13]:
KMeans(n_clusters=3, n_init=20)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KMeans(n_clusters=3, n_init=20)
On retrouve sensiblement les mêmes classes (pas forcément dans le même ordre)
In [14]:
wine2.assign(classe = kmeans3.labels_).groupby("classe").mean().round(2)
Out[14]:
Alcohol | Malic acid | Ash | Alcalinity of ash | Magnesium | Total phenols | Flavanoids | Nonflavanoid phenols | Proanthocyanins | Color intensity | Hue | OD280/OD315 of diluted wines | Proline | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
classe | |||||||||||||
0 | 13.13 | 3.31 | 2.42 | 21.24 | 98.67 | 1.68 | 0.82 | 0.45 | 1.15 | 7.23 | 0.69 | 1.70 | 619.06 |
1 | 13.68 | 2.00 | 2.47 | 17.46 | 107.97 | 2.85 | 3.00 | 0.29 | 1.92 | 5.45 | 1.07 | 3.16 | 1100.23 |
2 | 12.25 | 1.90 | 2.23 | 20.06 | 92.74 | 2.25 | 2.05 | 0.36 | 1.62 | 2.97 | 1.06 | 2.80 | 510.17 |
In [15]:
g = seaborn.catplot(
data = wine2.assign(classe = [str(v) for v in kmeans3.labels_]) \
.melt(id_vars = "classe"),
x = "value", y = "classe",
kind = "box",
col = "variable", col_wrap = 2, sharex = False,
height = 2, aspect = 5
)
g.set_titles(col_template = "{col_name}", fontweight = "bold", size = 24)
g.set_xticklabels(size = 18)
g.tight_layout()
g.set_axis_labels(x_var = "")
plt.show()
In [16]:
pandas.crosstab(kmeans3.labels_, wine["class"])
Out[16]:
class | 1 | 2 | 3 |
---|---|---|---|
row_0 | |||
0 | 0 | 3 | 48 |
1 | 59 | 3 | 0 |
2 | 0 | 65 | 0 |
Comparaison des deux partitions¶
In [17]:
tab = pandas.crosstab(kmeans3.labels_, hac3.labels_)
tab.index.name = "Kmeans"
tab.columns.name = "HAC"
tab
Out[17]:
HAC | 0 | 1 | 2 |
---|---|---|---|
Kmeans | |||
0 | 0 | 51 | 0 |
1 | 1 | 0 | 61 |
2 | 57 | 5 | 3 |
Représentation sur le plan factoriel de l'ACP¶
In [18]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
pca.fit(scale(wine2))
Out[18]:
PCA(n_components=2)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA(n_components=2)
In [19]:
wine_pca_row = pandas.DataFrame(pca.transform(scale(wine2)), columns = ["Dim1", "Dim2"])
In [20]:
g_pca = seaborn.lmplot(
data = wine_pca_row.assign(classe = hac3.labels_),
x = "Dim1", y = "Dim2", hue = "classe", fit_reg = False,
height = 6, aspect = 2)
g_pca.fig.suptitle("Avec CAH")
plt.show()
In [21]:
g_pca = seaborn.lmplot(
data = wine_pca_row.assign(classe = kmeans3.labels_),
x = "Dim1", y = "Dim2", hue = "classe", fit_reg = False,
height = 6, aspect = 2)
g_pca.fig.suptitle("Avec k-means")
plt.show()
In [ ]: