import pandas
import numpy
import matplotlib.pyplot as plt
import seaborn
seaborn.set_style("white")

from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
wine = pandas.read_csv(url, header = None, sep = ",")
wine.columns = ["class", "Alcohol", "Malic acid", "Ash", "Alcalinity of ash", "Magnesium", 
                "Total phenols", "Flavanoids", "Nonflavanoid phenols", "Proanthocyanins", 
                "Color intensity", "Hue", "OD280/OD315 of diluted wines", "Proline"]
wine

seaborn.countplot(x = "class", data = wine)
plt.show()

g = seaborn.catplot(x = "value", data = wine.melt(id_vars="class"), 
                kind = "box", height = 2, aspect = 5,
                col = "variable", col_wrap = 2, sharex = False)
g.set_titles(col_template = "{col_name}", fontweight = "bold", size = 24)
g.set_xticklabels(size = 18)
g.tight_layout()
g.set_axis_labels(x_var = "")
plt.show()

df_cor = wine.drop(columns="class").corr()
seaborn.heatmap(df_cor, vmin = -1, vmax = 1, cmap = "RdYlGn")
plt.show()

df_cor.reset_index().melt(id_vars = "index").query("value < 1").query("abs(value) > .5").sort_values("value", ascending = False)[::2]

wine_bis = scale(wine.drop(columns = "class"))
pca = PCA(n_components = 13)
pca.fit(wine_bis)

PCA(n_components=13)

PCA(n_components=13)

eig = pandas.DataFrame(
    {
        "Dimension" : ["Dim" + str(x + 1) for x in range(13)], 
        "Valeur propre" : pca.explained_variance_,
        "% variance expliquée" : numpy.round(pca.explained_variance_ratio_ * 100),
        "% cum. var. expliquée" : numpy.round(numpy.cumsum(pca.explained_variance_ratio_) * 100)
    },
    columns = ["Dimension", "Valeur propre", "% variance expliquée", "% cum. var. expliquée"]
)
eig

plt.figure(figsize=(16, 6))
g_eig = seaborn.barplot(x = "Dimension", 
                        y = "% variance expliquée",
                        color = "lightseagreen",
                        data = eig)
plt.text(12, 100/13 + 1, "Seuil")
plt.axhline(y = 100/13, linewidth = .5, color = "dimgray", linestyle = "--")
g_eig.set(ylabel = "Variance expliquée (%)")
g_eig.figure.suptitle("Variance expliquée par dimension")

plt.show()

wine_pca_df = pandas.DataFrame(pca.transform(wine_bis), 
                               columns = ["Dim" + str(i+1) for i in range(13)]) \
                    .assign(Class = wine["class"])

fig, axes = plt.subplots(figsize = (16, 8))
fig.suptitle("Premier plan factoriel")
axes.axvline(x = 0, color = 'lightgray', linestyle = '--', linewidth = 1)
axes.axhline(y = 0, color = 'lightgray', linestyle = '--', linewidth = 1)

g_pca = seaborn.scatterplot(x = "Dim1", y = "Dim2", hue = "Class", data = wine_pca_df, 
                            palette = "Set1", s = 100)
g_pca.set(xlabel = "Dimension 1", ylabel = "Dimension 2")

plt.show()

coordvar = pca.components_.T * numpy.sqrt(pca.explained_variance_)
coordvar_df = pandas.DataFrame(coordvar, columns=['PC'+str(i+1) for i in range(13)], index=wine.columns[1:])

fig, axes = plt.subplots(figsize = (10, 10))
fig.suptitle("Cercle des corrélations")
axes.set_xlim(-1, 1)
axes.set_ylim(-1, 1)
axes.axvline(x = 0, color = 'lightgray', linestyle = '--', linewidth = 1)
axes.axhline(y = 0, color = 'lightgray', linestyle = '--', linewidth = 1)
for j in range(13):
    axes.text(coordvar_df["PC1"].iloc[j],
              coordvar_df["PC2"].iloc[j], 
              coordvar_df.index[j], size = 25)
    axes.plot([0,coordvar_df["PC1"].iloc[j]], 
              [0,coordvar_df["PC2"].iloc[j]], 
              color = "gray", linestyle = 'dashed')
plt.gca().add_artist(plt.Circle((0,0),1,color='blue',fill=False))

plt.show()

def cercle(ax, d1, d2):
    ax.set_xlim(-1, 1)
    ax.set_ylim(-1, 1)
    ax.axvline(x = 0, color = 'lightgray', linestyle = '--', linewidth = 1)
    ax.axhline(y = 0, color = 'lightgray', linestyle = '--', linewidth = 1)
    for j in range(len(d1)):
        ax.text(d1.iloc[j], d2.iloc[j], d1.index[j])
        ax.plot([0, d1.iloc[j]], [0, d2.iloc[j]], color = "gray", linestyle = 'dashed')

plt.figure(figsize = (16, 16))

nb_composantes = 3

for i in range(1, nb_composantes):
    for j in range(i + 1, nb_composantes + 1):
        ax = plt.subplot2grid((nb_composantes - 1, nb_composantes - 1), (i - 1, j - 2))
        cercle(ax, coordvar_df["PC" + str(i)], coordvar_df["PC" + str(j)])
        ax.text(-.9, -.9, "Dim: " + str(i) + " - " + str(j))

plt.figure(figsize = (16, 16))

for i in range(1, nb_composantes):
    for j in range(i + 1, nb_composantes + 1):
        ax = plt.subplot2grid((nb_composantes - 1, nb_composantes - 1), (i - 1, j - 2))
        ax.axvline(x = 0, color = 'lightgray', linestyle = '--', linewidth = 1)
        ax.axhline(y = 0, color = 'lightgray', linestyle = '--', linewidth = 1)
        seaborn.scatterplot(x = "Dim" + str(i), y = "Dim" + str(j), hue = "Class", data = wine_pca_df, palette = "Set1", legend = False)

wine["Ash"].sort_values().head()

59     1.36
66     1.70
100    1.70
76     1.71
69     1.75
Name: Ash, dtype: float64

wine["Ash"].sort_values(ascending = False).head()

121    3.23
25     3.22
112    2.92
4      2.87
169    2.86
Name: Ash, dtype: float64

seaborn.boxplot(data = wine, x = "Ash")

<Axes: xlabel='Ash'>

wine["Alcalinity of ash"].sort_values().head()

59    10.6
1     11.2
13    11.4
14    12.0
50    12.4
Name: Alcalinity of ash, dtype: float64

wine["Alcalinity of ash"].sort_values(ascending = False).head()

73     30.0
121    28.5
127    28.5
157    27.0
122    26.5
Name: Alcalinity of ash, dtype: float64

seaborn.boxplot(data = wine, x = "Alcalinity of ash")

<Axes: xlabel='Alcalinity of ash'>

	class	Alcohol	Malic acid	Ash	Alcalinity of ash	Magnesium	Total phenols	Flavanoids	Nonflavanoid phenols	Proanthocyanins	Color intensity	Hue	OD280/OD315 of diluted wines	Proline
0	1	14.23	1.71	2.43	15.6	127	2.80	3.06	0.28	2.29	5.64	1.04	3.92	1065
1	1	13.20	1.78	2.14	11.2	100	2.65	2.76	0.26	1.28	4.38	1.05	3.40	1050
2	1	13.16	2.36	2.67	18.6	101	2.80	3.24	0.30	2.81	5.68	1.03	3.17	1185
3	1	14.37	1.95	2.50	16.8	113	3.85	3.49	0.24	2.18	7.80	0.86	3.45	1480
4	1	13.24	2.59	2.87	21.0	118	2.80	2.69	0.39	1.82	4.32	1.04	2.93	735
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
173	3	13.71	5.65	2.45	20.5	95	1.68	0.61	0.52	1.06	7.70	0.64	1.74	740
174	3	13.40	3.91	2.48	23.0	102	1.80	0.75	0.43	1.41	7.30	0.70	1.56	750
175	3	13.27	4.28	2.26	20.0	120	1.59	0.69	0.43	1.35	10.20	0.59	1.56	835
176	3	13.17	2.59	2.37	20.0	120	1.65	0.68	0.53	1.46	9.30	0.60	1.62	840
177	3	14.13	4.10	2.74	24.5	96	2.05	0.76	0.56	1.35	9.20	0.61	1.60	560

	Dimension	Valeur propre	% variance expliquée	% cum. var. expliquée
0	Dim1	4.732437	36.0	36.0
1	Dim2	2.511081	19.0	55.0
2	Dim3	1.454242	11.0	67.0
3	Dim4	0.924166	7.0	74.0
4	Dim5	0.858049	7.0	80.0
5	Dim6	0.645282	5.0	85.0
6	Dim7	0.554141	4.0	89.0
7	Dim8	0.350466	3.0	92.0
8	Dim9	0.290512	2.0	94.0
9	Dim10	0.252320	2.0	96.0
10	Dim11	0.227064	2.0	98.0
11	Dim12	0.169724	1.0	99.0
12	Dim13	0.103962	1.0	100.0

Séance 3 - correction¶

Données `Wine`¶

Décrire les données¶

Réaliser une ACP¶

Choix du nombre d'axes¶

Création des graphiques¶

Nuage de points sur le premier plan factoriel¶

Cercle de corrélations¶

Et si on voulait regarder plus loin, sur les axes 3, 4 et 5 aussi¶

Pour les variables¶

Pour les individus¶

	index	variable	value
71	Flavanoids	Total phenols	0.864564
89	OD280/OD315 of diluted wines	Flavanoids	0.787194
148	Total phenols	OD280/OD315 of diluted wines	0.699949
86	Proanthocyanins	Flavanoids	0.652692
156	Alcohol	Proline	0.643720
73	Proanthocyanins	Total phenols	0.612413
153	Hue	OD280/OD315 of diluted wines	0.565468
9	Color intensity	Alcohol	0.546364
136	Flavanoids	Hue	0.543479
115	OD280/OD315 of diluted wines	Proanthocyanins	0.519067
150	Nonflavanoid phenols	OD280/OD315 of diluted wines	-0.503270
139	Color intensity	Hue	-0.521813
85	Nonflavanoid phenols	Flavanoids	-0.537900
23	Hue	Malic acid	-0.561296

Séance 3 - correction¶

Données Wine¶

Décrire les données¶

Réaliser une ACP¶

Choix du nombre d'axes¶

Création des graphiques¶

Nuage de points sur le premier plan factoriel¶

Cercle de corrélations¶

Et si on voulait regarder plus loin, sur les axes 3, 4 et 5 aussi¶

Pour les variables¶

Pour les individus¶

Données `Wine`¶