import pandas
import numpy
import matplotlib.pyplot as plt
import seaborn
seaborn.set_style("white")

from sklearn.decomposition import PCA
from sklearn.preprocessing import scale


url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
wine = pandas.read_csv(url, header = None, sep = ",")
wine.columns = ["class", "Alcohol", "Malic acid", "Ash", "Alcalinity of ash", "Magnesium", 
                "Total phenols", "Flavanoids", "Nonflavanoid phenols", "Proanthocyanins", 
                "Color intensity", "Hue", "OD280/OD315 of diluted wines", "Proline"]
wine


seaborn.countplot(x = "class", data = wine)
plt.show()


g = seaborn.catplot(x = "value", data = wine.melt(id_vars="class"), 
                kind = "box", height = 2, aspect = 5,
                col = "variable", col_wrap = 2, sharex = False)
g.set_titles(col_template = "{col_name}", fontweight = "bold", size = 24)
g.set_xticklabels(size = 18)
g.tight_layout()
g.set_axis_labels(x_var = "")
plt.show()


wine.drop(columns="class").corr().round(1)


wine_bis = scale(wine.drop(columns = "class"))
pca = PCA(n_components = 13)
pca.fit(wine_bis)

PCA(n_components=13)

PCA(n_components=13)


eig = pandas.DataFrame(
    {
        "Dimension" : ["Dim" + str(x + 1) for x in range(13)], 
        "Valeur propre" : pca.explained_variance_,
        "% variance expliquée" : numpy.round(pca.explained_variance_ratio_ * 100),
        "% cum. var. expliquée" : numpy.round(numpy.cumsum(pca.explained_variance_ratio_) * 100)
    },
    columns = ["Dimension", "Valeur propre", "% variance expliquée", "% cum. var. expliquée"]
)
eig


plt.figure(figsize=(16, 6))
g_eig = seaborn.barplot(x = "Dimension", 
                        y = "% variance expliquée",
                        color = "lightseagreen",
                        data = eig)
plt.text(12, 100/13 + 1, "Seuil")
plt.axhline(y = 100/13, linewidth = .5, color = "dimgray", linestyle = "--")
g_eig.set(ylabel = "Variance expliquée (%)")
g_eig.figure.suptitle("Variance expliquée par dimension")

plt.show()


wine_pca_df = pandas.DataFrame(pca.transform(wine_bis), 
                               columns = ["Dim" + str(i+1) for i in range(13)]) \
                    .assign(Class = wine["class"])


fig, axes = plt.subplots(figsize = (16, 8))
fig.suptitle("Premier plan factoriel")
axes.axvline(x = 0, color = 'lightgray', linestyle = '--', linewidth = 1)
axes.axhline(y = 0, color = 'lightgray', linestyle = '--', linewidth = 1)

g_pca = seaborn.scatterplot(x = "Dim1", y = "Dim2", hue = "Class", data = wine_pca_df, 
                            palette = "Set1", s = 100)
g_pca.set(xlabel = "Dimension 1", ylabel = "Dimension 2")

plt.show()


coordvar = pca.components_.T * numpy.sqrt(pca.explained_variance_)
coordvar_df = pandas.DataFrame(coordvar, columns=['PC'+str(i+1) for i in range(13)], index=wine.columns[1:])


fig, axes = plt.subplots(figsize = (10, 10))
fig.suptitle("Cercle des corrélations")
axes.set_xlim(-1, 1)
axes.set_ylim(-1, 1)
axes.axvline(x = 0, color = 'lightgray', linestyle = '--', linewidth = 1)
axes.axhline(y = 0, color = 'lightgray', linestyle = '--', linewidth = 1)
for j in range(13):
    axes.text(coordvar_df["PC1"].iloc[j],
              coordvar_df["PC2"].iloc[j], 
              coordvar_df.index[j], size = 25)
    axes.plot([0,coordvar_df["PC1"].iloc[j]], 
              [0,coordvar_df["PC2"].iloc[j]], 
              color = "gray", linestyle = 'dashed')
plt.gca().add_artist(plt.Circle((0,0),1,color='blue',fill=False))

plt.show()

	class	Alcohol	Malic acid	Ash	Alcalinity of ash	Magnesium	Total phenols	Flavanoids	Nonflavanoid phenols	Proanthocyanins	Color intensity	Hue	OD280/OD315 of diluted wines	Proline
0	1	14.23	1.71	2.43	15.6	127	2.80	3.06	0.28	2.29	5.64	1.04	3.92	1065
1	1	13.20	1.78	2.14	11.2	100	2.65	2.76	0.26	1.28	4.38	1.05	3.40	1050
2	1	13.16	2.36	2.67	18.6	101	2.80	3.24	0.30	2.81	5.68	1.03	3.17	1185
3	1	14.37	1.95	2.50	16.8	113	3.85	3.49	0.24	2.18	7.80	0.86	3.45	1480
4	1	13.24	2.59	2.87	21.0	118	2.80	2.69	0.39	1.82	4.32	1.04	2.93	735
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
173	3	13.71	5.65	2.45	20.5	95	1.68	0.61	0.52	1.06	7.70	0.64	1.74	740
174	3	13.40	3.91	2.48	23.0	102	1.80	0.75	0.43	1.41	7.30	0.70	1.56	750
175	3	13.27	4.28	2.26	20.0	120	1.59	0.69	0.43	1.35	10.20	0.59	1.56	835
176	3	13.17	2.59	2.37	20.0	120	1.65	0.68	0.53	1.46	9.30	0.60	1.62	840
177	3	14.13	4.10	2.74	24.5	96	2.05	0.76	0.56	1.35	9.20	0.61	1.60	560

	Alcohol	Malic acid	Ash	Alcalinity of ash	Magnesium	Total phenols	Flavanoids	Nonflavanoid phenols	Proanthocyanins	Color intensity	Hue	OD280/OD315 of diluted wines	Proline
Alcohol	1.0	0.1	0.2	-0.3	0.3	0.3	0.2	-0.2	0.1	0.5	-0.1	0.1	0.6
Malic acid	0.1	1.0	0.2	0.3	-0.1	-0.3	-0.4	0.3	-0.2	0.2	-0.6	-0.4	-0.2
Ash	0.2	0.2	1.0	0.4	0.3	0.1	0.1	0.2	0.0	0.3	-0.1	0.0	0.2
Alcalinity of ash	-0.3	0.3	0.4	1.0	-0.1	-0.3	-0.4	0.4	-0.2	0.0	-0.3	-0.3	-0.4
Magnesium	0.3	-0.1	0.3	-0.1	1.0	0.2	0.2	-0.3	0.2	0.2	0.1	0.1	0.4
Total phenols	0.3	-0.3	0.1	-0.3	0.2	1.0	0.9	-0.4	0.6	-0.1	0.4	0.7	0.5
Flavanoids	0.2	-0.4	0.1	-0.4	0.2	0.9	1.0	-0.5	0.7	-0.2	0.5	0.8	0.5
Nonflavanoid phenols	-0.2	0.3	0.2	0.4	-0.3	-0.4	-0.5	1.0	-0.4	0.1	-0.3	-0.5	-0.3
Proanthocyanins	0.1	-0.2	0.0	-0.2	0.2	0.6	0.7	-0.4	1.0	-0.0	0.3	0.5	0.3
Color intensity	0.5	0.2	0.3	0.0	0.2	-0.1	-0.2	0.1	-0.0	1.0	-0.5	-0.4	0.3
Hue	-0.1	-0.6	-0.1	-0.3	0.1	0.4	0.5	-0.3	0.3	-0.5	1.0	0.6	0.2
OD280/OD315 of diluted wines	0.1	-0.4	0.0	-0.3	0.1	0.7	0.8	-0.5	0.5	-0.4	0.6	1.0	0.3
Proline	0.6	-0.2	0.2	-0.4	0.4	0.5	0.5	-0.3	0.3	0.3	0.2	0.3	1.0

	Dimension	Valeur propre	% variance expliquée	% cum. var. expliquée
0	Dim1	4.732437	36.0	36.0
1	Dim2	2.511081	19.0	55.0
2	Dim3	1.454242	11.0	67.0
3	Dim4	0.924166	7.0	74.0
4	Dim5	0.858049	7.0	80.0
5	Dim6	0.645282	5.0	85.0
6	Dim7	0.554141	4.0	89.0
7	Dim8	0.350466	3.0	92.0
8	Dim9	0.290512	2.0	94.0
9	Dim10	0.252320	2.0	96.0
10	Dim11	0.227064	2.0	98.0
11	Dim12	0.169724	1.0	99.0
12	Dim13	0.103962	1.0	100.0

Séance 3 - correction¶

Données `Wine`¶

Décrire les données¶

Réaliser une ACP¶

Choix du nombre d'axes¶

Création des graphiques¶

Nuage de points sur le premier plan factoriel¶

Cercle de corrélations¶

Séance 3 - correction¶

Données Wine¶

Décrire les données¶

Réaliser une ACP¶

Choix du nombre d'axes¶

Création des graphiques¶

Nuage de points sur le premier plan factoriel¶

Cercle de corrélations¶

Données `Wine`¶