import pandas
import numpy
import matplotlib.pyplot as plt
import seaborn
seaborn.set_style("white")

from sklearn.decomposition import PCA
from sklearn.preprocessing import scale


iris = pandas.read_table("https://fxjollois.github.io/donnees/Iris.txt", sep = "\t")
iris.head()


iris2 = iris.drop("Species", axis = 1)
iris2.head()


pca = PCA(n_components = 4)
pca.fit(scale(iris2))

PCA(n_components=4)

PCA(n_components=4)


pca.components_

array([[ 0.52106591, -0.26934744,  0.5804131 ,  0.56485654],
       [ 0.37741762,  0.92329566,  0.02449161,  0.06694199],
       [-0.71956635,  0.24438178,  0.14212637,  0.63427274],
       [-0.26128628,  0.12350962,  0.80144925, -0.52359713]])


pca.singular_values_

array([20.92306556, 11.7091661 ,  4.69185798,  1.76273239])


pca.mean_

array([-4.73695157e-16, -7.81597009e-16, -4.26325641e-16, -4.73695157e-16])


print(pca.explained_variance_)
print(pca.explained_variance_ratio_)

[2.93808505 0.9201649  0.14774182 0.02085386]
[0.72962445 0.22850762 0.03668922 0.00517871]


eig = pandas.DataFrame(
    {
        "Dimension" : ["Dim" + str(x + 1) for x in range(len(pca.explained_variance_))], 
        "Valeur propre" : pca.explained_variance_,
        "% variance expliquée" : numpy.round(pca.explained_variance_ratio_ * 100),
        "% cum. var. expliquée" : numpy.round(numpy.cumsum(pca.explained_variance_ratio_) * 100)
    },
    columns = ["Dimension", "Valeur propre", "% variance expliquée", "% cum. var. expliquée"]
)
eig


plt.figure(figsize=(16, 6))

g_eig = seaborn.barplot(x = "Dimension", 
                        y = "% variance expliquée",
                        color = "lightseagreen",
                        data = eig)

# ligne indicatrice du seuil de sélection des dimensions
plt.axhline(y = 25, linewidth = .5, color = "dimgray", linestyle = "--") # 25 = 100 / 4 (nb dimensions)
plt.text(3.25, 26, "25%")

g_eig.set(ylabel = "Variance expliquée (%)")
g_eig.figure.suptitle("Variance expliquée par dimension")

plt.show()


plt.figure(figsize=(16, 6))

eig2 = eig.filter(["Dimension", "% variance expliquée", "% cum. var. expliquée"]).melt(id_vars = "Dimension")
g_eig2 = seaborn.lineplot(x = "Dimension", 
                 y = "value",
                 hue = "variable",
                 data = eig2)

g_eig2.set(ylabel = "Variance expliquée (%)")
g_eig2.figure.suptitle("Variance expliquée par dimension")

plt.show()


iris_pca = pca.transform(scale(iris[iris.columns[:4]]))
iris_pca_df = pandas.DataFrame({
    "Dim1" : iris_pca[:,0], 
    "Dim2" : iris_pca[:,1], 
    "Species" : iris.Species
})
iris_pca_df.head()


g_pca = seaborn.lmplot(x = "Dim1", y = "Dim2", data = iris_pca_df, fit_reg = False, 
                       height = 4, aspect = 3)
g_pca.set(xlabel = "Dimension 1 (73%)", ylabel = "Dimension 2 (23 %)")
g_pca.fig.suptitle("Premier plan factoriel")

plt.show()


pandas.DataFrame(pca.components_.T, columns=['PC'+str(i) for i in range(1, 5)], index=iris.columns[:4])


coordvar = pca.components_.T * numpy.sqrt(pca.explained_variance_)
coordvar_df = pandas.DataFrame(coordvar, columns=['PC'+str(i) for i in range(1, 5)], index=iris.columns[:4])
coordvar_df


fig, axes = plt.subplots(figsize = (10, 10))
fig.suptitle("Cercle des corrélations")
axes.set_xlim(-1, 1)
axes.set_ylim(-1, 1)
axes.axvline(x = 0, color = 'lightgray', linestyle = '--', linewidth = 1)
axes.axhline(y = 0, color = 'lightgray', linestyle = '--', linewidth = 1)
for j in range(coordvar_df.shape[0]):
    axes.text(coordvar_df["PC1"].iloc[j],
              coordvar_df["PC2"].iloc[j], 
              coordvar_df.index[j], 
              size = 25)
    axes.plot([0,coordvar_df["PC1"].iloc[j]], 
              [0,coordvar_df["PC2"].iloc[j]], 
              color = "gray", linestyle = 'dashed')
plt.gca().add_artist(plt.Circle((0,0),1,color='blue',fill=False))

plt.show()


g_pca = seaborn.lmplot(x = "Dim1", y = "Dim2", 
                       data = iris_pca_df, fit_reg = False, 
                       height = 6, aspect = 2)

g_pca.set(xlabel = "Dimension 1 (73%)", ylabel = "Dimension 2 (23 %)")
g_pca.fig.suptitle("Premier plan factoriel")

axes = g_pca.axes[0,0]
for j in range(coordvar_df.shape[0]):
    axes.text(3 * coordvar_df["PC1"].iloc[j],
              3 * coordvar_df["PC2"].iloc[j], 
              coordvar_df.index[j], size = 25)
plt.axvline(x = iris_pca_df.Dim1.mean(), linewidth = .5, color = "dimgray", linestyle = "--")
plt.axhline(y = iris_pca_df.Dim2.mean(), linewidth = .5, color = "dimgray", linestyle = "--")

plt.show()


g_pca = seaborn.lmplot(x = "Dim1", y = "Dim2", hue = "Species", data = iris_pca_df, fit_reg = False, 
                       height = 4, aspect = 3)

g_pca.set(xlabel = "Dimension 1 (73%)", ylabel = "Dimension 2 (23 %)")
g_pca.fig.suptitle("Premier plan factoriel")

plt.show()


g_pca2 = seaborn.lmplot(x = "Dim1", y = "Dim2", hue = "Species", col = "Species", 
                        data = iris_pca_df, fit_reg = False,
                        height = 4, aspect = 1.1)
g_pca2.set(xlabel = "Dimension 1 (73%)", ylabel = "Dimension 2 (23 %)")
g_pca.fig.suptitle("Premier plan factoriel")

plt.show()


url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
wine = pandas.read_csv(url, header = None, sep = ",")
wine.columns = ["class", "Alcohol", "Malic acid", "Ash", "Alcalinity of ash", "Magnesium", 
                "Total phenols", "Flavanoids", "Nonflavanoid phenols", "Proanthocyanins", 
                "Color intensity", "Hue", "OD280/OD315 of diluted wines", "Proline"]
wine


wgi = pandas.read_csv("https://fxjollois.github.io/donnees/WGI/wgi2019.csv")
wgi

	Sepal Length	Sepal Width	Petal Length	Petal Width	Species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa

	Sepal Length	Sepal Width	Petal Length	Petal Width
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	Dimension	Valeur propre	% variance expliquée	% cum. var. expliquée
0	Dim1	2.938085	73.0	73.0
1	Dim2	0.920165	23.0	96.0
2	Dim3	0.147742	4.0	99.0
3	Dim4	0.020854	1.0	100.0

	Dim1	Dim2	Species
0	-2.264703	0.480027	setosa
1	-2.080961	-0.674134	setosa
2	-2.364229	-0.341908	setosa
3	-2.299384	-0.597395	setosa
4	-2.389842	0.646835	setosa

	PC1	PC2	PC3	PC4
Sepal Length	0.521066	0.377418	-0.719566	-0.261286
Sepal Width	-0.269347	0.923296	0.244382	0.123510
Petal Length	0.580413	0.024492	0.142126	0.801449
Petal Width	0.564857	0.066942	0.634273	-0.523597

Extraction de connaissances à partir de données structurées et non structurées¶

Séance 3 : Analyse en Composantes Principales (ACP)¶

Utilisation de `python`¶

Librairies utilisées¶

Données utilisées¶

Suppression de la variable `Species`¶

Réalisation de l'ACP¶

Calcul des valeurs propres¶

Choix du nombre de facteurs¶

Premier graphique : diagramme des variances expliquées¶

Choix du nombre de facteurs¶

Deuxième graphique : évolution de la variance expliquée et variance expliqu&e cumulée¶

Visualisation du nuage de points¶

Représentation des variables¶

Représentation simultanée¶

Visualisation des espèces sur le premier plan factoriel¶

Exercice - Wine¶

Travail à faire¶

Worldwide Governance Indicators¶

Travail à faire¶

	Sepal Length	Sepal Width	Petal Length	Petal Width
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	PC1	PC2	PC3	PC4
Sepal Length	0.893151	0.362039	-0.276581	-0.037732
Sepal Width	-0.461684	0.885673	0.093934	0.017836
Petal Length	0.994877	0.023494	0.054629	0.115736
Petal Width	0.968212	0.064214	0.243797	-0.075612

	class	Alcohol	Malic acid	Ash	Alcalinity of ash	Magnesium	Total phenols	Flavanoids	Nonflavanoid phenols	Proanthocyanins	Color intensity	Hue	OD280/OD315 of diluted wines	Proline
0	1	14.23	1.71	2.43	15.6	127	2.80	3.06	0.28	2.29	5.64	1.04	3.92	1065
1	1	13.20	1.78	2.14	11.2	100	2.65	2.76	0.26	1.28	4.38	1.05	3.40	1050
2	1	13.16	2.36	2.67	18.6	101	2.80	3.24	0.30	2.81	5.68	1.03	3.17	1185
3	1	14.37	1.95	2.50	16.8	113	3.85	3.49	0.24	2.18	7.80	0.86	3.45	1480
4	1	13.24	2.59	2.87	21.0	118	2.80	2.69	0.39	1.82	4.32	1.04	2.93	735
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
173	3	13.71	5.65	2.45	20.5	95	1.68	0.61	0.52	1.06	7.70	0.64	1.74	740
174	3	13.40	3.91	2.48	23.0	102	1.80	0.75	0.43	1.41	7.30	0.70	1.56	750
175	3	13.27	4.28	2.26	20.0	120	1.59	0.69	0.43	1.35	10.20	0.59	1.56	835
176	3	13.17	2.59	2.37	20.0	120	1.65	0.68	0.53	1.46	9.30	0.60	1.62	840
177	3	14.13	4.10	2.74	24.5	96	2.05	0.76	0.56	1.35	9.20	0.61	1.60	560

	Country	Code	Voice and Accountability	Political Stability and Absence of Violence/Terrorism	Government Effectiveness	Regulatory Quality	Rule of Law	Control of Corruption
0	Aruba	ABW	1.294189	1.357372	1.029933	0.857360	1.263128	1.217238
1	Andorra	ADO	1.139154	1.615139	1.908749	1.228176	1.579939	1.234392
2	Afghanistan	AFG	-0.988032	-2.649407	-1.463875	-1.120555	-1.713527	-1.401076
3	Angola	AGO	-0.777283	-0.311101	-1.117144	-0.893871	-1.054343	-1.054683
4	Anguilla	AIA	NaN	1.367357	0.815824	0.846231	0.355737	1.234392
...	...	...	...	...	...	...	...	...
209	Serbia	SRB	0.026626	-0.091665	0.019079	0.113867	-0.119070	-0.445551
210	South Africa	ZAF	0.670388	-0.217931	0.367380	0.156172	-0.076408	0.084924
211	Congo, Dem. Rep.	ZAR	-1.365966	-1.808007	-1.627429	-1.509667	-1.786088	-1.538931
212	Zambia	ZMB	-0.286199	-0.102216	-0.675215	-0.554269	-0.462069	-0.640345
213	Zimbabwe	ZWE	-1.141875	-0.920179	-1.205337	-1.463199	-1.257009	-1.238796

	Sepal Length	Sepal Width	Petal Length	Petal Width
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

Extraction de connaissances à partir de données structurées et non structurées¶

Séance 3 : Analyse en Composantes Principales (ACP)¶

Utilisation de python¶

Librairies utilisées¶

Données utilisées¶

Suppression de la variable Species¶

Réalisation de l'ACP¶

Calcul des valeurs propres¶

Choix du nombre de facteurs¶

Premier graphique : diagramme des variances expliquées¶

Choix du nombre de facteurs¶

Deuxième graphique : évolution de la variance expliquée et variance expliqu&e cumulée¶

Visualisation du nuage de points¶

Représentation des variables¶

Représentation simultanée¶

Visualisation des espèces sur le premier plan factoriel¶

Exercice - Wine¶

Travail à faire¶

Worldwide Governance Indicators¶

Travail à faire¶

Utilisation de `python`¶

Suppression de la variable `Species`¶

	Sepal Length	Sepal Width	Petal Length	Petal Width
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2