import pandas
import numpy
import matplotlib.pyplot as plt
import seaborn
seaborn.set_style("white")
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
Données sur des iris disponibles ici
iris = pandas.read_table("https://fxjollois.github.io/donnees/Iris.txt", sep = "\t")
iris.head()
Sepal Length | Sepal Width | Petal Length | Petal Width | Species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
Species
¶ACP uniquement sur des variables quantitatives
iris2 = iris.drop("Species", axis = 1)
iris2.head()
Sepal Length | Sepal Width | Petal Length | Petal Width | |
---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 |
1 | 4.9 | 3.0 | 1.4 | 0.2 |
2 | 4.7 | 3.2 | 1.3 | 0.2 |
3 | 4.6 | 3.1 | 1.5 | 0.2 |
4 | 5.0 | 3.6 | 1.4 | 0.2 |
Ici, scale()
permet donc de normaliser chaque variable
pca = PCA(n_components = 4)
pca.fit(scale(iris2))
PCA(n_components=4)
pca.components_
array([[ 0.52106591, -0.26934744, 0.5804131 , 0.56485654], [ 0.37741762, 0.92329566, 0.02449161, 0.06694199], [-0.71956635, 0.24438178, 0.14212637, 0.63427274], [-0.26128628, 0.12350962, 0.80144925, -0.52359713]])
pca.singular_values_
array([20.92306556, 11.7091661 , 4.69185798, 1.76273239])
pca.mean_
array([-4.73695157e-16, -7.81597009e-16, -4.26325641e-16, -4.73695157e-16])
print(pca.explained_variance_)
print(pca.explained_variance_ratio_)
[2.93808505 0.9201649 0.14774182 0.02085386] [0.72962445 0.22850762 0.03668922 0.00517871]
eig = pandas.DataFrame(
{
"Dimension" : ["Dim" + str(x + 1) for x in range(len(pca.explained_variance_))],
"Valeur propre" : pca.explained_variance_,
"% variance expliquée" : numpy.round(pca.explained_variance_ratio_ * 100),
"% cum. var. expliquée" : numpy.round(numpy.cumsum(pca.explained_variance_ratio_) * 100)
},
columns = ["Dimension", "Valeur propre", "% variance expliquée", "% cum. var. expliquée"]
)
eig
Dimension | Valeur propre | % variance expliquée | % cum. var. expliquée | |
---|---|---|---|---|
0 | Dim1 | 2.938085 | 73.0 | 73.0 |
1 | Dim2 | 0.920165 | 23.0 | 96.0 |
2 | Dim3 | 0.147742 | 4.0 | 99.0 |
3 | Dim4 | 0.020854 | 1.0 | 100.0 |
plt.figure(figsize=(16, 6))
g_eig = seaborn.barplot(x = "Dimension",
y = "% variance expliquée",
palette = ["lightseagreen"],
data = eig)
# ligne indicatrice du seuil de sélection des dimensions
plt.axhline(y = 25, linewidth = .5, color = "dimgray", linestyle = "--") # 25 = 100 / 4 (nb dimensions)
plt.text(3.25, 26, "25%")
g_eig.set(ylabel = "Variance expliquée (%)")
g_eig.figure.suptitle("Variance expliquée par dimension")
plt.show()
plt.figure(figsize=(16, 6))
eig2 = eig.filter(["Dimension", "% variance expliquée", "% cum. var. expliquée"]).melt(id_vars = "Dimension")
g_eig2 = seaborn.lineplot(x = "Dimension",
y = "value",
hue = "variable",
data = eig2)
g_eig2.set(ylabel = "Variance expliquée (%)")
g_eig2.figure.suptitle("Variance expliquée par dimension")
plt.show()
Récupération des dimensions avec les espèces pour la représentation
iris_pca = pca.transform(scale(iris[iris.columns[:4]]))
iris_pca_df = pandas.DataFrame({
"Dim1" : iris_pca[:,0],
"Dim2" : iris_pca[:,1],
"Species" : iris.Species
})
iris_pca_df.head()
Dim1 | Dim2 | Species | |
---|---|---|---|
0 | -2.264703 | 0.480027 | setosa |
1 | -2.080961 | -0.674134 | setosa |
2 | -2.364229 | -0.341908 | setosa |
3 | -2.299384 | -0.597395 | setosa |
4 | -2.389842 | 0.646835 | setosa |
g_pca = seaborn.lmplot(x = "Dim1", y = "Dim2", data = iris_pca_df, fit_reg = False,
height = 4, aspect = 3)
g_pca.set(xlabel = "Dimension 1 (73%)", ylabel = "Dimension 2 (23 %)")
g_pca.fig.suptitle("Premier plan factoriel")
plt.show()
Obligation de faire un calcul pour avoir les coordonnées des variables
pandas.DataFrame(pca.components_.T, columns=['PC'+str(i) for i in range(1, 5)], index=iris.columns[:4])
PC1 | PC2 | PC3 | PC4 | |
---|---|---|---|---|
Sepal Length | 0.521066 | 0.377418 | -0.719566 | -0.261286 |
Sepal Width | -0.269347 | 0.923296 | 0.244382 | 0.123510 |
Petal Length | 0.580413 | 0.024492 | 0.142126 | 0.801449 |
Petal Width | 0.564857 | 0.066942 | 0.634273 | -0.523597 |
coordvar = pca.components_.T * numpy.sqrt(pca.explained_variance_)
coordvar_df = pandas.DataFrame(coordvar, columns=['PC'+str(i) for i in range(1, 5)], index=iris.columns[:4])
coordvar_df
PC1 | PC2 | PC3 | PC4 | |
---|---|---|---|---|
Sepal Length | 0.893151 | 0.362039 | -0.276581 | -0.037732 |
Sepal Width | -0.461684 | 0.885673 | 0.093934 | 0.017836 |
Petal Length | 0.994877 | 0.023494 | 0.054629 | 0.115736 |
Petal Width | 0.968212 | 0.064214 | 0.243797 | -0.075612 |
fig, axes = plt.subplots(figsize = (10, 10))
fig.suptitle("Cercle des corrélations")
axes.set_xlim(-1, 1)
axes.set_ylim(-1, 1)
axes.axvline(x = 0, color = 'lightgray', linestyle = '--', linewidth = 1)
axes.axhline(y = 0, color = 'lightgray', linestyle = '--', linewidth = 1)
for j in range(coordvar_df.shape[0]):
axes.text(coordvar_df["PC1"][j],coordvar_df["PC2"][j], coordvar_df.index[j], size = 25)
axes.plot([0,coordvar_df["PC1"][j]], [0,coordvar_df["PC2"][j]], color = "gray", linestyle = 'dashed')
plt.gca().add_artist(plt.Circle((0,0),1,color='blue',fill=False))
plt.show()
Analyse conjointe des deux nuages (individus et variables)
g_pca = seaborn.lmplot(x = "Dim1", y = "Dim2", data = iris_pca_df, fit_reg = False,
height = 4, aspect = 3)
g_pca.set(xlabel = "Dimension 1 (73%)", ylabel = "Dimension 2 (23 %)")
g_pca.fig.suptitle("Premier plan factoriel")
axes = g_pca.axes[0,0]
for j in range(coordvar_df.shape[0]):
axes.text(3 * coordvar_df["PC1"][j],
3 * coordvar_df["PC2"][j],
coordvar_df.index[j], size = 25)
plt.axvline(x = iris_pca_df.Dim1.mean(), linewidth = .5, color = "dimgray", linestyle = "--")
plt.axhline(y = iris_pca_df.Dim2.mean(), linewidth = .5, color = "dimgray", linestyle = "--")
plt.show()
Gros intérêt de l'ACP : représenter une variable qualitative sur le plan factoriel
g_pca = seaborn.lmplot(x = "Dim1", y = "Dim2", hue = "Species", data = iris_pca_df, fit_reg = False,
height = 4, aspect = 3)
g_pca.set(xlabel = "Dimension 1 (73%)", ylabel = "Dimension 2 (23 %)")
g_pca.fig.suptitle("Premier plan factoriel")
plt.show()
g_pca2 = seaborn.lmplot(x = "Dim1", y = "Dim2", hue = "Species", col = "Species",
data = iris_pca_df, fit_reg = False,
height = 4, aspect = 1.1)
g_pca2.set(xlabel = "Dimension 1 (73%)", ylabel = "Dimension 2 (23 %)")
g_pca.fig.suptitle("Premier plan factoriel")
plt.show()
Nous allons travailler sur des données concernant 3 types de vin. Elles sont disponibles sur cette page de l'UCI MLR. Il s'agit de 178 vins, réparties en 3 classes donc, et décrit par 13 variables quantitatives (lire la description dans le fichier wine.names
pour plus d'informations).
Le code suivant permet de charger les données, et de nommer correctement les variables.
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"
wine = pandas.read_csv(url, header = None, sep = ",")
wine.columns = ["class", "Alcohol", "Malic acid", "Ash", "Alcalinity of ash", "Magnesium",
"Total phenols", "Flavanoids", "Nonflavanoid phenols", "Proanthocyanins",
"Color intensity", "Hue", "OD280/OD315 of diluted wines", "Proline"]
wine
class | Alcohol | Malic acid | Ash | Alcalinity of ash | Magnesium | Total phenols | Flavanoids | Nonflavanoid phenols | Proanthocyanins | Color intensity | Hue | OD280/OD315 of diluted wines | Proline | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 14.23 | 1.71 | 2.43 | 15.6 | 127 | 2.80 | 3.06 | 0.28 | 2.29 | 5.64 | 1.04 | 3.92 | 1065 |
1 | 1 | 13.20 | 1.78 | 2.14 | 11.2 | 100 | 2.65 | 2.76 | 0.26 | 1.28 | 4.38 | 1.05 | 3.40 | 1050 |
2 | 1 | 13.16 | 2.36 | 2.67 | 18.6 | 101 | 2.80 | 3.24 | 0.30 | 2.81 | 5.68 | 1.03 | 3.17 | 1185 |
3 | 1 | 14.37 | 1.95 | 2.50 | 16.8 | 113 | 3.85 | 3.49 | 0.24 | 2.18 | 7.80 | 0.86 | 3.45 | 1480 |
4 | 1 | 13.24 | 2.59 | 2.87 | 21.0 | 118 | 2.80 | 2.69 | 0.39 | 1.82 | 4.32 | 1.04 | 2.93 | 735 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
173 | 3 | 13.71 | 5.65 | 2.45 | 20.5 | 95 | 1.68 | 0.61 | 0.52 | 1.06 | 7.70 | 0.64 | 1.74 | 740 |
174 | 3 | 13.40 | 3.91 | 2.48 | 23.0 | 102 | 1.80 | 0.75 | 0.43 | 1.41 | 7.30 | 0.70 | 1.56 | 750 |
175 | 3 | 13.27 | 4.28 | 2.26 | 20.0 | 120 | 1.59 | 0.69 | 0.43 | 1.35 | 10.20 | 0.59 | 1.56 | 835 |
176 | 3 | 13.17 | 2.59 | 2.37 | 20.0 | 120 | 1.65 | 0.68 | 0.53 | 1.46 | 9.30 | 0.60 | 1.62 | 840 |
177 | 3 | 14.13 | 4.10 | 2.74 | 24.5 | 96 | 2.05 | 0.76 | 0.56 | 1.35 | 9.20 | 0.61 | 1.60 | 560 |
178 rows × 14 columns
Vous devez donc réaliser les étapes suivantes :