import numpy
import pandas
import matplotlib.pyplot as plt
import seaborn
seaborn.set_style("white") # change le style par défaut des graphiques seaborn

%matplotlib inline


# le dropna() permet de supprimer les pays pour lesquels il manque des informations
WGI_complet = pandas.read_csv("https://fxjollois.github.io/donnees/WGI/wgi2019.csv").dropna()
WGI_complet


WGI_complet.corr()


seaborn.heatmap(WGI_complet.corr(), annot = True, cmap = "Blues")

<AxesSubplot:>


from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

# suppression des colonnes non numériques
WGI_num = WGI_complet.drop(columns = ["Country", "Code"])
pca = PCA()
pca.fit(WGI_num)

PCA()


print(pca.explained_variance_)
print(pca.explained_variance_ratio_)

[5.04959218 0.41467898 0.30759716 0.12165034 0.04965011 0.04069217]
[0.84386857 0.06929957 0.05140446 0.02032974 0.00829734 0.00680032]


eig = pandas.DataFrame(
    {
        "Dimension" : ["Dim" + str(x + 1) for x in range(WGI_num.shape[1])], 
        "Variance expliquée" : pca.explained_variance_,
        "% variance expliquée" : numpy.round(pca.explained_variance_ratio_ * 100),
        "% cum. var. expliquée" : numpy.round(numpy.cumsum(pca.explained_variance_ratio_) * 100)
    }
)
eig


plt.figure(figsize = (15, 5)) # modifie la taille du graphique
seaborn.pointplot(data = eig, x = "Dimension", y = "% variance expliquée")
seuil = 100 / WGI_num.shape[1]
plt.text(WGI_num.shape[1] - 1, seuil + 1, str(round(seuil)) + "%")
plt.axhline(y = seuil, linewidth = .5, color = "dimgray", linestyle = "--")
plt.show() # cette ligne supprime l'affichage inutile en amont du graphique


WGI_pca = pca.transform(WGI_num)


# Transformation en DataFrame pandas
WGI_pca_df = pandas.DataFrame({
    "Dim1" : WGI_pca[:,0], 
    "Dim2" : WGI_pca[:,1],
    "Country" : WGI_complet["Country"]
})

# Résultat (premières lignes)
WGI_pca_df.head()


plt.figure(figsize = (15, 8))
plt.axvline(x = 0, linewidth = .5, color = "dimgray", linestyle = "--")
plt.axhline(y = 0, linewidth = .5, color = "dimgray", linestyle = "--")
seaborn.regplot(data = WGI_pca_df, x = "Dim1", y = "Dim2", fit_reg = False)
plt.xlabel("Dimension 1 (83%)") # modification du nom de l'axe X
plt.ylabel("Dimension 2 (8 %)") # idem pour axe Y
plt.suptitle("Premier plan factoriel (91%)") # titre général
plt.show()


# utilisation de subplots nécessaire car annotation du graphique
fig, ax = plt.subplots(figsize=(15,8))
plt.axvline(x = 0, linewidth = .5, color = "dimgray", linestyle = "--")
plt.axhline(y = 0, linewidth = .5, color = "dimgray", linestyle = "--")

# l'option ax permet de placer les points et le texte sur le même graphique
seaborn.regplot(data = WGI_pca_df, x = "Dim1", y = "Dim2", fit_reg = False, ax = ax)

# boucle sur chaque pays
for k in WGI_pca_df.iterrows():
    # annotation uniquement si valeur absolue sur une de 2 dimensions importantes (valeurs choisies empiriquement)
    if (abs(k[1]['Dim1']) > 3.5) | (abs(k[1]['Dim2']) > 1.5):
        ax.annotate(k[1]["Country"], (k[1]['Dim1'], k[1]['Dim2']), fontsize = 9, ha = "center")
plt.xlabel("Dimension 1 (83%)") 
plt.ylabel("Dimension 2 (8 %)")
plt.suptitle("Premier plan factoriel (91%)")
plt.show()


pca.components_

array([[-0.37750239, -0.37087719, -0.42378854, -0.41732608, -0.43430217,
        -0.42138568],
       [ 0.48844396,  0.65583402, -0.37286018, -0.40542468, -0.13257594,
        -0.10165618],
       [-0.76537562,  0.60350415,  0.16241413, -0.12927521,  0.06087085,
         0.05645548],
       [-0.05176049, -0.23361868, -0.14986293, -0.55342169,  0.18333703,
         0.76183741],
       [-0.13050531, -0.02085703, -0.61015461,  0.17529628,  0.73828359,
        -0.18561589],
       [ 0.11577491, -0.11446349,  0.51014965, -0.5547552 ,  0.45980959,
        -0.44052629]])


# Création d'une figure vide (avec des axes entre -1 et 1 + le titre)
fig, axes = plt.subplots(figsize = (10, 10))
fig.suptitle("Cercle des corrélations")
axes.set_xlim(-1, 1)
axes.set_ylim(-1, 1)

# Ajout des axes
axes.axvline(x = 0, color = 'lightgray', linestyle = '--', linewidth = 1)
axes.axhline(y = 0, color = 'lightgray', linestyle = '--', linewidth = 1)

# Ajout des noms des variables
for j in range(WGI_num.shape[1]):
    axes.arrow(0, 0, pca.components_[0, j], pca.components_[1, j], width = .001, color = "darkblue", alpha = .25)
    if (pca.components_[1, j] > 0):
        va = "bottom"
    else:
        va = "top"
    axes.text(pca.components_[0, j], pca.components_[1, j], WGI_num.columns.values[j], ha = "center", va = va)

    # Ajout du cercle
plt.gca().add_artist(plt.Circle((0,0), 1, color = 'gray', fill = False))

plt.show()


had = pandas.read_csv("https://crudata.uea.ac.uk/cru/data/temperature/HadCRUT4-gl.dat", header=None)
donnees = pandas.DataFrame(
    [list(map(lambda v: float(v), filter(lambda v: v!= "", h.split(" ")))) for h in had[0][::2]],
    columns = ["Year", "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", "Annual"]
)
donnees.tail()

	Country	Code	Voice and Accountability	Political Stability and Absence of Violence/Terrorism	Government Effectiveness	Regulatory Quality	Rule of Law	Control of Corruption
0	Aruba	ABW	1.294189	1.357372	1.029933	0.857360	1.263128	1.217238
1	Andorra	ADO	1.139154	1.615139	1.908749	1.228176	1.579939	1.234392
2	Afghanistan	AFG	-0.988032	-2.649407	-1.463875	-1.120555	-1.713527	-1.401076
3	Angola	AGO	-0.777283	-0.311101	-1.117144	-0.893871	-1.054343	-1.054683
5	Albania	ALB	0.151805	0.118570	-0.061331	0.274380	-0.411179	-0.528758
...	...	...	...	...	...	...	...	...
209	Serbia	SRB	0.026626	-0.091665	0.019079	0.113867	-0.119070	-0.445551
210	South Africa	ZAF	0.670388	-0.217931	0.367380	0.156172	-0.076408	0.084924
211	Congo, Dem. Rep.	ZAR	-1.365966	-1.808007	-1.627429	-1.509667	-1.786088	-1.538931
212	Zambia	ZMB	-0.286199	-0.102216	-0.675215	-0.554269	-0.462069	-0.640345
213	Zimbabwe	ZWE	-1.141875	-0.920179	-1.205337	-1.463199	-1.257009	-1.238796

	Voice and Accountability	Political Stability and Absence of Violence/Terrorism	Government Effectiveness	Regulatory Quality	Rule of Law	Control of Corruption
Voice and Accountability	1.000000	0.701400	0.699666	0.741906	0.782504	0.770459
Political Stability and Absence of Violence/Terrorism	0.701400	1.000000	0.726165	0.666708	0.783312	0.762406
Government Effectiveness	0.699666	0.726165	1.000000	0.938556	0.934389	0.909035
Regulatory Quality	0.741906	0.666708	0.938556	1.000000	0.916644	0.865925
Rule of Law	0.782504	0.783312	0.934389	0.916644	1.000000	0.940869
Control of Corruption	0.770459	0.762406	0.909035	0.865925	0.940869	1.000000

	Dimension	Variance expliquée	% variance expliquée	% cum. var. expliquée
0	Dim1	5.049592	84.0	84.0
1	Dim2	0.414679	7.0	91.0
2	Dim3	0.307597	5.0	96.0
3	Dim4	0.121650	2.0	98.0
4	Dim5	0.049650	1.0	99.0
5	Dim6	0.040692	1.0	100.0

	Dim1	Dim2	Country
0	-2.921523	0.503627	Aruba
1	-3.630601	0.075195	Andorra
2	3.704414	-0.846353	Afghanistan
3	2.083839	0.446336	Angola
5	0.137824	0.175896	Albania

	Year	Jan	Feb	Mar	Apr	May	Jun	Jul	Aug	Sep	Oct	Nov	Dec	Annual
167	2017.0	0.739	0.845	0.873	0.737	0.659	0.641	0.651	0.714	0.557	0.571	0.554	0.600	0.677
168	2018.0	0.554	0.528	0.615	0.627	0.587	0.573	0.594	0.586	0.598	0.678	0.590	0.638	0.597
169	2019.0	0.738	0.662	0.874	0.780	0.610	0.708	0.706	0.719	0.713	0.752	0.693	0.880	0.736
170	2020.0	0.982	1.001	1.017	0.800	0.714	0.682	0.695	0.735	0.714	0.617	0.761	0.516	0.768
171	2021.0	0.539	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.539

Analyse de données sous `Python`¶

ACP¶

Variance expliquée¶

Représentation des individus¶

Représentation des variables¶

A faire¶

Températures mondiales (anomalies)¶

Analyse de données sous Python¶

ACP¶

Variance expliquée¶

Représentation des individus¶

Représentation des variables¶

A faire¶

Températures mondiales (anomalies)¶

Analyse de données sous `Python`¶