import numpy
import pandas
import matplotlib.pyplot as plt
import seaborn
seaborn.set_style("white") # change le style par défaut des graphiques seaborn

%matplotlib inline


# le dropna() permet de supprimer les pays pour lesquels il manque des informations
WGI_complet = pandas.read_csv("https://fxjollois.github.io/donnees/WGI/wgi2019.csv").dropna()
WGI_complet


from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

# suppression des colonnes non numériques
WGI_num = WGI_complet.drop(columns = ["Country", "Code"])
pca = PCA()
pca.fit(WGI_num)

PCA()


print(pca.explained_variance_)
print(pca.explained_variance_ratio_)

[4.17153223 0.41073498 0.30677259 0.06485507 0.04785222]
[0.83401503 0.0821183  0.06133309 0.01296648 0.0095671 ]


eig = pandas.DataFrame(
    {
        "Dimension" : ["Dim" + str(x + 1) for x in range(5)], 
        "Variance expliquée" : pca.explained_variance_,
        "% variance expliquée" : numpy.round(pca.explained_variance_ratio_ * 100),
        "% cum. var. expliquée" : numpy.round(numpy.cumsum(pca.explained_variance_ratio_) * 100)
    }
)
eig


plt.figure(figsize = (15, 5)) # modifie la taille du graphique
seaborn.barplot(data = eig, x = "Dimension", y = "% variance expliquée")
plt.text(5, 18, "17%")
plt.axhline(y = 17, linewidth = .5, color = "dimgray", linestyle = "--") # ligne 17 = 100 / 6 (nb dimensions)
plt.show() # cette ligne supprime l'affichage inutile en amont du graphique


WGI_pca = pca.transform(WGI_num)


# Transformation en DataFrame pandas
WGI_pca_df = pandas.DataFrame({
    "Dim1" : WGI_pca[:,0], 
    "Dim2" : WGI_pca[:,1],
    "Country" : WGI_complet["Country"]
})

# Résultat (premières lignes)
WGI_pca_df.head()


plt.figure(figsize = (15, 8))
plt.axvline(x = 0, linewidth = .5, color = "dimgray", linestyle = "--")
plt.axhline(y = 0, linewidth = .5, color = "dimgray", linestyle = "--")
seaborn.regplot(data = WGI_pca_df, x = "Dim1", y = "Dim2", fit_reg = False)
plt.xlabel("Dimension 1 (83%)") # modification du nom de l'axe X
plt.ylabel("Dimension 2 (8 %)") # idem pour axe Y
plt.suptitle("Premier plan factoriel (91%)") # titre général
plt.show()


# utilisation de subplots nécessaire car annotation du graphique
fig, ax = plt.subplots(figsize=(15,8))
plt.axvline(x = 0, linewidth = .5, color = "dimgray", linestyle = "--")
plt.axhline(y = 0, linewidth = .5, color = "dimgray", linestyle = "--")

# l'option ax permet de placer les points et le texte sur le même graphique
seaborn.regplot(data = WGI_pca_df, x = "Dim1", y = "Dim2", fit_reg = False, ax = ax)

# boucle sur chaque pays
for k in WGI_pca_df.iterrows():
    # annotation uniquement si valeur absolue sur une de 2 dimensions importantes (valeurs choisies empiriquement)
    if (abs(k[1]['Dim1']) > 3.5) | (abs(k[1]['Dim2']) > 1.5):
        ax.annotate(k[1]["Country"], (k[1]['Dim1'], k[1]['Dim2']), fontsize = 9, ha = "center")
plt.xlabel("Dimension 1 (83%)") 
plt.ylabel("Dimension 2 (8 %)")
plt.suptitle("Premier plan factoriel (91%)")
plt.show()


n = WGI_num.shape[0] # nb individus
p = WGI_num.shape[1] # nb variables
eigval = (n-1) / n * pca.explained_variance_ # valeurs propres
sqrt_eigval = numpy.sqrt(eigval) # racine carrée des valeurs propres
corvar = numpy.zeros((p,p)) # matrice vide pour avoir les coordonnées
for k in range(p):
    corvar[:,k] = pca.components_[k,:] * sqrt_eigval[k]
# on modifie pour avoir un dataframe
coordvar = pandas.DataFrame({'id': WGI_num.columns, 'COR_1': corvar[:,0], 'COR_2': corvar[:,1]})
coordvar


# Création d'une figure vide (avec des axes entre -1 et 1 + le titre)
fig, axes = plt.subplots(figsize = (10, 10))
fig.suptitle("Cercle des corrélations")
axes.set_xlim(-1, 1)
axes.set_ylim(-1, 1)
# Ajout des axes
axes.axvline(x = 0, color = 'lightgray', linestyle = '--', linewidth = 1)
axes.axhline(y = 0, color = 'lightgray', linestyle = '--', linewidth = 1)
# Ajout des noms des variables
for j in range(p):
    axes.text(coordvar["COR_1"][j],coordvar["COR_2"][j], coordvar["id"][j])
# Ajout du cercle
plt.gca().add_artist(plt.Circle((0,0),1,color='blue',fill=False))

plt.show()


from sklearn.cluster import AgglomerativeClustering

hac = AgglomerativeClustering(distance_threshold=0, n_clusters=None)
hac.fit(WGI_num)

AgglomerativeClustering(distance_threshold=0, n_clusters=None)


from scipy.cluster.hierarchy import dendrogram

def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = numpy.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = numpy.column_stack([model.children_, model.distances_, counts]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)


plt.title("CAH (Ward) sur les pays")
# plot the top three levels of the dendrogram
plot_dendrogram(hac)
plt.axhline(y = 25, linewidth = .5, color = "dimgray", linestyle = "--")
plt.axhline(y = 12, linewidth = .5, color = "dimgray", linestyle = "--")
plt.show()


from sklearn.cluster import KMeans

kmeans2 = KMeans(n_clusters = 2)
kmeans2.fit(WGI_num)

KMeans(n_clusters=2)


pandas.Series(kmeans2.labels_).value_counts()

0    113
1     89
dtype: int64


kmeans2.cluster_centers_

array([[-0.65390611, -0.69609133, -0.70131998, -0.67725302, -0.7361161 ],
       [ 0.80913972,  0.78274272,  0.82320404,  0.8019086 ,  0.86262377]])


WGI_k2 = WGI_num.assign(classe = kmeans2.labels_)
WGI_k2.groupby("classe").mean()


WGI_pca_k2 = WGI_pca_df.assign(classe = kmeans2.labels_)
seaborn.lmplot(data = WGI_pca_k2, x = "Dim1", y = "Dim2", hue = "classe", 
               fit_reg = False, height = 5, aspect = 2)
plt.show()


kmeans3 = KMeans(n_clusters = 3)
kmeans3.fit(WGI_num)

KMeans(n_clusters=3)


pandas.Series(kmeans3.labels_).value_counts()

2    87
0    68
1    47
dtype: int64


WGI_k3 = WGI_num.assign(classe = kmeans3.labels_)
WGI_k3.groupby("classe").mean()


WGI_pca_k3 = WGI_pca_df.assign(classe = kmeans3.labels_)
seaborn.lmplot(data = WGI_pca_k3, x = "Dim1", y = "Dim2", hue = "classe", 
               fit_reg = False, height = 5, aspect = 2)
plt.show()


inertia = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters = k, init = "random", n_init = 20).fit(WGI_num)
    inertia = inertia + [kmeans.inertia_]
inertia = pandas.DataFrame({"k": range(1, 11), "inertia": inertia})
seaborn.lineplot(data = inertia, x = "k", y = "inertia")
plt.scatter(2, inertia.query('k == 2')["inertia"], c = "red")
plt.scatter(3, inertia.query('k == 3')["inertia"], c = "red")
plt.show()


had = pandas.read_csv("https://crudata.uea.ac.uk/cru/data/temperature/HadCRUT4-gl.dat", header=None)
donnees = pandas.DataFrame(
    [list(map(lambda v: float(v), filter(lambda v: v!= "", h.split(" ")))) for h in had[0][::2]],
    columns = ["Year", "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", "Annual"]
)
donnees.tail()

	Country	Code	Voice and Accountability	Political Stability and Absence of Violence/Terrorism	Government Effectiveness	Regulatory Quality	Rule of Law
0	Aruba	ABW	1.294189	1.357372	1.029933	0.857360	1.263128
1	Andorra	ADO	1.139154	1.615139	1.908749	1.228176	1.579939
2	Afghanistan	AFG	-0.988032	-2.649407	-1.463875	-1.120555	-1.713527
3	Angola	AGO	-0.777283	-0.311101	-1.117144	-0.893871	-1.054343
5	Albania	ALB	0.151805	0.118570	-0.061331	0.274380	-0.411179
...	...	...	...	...	...	...	...
209	Serbia	SRB	0.026626	-0.091665	0.019079	0.113867	-0.119070
210	South Africa	ZAF	0.670388	-0.217931	0.367380	0.156172	-0.076408
211	Congo, Dem. Rep.	ZAR	-1.365966	-1.808007	-1.627429	-1.509667	-1.786088
212	Zambia	ZMB	-0.286199	-0.102216	-0.675215	-0.554269	-0.462069
213	Zimbabwe	ZWE	-1.141875	-0.920179	-1.205337	-1.463199	-1.257009

	Dimension	Variance expliquée	% variance expliquée	% cum. var. expliquée
0	Dim1	4.171532	83.0	83.0
1	Dim2	0.410735	8.0	92.0
2	Dim3	0.306773	6.0	98.0
3	Dim4	0.064855	1.0	99.0
4	Dim5	0.047852	1.0	100.0

	Dim1	Dim2	Country
0	-2.638757	0.502308	Aruba
1	-3.410131	0.039344	Andorra
2	3.453682	-0.829503	Afghanistan
3	1.824034	0.440028	Angola
5	-0.077668	0.116001	Albania

	id	COR_1	COR_2
0	Voice and Accountability	-0.853816	0.293912
1	Political Stability and Absence of Violence/Te...	-0.838199	0.412244
2	Government Effectiveness	-0.947230	-0.252506
3	Regulatory Quality	-0.939528	-0.282021
4	Rule of Law	-0.969197	-0.095275

	Voice and Accountability	Political Stability and Absence of Violence/Terrorism	Government Effectiveness	Regulatory Quality	Rule of Law
classe
0	-0.653906	-0.696091	-0.701320	-0.677253	-0.736116
1	0.809140	0.782743	0.823204	0.801909	0.862624

Analyse de données sous `Python`¶

ACP¶

Variance expliquée¶

Représentation des individus¶

Représentation des variables¶

Classification¶

Classification Ascendante Hiérarchique (CAH)¶

$k$-means¶

2 classes¶

3 classes¶

Choix du nombre de classes avec $k$-means¶

A faire¶

Températures mondiales (anomalies)¶

	Voice and Accountability	Political Stability and Absence of Violence/Terrorism	Government Effectiveness	Regulatory Quality	Rule of Law
classe
0	-1.005108	-1.030116	-1.005825	-0.982840	-1.034689
1	1.060408	0.882350	1.359792	1.373400	1.363597
2	0.191151	0.225094	-0.017219	-0.033062	-0.001584

	Year	Jan	Feb	Mar	Apr	May	Jun	Jul	Aug	Sep	Oct	Nov	Dec	Annual
167	2017.0	0.739	0.845	0.873	0.737	0.659	0.641	0.651	0.714	0.557	0.571	0.554	0.600	0.677
168	2018.0	0.554	0.528	0.615	0.627	0.587	0.573	0.594	0.586	0.598	0.678	0.590	0.638	0.597
169	2019.0	0.738	0.662	0.874	0.780	0.610	0.708	0.706	0.719	0.713	0.752	0.693	0.880	0.736
170	2020.0	0.982	1.001	1.017	0.800	0.714	0.682	0.695	0.735	0.714	0.617	0.761	0.516	0.768
171	2021.0	0.539	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.539

Analyse de données sous Python¶

ACP¶

Variance expliquée¶

Représentation des individus¶

Représentation des variables¶

Classification¶

Classification Ascendante Hiérarchique (CAH)¶

$k$-means¶

2 classes¶

3 classes¶

Choix du nombre de classes avec $k$-means¶

A faire¶

Températures mondiales (anomalies)¶

Analyse de données sous `Python`¶