import numpy
import pandas
import matplotlib.pyplot as plt
import seaborn
seaborn.set_style("white") # change le style par défaut des graphiques seaborn

%matplotlib inline


# le dropna() permet de supprimer les pays pour lesquels il manque des informations
WGI_complet = pandas.read_csv("https://fxjollois.github.io/donnees/WGI/wgi2019.csv").dropna()
WGI_complet


from sklearn.cluster import AgglomerativeClustering

WGI_num = WGI_complet.drop(columns = ["Country", "Code"])
hac = AgglomerativeClustering(distance_threshold = 0, n_clusters = None)
hac.fit(WGI_num)

AgglomerativeClustering(distance_threshold=0, n_clusters=None)


from scipy.cluster.hierarchy import dendrogram

def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = numpy.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = numpy.column_stack([model.children_, model.distances_, counts]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)


plt.figure(figsize = (16, 6))
plt.title("CAH (Ward) sur les pays")
# plot the top three levels of the dendrogram
plot_dendrogram(hac)
plt.axhline(y = 27, linewidth = .5, color = "dimgray", linestyle = "--")
plt.axhline(y = 15, linewidth = .5, color = "dimgray", linestyle = "--")
plt.axhline(y = 9, linewidth = .5, color = "dimgray", linestyle = "--")
plt.show()


hac2 = AgglomerativeClustering(n_clusters = 2)
hac2.fit(WGI_num)

AgglomerativeClustering()


pandas.DataFrame(hac2.labels_, columns = ["Classe"]).assign(Effectif = 1).groupby("Classe").count()


WGI_num.assign(Classe = hac2.labels_).groupby("Classe").mean()


plt.figure(figsize = (16, 8))
df = pandas.melt(WGI_num.assign(Classe = hac2.labels_), id_vars = 'Classe')
seaborn.boxplot(data = df, y = "variable", x = "value", hue = "Classe")
plt.show()


hac3 = AgglomerativeClustering(n_clusters = 3)
hac3.fit(WGI_num)

AgglomerativeClustering(n_clusters=3)


pandas.DataFrame(hac3.labels_, columns = ["Classe"]).assign(Effectif = 1).groupby("Classe").count()


WGI_num.assign(Classe = hac3.labels_).groupby("Classe").mean()


plt.figure(figsize = (16, 8))
df = pandas.melt(WGI_num.assign(Classe = hac3.labels_), id_vars = 'Classe')
seaborn.boxplot(data = df, y = "variable", x = "value", hue = "Classe")
plt.show()


from sklearn.cluster import KMeans

kmeans2 = KMeans(n_clusters = 2)
kmeans2.fit(WGI_num)

KMeans(n_clusters=2)


pandas.DataFrame(kmeans2.labels_, columns = ["Classe"]).assign(Effectif = 1).groupby("Classe").count()


kmeans2.cluster_centers_

array([[ 0.83555058,  0.81293581,  0.90273293,  0.87084453,  0.94783308,
         0.92581433],
       [-0.58660919, -0.63045795, -0.66673415, -0.63807482, -0.70108263,
        -0.6995669 ]])


WGI_num.assign(Classe = kmeans2.labels_).groupby("Classe").mean()


plt.figure(figsize = (16, 8))
df = pandas.melt(WGI_num.assign(Classe = kmeans2.labels_), id_vars = 'Classe')
seaborn.boxplot(data = df, y = "variable", x = "value", hue = "Classe")
plt.show()


pandas.crosstab(hac2.labels_, kmeans2.labels_)


kmeans3 = KMeans(n_clusters = 3)
kmeans3.fit(WGI_num)

KMeans(n_clusters=3)


pandas.DataFrame(kmeans3.labels_, columns = ["Classe"]).assign(Effectif = 1).groupby("Classe").count()


WGI_num.assign(Classe = kmeans3.labels_).groupby("Classe").mean()


plt.figure(figsize = (16, 8))
df = pandas.melt(WGI_num.assign(Classe = kmeans3.labels_), id_vars = 'Classe')
seaborn.boxplot(data = df, y = "variable", x = "value", hue = "Classe")
plt.show()


pandas.crosstab(hac3.labels_, kmeans3.labels_)


plt.figure(figsize = (16, 6))
inertia = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters = k, init = "random", n_init = 20).fit(WGI_num)
    inertia = inertia + [kmeans.inertia_]
inertia = pandas.DataFrame({"k": range(1, 11), "inertia": inertia})
seaborn.lineplot(data = inertia, x = "k", y = "inertia")
plt.scatter(2, inertia.query('k == 2')["inertia"], c = "red")
plt.scatter(3, inertia.query('k == 3')["inertia"], c = "red")
plt.show()


had = pandas.read_csv("https://crudata.uea.ac.uk/cru/data/temperature/HadCRUT5.0Analysis_gl.txt", header=None)
donnees = pandas.DataFrame(
    [list(map(lambda v: float(v), filter(lambda v: v!= "", h.split(" ")))) for h in had[0][::2]],
    columns = ["Year", "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", "Annual"]
).query("Year < 2022")
donnees.tail()

	Country	Code	Voice and Accountability	Political Stability and Absence of Violence/Terrorism	Government Effectiveness	Regulatory Quality	Rule of Law	Control of Corruption
0	Aruba	ABW	1.294189	1.357372	1.029933	0.857360	1.263128	1.217238
1	Andorra	ADO	1.139154	1.615139	1.908749	1.228176	1.579939	1.234392
2	Afghanistan	AFG	-0.988032	-2.649407	-1.463875	-1.120555	-1.713527	-1.401076
3	Angola	AGO	-0.777283	-0.311101	-1.117144	-0.893871	-1.054343	-1.054683
5	Albania	ALB	0.151805	0.118570	-0.061331	0.274380	-0.411179	-0.528758
...	...	...	...	...	...	...	...	...
209	Serbia	SRB	0.026626	-0.091665	0.019079	0.113867	-0.119070	-0.445551
210	South Africa	ZAF	0.670388	-0.217931	0.367380	0.156172	-0.076408	0.084924
211	Congo, Dem. Rep.	ZAR	-1.365966	-1.808007	-1.627429	-1.509667	-1.786088	-1.538931
212	Zambia	ZMB	-0.286199	-0.102216	-0.675215	-0.554269	-0.462069	-0.640345
213	Zimbabwe	ZWE	-1.141875	-0.920179	-1.205337	-1.463199	-1.257009	-1.238796

	Voice and Accountability	Political Stability and Absence of Violence/Terrorism	Government Effectiveness	Regulatory Quality	Rule of Law	Control of Corruption
Classe
0	-0.498394	-0.536321	-0.615752	-0.603835	-0.633716	-0.641430
1	0.854999	0.824537	1.006136	0.996369	1.032080	1.023469

	Voice and Accountability	Political Stability and Absence of Violence/Terrorism	Government Effectiveness	Regulatory Quality	Rule of Law	Control of Corruption
Classe
0	0.854999	0.824537	1.006136	0.996369	1.032080	1.023469
1	-0.959242	-1.062068	-1.052668	-0.990906	-1.041426	-1.029118
2	-0.015602	0.014461	-0.158030	-0.198332	-0.206592	-0.235281

	Voice and Accountability	Political Stability and Absence of Violence/Terrorism	Government Effectiveness	Regulatory Quality	Rule of Law	Control of Corruption
Classe
0	0.835551	0.812936	0.902733	0.870845	0.947833	0.925814
1	-0.586609	-0.630458	-0.666734	-0.638075	-0.701083	-0.699567

col_0	0	1
row_0
0	9	120
1	73	0

Classification sous `Python`¶

Classification¶

Classification Ascendante Hiérarchique (CAH)¶

Avec 2 classes¶

Avec 3 classes¶

$k$-means¶

2 classes¶

3 classes¶

Choix du nombre de classes avec $k$-means¶

A faire¶

Températures mondiales (anomalies)¶

	Voice and Accountability	Political Stability and Absence of Violence/Terrorism	Government Effectiveness	Regulatory Quality	Rule of Law	Control of Corruption
Classe
0	-0.992739	-1.075007	-1.030913	-0.981857	-1.049120	-1.033342
1	1.020859	0.889478	1.344081	1.338705	1.327762	1.272946
2	0.156560	0.211170	-0.043721	-0.068433	-0.025587	-0.025345

	Year	Jan	Feb	Mar	Apr	May	Jun	Jul	Aug	Sep	Oct	Nov	Dec	Annual
167	2017.0	0.952	1.067	1.065	0.846	0.780	0.658	0.805	0.811	0.729	0.809	0.806	0.815	0.845
168	2018.0	0.711	0.796	0.790	0.822	0.713	0.738	0.733	0.735	0.676	0.869	0.745	0.824	0.763
169	2019.0	0.800	0.844	1.076	0.939	0.778	0.809	0.857	0.858	0.803	0.956	0.937	1.037	0.891
170	2020.0	1.069	1.113	1.094	1.063	0.908	0.825	0.816	0.801	0.867	0.811	1.013	0.693	0.923
171	2021.0	0.701	0.565	0.726	0.760	0.706	0.713	0.792	0.799	0.867	0.907	0.854	0.751	0.762

	Effectif
Classe
0	129
1	73

	Effectif
Classe
0	73
1	66
2	63

	Effectif
Classe
0	66
1	49
2	87

Classification sous Python¶

Classification¶

Classification Ascendante Hiérarchique (CAH)¶

Avec 2 classes¶

Avec 3 classes¶

$k$-means¶

2 classes¶

3 classes¶

Choix du nombre de classes avec $k$-means¶

A faire¶

Températures mondiales (anomalies)¶

Classification sous `Python`¶