import numpy
import pandas
import matplotlib.pyplot as plt
import seaborn

from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram

def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = numpy.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = numpy.column_stack([model.children_, model.distances_, counts]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)
    
%matplotlib inline


url_base = "http://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/"
pen_tes = pandas.read_csv(url_base + "pendigits.tes", header=None)
pen_tra = pandas.read_csv(url_base + "pendigits.tra", header=None)
pen = pen_tes.copy().append(pen_tra, ignore_index = True)
print("Dimensions des données : ", pen.shape)
pen.head()

Dimensions des données :  (10992, 17)


a = [c+n for c, n in zip(["x", "y"] * 8, [str(j) for j in range(1, 9) for i in range(2)])]
a.append("chiffre")
print(a)

['x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'x4', 'y4', 'x5', 'y5', 'x6', 'y6', 'x7', 'y7', 'x8', 'y8', 'chiffre']


pen.columns = a
pen.head()


xN = ["x" + str(i + 1) for i in range(8)]
print(xN)

['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8']


yN = ["y" + str(i + 1) for i in range(8)]
print(yN)

['y1', 'y2', 'y3', 'y4', 'y5', 'y6', 'y7', 'y8']


xyN = [a + b for a,b in zip(["x", "y"] * 8, [str(i + 1) for i in range(8) for j in range(2)])]
print(xyN)

['x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'x4', 'y4', 'x5', 'y5', 'x6', 'y6', 'x7', 'y7', 'x8', 'y8']


x = pen.loc[0, xN]
y = pen.loc[0, yN]
chiffre = pen.loc[0, "chiffre"]
plt.plot(x, y)
plt.title("Chiffre : " + str(chiffre))
plt.show()


def dessin(p, x, y, chiffre):
    p.plot(x, y)
    p.set_title("Chiffre : " + str(chiffre))
    p.axis("off")
    p.set_xlim([-1, 101])
    p.set_ylim([-1, 101])

fig, ax = plt.subplots()
dessin(ax, x, y, chiffre)


sub = [pen.query("chiffre == " + str(i)).reset_index(drop = True) for i in range(10)]


sub_first_xyc = [[s.loc[0, xN], s.loc[0, yN], s.loc[0, "chiffre"]] for s in sub]


fig = plt.figure(figsize = (15, 5))
for i in range(10):
    ax = fig.add_subplot(2, 5, i + 1)
    dessin(ax, sub_first_xyc[i][0], sub_first_xyc[i][1], sub_first_xyc[i][2])


cmoy = pen.groupby("chiffre").mean().round(2)
cmoy


def dessin(p, x, y, chiffre, pos = False, titre = "Chiffre"):
    p.plot(x, y)
    if (pos):
        for i in range(8):
            p.text(x[i], y[i], str(i+1), 
                   va = "center", ha = "center", weight = "bold", size = "x-large")
    p.set_title(titre + " : " + str(chiffre))
    p.axis("off")
    p.set_xlim([-1, 101])
    p.set_ylim([-1, 101])


fig = plt.figure(figsize = (15, 5))
for i in range(10):
    ax = fig.add_subplot(2, 5, i + 1)
    dessin(ax, cmoy.loc[i,xN], cmoy.loc[i,yN], str(i), pos = True)


pca_original = PCA()
pca_original.fit(pen.loc[:,xyN])
pen_original_pca = pca_original.transform(pen.loc[:,xyN])
pen_original_df = pandas.DataFrame({
    "Dim1" : pen_original_pca[:,0], 
    "Dim2" : pen_original_pca[:,1],
    "Chiffre" : pen["chiffre"]
})


seaborn.lmplot(data = pen_original_df, x = "Dim1", y = "Dim2", hue = "Chiffre",
              fit_reg = False, height = 7, aspect = 1.5)
plt.show()


g = seaborn.lmplot(data = pen_original_df, x = "Dim1", y = "Dim2", hue = "Chiffre", 
               col = "Chiffre", col_wrap = 5, fit_reg = False)
g.set_titles(col_template = "chiffre : {col_name}", fontweight = "bold", size = 24)
plt.show()


pca_scale = PCA()
pca_scale.fit(scale(pen.loc[:,xyN]))
pen_scale_pca = pca_scale.transform(scale(pen.loc[:,xyN]))
pen_scale_df = pandas.DataFrame({
    "Dim1" : pen_scale_pca[:,0], 
    "Dim2" : pen_scale_pca[:,1],
    "Chiffre" : pen["chiffre"]
})


seaborn.lmplot(data = pen_scale_df, x = "Dim1", y = "Dim2", hue = "Chiffre",
              fit_reg = False, height = 7, aspect = 1)
plt.show()


g = seaborn.lmplot(data = pen_scale_df, x = "Dim1", y = "Dim2", hue = "Chiffre", 
               col = "Chiffre", col_wrap = 5, fit_reg = False)
g.set_titles(col_template = "chiffre : {col_name}", fontweight = "bold", size = 24)
plt.show()


def recherche(chiffre):
    # Restriction aux données d'intérêts + standardisation (nécessaire pour CAH et k-means)
    pen_chiffre = pen.query("chiffre == " + str(chiffre)).drop(columns = "chiffre")
    pen_chiffre_scale = scale(pen_chiffre)
    
    # Réalisation de la CAH avec affichage du dendrogramme
    fig = plt.figure(figsize = (15, 5))
    hac = AgglomerativeClustering(distance_threshold=0, n_clusters=None)
    hac.fit(pen_chiffre_scale)
    plot_dendrogram(hac, ax = fig.add_subplot(1, 2, 1))

    # Réalisation de k-means et affichage de l'évolution de l'intertie intra-classe
    inertia = []
    for k in range(1, 11):
        kmeans = KMeans(n_clusters = k, init = "random", n_init = 20).fit(pen_chiffre_scale)
        inertia = inertia + [kmeans.inertia_]
    ax = fig.add_subplot(1, 2, 2)
    ax.plot(range(1, 11), inertia)


def application(chiffre, nb_classes):
    # Restriction aux données d'intérêts + standardisation (nécessaire pour CAH et k-means)
    pen_chiffre = pen.query("chiffre == " + str(chiffre)).drop(columns = "chiffre")
    pen_chiffre_scale = scale(pen_chiffre)
    
    # Réalisation de k-means avec affichage du nombre de tracés pour chaque classe, 
    # des tracés sur le plan factoriel et des tracés moyens de chaque classe
    kmeans = KMeans(n_clusters = nb_classes)
    kmeans.fit(pen_chiffre_scale)
    
    pca_chiffre = pen_scale_df.query("Chiffre == " + str(chiffre)) \
                              .assign(classe = kmeans.labels_)
    g = seaborn.lmplot(data = pca_chiffre, x = "Dim1", y = "Dim2", hue = "classe", 
                   col = "classe", fit_reg = False, height = 4)
    g.set(xlim=(-4, 5), ylim=(-4,5))
    
    km_centres = pen_chiffre.assign(classe = kmeans.labels_).groupby("classe").mean()
    fig = plt.figure(figsize = (18, 4))
    for k in range(nb_classes):
        nbk = numpy.sum([i == k for i in kmeans.labels_])
        ax = fig.add_subplot(1, nb_classes, k + 1)
        dessin(ax, km_centres.loc[k,xN], km_centres.loc[k,yN], str(k), 
               pos = True, titre = "Classe (nb : " + str(nbk) + ")")


recherche(0)


application(0, 4)


recherche(1)


application(1, 4)


recherche(2)


application(2, 1)


recherche(3)


application(3, 1)


recherche(4)


application(4, 3)


recherche(5)


application(5, 2)


recherche(6)


application(6, 2)


recherche(7)


application(7, 2)


recherche(8)


application(8, 8)


recherche(9)


application(9, 4)

	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16
0	88	92	2	99	16	66	94	37	70	0	0	24	42	65	100	100	8
1	80	100	18	98	60	66	100	29	42	0	0	23	42	61	56	98	8
2	0	94	9	57	20	19	7	0	20	36	70	68	100	100	18	92	8
3	95	82	71	100	27	77	77	73	100	80	93	42	56	13	0	0	9
4	68	100	6	88	47	75	87	82	85	56	100	29	75	6	0	0	9

	x1	y1	x2	y2	x3	y3	x4	y4	x5	y5	x6	y6	x7	y7	x8	y8	chiffre
0	88	92	2	99	16	66	94	37	70	0	0	24	42	65	100	100	8
1	80	100	18	98	60	66	100	29	42	0	0	23	42	61	56	98	8
2	0	94	9	57	20	19	7	0	20	36	70	68	100	100	18	92	8
3	95	82	71	100	27	77	77	73	100	80	93	42	56	13	0	0	9
4	68	100	6	88	47	75	87	82	85	56	100	29	75	6	0	0	9

	x1	y1	x2	y2	x3	y3	x4	y4	x5	y5	x6	y6	x7	y7	x8	y8
chiffre
0	35.37	86.06	11.58	58.31	14.94	19.60	51.17	7.29	85.94	31.30	89.29	68.49	59.01	89.31	22.10	75.24
1	14.70	61.39	44.35	77.94	69.86	89.51	77.50	79.80	67.64	54.06	47.80	32.66	44.60	16.16	59.91	1.38
2	18.39	76.95	42.13	99.39	67.46	79.76	51.28	46.05	19.83	19.38	11.64	9.09	53.06	5.25	98.71	4.17
3	24.78	84.06	56.66	99.52	86.64	84.69	64.53	60.59	82.13	43.22	90.88	17.26	50.01	2.28	3.47	6.24
4	42.96	99.54	22.13	79.38	5.75	51.16	42.83	40.47	85.10	49.56	86.30	59.72	70.99	31.45	62.60	0.00
5	41.24	90.94	42.60	75.83	57.31	59.18	36.46	29.36	26.18	33.15	37.64	50.24	42.83	57.69	59.46	60.31
6	87.52	98.72	51.75	86.72	20.71	58.48	6.94	26.93	32.61	3.14	81.11	11.02	61.57	30.54	11.00	23.35
7	3.50	91.01	45.37	98.25	78.85	80.76	71.27	47.47	52.73	14.93	33.60	18.47	39.51	33.80	81.14	34.31
8	56.95	82.08	39.83	79.62	51.81	51.93	50.56	24.22	35.25	17.07	39.93	36.90	67.78	68.49	49.00	81.40
9	69.26	81.32	52.79	83.26	45.45	81.28	56.57	82.96	79.06	71.09	89.78	43.23	61.48	14.34	18.15	4.54

	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16
0	88	92	2	99	16	66	94	37	70	0	0	24	42	65	100	100	8
1	80	100	18	98	60	66	100	29	42	0	0	23	42	61	56	98	8
2	0	94	9	57	20	19	7	0	20	36	70	68	100	100	18	92	8
3	95	82	71	100	27	77	77	73	100	80	93	42	56	13	0	0	9
4	68	100	6	88	47	75	87	82	85	56	100	29	75	6	0	0	9

	x1	y1	x2	y2	x3	y3	x4	y4	x5	y5	x6	y6	x7	y7	x8	y8	chiffre
0	88	92	2	99	16	66	94	37	70	0	0	24	42	65	100	100	8
1	80	100	18	98	60	66	100	29	42	0	0	23	42	61	56	98	8
2	0	94	9	57	20	19	7	0	20	36	70	68	100	100	18	92	8
3	95	82	71	100	27	77	77	73	100	80	93	42	56	13	0	0	9
4	68	100	6	88	47	75	87	82	85	56	100	29	75	6	0	0	9

	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16
0	88	92	2	99	16	66	94	37	70	0	0	24	42	65	100	100	8
1	80	100	18	98	60	66	100	29	42	0	0	23	42	61	56	98	8
2	0	94	9	57	20	19	7	0	20	36	70	68	100	100	18	92	8
3	95	82	71	100	27	77	77	73	100	80	93	42	56	13	0	0	9
4	68	100	6	88	47	75	87	82	85	56	100	29	75	6	0	0	9

	x1	y1	x2	y2	x3	y3	x4	y4	x5	y5	x6	y6	x7	y7	x8	y8	chiffre
0	88	92	2	99	16	66	94	37	70	0	0	24	42	65	100	100	8
1	80	100	18	98	60	66	100	29	42	0	0	23	42	61	56	98	8
2	0	94	9	57	20	19	7	0	20	36	70	68	100	100	18	92	8
3	95	82	71	100	27	77	77	73	100	80	93	42	56	13	0	0	9
4	68	100	6	88	47	75	87	82	85	56	100	29	75	6	0	0	9

Etude de cas¶

Données¶

Objectifs¶

Librairies et fonction plot_dendogram()¶

Importation des données¶

Création du vecteur des noms de variables¶

Renommage des colonnes avec ce vecteur¶

Création de vecteurs utiles pour la suite¶

Représentation graphique d'un tracé¶

Création d'un fonction dessin()¶

Création d'une liste¶

Représentation du premier tracé de chaque chiffre¶

Représentation du premier tracé de chaque chiffre¶

Représentation du premier tracé de chaque chiffre¶

Calcul des coordonnées moyennes¶

Récriture de la fonction dessin()¶

Représentation des chiffres moyens¶

Représentation des tracés sur un plan en 2D¶

sur données originales¶

Représentation des tracés sur un plan en 2D¶

sur données originales¶

Représentation des tracés sur un plan en 2D¶

sur données originales¶

Représentation des tracés sur un plan en 2D¶

sur données standardisées¶

Représentation des tracés sur un plan en 2D¶

sur données standardisées¶

Représentation des tracés sur un plan en 2D¶

sur données standardisées¶

Recherche des différentes manières d'écrire chaque chiffre¶

Définition de 2 fonctions¶

recherche(chiffre)¶

application(chiffre, nb_classes)¶

Fonction recherche(chiffre)¶

Fonction application(chiffre, nb_classes)¶

Chiffre 0¶

Recherche¶

Chiffre 0¶

Application¶

Chiffre 1¶

Recherche¶

Chiffre 1¶

Application¶

Chiffre 2¶

Recherche¶

Chiffre 2¶

Application¶

Chiffre 3¶

Recherche¶

Chiffre 3¶

Application¶

Chiffre 4¶

Recherche¶

Chiffre 4¶

Application¶

Chiffre 5¶

Recherche¶

Chiffre 5¶

Application¶

Chiffre 6¶

Recherche¶

Chiffre 6¶

Représentation¶

Chiffre 7¶

Recherche¶

Chiffre 7¶

Application¶

Chiffre 8¶

Recherche¶

Chiffre 8¶

Application¶

Chiffre 9¶

Recherche¶

Chiffre 9¶

Application¶

Conclusion¶

Mais que faire de ces informations?¶

Librairies et fonction `plot_dendogram()`¶

Création d'un fonction `dessin()`¶

Récriture de la fonction `dessin()`¶

`recherche(chiffre)`¶

`application(chiffre, nb_classes)`¶

Fonction `recherche(chiffre)`¶

Fonction `application(chiffre, nb_classes)`¶

	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16
0	88	92	2	99	16	66	94	37	70	0	0	24	42	65	100	100	8
1	80	100	18	98	60	66	100	29	42	0	0	23	42	61	56	98	8
2	0	94	9	57	20	19	7	0	20	36	70	68	100	100	18	92	8
3	95	82	71	100	27	77	77	73	100	80	93	42	56	13	0	0	9
4	68	100	6	88	47	75	87	82	85	56	100	29	75	6	0	0	9

	x1	y1	x2	y2	x3	y3	x4	y4	x5	y5	x6	y6	x7	y7	x8	y8	chiffre
0	88	92	2	99	16	66	94	37	70	0	0	24	42	65	100	100	8
1	80	100	18	98	60	66	100	29	42	0	0	23	42	61	56	98	8
2	0	94	9	57	20	19	7	0	20	36	70	68	100	100	18	92	8
3	95	82	71	100	27	77	77	73	100	80	93	42	56	13	0	0	9
4	68	100	6	88	47	75	87	82	85	56	100	29	75	6	0	0	9