import numpy
import pandas
import matplotlib.pyplot as plt

%matplotlib inline


pen_tes = pandas.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.tes", 
                          header=None)
pen_tra = pandas.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/pendigits/pendigits.tra", 
                          header=None)
pen = pen_tes.copy().append(pen_tra, ignore_index = True)
print(pen.shape)
pen.head()

(10992, 17)


a = [c + n for c, n in zip(["x", "y"] * 8, [str(j) for j in range(1, 9) for i in range(2)])]
a.append("chiffre")
print(a)

['x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'x4', 'y4', 'x5', 'y5', 'x6', 'y6', 'x7', 'y7', 'x8', 'y8', 'chiffre']


pen.columns = a
pen.head()


xN = ["x" + str(i + 1) for i in range(8)]
print(xN)
yN = ["y" + str(i + 1) for i in range(8)]
print(yN)
xyN = [a + b for a,b in zip(["x", "y"] * 8, [str(i + 1) for i in range(8) for j in range(2)])]
print(xyN)

['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8']
['y1', 'y2', 'y3', 'y4', 'y5', 'y6', 'y7', 'y8']
['x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'x4', 'y4', 'x5', 'y5', 'x6', 'y6', 'x7', 'y7', 'x8', 'y8']


x = pen.loc[0, xN]
y = pen.loc[0, yN]
chiffre = pen.loc[0, "chiffre"]
plt.plot(x, y)
plt.title("Chiffre : " + str(chiffre))
plt.show()


def dessin(p, x, y, chiffre):
    p.plot(x, y)
    p.set_title("Chiffre : " + str(chiffre))
    p.axis("off")
    p.set_xlim([-1, 101])
    p.set_ylim([-1, 101])

fig, ax = plt.subplots()
dessin(ax, x, y, chiffre)


sub = [pen.query("chiffre == " + str(i)).reset_index(drop = True) for i in range(10)]


sub_first_xyc = [[s.loc[0, xN], s.loc[0, yN], s.loc[0, "chiffre"]] for s in sub]


fig = plt.figure(figsize = (15, 5))
for i in range(10):
    ax = fig.add_subplot(2, 5, i + 1) # on ajoute un sous-graphique à la position i+1
    dessin(ax, sub_first_xyc[i][0], sub_first_xyc[i][1], sub_first_xyc[i][2])


pen.groupby("chiffre").mean().round(2)


def dessin(p, x, y, chiffre, pos = False, titre = "Chiffre"):
    p.plot(x, y)
    if (pos):
        for i in range(8):
            p.text(x[i], y[i], str(i+1), va = "center", ha = "center", weight = "bold", size = "x-large")
    p.set_title(titre + " : " + str(chiffre))
    p.axis("off")
    p.set_xlim([-1, 101])
    p.set_ylim([-1, 101])


cmoy = pen.groupby("chiffre").mean().round(2)

fig = plt.figure(figsize = (15, 5))
for i in range(10):
    ax = fig.add_subplot(2, 5, i + 1) # on ajoute un sous-graphique à la position i+1
    dessin(ax, cmoy.loc[i,xN], cmoy.loc[i,yN], str(i), pos = True)


from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

pca_original = PCA()
pca_original.fit(pen.loc[:,xyN])
pen_original_pca = pca_original.transform(pen.loc[:,xyN])
pen_original_df = pandas.DataFrame({
    "Dim1" : pen_original_pca[:,0], 
    "Dim2" : pen_original_pca[:,1],
    "Chiffre" : pen["chiffre"]
})


import seaborn

seaborn.lmplot(data = pen_original_df, x = "Dim1", y = "Dim2", hue = "Chiffre",
              fit_reg = False, height = 8, aspect = 1)
plt.show()


import seaborn

seaborn.lmplot(data = pen_original_df, x = "Dim1", y = "Dim2", hue = "Chiffre", 
               col = "Chiffre", col_wrap = 5,
              fit_reg = False)
plt.show()


pca_scale = PCA()
pca_scale.fit(scale(pen.loc[:,xyN]))
pen_scale_pca = pca_scale.transform(scale(pen.loc[:,xyN]))
pen_scale_df = pandas.DataFrame({
    "Dim1" : pen_scale_pca[:,0], 
    "Dim2" : pen_scale_pca[:,1],
    "Chiffre" : pen["chiffre"]
})


import seaborn

seaborn.lmplot(data = pen_scale_df, x = "Dim1", y = "Dim2", hue = "Chiffre",
              fit_reg = False, height = 8, aspect = 1)
plt.show()


import seaborn

seaborn.lmplot(data = pen_scale_df, x = "Dim1", y = "Dim2", hue = "Chiffre", 
               col = "Chiffre", col_wrap = 5,
              fit_reg = False)
plt.show()


from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram

def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = numpy.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = numpy.column_stack([model.children_, model.distances_, counts]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)


def recherche(chiffre):
    # Restriction aux données d'intérêts + standardisation (nécessaire pour CAH et k-means)
    pen_chiffre = pen.query("chiffre == " + str(chiffre)).drop(columns = "chiffre")
    pen_chiffre_scale = scale(pen_chiffre)
    
    # Réalisation de la CAH avec affichage du dendrogramme
    fig = plt.figure(figsize = (15, 5))
    hac = AgglomerativeClustering(distance_threshold=0, n_clusters=None)
    hac.fit(pen_chiffre_scale)
    plot_dendrogram(hac, ax = fig.add_subplot(1, 2, 1))

    # Réalisation de k-means et affichage de l'évolution de l'intertie intra-classe
    inertia = []
    for k in range(1, 11):
        kmeans = KMeans(n_clusters = k, init = "random", n_init = 20).fit(pen_chiffre_scale)
        inertia = inertia + [kmeans.inertia_]
    ax = fig.add_subplot(1, 2, 2)
    ax.plot(range(1, 11), inertia)


def application(chiffre, nb_classes):
    # Restriction aux données d'intérêts + standardisation (nécessaire pour CAH et k-means)
    pen_chiffre = pen.query("chiffre == " + str(chiffre)).drop(columns = "chiffre")
    pen_chiffre_scale = scale(pen_chiffre)
    
    # Réalisation de k-means avec affichage du nombre de tracés pour chaque classe, 
    # des tracés sur le plan factoriel et des tracés moyens de chaque classe
    kmeans = KMeans(n_clusters = nb_classes)
    kmeans.fit(pen_chiffre_scale)
    print("Effectifs des classes")
    for k in range(nb_classes):
        print("Classe {} : {}".format(k, numpy.sum([i == k for i in kmeans.labels_])))
    pca_chiffre = pen_scale_df.query("Chiffre == " + str(chiffre)).assign(classe = kmeans.labels_)
    g = seaborn.lmplot(data = pca_chiffre, x = "Dim1", y = "Dim2", hue = "classe", 
                   col = "classe", fit_reg = False)
    g.set(xlim=(-4, 5), ylim=(-4,5))
    km_centres = pen_chiffre.assign(classe = kmeans.labels_).groupby("classe").mean()
    fig = plt.figure(figsize = (15, 5))
    for k in range(nb_classes):
        ax = fig.add_subplot(1, nb_classes, k + 1) # on ajoute un sous-graphique à la position i+1
        dessin(ax, km_centres.loc[k,xN], km_centres.loc[k,yN], str(k), pos = True, titre = "Classe")


recherche(0)


application(0, 4)

Effectifs des classes
Classe 0 : 470
Classe 1 : 61
Classe 2 : 260
Classe 3 : 352


recherche(1)


application(1, 4)

Effectifs des classes
Classe 0 : 373
Classe 1 : 349
Classe 2 : 95
Classe 3 : 326


recherche(2)


application(2, 1)

Effectifs des classes
Classe 0 : 1144


recherche(3)


application(3, 1)

Effectifs des classes
Classe 0 : 1055


recherche(4)


application(4, 3)

Effectifs des classes
Classe 0 : 323
Classe 1 : 509
Classe 2 : 312


recherche(5)


application(5, 2)

Effectifs des classes
Classe 0 : 627
Classe 1 : 428


recherche(6)


application(6, 2)

Effectifs des classes
Classe 0 : 630
Classe 1 : 426


recherche(7)


application(7, 2)

Effectifs des classes
Classe 0 : 984
Classe 1 : 158


recherche(8)


application(8, 8)

Effectifs des classes
Classe 0 : 244
Classe 1 : 205
Classe 2 : 166
Classe 3 : 168
Classe 4 : 54
Classe 5 : 48
Classe 6 : 37
Classe 7 : 133


recherche(9)


application(9, 4)

Effectifs des classes
Classe 0 : 221
Classe 1 : 576
Classe 2 : 25
Classe 3 : 233

	x1	y1	x2	y2	x3	y3	x4	y4	x5	y5	x6	y6	x7	y7	x8	y8
chiffre
0	35.37	86.06	11.58	58.31	14.94	19.60	51.17	7.29	85.94	31.30	89.29	68.49	59.01	89.31	22.10	75.24
1	14.70	61.39	44.35	77.94	69.86	89.51	77.50	79.80	67.64	54.06	47.80	32.66	44.60	16.16	59.91	1.38
2	18.39	76.95	42.13	99.39	67.46	79.76	51.28	46.05	19.83	19.38	11.64	9.09	53.06	5.25	98.71	4.17
3	24.78	84.06	56.66	99.52	86.64	84.69	64.53	60.59	82.13	43.22	90.88	17.26	50.01	2.28	3.47	6.24
4	42.96	99.54	22.13	79.38	5.75	51.16	42.83	40.47	85.10	49.56	86.30	59.72	70.99	31.45	62.60	0.00
5	41.24	90.94	42.60	75.83	57.31	59.18	36.46	29.36	26.18	33.15	37.64	50.24	42.83	57.69	59.46	60.31
6	87.52	98.72	51.75	86.72	20.71	58.48	6.94	26.93	32.61	3.14	81.11	11.02	61.57	30.54	11.00	23.35
7	3.50	91.01	45.37	98.25	78.85	80.76	71.27	47.47	52.73	14.93	33.60	18.47	39.51	33.80	81.14	34.31
8	56.95	82.08	39.83	79.62	51.81	51.93	50.56	24.22	35.25	17.07	39.93	36.90	67.78	68.49	49.00	81.40
9	69.26	81.32	52.79	83.26	45.45	81.28	56.57	82.96	79.06	71.09	89.78	43.23	61.48	14.34	18.15	4.54

Etude de cas¶

Calcul des coordonnées moyennes¶

Récriture de la fonction `dessin()`¶

Représentation des chiffres moyens¶

Représentation des tracés sur un plan en 2D¶

sur données originales¶

sur données standardisées¶

Recherche des différentes manières d'écrire chaque chiffre¶

Importation des élements de `scikit-learn` pour l'utilisation¶

Définition des fonctions¶

Chiffre 0¶

Chiffre 1¶

Chiffre 2¶

Chiffre 3¶

Chiffre 4¶

Chiffre 5¶

Chiffre 6¶

Chiffre 7¶

Chiffre 8¶

Chiffre 9¶

Conclusion¶

	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16
0	88	92	2	99	16	66	94	37	70	0	0	24	42	65	100	100	8
1	80	100	18	98	60	66	100	29	42	0	0	23	42	61	56	98	8
2	0	94	9	57	20	19	7	0	20	36	70	68	100	100	18	92	8
3	95	82	71	100	27	77	77	73	100	80	93	42	56	13	0	0	9
4	68	100	6	88	47	75	87	82	85	56	100	29	75	6	0	0	9

	x1	y1	x2	y2	x3	y3	x4	y4	x5	y5	x6	y6	x7	y7	x8	y8	chiffre
0	88	92	2	99	16	66	94	37	70	0	0	24	42	65	100	100	8
1	80	100	18	98	60	66	100	29	42	0	0	23	42	61	56	98	8
2	0	94	9	57	20	19	7	0	20	36	70	68	100	100	18	92	8
3	95	82	71	100	27	77	77	73	100	80	93	42	56	13	0	0	9
4	68	100	6	88	47	75	87	82	85	56	100	29	75	6	0	0	9

	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16
0	88	92	2	99	16	66	94	37	70	0	0	24	42	65	100	100	8
1	80	100	18	98	60	66	100	29	42	0	0	23	42	61	56	98	8
2	0	94	9	57	20	19	7	0	20	36	70	68	100	100	18	92	8
3	95	82	71	100	27	77	77	73	100	80	93	42	56	13	0	0	9
4	68	100	6	88	47	75	87	82	85	56	100	29	75	6	0	0	9

	x1	y1	x2	y2	x3	y3	x4	y4	x5	y5	x6	y6	x7	y7	x8	y8	chiffre
0	88	92	2	99	16	66	94	37	70	0	0	24	42	65	100	100	8
1	80	100	18	98	60	66	100	29	42	0	0	23	42	61	56	98	8
2	0	94	9	57	20	19	7	0	20	36	70	68	100	100	18	92	8
3	95	82	71	100	27	77	77	73	100	80	93	42	56	13	0	0	9
4	68	100	6	88	47	75	87	82	85	56	100	29	75	6	0	0	9

Etude de cas¶

Calcul des coordonnées moyennes¶

Récriture de la fonction dessin()¶

Représentation des chiffres moyens¶

Représentation des tracés sur un plan en 2D¶

sur données originales¶

sur données standardisées¶

Recherche des différentes manières d'écrire chaque chiffre¶

Importation des élements de scikit-learn pour l'utilisation¶

Définition des fonctions¶

Chiffre 0¶

Chiffre 1¶

Chiffre 2¶

Chiffre 3¶

Chiffre 4¶

Chiffre 5¶

Chiffre 6¶

Chiffre 7¶

Chiffre 8¶

Chiffre 9¶

Conclusion¶

Récriture de la fonction `dessin()`¶

Importation des élements de `scikit-learn` pour l'utilisation¶

	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16
0	88	92	2	99	16	66	94	37	70	0	0	24	42	65	100	100	8
1	80	100	18	98	60	66	100	29	42	0	0	23	42	61	56	98	8
2	0	94	9	57	20	19	7	0	20	36	70	68	100	100	18	92	8
3	95	82	71	100	27	77	77	73	100	80	93	42	56	13	0	0	9
4	68	100	6	88	47	75	87	82	85	56	100	29	75	6	0	0	9

	x1	y1	x2	y2	x3	y3	x4	y4	x5	y5	x6	y6	x7	y7	x8	y8	chiffre
0	88	92	2	99	16	66	94	37	70	0	0	24	42	65	100	100	8
1	80	100	18	98	60	66	100	29	42	0	0	23	42	61	56	98	8
2	0	94	9	57	20	19	7	0	20	36	70	68	100	100	18	92	8
3	95	82	71	100	27	77	77	73	100	80	93	42	56	13	0	0	9
4	68	100	6	88	47	75	87	82	85	56	100	29	75	6	0	0	9