import pandas
import numpy
import matplotlib.pyplot as plt
import seaborn
seaborn.set_style("white")

from prince import CA


df = pandas.read_csv("USAccDeaths.csv", index_col = "Year")
df.columns.rename("Month", inplace = True)
df.index.rename("Year", inplace = True)
df


ca = CA(n_components=6)
ca.fit(df)

CA(n_components=6)


print(ca.eigenvalues_)
print(ca.total_inertia_)
print(ca.explained_inertia_)

[0.00037628345587226565, 0.0001856700837173522, 7.578318526200408e-05, 3.768944468224173e-05, 2.457913831520349e-05, 4.207611728329211e-33]
0.0007000053078490677
[0.5375437181019181, 0.2652409655119153, 0.10826087232804832, 0.053841655569800584, 0.03511278848831693, 6.010828319656741e-30]


eig = pandas.DataFrame(
    { 
        "Dimension" : ["Dim" + str(x + 1) for x in range(6)],
        "Valeur propre": ca.eigenvalues_,
        "% variance expliquée": numpy.round(ca.explained_inertia_, 4) * 100,
        "% variance expliquée cumulée": numpy.round(numpy.cumsum(ca.explained_inertia_), 4) * 100,
    }
)
eig


plt.figure(figsize=(16, 6))
g_eig = seaborn.barplot(x = "Dimension", 
                        y = "% variance expliquée",
                        palette = ["lightseagreen"],
                        data = eig)
g_eig.set(ylabel = "Variance expliquée (%)")
g_eig.figure.suptitle("Variance expliquée par dimension")

plt.show()


df_row = pandas.DataFrame(ca.row_coordinates(df)).rename(columns = {0: "Dim1", 1: "Dim2"})
df_row


g_row = seaborn.lmplot(x = "Dim1", y = "Dim2", data = df_row, fit_reg = False, 
                       height = 4, aspect = 3)
g_row.fig.suptitle("Modalités en lignes")
for i in df_row.index:
    plt.text(df_row.loc[i].Dim1, df_row.loc[i].Dim2, i, size = "xx-large")
plt.show()


df_col = pandas.DataFrame(ca.column_coordinates(df)).rename(columns = {0: "Dim1", 1: "Dim2"})
df_col


g_col = seaborn.lmplot(x = "Dim1", y = "Dim2", data = df_col, fit_reg = False, 
                       height = 4, aspect = 3)
g_col.fig.suptitle("Modalités en colonnes")
for i in df_col.index:
    plt.text(df_col.loc[i].Dim1, df_col.loc[i].Dim2, i, size = "xx-large")
plt.show()


ca.plot_coordinates(df,
                    figsize=(16, 8));


fig = plt.figure(figsize = (16,8))
plt.xlim(-.05, .05)
plt.ylim(-.05, .05)

for i in df_row.index:
    plt.scatter(df_row.loc[i].Dim1, df_row.loc[i].Dim2, alpha = .25, c = "black")
    plt.text(df_row.loc[i].Dim1, df_row.loc[i].Dim2, i, size = "xx-large", color = "darkblue", ha = "center")

for i in df_col.index:
    plt.scatter(df_col.loc[i].Dim1, df_col.loc[i].Dim2, alpha = .25, c = "black")
    plt.text(df_col.loc[i].Dim1, df_col.loc[i].Dim2, i, size = "xx-large", color = "darkred", ha = "center")

fig.suptitle("Représentation conjointe")
plt.show()


plt.figure(figsize = (16, 4))
plt.bar(df.index, ca.row_masses_, color = "darkblue")
plt.show()


plt.figure(figsize = (16, 4))
plt.bar(df.columns, ca.col_masses_, color = "darkred")
plt.show()


p_row = df.div(df.sum(axis=1), axis=0).round(4) * 100
p_row


fig = plt.figure(figsize = (16, 8))
base = numpy.zeros(6)
for m in p_row.columns:
    plt.bar(p_row.index, p_row[m], bottom = base, label = m)
    base = base + p_row[m]
plt.margins(0.05, 0.15)
plt.legend(ncol = 6, loc = 9)
plt.show()


p_col = df.div(df.sum(axis=0), axis=1).round(4) * 100
p_col


fig = plt.figure(figsize = (16, 8))
base = numpy.zeros(12)
for m in p_col.index:
    plt.bar(p_col.columns, p_col.loc[m], bottom = base, label = m)
    base = base + p_col.loc[m]
plt.margins(0.05, 0.15)
plt.legend(ncol = 6, loc = 9)
plt.show()

Month	January	February	March	April	May	June	July	August	September	October	November	December
Year
1973	9007	8106	8928	9137	10017	10826	11317	10744	9713	9938	9161	8927
1974	7750	6981	8038	8422	8714	9512	10120	9823	8743	9129	8710	8680
1975	8162	7306	8124	7870	9387	9556	10093	9620	8285	8466	8160	8034
1976	7717	7461	7767	7925	8623	8945	10078	9179	8037	8488	7874	8647
1977	7792	6957	7726	8106	8890	9299	10625	9302	8314	8850	8265	8796
1978	7836	6892	7791	8192	9115	9434	10484	9827	9110	9070	8633	9240

	Dimension	Valeur propre	% variance expliquée	% variance expliquée cumulée
0	Dim1	3.762835e-04	53.75	53.75
1	Dim2	1.856701e-04	26.52	80.28
2	Dim3	7.578319e-05	10.83	91.10
3	Dim4	3.768944e-05	5.38	96.49
4	Dim5	2.457914e-05	3.51	100.00
5	Dim6	4.207612e-33	0.00	100.00

	Dim1	Dim2	2	3	4	5
1973	0.017407	-0.013220	0.004260	0.004290	0.006992	-0.485789
1974	-0.013806	-0.014447	0.011227	-0.001709	-0.006420	-0.485789
1975	0.030026	-0.000779	-0.009876	-0.003899	-0.005081	-0.485789
1976	0.003461	0.024649	0.009397	-0.004836	0.002169	-0.485789
1977	-0.011677	0.010810	-0.004949	0.011591	-0.002571	-0.485789
1978	-0.026633	-0.004477	-0.010296	-0.005888	0.004086	-0.485789

	Dim1	Dim2	2	3	4	5
January	0.023375	0.003002	-0.004639	0.001429	0.001994	1.0
February	0.029632	0.024436	0.014401	-0.006230	0.005528	1.0
March	0.018686	-0.000595	0.006133	-0.003373	-0.006826	1.0
April	-0.006568	-0.003396	0.013345	0.004948	-0.001407	1.0
May	0.018069	0.002971	-0.018906	-0.002541	-0.001863	1.0
June	0.013585	-0.011738	-0.001110	0.005372	0.000419	1.0
July	-0.006641	0.014813	-0.006718	0.011894	0.000836	1.0
August	0.001480	-0.010823	0.000180	-0.007366	-0.002311	1.0
September	-0.014577	-0.019645	-0.003744	-0.005379	0.011479	1.0
October	-0.013734	-0.007586	0.008094	0.006472	0.002180	1.0
November	-0.015175	-0.012299	0.000979	-0.001664	-0.008809	1.0
December	-0.041467	0.024106	-0.002788	-0.006655	-0.000887	1.0

Month	January	February	March	April	May	June	July	August	September	October	November	December
Year
1973	7.78	7.00	7.71	7.89	8.65	9.35	9.77	9.28	8.39	8.58	7.91	7.71
1974	7.41	6.67	7.68	8.05	8.33	9.09	9.67	9.39	8.36	8.73	8.33	8.30
1975	7.92	7.09	7.88	7.64	9.11	9.27	9.79	9.33	8.04	8.21	7.92	7.80
1976	7.66	7.41	7.71	7.87	8.56	8.88	10.00	9.11	7.98	8.43	7.82	8.58
1977	7.57	6.76	7.51	7.88	8.64	9.03	10.32	9.04	8.08	8.60	8.03	8.55
1978	7.42	6.53	7.38	7.76	8.63	8.93	9.93	9.30	8.62	8.59	8.17	8.75

Analyse Factorielle des Correspondances Simples (AFC)¶

Mastère ESD - Introduction au Machine Learning¶

Librairies utilisées¶

Données utilisées¶

Calcul de AFC¶

Valeurs propres¶

Choix des facteurs¶

Représentation des lignes¶

Représentation des colonnes¶

Représentation simultanée¶

Importance de chaque modalité¶

AFC - Profils lignes¶

AFC - Profils colonnes¶

Month	January	February	March	April	May	June	July	August	September	October	November	December
Year
1973	18.66	18.55	18.46	18.40	18.30	18.80	18.04	18.37	18.61	18.42	18.03	17.06
1974	16.06	15.97	16.62	16.96	15.92	16.52	16.14	16.79	16.75	16.92	17.14	16.59
1975	16.91	16.72	16.79	15.85	17.15	16.60	16.09	16.45	15.87	15.69	16.06	15.35
1976	15.99	17.07	16.06	15.96	15.75	15.54	16.07	15.69	15.40	15.74	15.50	16.53
1977	16.14	15.92	15.97	16.33	16.24	16.15	16.94	15.90	15.93	16.41	16.27	16.81
1978	16.24	15.77	16.11	16.50	16.65	16.39	16.72	16.80	17.45	16.81	16.99	17.66