import pandas
import numpy
import matplotlib.pyplot as plt
import seaborn
seaborn.set_style("white")

from prince import CA


df = pandas.read_csv("USAccDeaths.csv", index_col = "Year")
df.columns.rename("Month", inplace = True)
df.index.rename("Year", inplace = True)
df


ca = CA(n_components = 5) # minimum du nombre de modalités - 1
ca.fit(df)

<prince.ca.CA at 0x10ca5cbb0>


print(ca.eigenvalues_)
print(ca.total_inertia_)
print(ca.percentage_of_variance_) # CHANGEMENT ICI

[3.76283456e-04 1.85670084e-04 7.57831853e-05 3.76894447e-05
 2.45791383e-05]
0.0007000053078490678
[53.75437181 26.52409655 10.82608723  5.38416556  3.51127885]


eig = pandas.DataFrame(
    { 
        "Dimension" : ["Dim" + str(x + 1) for x in range(5)],
        "% variance expliquée": numpy.round(ca.percentage_of_variance_, 4) * 100,
        "% variance expliquée cumulée": numpy.round(numpy.cumsum(ca.percentage_of_variance_), 4) * 100,
    }
)
eig


plt.figure(figsize=(16, 6))
g_eig = seaborn.barplot(x = "Dimension", 
                        y = "% variance expliquée",
                        palette = ["lightseagreen"],
                        data = eig)
g_eig.set(ylabel = "Variance expliquée (%)")
g_eig.figure.suptitle("Variance expliquée par dimension")

plt.show()

/usr/local/lib/python3.9/site-packages/seaborn/_core.py:1218: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
  if pd.api.types.is_categorical_dtype(vector):
/usr/local/lib/python3.9/site-packages/seaborn/_core.py:1218: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
  if pd.api.types.is_categorical_dtype(vector):
/usr/local/lib/python3.9/site-packages/seaborn/_core.py:1218: FutureWarning: is_categorical_dtype is deprecated and will be removed in a future version. Use isinstance(dtype, CategoricalDtype) instead
  if pd.api.types.is_categorical_dtype(vector):


df_row = pandas.DataFrame(ca.row_coordinates(df)).rename(columns = {0: "Dim1", 1: "Dim2"})
df_row


g_row = seaborn.lmplot(x = "Dim1", y = "Dim2", data = df_row, fit_reg = False, 
                       height = 4, aspect = 3)
g_row.fig.suptitle("Modalités en lignes")
for i in df_row.index:
    plt.text(df_row.loc[i].Dim1, df_row.loc[i].Dim2, i, size = "xx-large")
plt.show()


df_col = pandas.DataFrame(ca.column_coordinates(df)).rename(columns = {0: "Dim1", 1: "Dim2"})
df_col

/usr/local/lib/python3.9/site-packages/prince/ca.py:206: FutureWarning: is_sparse is deprecated and will be removed in a future version. Check `isinstance(dtype, pd.SparseDtype)` instead.
  is_sparse = X.dtypes.apply(pd.api.types.is_sparse).all()


g_col = seaborn.lmplot(x = "Dim1", y = "Dim2", data = df_col, fit_reg = False, 
                       height = 4, aspect = 3)
g_col.fig.suptitle("Modalités en colonnes")
for i in df_col.index:
    plt.text(df_col.loc[i].Dim1, df_col.loc[i].Dim2, i, size = "xx-large")
plt.show()


fig = plt.figure(figsize = (16,8))
plt.xlim(-.05, .05)
plt.ylim(-.05, .05)

for i in df_row.index:
    plt.scatter(df_row.loc[i].Dim1, df_row.loc[i].Dim2, alpha = .25, c = "black")
    plt.text(df_row.loc[i].Dim1, df_row.loc[i].Dim2, i, size = "xx-large", color = "darkblue", ha = "center")

for i in df_col.index:
    plt.scatter(df_col.loc[i].Dim1, df_col.loc[i].Dim2, alpha = .25, c = "black")
    plt.text(df_col.loc[i].Dim1, df_col.loc[i].Dim2, i, size = "xx-large", color = "darkred", ha = "center")

fig.suptitle("Représentation conjointe")
plt.show()


plt.figure(figsize = (16, 4))
plt.bar(df.index, ca.row_masses_, color = "darkblue")
plt.show()


plt.figure(figsize = (16, 4))
plt.bar(df.columns, ca.col_masses_, color = "darkred")
plt.show()


p_row = df.div(df.sum(axis=1), axis=0).round(4) * 100
p_row


fig = plt.figure(figsize = (16, 8))
base = numpy.zeros(6)
for m in p_row.columns:
    plt.bar(p_row.index, p_row[m], bottom = base, label = m)
    base = base + p_row[m]
plt.margins(0.05, 0.15)
plt.legend(ncol = 6, loc = 9)
plt.show()


p_col = df.div(df.sum(axis=0), axis=1).round(4) * 100
p_col


fig = plt.figure(figsize = (16, 8))
base = numpy.zeros(12)
for m in p_col.index:
    plt.bar(p_col.columns, p_col.loc[m], bottom = base, label = m)
    base = base + p_col.loc[m]
plt.margins(0.05, 0.15)
plt.legend(ncol = 6, loc = 9)
plt.show()

Month	January	February	March	April	May	June	July	August	September	October	November	December
Year
1973	9007	8106	8928	9137	10017	10826	11317	10744	9713	9938	9161	8927
1974	7750	6981	8038	8422	8714	9512	10120	9823	8743	9129	8710	8680
1975	8162	7306	8124	7870	9387	9556	10093	9620	8285	8466	8160	8034
1976	7717	7461	7767	7925	8623	8945	10078	9179	8037	8488	7874	8647
1977	7792	6957	7726	8106	8890	9299	10625	9302	8314	8850	8265	8796
1978	7836	6892	7791	8192	9115	9434	10484	9827	9110	9070	8633	9240

	Dimension	% variance expliquée	% variance expliquée cumulée
0	Dim1	5375.44	5375.44
1	Dim2	2652.41	8027.85
2	Dim3	1082.61	9110.46
3	Dim4	538.42	9648.87
4	Dim5	351.13	10000.00

	Dim1	Dim2	2	3	4
Year
1973	0.017407	-0.013220	0.004260	0.004290	0.006992
1974	-0.013806	-0.014447	0.011227	-0.001709	-0.006420
1975	0.030026	-0.000779	-0.009876	-0.003899	-0.005081
1976	0.003461	0.024649	0.009397	-0.004836	0.002169
1977	-0.011677	0.010810	-0.004949	0.011591	-0.002571
1978	-0.026633	-0.004477	-0.010296	-0.005888	0.004086

	Dim1	Dim2	2	3	4
Month
January	0.023375	0.003002	-0.004639	0.001429	0.001994
February	0.029632	0.024436	0.014401	-0.006230	0.005528
March	0.018686	-0.000595	0.006133	-0.003373	-0.006826
April	-0.006568	-0.003396	0.013345	0.004948	-0.001407
May	0.018069	0.002971	-0.018906	-0.002541	-0.001863
June	0.013585	-0.011738	-0.001110	0.005372	0.000419
July	-0.006641	0.014813	-0.006718	0.011894	0.000836
August	0.001480	-0.010823	0.000180	-0.007366	-0.002311
September	-0.014577	-0.019645	-0.003744	-0.005379	0.011479
October	-0.013734	-0.007586	0.008094	0.006472	0.002180
November	-0.015175	-0.012299	0.000979	-0.001664	-0.008809
December	-0.041467	0.024106	-0.002788	-0.006655	-0.000887

Month	January	February	March	April	May	June	July	August	September	October	November	December
Year
1973	7.78	7.00	7.71	7.89	8.65	9.35	9.77	9.28	8.39	8.58	7.91	7.71
1974	7.41	6.67	7.68	8.05	8.33	9.09	9.67	9.39	8.36	8.73	8.33	8.30
1975	7.92	7.09	7.88	7.64	9.11	9.27	9.79	9.33	8.04	8.21	7.92	7.80
1976	7.66	7.41	7.71	7.87	8.56	8.88	10.00	9.11	7.98	8.43	7.82	8.58
1977	7.57	6.76	7.51	7.88	8.64	9.03	10.32	9.04	8.08	8.60	8.03	8.55
1978	7.42	6.53	7.38	7.76	8.63	8.93	9.93	9.30	8.62	8.59	8.17	8.75

Analyse Factorielle des Correspondances Simples (AFC)¶

Mastère ESD - Introduction au Machine Learning¶

Librairies utilisées¶

Données utilisées¶

Calcul de AFC¶

Valeurs propres¶

Choix des facteurs¶

Représentation des lignes¶

Représentation des colonnes¶

Représentation simultanée¶

Importance de chaque modalité¶

AFC - Profils lignes¶

AFC - Profils colonnes¶

Month	January	February	March	April	May	June	July	August	September	October	November	December
Year
1973	18.66	18.55	18.46	18.40	18.30	18.80	18.04	18.37	18.61	18.42	18.03	17.06
1974	16.06	15.97	16.62	16.96	15.92	16.52	16.14	16.79	16.75	16.92	17.14	16.59
1975	16.91	16.72	16.79	15.85	17.15	16.60	16.09	16.45	15.87	15.69	16.06	15.35
1976	15.99	17.07	16.06	15.96	15.75	15.54	16.07	15.69	15.40	15.74	15.50	16.53
1977	16.14	15.92	15.97	16.33	16.24	16.15	16.94	15.90	15.93	16.41	16.27	16.81
1978	16.24	15.77	16.11	16.50	16.65	16.39	16.72	16.80	17.45	16.81	16.99	17.66