import pandas
import numpy
import matplotlib.pyplot as plt
import seaborn
seaborn.set_style("white")

from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans

spotify = pandas.read_csv("https://fxjollois.github.io/donnees/spotify_dataset.csv")
spotify

vars = ['danceability', 'energy', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence']
spotify_01 = spotify.filter(vars)
spotify_01

kmeans_100 = KMeans(n_clusters = 100, n_init = 30, init = "random").fit(spotify_01)

kmeans_1 = KMeans(n_clusters = 1, n_init = 1).fit(spotify_01)

part_100 = 1 - kmeans_100.inertia_ / kmeans_1.inertia_

print("Part d'inertie expliquée par les 100 classes : " + str(round(part_100 * 100, 2)) + "%")

Part d'inertie expliquée par les 100 classes : 89.15%

centres_100 = pandas.DataFrame(kmeans_100.cluster_centers_, columns = spotify_01.columns)
centres_100

hac = AgglomerativeClustering(distance_threshold = 0, n_clusters = None)
hac.fit(centres_100)

AgglomerativeClustering(distance_threshold=0, n_clusters=None)

AgglomerativeClustering(distance_threshold=0, n_clusters=None)

from scipy.cluster.hierarchy import dendrogram

def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = numpy.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = numpy.column_stack([model.children_, model.distances_, counts]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

plt.figure(figsize = (16, 8))
plt.title("CAH (Ward)")
plot_dendrogram(hac)
plt.axhline(y = 4, linewidth = .5, color = "dimgray", linestyle = "--")
plt.axhline(y = 2.5, linewidth = .5, color = "dimgray", linestyle = "--")
plt.show()

inertia = []
for k in range(1, 11):
    kmeans = KMeans(n_clusters = k, init = "random", n_init = 20).fit(centres_100)
    inertia = inertia + [kmeans.inertia_]
rsquare = [(inertia[0] - i) / inertia[0] for i in inertia]
criteres = pandas.DataFrame({
    "k": range(1, 11), 
    "inertia": inertia,
    "rsquare": rsquare,
    "pseudof": [(rsquare[k-1] / k) / ((1 - rsquare[k-1]) / (centres_100.shape[0] - k)) if k > 1 else None for k in range(1, 11)]
})

g = seaborn.FacetGrid(data = criteres.melt(id_vars = "k"), col = "variable", sharey = False, 
                      height = 5, aspect = 1)
g.map_dataframe(seaborn.lineplot, x = "k", y = "value")
g.map(plt.axvline, x = 2, linewidth = .5, color = "dimgray", linestyle = "--")
g.map(plt.axvline, x = 4, linewidth = .5, color = "dimgray", linestyle = "--")
g.add_legend()
plt.show()

kmeans_4 = KMeans(n_clusters = 4, init = "random", n_init = 30, random_state = 123456).fit(spotify_01)

part_4 = 1 - kmeans_4.inertia_ / kmeans_1.inertia_

print("Part d'inertie expliquée par les 4 classes : " + str(round(part_4 * 100, 2)) + "%")

Part d'inertie expliquée par les 4 classes : 54.11%

pandas.DataFrame(kmeans_4.cluster_centers_, columns = spotify_01.columns).round(2)

pandas.Series(kmeans_4.labels_).value_counts()

3    21712
0    12042
1     3775
2     3570
Name: count, dtype: int64

g = seaborn.catplot(
    data = spotify_01.assign(classe = [str(v) for v in kmeans_4.labels_]) \
                .sort_values(by = ["classe"]) \
                .melt(id_vars = "classe"),
    x = "value", y = "classe",
    kind = "box",
    col = "variable", col_wrap = 2, sharex = False,
    height = 2, aspect = 5
)
g.set_titles(col_template = "{col_name}", fontweight = "bold", size = 24)
g.set_xticklabels(size = 18)
g.tight_layout()
g.set_axis_labels(x_var = "")
plt.show()

pca = PCA(n_components = 2).fit(spotify_01)
spotify_pca_row = pandas.DataFrame(pca.transform(spotify_01), columns = ["Dim1", "Dim2"])
g_pca = seaborn.lmplot(
    data = spotify_pca_row.assign(classe = kmeans_4.labels_), 
    x = "Dim1", y = "Dim2", hue = "classe", fit_reg = False, 
    height = 6, aspect = 2)
plt.show()

coordvar = pca.components_.T
coordvar_df = pandas.DataFrame(coordvar, columns=['PC'+str(i+1) for i in range(2)], index=spotify_01.columns)
fig, axes = plt.subplots(figsize = (10, 10))
fig.suptitle("Cercle des corrélations")
axes.set_xlim(-1, 1)
axes.set_ylim(-1, 1)
axes.axvline(x = 0, color = 'lightgray', linestyle = '--', linewidth = 1)
axes.axhline(y = 0, color = 'lightgray', linestyle = '--', linewidth = 1)
for j in range(7):
    axes.text(coordvar_df["PC1"].iloc[j],
              coordvar_df["PC2"].iloc[j], 
              coordvar_df.index[j], size = 25)
    axes.plot([0,coordvar_df["PC1"].iloc[j]], 
              [0,coordvar_df["PC2"].iloc[j]], 
              color = "gray", linestyle = 'dashed')
plt.gca().add_artist(plt.Circle((0,0),1,color='blue',fill=False))

plt.show()

spotify_cl = spotify.assign(classe = kmeans_4.labels_)

plt.figure(figsize = (16,8))
seaborn.countplot(data = spotify_cl, x = "classe", hue = "mode", palette = "Set1")
plt.show()

plt.figure(figsize = (16,8))
seaborn.countplot(data = spotify_cl, x = "classe", hue = "popularity", palette = "Set1")
plt.show()

plt.figure(figsize = (16,8))
seaborn.countplot(data = spotify_cl, x = "classe", hue = "decade", palette = "Set1")
plt.show()

plt.figure(figsize = (16,8))
seaborn.boxplot(data = spotify_cl, x = "classe", y = "loudness")
plt.show()

plt.figure(figsize = (16,8))
seaborn.boxplot(data = spotify_cl, x = "classe", y = "tempo")
plt.show()

f, ax = plt.subplots(figsize=(16, 8))
ax.set_yscale("log")

seaborn.boxplot(data = spotify_cl, x = "classe", y = "duration_ms")
plt.show()

plt.figure(figsize = (16,8))
seaborn.stripplot(data = spotify_cl.assign(duration_min = spotify_cl.duration_ms / 1000 / 60), 
                  x = "classe", y = "duration_min")
plt.show()

spotify_cl.query("decade == '10s'").query("popularity == 1").query("classe == 0").filter(["track", "artist"])

spotify_cl.query("decade == '10s'").query("popularity == 1").query("classe == 1").filter(["track", "artist"])

spotify_cl.query("decade == '10s'").query("popularity == 1").query("classe == 2").filter(["track", "artist"])

spotify_cl.query("decade == '10s'").query("popularity == 1").query("classe == 3").filter(["track", "artist"])

	track	artist	uri	danceability	energy	key	loudness	mode	speechiness	acousticness	instrumentalness	liveness	valence	tempo	duration_ms	time_signature	chorus_hit	sections	popularity	decade
0	Jealous Kind Of Fella	Garland Green	spotify:track:1dtKN6wwlolkM8XZy2y9C1	0.417	0.620	3	-7.727	1	0.0403	0.4900	0.000000	0.0779	0.8450	185.655	173533	3	32.94975	9	1	60s
1	Initials B.B.	Serge Gainsbourg	spotify:track:5hjsmSnUefdUqzsDogisiX	0.498	0.505	3	-12.475	1	0.0337	0.0180	0.107000	0.1760	0.7970	101.801	213613	4	48.82510	10	0	60s
2	Melody Twist	Lord Melody	spotify:track:6uk8tI6pwxxdVTNlNOJeJh	0.657	0.649	5	-13.392	1	0.0380	0.8460	0.000004	0.1190	0.9080	115.940	223960	4	37.22663	12	0	60s
3	Mi Bomba Sonó	Celia Cruz	spotify:track:7aNjMJ05FvUXACPWZ7yJmv	0.590	0.545	7	-12.058	0	0.1040	0.7060	0.024600	0.0610	0.9670	105.592	157907	4	24.75484	8	0	60s
4	Uravu Solla	P. Susheela	spotify:track:1rQ0clvgkzWr001POOPJWx	0.515	0.765	11	-3.515	0	0.1240	0.8570	0.000872	0.2130	0.9060	114.617	245600	4	21.79874	14	0	60s
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
41094	Lotus Flowers	Yolta	spotify:track:4t1TljQWJ6ZuoSY67zVvBI	0.172	0.358	9	-14.430	1	0.0342	0.8860	0.966000	0.3140	0.0361	72.272	150857	4	24.30824	7	0	10s
41095	Calling My Spirit	Kodak Black	spotify:track:2MShy1GSSgbmGUxADNIao5	0.910	0.366	1	-9.954	1	0.0941	0.0996	0.000000	0.2610	0.7400	119.985	152000	4	32.53856	8	1	10s
41096	Teenage Dream	Katy Perry	spotify:track:55qBw1900pZKfXJ6Q9A2Lc	0.719	0.804	10	-4.581	1	0.0355	0.0132	0.000003	0.1390	0.6050	119.999	227760	4	20.73371	7	1	10s
41097	Stormy Weather	Oscar Peterson	spotify:track:4o9npmYHrOF1rUxxTVH8h4	0.600	0.177	7	-16.070	1	0.0561	0.9890	0.868000	0.1490	0.5600	120.030	213387	4	21.65301	14	0	10s
41098	Dust	Hans Zimmer	spotify:track:2khIaVUkbMmDHB596lyMG3	0.121	0.123	4	-23.025	0	0.0443	0.9640	0.696000	0.1030	0.0297	95.182	341396	4	71.05343	15	0	10s

	danceability	energy	speechiness	acousticness	instrumentalness	liveness	valence
0	0.417	0.620	0.0403	0.4900	0.000000	0.0779	0.8450
1	0.498	0.505	0.0337	0.0180	0.107000	0.1760	0.7970
2	0.657	0.649	0.0380	0.8460	0.000004	0.1190	0.9080
3	0.590	0.545	0.1040	0.7060	0.024600	0.0610	0.9670
4	0.515	0.765	0.1240	0.8570	0.000872	0.2130	0.9060
...	...	...	...	...	...	...	...
41094	0.172	0.358	0.0342	0.8860	0.966000	0.3140	0.0361
41095	0.910	0.366	0.0941	0.0996	0.000000	0.2610	0.7400
41096	0.719	0.804	0.0355	0.0132	0.000003	0.1390	0.6050
41097	0.600	0.177	0.0561	0.9890	0.868000	0.1490	0.5600
41098	0.121	0.123	0.0443	0.9640	0.696000	0.1030	0.0297

	danceability	energy	speechiness	acousticness	instrumentalness	liveness	valence
0	0.514304	0.391603	0.145796	0.809552	0.026704	0.769572	0.614737
1	0.689409	0.412183	0.056003	0.566149	0.009662	0.125540	0.802655
2	0.747460	0.473653	0.353398	0.106073	0.006481	0.148948	0.331603
3	0.592500	0.498425	0.047377	0.442836	0.830558	0.147336	0.795416
4	0.578971	0.346405	0.052496	0.539118	0.792518	0.134393	0.359687
...	...	...	...	...	...	...	...
95	0.553835	0.669117	0.053761	0.067242	0.012244	0.146653	0.254909
96	0.660349	0.858944	0.060838	0.068495	0.010545	0.319502	0.848065
97	0.619595	0.288956	0.062491	0.859387	0.017568	0.147778	0.767506
98	0.587148	0.425345	0.057358	0.771470	0.445503	0.144969	0.745464
99	0.461136	0.323450	0.051041	0.824657	0.453120	0.161951	0.335068

	danceability	energy	speechiness	acousticness	instrumentalness	liveness	valence
0	0.51	0.39	0.07	0.70	0.02	0.21	0.50
1	0.37	0.24	0.05	0.86	0.81	0.16	0.28
2	0.48	0.73	0.07	0.11	0.74	0.21	0.46
3	0.60	0.72	0.08	0.13	0.02	0.20	0.62

	track	artist
34713	Break Up In The End	Cole Swindell
34731	Love Don't Run	Steve Holy
34734	You Should See Me In A Crown	Billie Eilish
34768	Mama's Song	Carrie Underwood
34773	Come Join The Murder	The White Buffalo & The Forest Rangers
...	...	...
41033	Noticed	Lil Mosey
41044	Window Seat	Erykah Badu
41045	Skyfall	Adele
41046	10 Freaky Girls	Metro Boomin Featuring 21 Savage
41056	Te Bote	Casper Magico, Nio Garcia, Darell, Nicky Jam, ...

Séance 8 - correction¶

Importation des données dans python¶

Création d'un sous-ensemble¶

Réduction des données à 100 classes avec $k$-means¶

Part d'inertie expliquée avec 100 classes¶

Centres des 100 classes¶

Choix du nombre de classes¶

Dendrogramme¶

Et avec $r^2$ et $PseudoF$¶

Conclusion¶

Calcul de la partition avec $k$-means¶

Part d'inertie expliquée avec 4 classes¶

Description des classes¶

Représentation sur le plan factoriel¶

Et les autres variables¶

Mode¶

Popularité¶

Décennie¶

Niveau sonore¶

Tempo¶

Durée¶

Représentation de la durée en minutes pour chaque classe¶

Pour les connaisseurs¶

	track	artist
35094	everything i wanted	Billie Eilish
37228	What Are You So Afraid Of	XXXTENTACION
37543	Jonestown (Interlude)	Post Malone
38450	I'm Not The Only One	Sam Smith
40242	whoa (mind in awe)	XXXTENTACION

	track	artist
35910	Magic	Coldplay
35912	Shadow Days	John Mayer
36762	Rap Saved Me	21 Savage, Offset & Metro Boomin Featuring Quavo
36995	Scary Monsters And Nice Sprites	Skrillex
37085	Major Minus	Coldplay
38344	Better	Khalid
38444	Sail	AWOLNATION
40377	Animals	Martin Garrix
40833	Out The Speakers	A-Trak + Milo & Otis Featuring Rich Kidz
40940	Get Low	Dillon Francis & DJ Snake

	track	artist
34703	Wild Things	Alessia Cara
34705	Love Someone	Lukas Graham
34707	Juju On That Beat (TZ Anthem)	Zay Hilfigerrr & Zayion McCall
34708	Here's To Never Growing Up	Avril Lavigne
34710	Helluva Night	Ludacris
...	...	...
41091	Tear In My Heart	twenty one pilots
41092	Sweater Weather	The Neighbourhood
41093	Untouchable	YoungBoy Never Broke Again
41095	Calling My Spirit	Kodak Black
41096	Teenage Dream	Katy Perry