import pandas
import numpy
import matplotlib.pyplot as plt
import seaborn
seaborn.set_style("white")
from sklearn.cluster import KMeans
from sklearn.preprocessing import scale
iris = pandas.read_table("https://fxjollois.github.io/donnees/Iris.txt", sep = "\t")
iris.head()
Sepal Length | Sepal Width | Petal Length | Petal Width | Species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
iris2 = iris.drop("Species", axis = 1)
iris2.head()
Sepal Length | Sepal Width | Petal Length | Petal Width | |
---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 |
1 | 4.9 | 3.0 | 1.4 | 0.2 |
2 | 4.7 | 3.2 | 1.3 | 0.2 |
3 | 4.6 | 3.1 | 1.5 | 0.2 |
4 | 5.0 | 3.6 | 1.4 | 0.2 |
kmeans = KMeans(n_clusters = 3)
kmeans.fit(scale(iris2))
KMeans(n_clusters=3)
pandas.Series(kmeans.labels_).value_counts()
0 53 2 50 1 47 dtype: int64
kmeans.cluster_centers_
array([[-0.05021989, -0.88337647, 0.34773781, 0.2815273 ], [ 1.13597027, 0.08842168, 0.99615451, 1.01752612], [-1.01457897, 0.85326268, -1.30498732, -1.25489349]])
iris2.assign(classe = kmeans.labels_).groupby("classe").mean()
Sepal Length | Sepal Width | Petal Length | Petal Width | |
---|---|---|---|---|
classe | ||||
0 | 5.801887 | 2.673585 | 4.369811 | 1.413208 |
1 | 6.780851 | 3.095745 | 5.510638 | 1.972340 |
2 | 5.006000 | 3.428000 | 1.462000 | 0.246000 |
inertia = []
for k in range(1, 11):
kmeans = KMeans(n_clusters = k, init = "random", n_init = 20).fit(scale(iris2))
inertia = inertia + [kmeans.inertia_]
inertia = pandas.DataFrame({"k": range(1, 11), "inertia": inertia})
seaborn.lineplot(data = inertia, x = "k", y = "inertia")
plt.scatter(2, inertia.query('k == 2')["inertia"], c = "red")
plt.scatter(3, inertia.query('k == 3')["inertia"], c = "red")
plt.show()