library(readr)
library(DT)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ purrr 1.0.2
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(maps)
##
## Attaching package: 'maps'
##
## The following object is masked from 'package:purrr':
##
## map
library(ggplot2)
data = read_csv("103673.csv", show_col_types = FALSE)
On peut représenter les points de collecte de données sur une carte à l’aide du code suivant :
ggplot() +
geom_polygon(data = map_data("world"),
aes(x = long, y = lat, group = group),
fill = "lightgray") +
geom_point(data = data, aes(x = lon, y = lat), size = 1) +
theme_minimal()
On remarque que les concentrations sont très proches de 0, avec beaucoup de valeurs aberrantes/outliers. Il faut donc envisager de faire une transformation sur celles-ci pour pouvoir les étudier. Plusieurs possibilités :
data %>%
select(Acantharea:Trichodesmium) %>%
pivot_longer(everything()) %>%
ggplot(aes(x = value)) +
geom_boxplot() +
facet_wrap(~name, scales = "free", ncol = 5) +
theme_minimal()
On utilise donc la fonction decostand()
du package
vegan
.
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.6-8
data_hellinger = decostand(data %>% select(Acantharea:Trichodesmium), method = "hellinger")
Les concentrations sont un peu mieux réparties.
data_hellinger %>%
pivot_longer(everything()) %>%
ggplot(aes(x = value)) +
geom_boxplot() +
facet_wrap(~name, scales = "free", ncol = 5) +
theme_minimal()
bulk_conc
et
snow_conc
en calculant \(\log(1+x)\) (car valeurs parfois égales Ã
0)Ces trois opérations sont réalisables en un code
data_sup = data %>%
select(temp:kd490) %>%
replace_na(as.list(colMeans(.,na.rm=T))) %>% # étape 1
mutate_at(vars(snow_conc, bulk_conc), list(log1p)) %>% # étape 2
transmute_all(list(scale)) # étape 3
data_epi = data_hellinger %>%
bind_cols(data_sup) %>%
mutate(layer = data$layer) %>%
filter(layer == "epi") %>%
select(-layer)
On ajoute donc les variables explicatives en variables
supplémentaires (cf paramètre quanti.sup
dans la fonction
PCA()
).
library(FactoMineR)
acp_epi = PCA(data_epi,
quanti.sup = 29:51,
graph = FALSE,
scale = FALSE)
Les deux premières composantes expliquent 39% de variance.
datatable(round(acp_epi$eig %>% head(10), 2), options = list(dom = 't'))
ggplot(data.frame(acp_epi$eig) %>% mutate(dim = row_number()),
aes(x = dim, y = percentage.of.variance)) +
geom_line() +
theme_minimal()
La représentation via la fonction plot()
est
difficilement exploitable.
plot(acp_epi, choix = "var", invisible = "quanti.sup")
## Warning: ggrepel: 26 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
plot(acp_epi, choix = "var", invisible = "var")
## Warning: ggrepel: 14 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
ggplot(data.frame(acp_epi$ind$coord), aes(x = Dim.1, y = Dim.2)) +
geom_point() +
theme_minimal()
On choisit donc 3 classes.
cah_epi = hclust(dist(data_epi %>% select(Acantharea:Trichodesmium)), method = "ward.D2")
par(mar = c(1, 2, 2, 0) + .1)
plot(cah_epi, hang = -1, labels = FALSE)
km_epi = kmeans(data_epi %>% select(Acantharea:Trichodesmium), 3, iter.max = 30, nstart = 30)
Voici les 4 classes qu’on trouve :
datatable(t(round(km_epi$centers, 2)), options = list(dom = 'tp'))
data.frame(km_epi$centers) %>%
rownames_to_column("cluster") %>%
pivot_longer(!cluster) %>%
ggplot(aes(x = cluster, y = name, fill = value)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "orange") +
theme_minimal() +
labs(x = "classe", y = "", fill = "")
bulk_conc
et
chla
ze
et epi
temp
et
strat
ggplot(data.frame(acp_epi$ind$coord) %>% mutate(classe = km_epi$cluster),
aes(x = Dim.1, y = Dim.2)) +
geom_point(aes(color = factor(classe)), alpha = .5) +
geom_segment(data = data.frame(acp_epi$quanti.sup$coord) %>% rownames_to_column("var"),
aes(x = 0, y = 0, xend = Dim.1, yend = Dim.2),
arrow = arrow(length = unit(0.5, "cm")), alpha = .5) +
geom_text(data = data.frame(acp_epi$quanti.sup$coord) %>% rownames_to_column("var"),
aes(label = var), size = 5) +
theme_minimal() +
labs(x = "", y = "", color = "Classe")
Difficile de bien distinguer sur ce graphique
ggplot() +
geom_polygon(data = map_data("world"),
aes(x = long, y = lat, group = group),
fill = "lightgray") +
geom_point(data = data %>% filter(layer == "epi") %>% mutate(classe = km_epi$cluster),
aes(x = lon, y = lat, color = factor(classe)), size = 1) +
theme_minimal() +
labs(x = "", y = "", color = "Classe")
ggplot(data = data %>% filter(layer == "epi") %>% mutate(classe = km_epi$cluster),
aes(x = lon, y = lat)) +
geom_point(aes(color = factor(classe)), size = 1) +
geom_polygon(data = map_data("world"),
aes(x = long, y = lat, group = group),
fill = "lightgray")+
facet_wrap(~ classe, ncol = 2) +
theme_minimal() +
labs(x = "", y = "", color = "Classe")
data_meso = data_hellinger %>%
bind_cols(data_sup) %>%
mutate(layer = data$layer) %>%
filter(layer == "mesosup") %>%
select(-layer)
On ajoute donc les variables explicatives en variables
supplémentaires (cf paramètre quanti.sup
dans la fonction
PCA()
).
library(FactoMineR)
acp_meso = PCA(data_meso,
quanti.sup = 29:51,
graph = FALSE,
scale = FALSE)
Les deux premières composantes expliquent 39% de variance.
datatable(round(acp_meso$eig %>% head(10), 2), options = list(dom = 't'))
ggplot(data.frame(acp_meso$eig) %>% mutate(dim = row_number()),
aes(x = dim, y = percentage.of.variance)) +
geom_line() +
theme_minimal()
La représentation via la fonction plot()
est
difficilement exploitable.
plot(acp_meso, choix = "var", invisible = "quanti.sup")
## Warning: ggrepel: 26 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
plot(acp_meso, choix = "var", invisible = "var")
## Warning: ggrepel: 17 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
ggplot(data.frame(acp_meso$ind$coord), aes(x = Dim.1, y = Dim.2)) +
geom_point() +
theme_minimal()
On choisit donc 3 classes.
cah_meso = hclust(dist(data_meso %>% select(Acantharea:Trichodesmium)), method = "ward.D2")
par(mar = c(1, 2, 2, 0) + .1)
plot(cah_meso, hang = -1, labels = FALSE)
km_meso = kmeans(data_meso %>% select(Acantharea:Trichodesmium), 3, iter.max = 30, nstart = 30)
Voici les 4 classes qu’on trouve :
datatable(t(round(km_meso$centers, 2)), options = list(dom = 'tp'))
data.frame(km_meso$centers) %>%
rownames_to_column("cluster") %>%
pivot_longer(!cluster) %>%
ggplot(aes(x = cluster, y = name, fill = value)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "orange") +
theme_minimal() +
labs(x = "classe", y = "", fill = "")
aou
et
strat
oxy_ml
ggplot(data.frame(acp_meso$ind$coord) %>% mutate(classe = km_meso$cluster),
aes(x = Dim.1, y = Dim.2)) +
geom_point(aes(color = factor(classe)), alpha = .5) +
geom_segment(data = data.frame(acp_meso$quanti.sup$coord) %>% rownames_to_column("var"),
aes(x = 0, y = 0, xend = Dim.1, yend = Dim.2),
arrow = arrow(length = unit(0.5, "cm")), alpha = .5) +
geom_text(data = data.frame(acp_meso$quanti.sup$coord) %>% rownames_to_column("var"),
aes(label = var), size = 5) +
theme_minimal() +
labs(x = "", y = "", color = "Classe")
Difficile de bien distinguer sur ce graphique
ggplot() +
geom_polygon(data = map_data("world"),
aes(x = long, y = lat, group = group),
fill = "lightgray") +
geom_point(data = data %>% filter(layer == "mesosup") %>% mutate(classe = km_meso$cluster),
aes(x = lon, y = lat, color = factor(classe)), size = 1) +
theme_minimal() +
labs(x = "", y = "", color = "Classe")
ggplot(data = data %>% filter(layer == "mesosup") %>% mutate(classe = km_meso$cluster),
aes(x = lon, y = lat)) +
geom_point(aes(color = factor(classe)), size = 1) +
geom_polygon(data = map_data("world"),
aes(x = long, y = lat, group = group),
fill = "lightgray")+
facet_wrap(~ classe, ncol = 2) +
theme_minimal() +
labs(x = "", y = "", color = "Classe")