Check that the following packages are available on your computer:
library(tidyverse)
library(igraph)
library(missSBM)
library(aricode)
library(corrplot)
theme_set(theme_bw())
We first load the data set from the missSBM package
data('frenchblog2007')
frenchblog2007
## IGRAPH 7b93b75 UN-- 196 1432 --
## + attr: name (v/c), party (v/c)
## + edges from 7b93b75 (vertex names):
## [1] jeunesverts.org/bordeaux-- bix.enix.org/
## [2] jeunesverts.org/bordeaux-- dominiquevoynet.net/blog
## [3] bix.enix.org/ -- www.arnaudcaron.net/
## [4] bix.enix.org/ -- dominiquevoynet.net/blog
## [5] bix.enix.org/ -- blogs.lesverts.fr/
## [6] bix.enix.org/ -- emilien.net/
## [7] bix.enix.org/ -- lipietz.net/blog.php3?id_breve=63
## [8] bix.enix.org/ -- democratiesansfrontiere.org/
## + ... omitted several edges
I start by ploting the degree distribution of the node, with good old R-graphics! Just to check that a few node are indeed highly connected (with a power law distribution). But it also seems that there is a large part of the node which have a decent number of neighbor (>=5), probably forming communities.
<- frenchblog2007 %>% degree_distribution %>% setNames(1:length(.))
prob_degrees <- as.numeric(names(prob_degrees))
degrees par(mfrow = c(1, 2))
barplot(prob_degrees, col = "cyan", las = 3)
## power law distribution for degrees higher than 10
<- setdiff(which(prob_degrees != 0), 1:9)
keep plot(log(degrees), log(prob_degrees), pch = 16)
lm(log(prob_degrees) ~ 1 + log(degrees), subset = keep) %>%
coefficients() %>% abline(col = "red", lwd = 2, lty = "dashed")
Indeed, when plotting the graph by adding labels associated with the party, one can exhibit interesting groups of node forming community largely composed by blogs from the same political orientation.
plot(frenchblog2007,
vertex.shape="none", vertex.label=V(frenchblog2007)$party,
vertex.label.color = "steel blue", vertex.label.font=1.5,
vertex.label.cex=.6, edge.color="gray70", edge.width = 1)
We expect that clustering algorithms based on similarities between node - when similarity corresponds to a high level of connectivity - to find a clustering carrying a part of the information carried by the political party. Indeed, the above plots showed that nodes from the same political party tend to be more connected together than with node from other parties. Of course, this is only a part of the complex structure of the blog network.
Use this function to extract a possible clustering of the nodes for the French blogosphere. Use the plot function for object with class communities
outputing from igraph::fastgreedy.community
. Compare this clustering with the political labels of the nodes (use for instance confusion tables with table
or adjusted Rand-Index with aricode::ARI
).
<- fastgreedy.community(frenchblog2007)
community_cl plot(community_cl, frenchblog2007, vertex.shape="none",
vertex.label=V(frenchblog2007)$party, vertex.label.color = "steel blue",
vertex.label.font=1.5, vertex.label.cex=.6)
plot_dendrogram(community_cl)
The performances are not incredible…
<- 1:10
nb_clusters <- map_dbl(nb_clusters, ~ARI(cut_at(community_cl,no = .), V(frenchblog2007)$party))
ARI <- map_dbl(nb_clusters, ~NID(cut_at(community_cl,no = .), V(frenchblog2007)$party))
NID
tibble(ARI = ARI, NID = NID, nb_clusters = nb_clusters) %>%
pivot_longer(-nb_clusters, values_to = "value", names_to = "measure") %>%
group_by(measure) %>%
ggplot() + aes(x = nb_clusters, y = value, color = measure) + geom_line() +
ggtitle("Hierarchical Clustering Performance with community measure of similarity")
Explore the results offered by other clustering methods for community detection (e.g. igraph::cluster_edge_betweenness
), or the ones obtained by a simple hierarchical clustering on dissimilarity measured on the adjacency matrix. Compare the clustering to the “ground-truth”. Comment.
<- cluster_edge_betweenness(frenchblog2007)
betweenness_cl
<- 1:25
nb_clusters <- map_dbl(nb_clusters, ~ARI(cut_at(betweenness_cl,no = .), V(frenchblog2007)$party))
ARI <- map_dbl(nb_clusters, ~NID(cut_at(betweenness_cl,no = .), V(frenchblog2007)$party))
NID
tibble(ARI = ARI, NID = NID, nb_clusters = nb_clusters) %>%
pivot_longer(-nb_clusters, values_to = "value", names_to = "measure") %>%
group_by(measure) %>%
ggplot() + aes(x = nb_clusters, y = value, color = measure) + geom_line() +
ggtitle("Hierarchical Clustering Performance with betweeness")
The gain is significative, which proves that there is more than simple community based structure in this network.
plot(betweenness_cl, frenchblog2007, vertex.shape="none",
vertex.label=V(frenchblog2007)$party, vertex.label.color = "steel blue",
vertex.label.font=1.5, vertex.label.cex=.6)
We first used the algorithm introduced by Ng, Jordan and Weiss (2002). Check the course/slides!
igraph::graph.laplacian
function.<- graph.laplacian(frenchblog2007, normalized = TRUE) L
<- eigen(L)
spectrum_L plot(spectrum_L$values, pch =".")
sum(spectrum_L$values <= 1e-16)
## [1] 3
As expected, there are 4 zero eigen values (the number of connected components). Indeed, we saw on the graph plots 3 isolated nodes plus one large group of connected nodes.
10 is too much. Consider only the first 5, and we keep only one vector associated to a zero eigen value:
<- ncol(spectrum_L$vectors)
n <- (n-5):n
cols <- spectrum_L$vectors[, cols]
U <- U / rowSums(U^2)
U data.frame(U %>% as_tibble(), party = V(frenchblog2007)$party) %>%
::ggscatmat(color = "party") GGally
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
<- function(igraph_obj, nb_clusters, normalized = TRUE, nstart = 50) {
my_spectral <- graph.laplacian(igraph_obj, normalized = normalized)
L <- ncol(L)
n <- eigen(L)
spectrum_L <- (n-nb_clusters):(n)
cols <- spectrum_L$vectors[, cols, drop = FALSE]
U <- U/rowSums(U^2)
U <- kmeans(U, centers = nb_clusters, nstart = nstart)$cl
clusters as.factor(clusters)
}
<- 1:20
nb_clusters <- map(nb_clusters, ~my_spectral(frenchblog2007, .))
spectral_res <- map_dbl(spectral_res, ~ARI(., as.factor(V(frenchblog2007)$party)))
ARI <- map_dbl(spectral_res, ~NID(., as.factor(V(frenchblog2007)$party)))
NID
tibble(ARI = ARI, NID = NID, nb_clusters = nb_clusters) %>%
pivot_longer(-nb_clusters, values_to = "value", names_to = "measure") %>%
group_by(measure) %>%
ggplot() + aes(x = nb_clusters, y = value, color = measure) + geom_line() +
ggtitle("Spectral Clustering Performance")
<- function(igraph_obj, nb_clusters, normalized = TRUE, nstart = 50) {
my_abs_spectral <- graph.laplacian(igraph_obj, normalized = normalized)
L <- ncol(L)
n <- eigen(diag(n) - L)
spectrum_L <- order(abs(spectrum_L$values), decreasing = TRUE)
abs_values <- abs_values[1:nb_clusters]
cols <- spectrum_L$vectors[, cols, drop = FALSE]
U <- kmeans(U, centers = nb_clusters, nstart = nstart)$cl
clusters as.factor(clusters)
}
<- 1:20
nb_clusters <- map(nb_clusters, ~my_abs_spectral(frenchblog2007, .))
spectral_res <- map_dbl(spectral_res, ~ARI(., as.factor(V(frenchblog2007)$party)))
ARI <- map_dbl(spectral_res, ~NID(., as.factor(V(frenchblog2007)$party)))
NID
tibble(ARI = ARI, NID = NID, nb_clusters = nb_clusters) %>%
pivot_longer(-nb_clusters, values_to = "value", names_to = "measure") %>%
group_by(measure) %>%
ggplot() + aes(x = nb_clusters, y = value, color = measure) + geom_line() +
ggtitle("Absolute Spectral Clustering Performance")
Last but not the least, a matrix plot after reordering of rows/columns according to the absolute spectral clsutering found
<- frenchblog2007 %>% as_adj(sparse = FALSE)
A <- spectral_res[[which.max(ARI)]]
cl corrplot(A[order(cl), order(cl)], method = "color", tl.pos = "n", cl.pos = "n")