The idea is to combine R packages ClustOfVar
and ggdendro
to give a visual summary of variable clustering.
When there are few columns in the data, the result is pretty good except that there are areas not covered(as circled in the chart below). Using mtcars
for example:
library(plyr)
library(ggplot2)
library(gtable)
library(grid)
library(gridExtra)
library(ClustOfVar)
library(ggdendro)
fit = hclustvar(X.quanti = mtcars)
labels = cutree(fit,k = 5)
labelx = data.frame(Names=names(labels),group = paste("Group",as.vector(labels)),num=as.vector(labels))
p1 = ggdendrogram(as.dendrogram(fit), rotate=TRUE)
df2<-data.frame(cluster=cutree(fit, k =5), states=factor(fit$labels,levels=fit$labels[fit$order]))
df3<-ddply(df2,.(cluster),summarise,pos=mean(as.numeric(states)))
p2 = ggplot(df2,aes(states,y=1,fill=factor(cluster)))+geom_tile()+
scale_y_continuous(expand=c(0,0))+
theme(axis.title=element_blank(),
axis.ticks=element_blank(),
axis.text=element_blank(),
legend.position="none")+coord_flip()+
geom_text(data=df3,aes(x=pos,label=cluster))
gp1<-ggplotGrob(p1)
gp2<-ggplotGrob(p2)
maxHeight = grid::unit.pmax(gp1$heights[2:5], gp2$heights[2:5])
gp1$heights[2:5] <- as.list(maxHeight)
gp2$heights[2:5] <- as.list(maxHeight)
grid.arrange(gp2, gp1, ncol=2,widths=c(1/6,5/6))
When there are a large number of columns, another issue occurs. That is, the height of the color tiles part does not match the height the dendrogram.
library(ClustOfVar)
library(ggdendro)
X = data.frame(mtcars,mtcars,mtcars,mtcars,mtcars,mtcars)
fit = hclustvar(X.quanti = X)
labels = cutree(fit,k = 5)
labelx = data.frame(Names=names(labels),group = paste("Group",as.vector(labels)),num=as.vector(labels))
p1 = ggdendrogram(as.dendrogram(fit), rotate=TRUE)
df2<-data.frame(cluster=cutree(fit, k =5), states=factor(fit$labels,levels=fit$labels[fit$order]))
df3<-ddply(df2,.(cluster),summarise,pos=mean(as.numeric(states)))
p2 = ggplot(df2,aes(states,y=1,fill=factor(cluster)))+geom_tile()+
scale_y_continuous(expand=c(0,0))+
theme(axis.title=element_blank(),
axis.ticks=element_blank(),
axis.text=element_blank(),
legend.position="none")+coord_flip()+
geom_text(data=df3,aes(x=pos,label=cluster))
gp1<-ggplotGrob(p1)
gp2<-ggplotGrob(p2)
maxHeight = grid::unit.pmax(gp1$heights[2:5], gp2$heights[2:5])
gp1$heights[2:5] <- as.list(maxHeight)
gp2$heights[2:5] <- as.list(maxHeight)
grid.arrange(gp2, gp1, ncol=2,widths=c(1/6,5/6))
@Sandy Muspratt has actually provided an excellent solution to this IF we have the R upgraded to version 3.3.1. R: ggplot slight adjustment for clustering summary
But since I cannot change the version of the R deployed in the corporate server, I wonder if there is any other workaround that can align these two parts.
As far as I can tell, your code is not far wrong. The problem is that you are trying to match a continuous scale to a discrete scale when you merge the two plots. Also, it appears that ggdendrogram()
adds additional space to the y-axis.
library(plyr)
library(ggplot2)
library(gtable)
library(grid)
library(gridExtra)
library(ClustOfVar)
library(ggdendro)
# Data
X = data.frame(mtcars,mtcars,mtcars,mtcars,mtcars,mtcars)
# Cluster analysis
fit = hclustvar(X.quanti = X)
# Labels data frames
df2 <- data.frame(cluster = cutree(fit, k =5),
states = factor(fit$labels, levels = fit$labels[fit$order]))
df3 <- ddply(df2, .(cluster), summarise, pos = mean(as.numeric(states)))
# Dendrogram
# scale_x_continuous() for p1 should match scale_x_discrete() from p2
# scale_x_continuous strips off the labels. I grab them from df2
# scale _y_continuous() puts a little space between the labels and the dendrogram
p1 <- ggdendrogram(as.dendrogram(fit), rotate = TRUE) +
scale_x_continuous(expand = c(0, 0.5), labels = levels(df2$states), breaks = 1:length(df2$states)) +
scale_y_continuous(expand = c(0.02, 0))
# Tiles and labels
p2 <- ggplot(df2,aes(states, y = 1, fill = factor(cluster))) +
geom_tile() +
scale_y_continuous(expand = c(0, 0)) +
scale_x_discrete(expand = c(0, 0)) +
geom_text(data = df3, aes(x = pos, label = cluster)) +
coord_flip() +
theme(axis.title = element_blank(),
axis.ticks = element_blank(),
axis.text = element_blank(),
legend.position = "none")
# Get the ggplot grobs
gp1 <- ggplotGrob(p1)
gp2 <- ggplotGrob(p2)
# Make sure the heights match
maxHeight <- unit.pmax(gp1$heights, gp2$heights)
gp1$heights <- as.list(maxHeight)
gp2$heights <- as.list(maxHeight)
# Combine the two plots
grid.arrange(gp2, gp1, ncol = 2,widths = c(1/6, 5/6))