Consider we have such a data frame for clustering.
# df
dput(df)
structure(c(1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L,
1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L,
0L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L,
1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 1L), dim = c(9L, 11L), dimnames = list(c("1",
"2", "3", "4", "5", "6", "7", "8", "9"), c("C", "D", "E", "F",
"G", "H", "K", "L", "M", "N", "P")))
dist_matrix <- dist(df, method = "manhattan")
clust <- hclust(dist_matrix, method = "complete")
plot(clust)
We can get the following clustering results, where the italic letters are additional markers.
Please allow me to use pseudocode to illustrate the function I want to implement.
FUNC GET_RIGHT_NODE(D)
RETUEN E,F,G
FUNC GET_LEFT_NODE(ROOT)
RETUEN A,B,C
FUNC GET_LEAF(D)
RETURN 5,3,1,2,4
Related topic:
R : help to analyse cluster content in hierarchical clustering
The information you need is encoded in clust$merge
(see ?hclust
).
clust$merge
#> [,1] [,2]
#> [1,] -2 -4
#> [2,] -8 -9
#> [3,] -7 2
#> [4,] -1 1
#> [5,] -3 4
#> [6,] -6 3
#> [7,] -5 5
#> [8,] 6 7
Here, node 1 (the first row of clust$merge
) is formed by the leaves 2
and 4
, node 3 (the third row) is formed by leaf 7
and node 2, etc.
Your labels would correspond to clust$merge
as follows:
cbind(as.data.frame(clust$merge),
Label = c("G", "C", "B", "F", "E", "A", "D", "Root"))
#> V1 V2 Label
#> 1 -2 -4 G
#> 2 -8 -9 C
#> 3 -7 2 B
#> 4 -1 1 F
#> 5 -3 4 E
#> 6 -6 3 A
#> 7 -5 5 D
#> 8 6 7 Root
Functions to do what you asked for:
First the nodes to the left (right) of a specified node:
get_node <- function(cl, n, left = TRUE) {
m <- cl$merge
if (left) {
if (m[n, 1] > 0) n <- m[n, 1] else return(integer(0))
} else {
if (m[n, 2] > 0) n <- m[n, 2] else return(integer(0))
}
e <- environment()
out <- integer(n)
out[1] <- n
i <- 1L
f <- function(n) {
if (m[n, 1] > 0) {
e$i <- e$i + 1L
e$out[e$i] <- m[n, 1]
Recall(e$out[d$i])
}
if (m[n, 2] > 0) {
e$i <- e$i + 1L
e$out[e$i] <- m[n, 2]
Recall(e$out[e$i])
}
}
f(n)
out[1:i]
}
The leaves under a specified node:
get_leaf <- function(cl, n) {
m <- cl$merge
e <- environment()
i <- 0L
out <- integer(n + 1)
f <- function(n) {
if (m[n, 1] > 0) {
Recall(m[n, 1])
} else {
e$i <- e$i + 1L
e$out[e$i] <- -m[n, 1]
}
if (m[n, 2] > 0) {
Recall(m[n, 2])
} else {
e$i <- e$i + 1L
e$out[e$i] <- -m[n, 2]
}
}
f(n)
out[1:i]
}
Demonstrating:
get_node(clust, 7, FALSE) # get all nodes to the right of "D"
#> [1] 5 4 1
get_node(clust, 8) # get all nodes to the left of "Root"
#> [1] 6 3 2
get_leaf(clust, 7) # get all leaves under "D"
#> [1] 5 3 1 2 4
get_leaf(clust, 6) # get all leaves under "A"
#> [1] 6 7 8 9