rtreehclust

How to traverse hclust internal node in R


Consider we have such a data frame for clustering.

# df
dput(df)
structure(c(1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 
0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 
1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 
0L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 
1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 
0L, 0L, 0L, 0L, 1L, 1L), dim = c(9L, 11L), dimnames = list(c("1", 
"2", "3", "4", "5", "6", "7", "8", "9"), c("C", "D", "E", "F", 
"G", "H", "K", "L", "M", "N", "P")))
dist_matrix <- dist(df, method = "manhattan")
clust <- hclust(dist_matrix, method = "complete")
plot(clust)

We can get the following clustering results, where the italic letters are additional markers.

enter image description here

Please allow me to use pseudocode to illustrate the function I want to implement.

  1. get left node and right node of a specific node.
FUNC GET_RIGHT_NODE(D)
RETUEN E,F,G
FUNC GET_LEFT_NODE(ROOT)
RETUEN A,B,C
  1. get leaf elemnt of specific node
FUNC GET_LEAF(D)
RETURN 5,3,1,2,4

Related topic:

R : help to analyse cluster content in hierarchical clustering

How do you print the rows of a hclust object in R?

Hierarchical Clustering Nearest Neighbors Algorithm in R


Solution

  • The information you need is encoded in clust$merge (see ?hclust).

    clust$merge
    #>      [,1] [,2]
    #> [1,]   -2   -4
    #> [2,]   -8   -9
    #> [3,]   -7    2
    #> [4,]   -1    1
    #> [5,]   -3    4
    #> [6,]   -6    3
    #> [7,]   -5    5
    #> [8,]    6    7
    

    Here, node 1 (the first row of clust$merge) is formed by the leaves 2 and 4, node 3 (the third row) is formed by leaf 7 and node 2, etc.

    Your labels would correspond to clust$merge as follows:

    cbind(as.data.frame(clust$merge),
          Label = c("G", "C", "B", "F", "E", "A", "D", "Root"))
    #>   V1 V2 Label
    #> 1 -2 -4     G
    #> 2 -8 -9     C
    #> 3 -7  2     B
    #> 4 -1  1     F
    #> 5 -3  4     E
    #> 6 -6  3     A
    #> 7 -5  5     D
    #> 8  6  7  Root
    

    Functions to do what you asked for:

    First the nodes to the left (right) of a specified node:

    get_node <- function(cl, n, left = TRUE) {
      m <- cl$merge
      
      if (left) {
        if (m[n, 1] > 0) n <- m[n, 1] else return(integer(0))
      } else {
        if (m[n, 2] > 0) n <- m[n, 2] else return(integer(0))
      }
      
      e <- environment()
      out <- integer(n)
      out[1] <- n
      i <- 1L
      
      f <- function(n) {
        if (m[n, 1] > 0) {
          e$i <- e$i + 1L
          e$out[e$i] <- m[n, 1]
          Recall(e$out[d$i])
        }
        
        if (m[n, 2] > 0) {
          e$i <- e$i + 1L
          e$out[e$i] <- m[n, 2]
          Recall(e$out[e$i])
        }
      }
      
      f(n)
      out[1:i]
    }
    

    The leaves under a specified node:

    get_leaf <- function(cl, n) {
      m <- cl$merge
      e <- environment()
      i <- 0L
      out <- integer(n + 1)
      
      f <- function(n) {
        if (m[n, 1] > 0) {
          Recall(m[n, 1])
        } else {
          e$i <- e$i + 1L
          e$out[e$i] <- -m[n, 1]
        }
        
        if (m[n, 2] > 0) {
          Recall(m[n, 2])
        } else {
          e$i <- e$i + 1L
          e$out[e$i] <- -m[n, 2]
        }
      }
      
      f(n)
      out[1:i]
    }
    

    Demonstrating:

    get_node(clust, 7, FALSE) # get all nodes to the right of "D"
    #> [1] 5 4 1
    get_node(clust, 8)        # get all nodes to the left of "Root"
    #> [1] 6 3 2
    get_leaf(clust, 7)        # get all leaves under "D"
    #> [1] 5 3 1 2 4
    get_leaf(clust, 6)        # get all leaves under "A"
    #> [1] 6 7 8 9