rhierarchical

How to turn a wide data frame of taxonomic data in R into a hierarchical data structure?


I have a dataframe like this:

wide_df <- data.frame(
  kingdom = c("Animalia", "Animalia", "Plantae", "Plantae"),
  phylum = c("Chordata", "Chordata", "Angiosperms", "Angiosperms"),
  class = c("Mammalia", "Mammalia", "Dicotyledons", "Dicotyledons"),
  order = c("Carnivora", "Carnivora", "Rosales", "Solanales"),
  family = c("Felidae", "Canidae", "Rosaceae", "Solanaceae"),
  count = c(2, 3, 1, 4)
)

> wide_df
   kingdom      phylum        class     order     family count
1 Animalia    Chordata     Mammalia Carnivora    Felidae    2
2 Animalia    Chordata     Mammalia Carnivora    Canidae    3
3  Plantae Angiosperms Dicotyledons   Rosales   Rosaceae    1
4  Plantae Angiosperms Dicotyledons Solanales Solanaceae    4

I want to change the data structure so it looks like this:

hierarchical_df <- data.frame(
  name = c("Animalia",
           "Animalia",
           "Animalia",
           "Animalia",
           "Animalia",
           "Chordata",
           "Chordata",
           "Chordata",
           "Chordata",
           "Chordata",
           "Mammalia",
           "Mammalia",
           "Mammalia",
           "Mammalia",
           "Mammalia",
           "Carnivora",
           "Carnivora",
           "Carnivora",
           "Carnivora",
           "Carnivora",
           "Felidae",
           "Felidae",
           "Canidae",
           "Canidae",
           "Canidae",
           "Plantae",
           "Plantae",
           "Plantae",
           "Plantae",
           "Plantae",
           "Angiosperms",
           "Angiosperms",
           "Angiosperms",
           "Angiosperms",
           "Angiosperms",
           "Dicotyledons",
           "Dicotyledons",
           "Dicotyledons",
           "Dicotyledons",
           "Dicotyledons",
           "Rosales",
           "Solanales",
           "Solanales",
           "Solanales",
           "Solanales",
           "Rosaceae",
           "Solanaceae",
           "Solanaceae",
           "Solanaceae",
           "Solanaceae"),
  parent = c(NA,
             NA,
             NA,
             NA,
             NA,
             "Animalia",
             "Animalia",
             "Animalia",
             "Animalia",
             "Animalia",
             "Chordata",
             "Chordata",
             "Chordata",
             "Chordata",
             "Chordata",
             "Mammalia",
             "Mammalia",
             "Mammalia",
             "Mammalia",
             "Mammalia",
             "Carnivora",
             "Carnivora",
             "Carnivora",
             "Carnivora",
             "Carnivora",
             NA,
             NA,
             NA,
             NA,
             NA,
             "Plantae",
             "Plantae",
             "Plantae",
             "Plantae",
             "Plantae",
             "Angiosperms",
             "Angiosperms",
             "Angiosperms",
             "Angiosperms",
             "Angiosperms",
             "Dicotyledons",
             "Dicotyledons",
             "Dicotyledons",
             "Dicotyledons",
             "Dicotyledons",
             "Rosales",
             "Solanales",
             "Solanales",
             "Solanales",
             "Solanales"))


hierarchical_df
           name       parent
1      Animalia         <NA>
2      Animalia         <NA>
3      Animalia         <NA>
4      Animalia         <NA>
5      Animalia         <NA>
6      Chordata     Animalia
7      Chordata     Animalia
8      Chordata     Animalia
9      Chordata     Animalia
10     Chordata     Animalia
11     Mammalia     Chordata
12     Mammalia     Chordata
13     Mammalia     Chordata
14     Mammalia     Chordata
15     Mammalia     Chordata
16    Carnivora     Mammalia
17    Carnivora     Mammalia
18    Carnivora     Mammalia
19    Carnivora     Mammalia
20    Carnivora     Mammalia
21      Felidae    Carnivora
22      Felidae    Carnivora
23      Canidae    Carnivora
24      Canidae    Carnivora
25      Canidae    Carnivora
26      Plantae         <NA>
27      Plantae         <NA>
28      Plantae         <NA>
29      Plantae         <NA>
30      Plantae         <NA>
31  Angiosperms      Plantae
32  Angiosperms      Plantae
33  Angiosperms      Plantae
34  Angiosperms      Plantae
35  Angiosperms      Plantae
36 Dicotyledons  Angiosperms
37 Dicotyledons  Angiosperms
38 Dicotyledons  Angiosperms
39 Dicotyledons  Angiosperms
40 Dicotyledons  Angiosperms
41      Rosales Dicotyledons
42    Solanales Dicotyledons
43    Solanales Dicotyledons
44    Solanales Dicotyledons
45    Solanales Dicotyledons
46     Rosaceae      Rosales
47   Solanaceae    Solanales
48   Solanaceae    Solanales
49   Solanaceae    Solanales
50   Solanaceae    Solanales

Basically, I'm trying to get my data into a form where I can use it to make a Sankey diagram using this package (https://github.com/fbreitwieser/hiervis). I'm trying to make a visualization of the number of individual organisms of different taxonomic groups seen in a given area. There are 40,000+ observations in the dataset.


Solution

  • Here is a way.
    What you want is the original wide format df in just one column, then the 2nd column is this column lagged.

    tmp <- wide_df[rep(row.names(wide_df), wide_df$count), ]
    long_df <- stack(tmp[-6])
    long_df$parent <- dplyr::lag(long_df$values, sum(long_df$ind == "family"))
    rm(tmp)
    names(long_df)[1L] <- "name"
    long_df <- long_df[-2L]
    

    This is identical to the posted wanted result but sorted differently:

    # check the result
    i <- order(hierarchical_df$name)
    j <- order(long_df$name)
    tmp1 <- hierarchical_df[i, ]
    tmp2 <- long_df[j, ]
    row.names(tmp1) <- NULL
    row.names(tmp2) <- NULL
    
    identical(tmp1, tmp2)
    #> [1] TRUE