rlinked-listtreepurrrlookup-tables

Cut branches of linked documents (document trees) with a lookup-list using a function


I just learned how to add branches to linked documents (document trees).

Now I am trying to do the opposite, i.e., cut branches of document trees according to a lookup list by using a function.

Reproducible example:

library(tidyverse)

# list of document trees

df1 <- tibble(id_from=c(NA_character_,"111","222","333","444","444","aaa","bbb","x","x"),
              id_to=c("111","222","333","444","aaa","bbb","x","ccc","x1","x1"),
              level=c(0,1,2,3,4,4,5,5,6,6))

df2 <- tibble(id_from=c(NA_character_,"thank"),
              id_to=c("thank","you"),
              level=c(0,1))

tree_list <- list(df1,df2)
tree_list
#> [[1]]
#> # A tibble: 10 × 3
#>    id_from id_to level
#>    <chr>   <chr> <dbl>
#>  1 <NA>    111       0
#>  2 111     222       1
#>  3 222     333       2
#>  4 333     444       3
#>  5 444     aaa       4
#>  6 444     bbb       4
#>  7 aaa     x         5
#>  8 bbb     ccc       5
#>  9 x       x1        6
#> 10 x       x1        6
#> 
#> [[2]]
#> # A tibble: 2 × 3
#>   id_from id_to level
#>   <chr>   <chr> <dbl>
#> 1 <NA>    thank     0
#> 2 thank   you       1


# lookup list, i.e. branches that I want to cut
cut1 <- tibble(id_from=c("444"),
              id_to=c("aaa"))

cut2 <- tibble(id_from=c("thank"),
              id_to=c("you"))

cut3 <- tibble(id_from=c("bbb"),
               id_to=c("ccc"))

cut4 <- tibble(id_from=c("x"),
               id_to=c("x1"))

cut_lookup <- list(cut1,cut2,cut3,cut4)
cut_lookup
#> [[1]]
#> # A tibble: 1 × 2
#>   id_from id_to
#>   <chr>   <chr>
#> 1 444     aaa  
#> 
#> [[2]]
#> # A tibble: 1 × 2
#>   id_from id_to
#>   <chr>   <chr>
#> 1 thank   you  
#> 
#> [[3]]
#> # A tibble: 1 × 2
#>   id_from to_id
#>   <chr>   <chr>
#> 1 bbb     ccc  
#> 
#> [[4]]
#> # A tibble: 1 × 2
#>   id_from id_to
#>   <chr>   <chr>
#> 1 x       x1

Created on 2023-04-02 with reprex v2.0.2

Desired output:

#> [[1]]
#> # A tibble: 5 × 3
#>   id_from id_to level
#>   <chr>   <chr> <dbl>
#> 1 <NA>    111       0
#> 2 111     222       1
#> 3 222     333       2
#> 4 333     444       3
#> 5 444     bbb       4
#> 
#> [[2]]
#> # A tibble: 1 × 3
#>   id_from id_to level
#>   <chr>   <chr> <dbl>
#> 1 <NA>    thank     0

I tried the following, but I get errors:

# function to cut branches from a single tree
cut_tree <- function(tree, cuts) {
  nodes_to_cut_table <- setNames(rep(TRUE, length(cuts$id_from)), cuts$id_from)
  nodes_to_cut <- unique(cuts$id_from)
  tree %>%
    filter(!id_from %in% nodes_to_cut) %>%
    filter(!id_to %in% nodes_to_cut) %>%
    filter(!id_from %in% nodes_to_cut_table) %>%
    filter(!id_to %in% nodes_to_cut_table)
}

# function to apply cuts to a list of trees
cut_trees <- function(tree_list, cut_lookup) {
  pmap(list(tree_list, cut_lookup), cut_tree)
}

# apply cuts to the example input
cut_trees <- cut_trees(tree_list, cut_lookup)
#> Error in `pmap()`:
#> ! Can't recycle `.l[[1]]` (size 2) to match `.l[[2]]` (size 4).
#> Backtrace:
#>     ▆
#>  1. ├─global cut_trees(tree_list, cut_lookup)
#>  2. │ └─purrr::pmap(list(tree_list, cut_lookup), cut_tree)
#>  3. │   └─purrr:::pmap_("list", .l, .f, ..., .progress = .progress)
#>  4. │     └─vctrs::vec_size_common(!!!.l, .arg = ".l", .call = .purrr_error_call)
#>  5. └─vctrs::stop_incompatible_size(...)
#>  6.   └─vctrs:::stop_incompatible(...)
#>  7.     └─vctrs:::stop_vctrs(...)
#>  8.       └─rlang::abort(message, class = c(class, "vctrs_error"), ..., call = call)
cut_trees
#> function(tree_list, cut_lookup) {
#>   pmap(list(tree_list, cut_lookup), cut_tree)
#> }

Created on 2023-04-02 with reprex v2.0.2

UPDATE As discussed here; the items can merge. The items are temporally ordered (newest first), items only reference older items, they never reference a more recent item.


Solution

  • This solution uses the same get_tree() function from my answer to your previous post. Now it’s iterated over by cut_branches() to find branches in the data corresponding to the nodes in cut_lookup(). The identified branches are then removed using dplyr::anti_join().

    library(dplyr)
    library(purrr)
    
    get_tree <- function(id, data) {
      branch <- filter(data, id_from == id)
      if (nrow(branch) == 0) return()
      bind_rows(
        branch, 
        map(branch$id_to, \(x) get_tree(x, data))
      )
    }
    
    cut_branches <- function(data, lookups) {
      nodes_to_cut <- bind_rows(lookups)
      branches_to_cut <- nodes_to_cut %>%
        pull(id_to) %>%
        map(\(id) get_tree(id, data)) %>%
        bind_rows(nodes_to_cut)
      anti_join(data, branches_to_cut, join_by(id_from, id_to))
    }
    
    map(tree_list, \(x) cut_branches(x, lookups = cut_lookup))
    

    Result:

    [[1]]
    # A tibble: 5 × 3
      id_from id_to level
      <chr>   <chr> <dbl>
    1 <NA>    111       0
    2 111     222       1
    3 222     333       2
    4 333     444       3
    5 444     bbb       4
    
    [[2]]
    # A tibble: 1 × 3
      id_from id_to level
      <chr>   <chr> <dbl>
    1 <NA>    thank     0