rtidygraph

Remove duplicated elements from a list of tidygraph objects in R?


I have a list of tidygraph objects. Within the node data I have two columns, i.e., name and frequency. What I'm trying to do is remove any of the list elements (i.e., tidygraph objects) that are repeated more than once. Hopefully my example can explain more:

To begin, I create some node/edge data, turn them into tidygraph objects and put them in a list:

library(tidygraph)
library(dplyr)
library(tidyr)
library(purrr)
library(stringr)


# create some node and edge data for the tbl_graph
nodes <- data.frame(name = c("x4", NA, NA),
                    val = c(1, 5, 2))
nodes2 <- data.frame(name = c("x4", NA, NA),
                     val = c(3, 2, 2))
nodes3 <- data.frame(name = c("x4", NA, NA),
                     val = c(5, 6, 7))
nodes4 <- data.frame(name = c("x4", "x2", NA, NA, "x1", NA, NA),
                     val = c(3, 2, 2, 1, 1, 2, 7))
nodes5 <- data.frame(name= c("x1", "x2", NA),
                     val = c(7, 4, 2))
nodes6 <- data.frame(name = c("x1", "x2", NA),
                     val = c(2, 1, 3))

edges <- data.frame(from = c(1,1), to = c(2,3))
edges1 <- data.frame(from = c(1, 2, 2, 1, 5, 5),
                     to    = c(2, 3, 4, 5, 6, 7))

# create the tbl_graphs
tg   <- tbl_graph(nodes = nodes,  edges = edges)
tg_1 <- tbl_graph(nodes = nodes2, edges = edges)
tg_2 <- tbl_graph(nodes = nodes2, edges = edges)
tg_3 <- tbl_graph(nodes = nodes4, edges = edges1)
tg_4 <- tbl_graph(nodes = nodes5, edges = edges)
tg_5 <- tbl_graph(nodes = nodes6, edges = edges)


# put into list
myList <- list(tg, tg_1, tg_2, tg_3, tg_4, tg_5)

Then, I have this little function that tells me the frequency of each list element, based on the name columns. That is, if the column name is repeated/identical in multiple list elements, then the frequency is increased. So, in my example above, the name column in tg appears 3 times (identically in tg, tg_1, and tg_2) over my list... so it gets a frequency of 3.

I am then adding a frequency column to each list element and altering my original myList object. For example:

freqs <- lapply(myList, function(x){
  x %>% 
    pull(name) %>%
    replace_na("..") %>%
    paste0(collapse = "")
}) %>%
  unlist(use.names = F) %>%
  as_tibble() %>%
  group_by(value) %>%
  mutate(val = n():1) %>%
  pull(val)
  
  

newList <- purrr::imap(myList, ~.x %>% 
              mutate(frequency = freqs[.y]) %>% 
              select(name, frequency))

Looking at newList now returns:

> newList
[[1]]
# A tbl_graph: 3 nodes and 2 edges
#
# A rooted tree
#
# Node Data: 3 × 2 (active)
  name  frequency
  <chr>     <int>
1 x4            3
2 NA            3
3 NA            3
#
# Edge Data: 2 × 2
   from    to
  <int> <int>
1     1     2
2     1     3

[[2]]
# A tbl_graph: 3 nodes and 2 edges
#
# A rooted tree
#
# Node Data: 3 × 2 (active)
  name  frequency
  <chr>     <int>
1 x4            2
2 NA            2
3 NA            2
#
# Edge Data: 2 × 2
   from    to
  <int> <int>
1     1     2
2     1     3

[[3]]
# A tbl_graph: 3 nodes and 2 edges
#
# A rooted tree
#
# Node Data: 3 × 2 (active)
  name  frequency
  <chr>     <int>
1 x4            1
2 NA            1
3 NA            1
#
# Edge Data: 2 × 2
   from    to
  <int> <int>
1     1     2
2     1     3

[[4]]
# A tbl_graph: 7 nodes and 6 edges
#
# A rooted tree
#
# Node Data: 7 × 2 (active)
  name  frequency
  <chr>     <int>
1 x4            1
2 x2            1
3 NA            1
4 NA            1
5 x1            1
6 NA            1
# … with 1 more row
#
# Edge Data: 6 × 2
   from    to
  <int> <int>
1     1     2
2     2     3
3     2     4
# … with 3 more rows

[[5]]
# A tbl_graph: 3 nodes and 2 edges
#
# A rooted tree
#
# Node Data: 3 × 2 (active)
  name  frequency
  <chr>     <int>
1 x1            2
2 x2            2
3 NA            2
#
# Edge Data: 2 × 2
   from    to
  <int> <int>
1     1     2
2     1     3

[[6]]
# A tbl_graph: 3 nodes and 2 edges
#
# A rooted tree
#
# Node Data: 3 × 2 (active)
  name  frequency
  <chr>     <int>
1 x1            1
2 x2            1
3 NA            1
#
# Edge Data: 2 × 2
   from    to
  <int> <int>
1     1     2
2     1     3

So we can see that the name column with x4, NA, NA appears 3 times... but instead of adding the frequency to each.... I seem to be counting down the frequency (not intentional)... so, x4, NA, NA says it's frequency is 3, then 2 then 1.

I am trying to remove any of the duplicated list elements and keep just the element with the highest frequency. For example, my desired output would look like:

> newList
[[1]]
# A tbl_graph: 3 nodes and 2 edges
#
# A rooted tree
#
# Node Data: 3 × 2 (active)
  name  frequency
  <chr>     <int>
1 x4            3
2 NA            3
3 NA            3
#
# Edge Data: 2 × 2
   from    to
  <int> <int>
1     1     2
2     1     3

[[2]]
# A tbl_graph: 7 nodes and 6 edges
#
# A rooted tree
#
# Node Data: 7 × 2 (active)
  name  frequency
  <chr>     <int>
1 x4            1
2 x2            1
3 NA            1
4 NA            1
5 x1            1
6 NA            1
# … with 1 more row
#
# Edge Data: 6 × 2
   from    to
  <int> <int>
1     1     2
2     2     3
3     2     4
# … with 3 more rows

[[3]]
# A tbl_graph: 3 nodes and 2 edges
#
# A rooted tree
#
# Node Data: 3 × 2 (active)
  name  frequency
  <chr>     <int>
1 x1            2
2 x2            2
3 NA            2
#
# Edge Data: 2 × 2
   from    to
  <int> <int>
1     1     2
2     1     3

Here we can see that the elements with the duplicated frequencies have been removed... any suggestions as to how I could do this?


Solution

  • A comment on the original answer would be sufficient motivation to change the answer. That said, slightly updating the code by slice-ing the first of the grouped tibble, possibly like this:

    library(tidygraph) ; library(tidyverse)
    freqs <- map(myList, function(x){
      x %>% 
        pull(name) %>%
        replace_na("..") %>%
        paste0(collapse = "")
    }) %>%
      unlist(use.names = F) %>%
      as_tibble() %>%
      mutate(ids = 1:n()) %>%
      group_by(value) %>%
      mutate(val = n():1)
    
    ids <- freqs %>% slice(1) %>% pull(ids)
    freqs <- freqs %>% pull(val)
    
    newList <- purrr::imap(myList, ~.x %>% 
                             mutate(frequency = freqs[.y]) %>% 
                             select(name, frequency))
    
    newList[sort(ids)]
    
    [[1]]
    # A tbl_graph: 3 nodes and 2 edges
    #
    # A rooted tree
    #
    # Node Data: 3 x 2 (active)
      name  frequency
      <chr>     <int>
    1 x4            3
    2 NA            3
    3 NA            3
    #
    # Edge Data: 2 x 2
       from    to
      <int> <int>
    1     1     2
    2     1     3
    
    [[2]]
    # A tbl_graph: 7 nodes and 6 edges
    #
    # A rooted tree
    #
    # Node Data: 7 x 2 (active)
      name  frequency
      <chr>     <int>
    1 x4            1
    2 x2            1
    3 NA            1
    4 NA            1
    5 x1            1
    6 NA            1
    # ... with 1 more row
    #
    # Edge Data: 6 x 2
       from    to
      <int> <int>
    1     1     2
    2     2     3
    3     2     4
    # ... with 3 more rows
    
    [[3]]
    # A tbl_graph: 3 nodes and 2 edges
    #
    # A rooted tree
    #
    # Node Data: 3 x 2 (active)
      name  frequency
      <chr>     <int>
    1 x1            2
    2 x2            2
    3 NA            2
    #
    # Edge Data: 2 x 2
       from    to
      <int> <int>
    1     1     2
    2     1     3