rigraphtidygraph

How to write a loop function for network analysis after igraph fail


I am having trouble with a network analysis. I have a dataset with thousands of detections of hundreds of individuals at different locations. I am trying to get key network statistics for each individual including number of nodes and edges for each individual and network diameter for each individual (defined as greatest distance between any two nodes visited by that individual).

I tried igraph but my limited R skills don't allow me to convert the online examples I have found to fit my data.

Here is a simplified example of my data (dist is in km):

df <- data.frame(id = c("3811","3811","3832","3832","3832","3832"),
             Program = c("P1","P1","P1","P1","P1","P1"),
             from = c("hill","town","hill","wood","wood","lake"),
             from_lon = c(130.2,130.5,130.2,131.3,131.3,129.6),
             from_lat = c(-30.2,-30.5,-30.2,-31.3,-31.3,-29.6),
             to = c("town","lake","wood","wood","lake","town"),
             to_lon = c(130.5,129.6,131.3,131.3,129.6,130.5),
             to_lat = c(-30.5,-29.6,-31.3,-31.3,-29.6,-30.5),
             dist = c(44.111,132.506,161.456,0,249.847,132.506))

This gives the following dataframe:

id Program from from_lon from_lat   to to_lon to_lat    dist
3811      P1 hill    130.2    -30.2 town  130.5  -30.5  44.111
3811      P1 town    130.5    -30.5 lake  129.6  -29.6 132.506
3832      P1 hill    130.2    -30.2 wood  131.3  -31.3 161.456
3832      P1 wood    131.3    -31.3 wood  131.3  -31.3   0.000
3832      P1 wood    131.3    -31.3 lake  129.6  -29.6 249.847
3832      P1 lake    129.6    -29.6 town  130.5  -30.5 132.506

Due to my igraph failure, I have come up with this overly complex code (which does the same thing I think):

indiv_nodes <- df %>% 
  filter(id == "3811"& dist > 0) %>% #exclude repeat detections originating at same site
  summarise(
    id = dplyr::first(id),
    prog = first(Program),
    nodes = n_distinct(to)+1, #+1 to include start location
    netdiam = max(dist))
indiv_edges <- df %>% 
  filter(id == "3811" & dist > 0) %>% #Include only edges between nodes, exclude repeat detections at same site
  group_by(from, to) %>%
  summarise( 
    from = dplyr::first(from),
    to = dplyr::first(to),
    weight = n())
net <- transform(indiv_nodes, edges = sum(indiv_edges$weight))
#-  
indiv_nodes_n <- df %>% 
  filter(id == "3832" & dist > 0) %>% 
  summarise(
    id = dplyr::first(id),
    prog = first(Program),
    nodes = n_distinct(to)+1, 
    netdiam = max(dist))
indiv_edges_n <- df %>% 
  filter(id == "3832" & dist > 0) %>% 
  group_by(from, to) %>%
  summarise( 
    from = dplyr::first(from),
    to = dplyr::first(to),
    weight = n())
indiv_net <- transform(indiv_nodes_n, edges = sum(indiv_edges_n$weight))
net <- rbind(net, indiv_net)
#-  
net

The result is this:

id   prog  nodes   netdiam edges
3811   P1     3    132.506    2
3832   P1     4    249.847    3

My problem is that I have to repeat this for the hundreds of individuals in the dataset, not just two, and rbind them all back together.

I tried creating a loop function but failed dismally.

If anybody can help either with an igraph solution or with a loop function for my above code to run through all ids in my dataset that would be amazing!


Solution

  • tidygraph can simplify some (i)graph operations, for example here we could morph that graph into a list of subgraphs, split by id. And then get graph properties like order, size and weighted diameter along with a Program edge attribute from subgraphs.

    library(igraph, warn.conflicts = FALSE)
    library(tidygraph, warn.conflicts = FALSE)
    library(dplyr, warn.conflicts = FALSE)
    
    g_tbl <- 
      df |> 
      # igraph::diameter() will use "weights" edge attribute for weights
      rename(weight = dist) |> 
      # use df for edge list, "to" & "from" columns encode nodes
      tbl_graph(edges = _) |> 
      # remove 0-weight (dist) edges
      activate(edges) |> 
      filter(weight > 0) |> 
      # morph graph into a temporary list of subgraphs by "id" attribute
      morph(to_split, id, split_by = "edges") |> 
      # temp list of graphs to a nested tibble with tbl_graph objects
      crystallise() |> 
      # extract graph measures & attributes from subgraphs
      rowwise() |> 
      # outside of tbl_graph context it's bit more convenient to use igraph methods 
      mutate(across(graph, list(nodes = vcount, edges = ecount, netdiam = diameter))) |> 
      mutate(prog = edge_attr(graph, "Program")[1]) |> 
      ungroup()
    

    Resulting nested tibble and subgraphs:

    g_tbl
    #> # A tibble: 2 × 6
    #>   name     graph      graph_nodes graph_edges graph_netdiam prog 
    #>   <chr>    <list>           <dbl>       <dbl>         <dbl> <chr>
    #> 1 id: 3811 <tbl_grph>           3           2          177. P1   
    #> 2 id: 3832 <tbl_grph>           4           3          544. P1
    
    g_tbl$graph
    #> [[1]]
    #> # A tbl_graph: 3 nodes and 2 edges
    #> #
    #> # A rooted tree
    #> #
    #> # Edge Data: 2 × 10 (active)
    #>    from    to id    Program from_lon from_lat to_lon to_lat weight
    #>   <int> <int> <chr> <chr>      <dbl>    <dbl>  <dbl>  <dbl>  <dbl>
    #> 1     1     2 3811  P1          130.    -30.2   130.  -30.5   44.1
    #> 2     2     3 3811  P1          130.    -30.5   130.  -29.6  133. 
    #> # ℹ 1 more variable: .tidygraph_edge_index <int>
    #> #
    #> # Node Data: 3 × 2
    #>   name  .tidygraph_node_index
    #>   <chr>                 <int>
    #> 1 hill                      1
    #> 2 town                      2
    #> 3 lake                      3
    #> 
    #> [[2]]
    #> # A tbl_graph: 4 nodes and 3 edges
    #> #
    #> # A rooted tree
    #> #
    #> # Edge Data: 3 × 10 (active)
    #>    from    to id    Program from_lon from_lat to_lon to_lat weight
    #>   <int> <int> <chr> <chr>      <dbl>    <dbl>  <dbl>  <dbl>  <dbl>
    #> 1     1     4 3832  P1          130.    -30.2   131.  -31.3   161.
    #> 2     4     3 3832  P1          131.    -31.3   130.  -29.6   250.
    #> 3     3     2 3832  P1          130.    -29.6   130.  -30.5   133.
    #> # ℹ 1 more variable: .tidygraph_edge_index <int>
    #> #
    #> # Node Data: 4 × 2
    #>   name  .tidygraph_node_index
    #>   <chr>                 <int>
    #> 1 hill                      1
    #> 2 town                      2
    #> 3 lake                      3
    #> # ℹ 1 more row
    

    Plot subgraphs:

    par(mfcol = c(1,2))
    purrr::pwalk(g_tbl, \(name, graph, ...) plot(graph, vertex.size=50, edge.label = edge_attr(graph, "weight"), edge.label.dist = 0.5,main = name))
    


    Alternatively you could first split your dataset by id and use lapply() or purrr::map() on a resulting list to generate graphs & extract details. Should scale better for larger datasets as there are smaller graphs to deal with, from there it's also easy to switch to parallel execution with e.g. furrr::future_map().

    # helper to build a graph and extract details, return single-row tibble
    graph_measures <- function(edge_df){
      g <- 
        edge_df |> 
        tbl_graph(edges = _) |> 
        activate(edges) |> 
        filter(weight > 0)
    
      tibble(
        prog    = edge_attr(g, "Program")[1],
        nodes   = vcount(g), 
        edges   = ecount(g), 
        netdiam = diameter(g)
      )
    }
    
    # split by id, apply graph_measures() on each resulting list item,
    # bind rows and use list names for id column
    df |> 
      rename(weight = dist) |> 
      split(~id) |> 
      purrr::map(graph_measures) |> 
      purrr::list_rbind(names_to = "id")
    #> # A tibble: 2 × 5
    #>   id    prog  nodes edges netdiam
    #>   <chr> <chr> <dbl> <dbl>   <dbl>
    #> 1 3811  P1        3     2    177.
    #> 2 3832  P1        4     3    544.
    

    Example data:

    df <- data.frame(id = c("3811","3811","3832","3832","3832","3832"),
                     Program = c("P1","P1","P1","P1","P1","P1"),
                     from = c("hill","town","hill","wood","wood","lake"),
                     from_lon = c(130.2,130.5,130.2,131.3,131.3,129.6),
                     from_lat = c(-30.2,-30.5,-30.2,-31.3,-31.3,-29.6),
                     to = c("town","lake","wood","wood","lake","town"),
                     to_lon = c(130.5,129.6,131.3,131.3,129.6,130.5),
                     to_lat = c(-30.5,-29.6,-31.3,-31.3,-29.6,-30.5),
                     dist = c(44.111,132.506,161.456,0,249.847,132.506))