rdplyr

Conditional counting based on multiple conditions


I have a tidy ecological dataset in which every row is a single specimen/individual, with multiple columns for multiple variables.

#fake dataset
loc <- c(1,1,2,2,2,3,3,3,3,3,3,3,3)
date <- c(2021, 2022, 2021, 2021, 2022, 2021, 2021, 2022, 2023, 2023, 2023, 2023, 2023)
hab <- c("w", "l", "w", "w", "w", "l", "l", "w", "w", "w", "w", "w", "w")
spec <- c("frog", "frog", "frog", "frog", "frog", "beaver", "beaver", "beaver", "kingfisher", "kingfisher", "kingfisher", "kingfisher", "kingfisher")
n <- c(1,1,1,1,1,1,1,1,1,1,1,1,1)

df <- tibble(loc, date, hab, spec, n)

I want to condense different individuals, collected in the same location/date/habitat, into the same row, but only for some species (beaver and kingfisher, not frog), up to a maximum of 3 individuals per sample.

The resulting dataset should look like this:

#wanted output
loc1 <- c(1,1,2,2,2,3,3,3,3)
date1 <- c(2021, 2022, 2021, 2021, 2022, 2021, 2022, 2023, 2023)
hab1 <- c("w", "l", "w", "w", "w", "l", "w", "w", "w")
spec1 <- c("frog", "frog", "frog", "frog", "frog", "beaver", "beaver", "kingfisher", "kingfisher")
n1 <- c(1,1,1,1,1,2,1,3,2)

df1 <- tibble(loc1, date1, hab1, spec1, n1)

   loc1 date1 hab1  spec1         n1
  <dbl> <dbl> <chr> <chr>      <dbl>
1     1  2021 w     frog           1
2     1  2022 l     frog           1
3     2  2021 w     frog           1
4     2  2021 w     frog           1
5     2  2022 w     frog           1
6     3  2021 l     beaver         2
7     3  2022 w     beaver         1
8     3  2023 w     kingfisher     3
9     3  2023 w     kingfisher     2

Is it possible to do it with dplyr? Or should I use a for loop?


Solution

  • I've found group_modify() / group_map() to be handy for conditional aggregations.

    library(dplyr)
    df |> 
      group_by(loc, date, hab, spec) |> 
      group_modify(\(x, key){
        # key: grouping values
        # x: group subset, here just a frame with a single column, `n`
        # check if `spec` for current group is `beaver` or `kingfisher`
        if (key$spec %in% c("beaver", "kingfisher")){
          group_by(x, tri = rep(1:n(), length.out = n(), each = 3)) |> 
            summarise(n1 = sum(n)) |> 
            select(-tri)
        } else {
          rename(x, n1 = n)
        }
      }) |> 
      ungroup()
    #> # A tibble: 9 × 5
    #>     loc  date hab   spec          n1
    #>   <dbl> <dbl> <chr> <chr>      <dbl>
    #> 1     1  2021 w     frog           1
    #> 2     1  2022 l     frog           1
    #> 3     2  2021 w     frog           1
    #> 4     2  2021 w     frog           1
    #> 5     2  2022 w     frog           1
    #> 6     3  2021 l     beaver         2
    #> 7     3  2022 w     beaver         1
    #> 8     3  2023 w     kingfisher     3
    #> 9     3  2023 w     kingfisher     2
    

    With group_modify(..., .f = \(x, key){...} ), x refers to group subset and key arg is a single-row frame with grouping details, e.g.

    #> # A tibble: 1 × 4
    #>     loc  date hab   spec      
    #>   <dbl> <dbl> <chr> <chr>     
    #> 1     3  2023 w     kingfisher