rdataframefunctionif-statementsumifs

Is it possible to make R do this complex sum function involving a parent-child hierarchy?


I have now been trying for days to do this and I am incapable of getting anywhere. I hope I can make this understandable to whoever reads this. My goal is, in R, to turn this dataframe:

taxonomicStatus taxonRank name parent
accepted order Apusomonadida Obazoa
accepted family Apusomonadidae Apusomonadida
accepted genus Amastigomonas Apusomonadidae
accepted species Amastigomonas marisrubri Amastigomonas
accepted subfamily Apusomonadinae Apusomonadidae
accepted genus Apusomonas Apusomonadinae
accepted species Apusomonas proboscidea Apusomonas
synonym genus Rostromonas Apusomonadinae
synonym species Rostromonas proboscidea Rostromonas

Into this one:

taxonomicStatus taxonRank name parent speciesCount
accepted order Apusomonadida Obazoa 2
accepted family Apusomonadidae Apusomonadida 2
accepted genus Amastigomonas Apusomonadidae 1
accepted species Amastigomonas marisrubri Amastigomonas 1
accepted subfamily Apusomonadinae Apusomonadidae 1
accepted genus Apusomonas Apusomonadinae 1
accepted species Apusomonas proboscidea Apusomonas 1
synonym genus Rostromonas Apusomonadinae NA
synonym species Rostromonas proboscidea Rostromonas NA

In essence, I want an R script that:

I have achieved this in Google Sheets with the following formula in the speciesColumn cell of each row (row 2 in this example):

=IF(A2="synonym";"NA";IF(B2="species";1;SUMIF(I:I;D2;C:C)))

where A2 is the taxonomicStatus value, B2 is the taxonRank value, D2 is the name value, I:I is the entire parent column, and C:C is the entire speciesCount column.

However, attempting this in R for hours has given me two different bad outcomes:

taxonomicStatus taxonRank name parent speciesCount
accepted order Apusomonadida Obazoa 0
accepted family Apusomonadidae Apusomonadida 0
accepted genus Amastigomonas Apusomonadidae 1
accepted species Amastigomonas marisrubri Amastigomonas 1
accepted subfamily Apusomonadinae Apusomonadidae 0
accepted genus Apusomonas Apusomonadinae 1
accepted species Apusomonas proboscidea Apusomonas 1
synonym genus Rostromonas Apusomonadinae NA
synonym species Rostromonas proboscidea Rostromonas NA

Please help. I have even tried to get a usable script from ChatGPT, all in vain. Thank you in advance.

This is what remains of my countless hours of trial and errors. Keep in mind that I have no idea how to even go about this formula at the 3rd step, because I have not found a coherent answer anywhere:

data <- data.frame(
  taxonomicStatus = c("accepted", "accepted", "accepted", "accepted", "accepted", "accepted", "accepted", "synonym", "synonym"),
  taxonRank = c("order", "family", "genus", "species", "subfamily", "genus", "species", "genus", "species"),
  name = c("Apusomonadida", "Apusomonadidae", "Amastigomonas", "Amastigomonas marisrubri", "Apusomonadinae", "Apusomonas", "Apusomonas proboscidea", "Rostromonas", "Rostromonas proboscidea"),
  parent = c("Obazoa", "Apusomonadida", "Apusomonadidae", "Amastigomonas", "Apusomonadidae", "Apusomonadinae", "Apusomonas", "Apusomonadinae", "Rostromonas")
)

data$speciesCount <- NA

species_count <- function(speciesCount) {
  if(data$taxonomicStatus == "synonym" ) {
    data$speciesCount=NA
  }
  else if (data$taxonRank == "species") {
    data$speciesCount=1
  }
  else {data$speciesCount=aggregate(data$name ~ data$parent, data, sum)}
}

species_count(data)

This is the "solution" that chatGPT suggested to me and then failed either by error message or by incomplete result:

data <- data.frame(
  taxonomicStatus = c("accepted", "accepted", "accepted", "accepted", "accepted", "accepted", "accepted", "synonym", "synonym"),
  taxonRank = c("order", "family", "genus", "species", "subfamily", "genus", "species", "genus", "species"),
  name = c("Apusomonadida", "Apusomonadidae", "Amastigomonas", "Amastigomonas marisrubri", "Apusomonadinae", "Apusomonas", "Apusomonas proboscidea", "Rostromonas", "Rostromonas proboscidea"),
  parent = c("Obazoa", "Apusomonadida", "Apusomonadidae", "Amastigomonas", "Apusomonadidae", "Apusomonadinae", "Apusomonas", "Apusomonadinae", "Rostromonas")
)

data$speciesCount <- NA

data <- data %>%
  mutate(speciesCount = ifelse(taxonRank == "species" & taxonomicStatus == "accepted", 1, NA))

# Function to fill in speciesCount values iteratively
fill_species_counts <- function(data) {
  # Create a list to track which nodes need to be updated
  needs_update <- which(is.na(data$speciesCount))
  
  # Keep track of the number of rows for iteration
  n <- nrow(data)

  repeat {
    # Flag to check if any updates were made
    updated <- FALSE
    
    # Iterate over all parents needing updates
    for (i in needs_update) {
      parent_name <- data$name[i]

      # Calculate total speciesCount for current parent
      children_counts <- data %>%
        filter(parent == parent_name) %>%
        pull(speciesCount)
      
      # Calculate total sum excluding NAs
      sum_count <- sum(children_counts, na.rm = TRUE)
      
      # Update speciesCount if it's NA or different from the calculated sum
      if (is.na(data$speciesCount[i]) || data$speciesCount[i] != sum_count) {
        data$speciesCount[i] <- sum_count
        updated <- TRUE
      }
    }
    
    # Check if we need to continue updating
    needs_update <- which(is.na(data$speciesCount))
    if (!updated || length(needs_update) == 0) {
      break
    }
  }
  
  return(data)
}

# Fill speciesCount values
final_df <- fill_species_counts(df)

Solution

  • Recursive implementation because why not? The graph solution is probably more efficient I might guess.

    Seems to work at least for this super clean subset of the data provided.

    # data with id's
    data$index <- seq_len(nrow(data))
    
    # Function to get species count of one row
      # Where x = index
      # df is always the full dataset
    get_species_count <- function(x, df) {
    
      # If synonym, speciesCount <- NA
      if (df[x, "taxonomicStatus"] %in% "synonym") return(NA)
    
      # If species, speciesCount <- 1
      if (df[x, "taxonRank"] %in% "species") return(1)
    
      # else, gather every child
      no.synonyms <- df[!(df$taxonomicStatus %in% "synonym"), ]
      all.children <- no.synonyms[no.synonyms$parent %in% df[x, "name"], ]
    
      # if no children, speciesCount <- 0
      if (nrow(all.children) == 0) return(0)
    
      # else, recurse through each child
      lapply(
        X = as.numeric(all.children$index),
        FUN = get_species_count,
        df = df
      ) |> unlist() |> sum(na.rm = TRUE)
    
    }
    
    # Apply get_species_count() to every row
    data$speciesCount <- lapply(
      X = seq_len(nrow(data)),
      FUN = get_species_count,
      df = data
    ) |>
      unlist()
    
    # Remove index
    data$index <- NULL
    
      taxonomicStatus taxonRank                     name         parent speciesCount
    1        accepted     order            Apusomonadida         Obazoa            2
    2        accepted    family           Apusomonadidae  Apusomonadida            2
    3        accepted     genus            Amastigomonas Apusomonadidae            1
    4        accepted   species Amastigomonas marisrubri  Amastigomonas            1
    5        accepted subfamily           Apusomonadinae Apusomonadidae            1
    6        accepted     genus               Apusomonas Apusomonadinae            1
    7        accepted   species   Apusomonas proboscidea     Apusomonas            1
    8         synonym     genus              Rostromonas Apusomonadinae           NA
    9         synonym   species  Rostromonas proboscidea    Rostromonas           NA