rdplyrtidyr

Moving values from of a given row in one column to correspond to a different row in R


I have novice experience with programming and the R language. I have the following sample of my dataset:

library(tidyr)
library(dplyr)

subsest_df <- data.frame(
  Condition = c("Oxidative", "Oxidative", "Oxidative", "Oxidative", "Oxidative", 
                "Oxidative", "Oxidative", "Oxidative"),
  gene_name = c("Atu0472", "Atu0472", "Atu0477", "Atu0477", "Atu0479", "Atu0479", "Atu0479", "Atu0479"),
column = c("1", "1_description", "1", "1_description", "1", "2", "1_description", "2_description"),
  COG = c("M", "Cell wall/membrane/envelope biogenesis", NA, NA, "E", "T", "Amino acid metabolism and transport", " Signal transduction"),
COG_description = c(NA, "Cell wall/membrane/envelope biogenesis", NA, NA, NA, NA, "Amino acid metabolism and transport", "Signal transduction"))

I would like to move the non-NA values in the COG_description column to be in the same row as a corresponding "1" or "2" in the "column" column, instead of the "1_description" or "2_description" rows, but maintaining the same matching values of "Condition" and "gene_name". The output I would like should look like this:

ideal_df <- data.frame(
Condition = c("Oxidative", "Oxidative", "Oxidative", "Oxidative"),
gene_name = c("Atu0472", "Atu0477", "Atu0479", "Atu0479"),
column = c("1", "1", "1", "2"),
COG = c("M", "NA", "E", "T"),
COG_description = c("Cell wall/membrane/envelope biogenesis", NA, "Amino acid metabolism and transport", "Signal transduction"))

With the help of our friendly robot chatgpt, we came up with an iterative structure for this but it ultimately fails with an empty output, and I am not skilled enough to figure out where the failure is occurring or why or how to ask better questions.

# Initialize an empty dataframe to store the transformed data
transformed_df <- data.frame(
  Condition = character(),
  gene_name = character(),
  column = character(),
  COG = character(),
  COG_description = character(),
  stringsAsFactors = FALSE
)

# Iterate over each row
for (i in 1:nrow(subset_df)) {
  # Check if the current row is a description row
  if (grepl("_description", subset_df$column[i])) {
    # Extract the corresponding "_description" value
    description_value <- subset_df$COG_description[i]
    # Find the corresponding row with "1", "2", or "3" in the "column" column
    corresponding_row <- subset_df %>%
      filter(Condition == subset_df$Condition[i],
             gene_name == subset_df$gene_name[i],
             grepl("^\\d$", column))  # Matches only "1", "2", or "3" without "_description"
    # Check if a corresponding row is found
    if (nrow(corresponding_row) > 0) {
      # Add the corresponding row to the transformed dataframe with COG_description
      transformed_df <- rbind(transformed_df, corresponding_row %>% mutate(COG_description = description_value))
    }
  }
}

# View the transformed dataframe
print(transformed_df)

Thank you in advance for the help!


Solution

  • When adding one group based on column fill works for the wanted rows.

    library(dplyr)
    library(tidyr)
    
    df %>% 
      group_by(Condition, gene_name, grp = sub(".*(\\d+).*", "\\1", column)) %>%
      fill(COG_description, .direction="updown") %>% 
      filter(grepl("^\\d+$", column)) %>% 
      ungroup() %>%
      select(-grp)
    

    output

    # A tibble: 4 × 5
      Condition gene_name column COG   COG_description                       
      <chr>     <chr>     <chr>  <chr> <chr>                                 
    1 Oxidative Atu0472   1      M     Cell wall/membrane/envelope biogenesis
    2 Oxidative Atu0477   1      NA    NA                                    
    3 Oxidative Atu0479   1      E     Amino acid metabolism and transport   
    4 Oxidative Atu0479   2      T     Signal transduction