I have novice experience with programming and the R language. I have the following sample of my dataset:
library(tidyr)
library(dplyr)
subsest_df <- data.frame(
Condition = c("Oxidative", "Oxidative", "Oxidative", "Oxidative", "Oxidative",
"Oxidative", "Oxidative", "Oxidative"),
gene_name = c("Atu0472", "Atu0472", "Atu0477", "Atu0477", "Atu0479", "Atu0479", "Atu0479", "Atu0479"),
column = c("1", "1_description", "1", "1_description", "1", "2", "1_description", "2_description"),
COG = c("M", "Cell wall/membrane/envelope biogenesis", NA, NA, "E", "T", "Amino acid metabolism and transport", " Signal transduction"),
COG_description = c(NA, "Cell wall/membrane/envelope biogenesis", NA, NA, NA, NA, "Amino acid metabolism and transport", "Signal transduction"))
I would like to move the non-NA values in the COG_description column to be in the same row as a corresponding "1" or "2" in the "column" column, instead of the "1_description" or "2_description" rows, but maintaining the same matching values of "Condition" and "gene_name". The output I would like should look like this:
ideal_df <- data.frame(
Condition = c("Oxidative", "Oxidative", "Oxidative", "Oxidative"),
gene_name = c("Atu0472", "Atu0477", "Atu0479", "Atu0479"),
column = c("1", "1", "1", "2"),
COG = c("M", "NA", "E", "T"),
COG_description = c("Cell wall/membrane/envelope biogenesis", NA, "Amino acid metabolism and transport", "Signal transduction"))
With the help of our friendly robot chatgpt, we came up with an iterative structure for this but it ultimately fails with an empty output, and I am not skilled enough to figure out where the failure is occurring or why or how to ask better questions.
# Initialize an empty dataframe to store the transformed data
transformed_df <- data.frame(
Condition = character(),
gene_name = character(),
column = character(),
COG = character(),
COG_description = character(),
stringsAsFactors = FALSE
)
# Iterate over each row
for (i in 1:nrow(subset_df)) {
# Check if the current row is a description row
if (grepl("_description", subset_df$column[i])) {
# Extract the corresponding "_description" value
description_value <- subset_df$COG_description[i]
# Find the corresponding row with "1", "2", or "3" in the "column" column
corresponding_row <- subset_df %>%
filter(Condition == subset_df$Condition[i],
gene_name == subset_df$gene_name[i],
grepl("^\\d$", column)) # Matches only "1", "2", or "3" without "_description"
# Check if a corresponding row is found
if (nrow(corresponding_row) > 0) {
# Add the corresponding row to the transformed dataframe with COG_description
transformed_df <- rbind(transformed_df, corresponding_row %>% mutate(COG_description = description_value))
}
}
}
# View the transformed dataframe
print(transformed_df)
Thank you in advance for the help!
When adding one group based on column fill
works for the wanted rows.
library(dplyr)
library(tidyr)
df %>%
group_by(Condition, gene_name, grp = sub(".*(\\d+).*", "\\1", column)) %>%
fill(COG_description, .direction="updown") %>%
filter(grepl("^\\d+$", column)) %>%
ungroup() %>%
select(-grp)
output
# A tibble: 4 × 5
Condition gene_name column COG COG_description
<chr> <chr> <chr> <chr> <chr>
1 Oxidative Atu0472 1 M Cell wall/membrane/envelope biogenesis
2 Oxidative Atu0477 1 NA NA
3 Oxidative Atu0479 1 E Amino acid metabolism and transport
4 Oxidative Atu0479 2 T Signal transduction