rdataframepartial-matches

Create new variable based on partial string matching in column R


I have a large df with Columns: "Gene, variable, value" to which I would like to add a new variable "light_exposure". To assign the value to this new column I would like to see if there is a partial string match of "i13" within the variable column, and assign values of "yes" or "no" accordingly in the "light_exposure" column.

The column "variable" contains subject IDs and a typical one looks like "sem_outdir_NAi13t3n27_S4"

This is the code I thought would work. However it assigns yes in the new column even when there is not a match present.

df_long <- df_long %>% 
  group_by(Gene) %>% 
  mutate(Light_exposure = case_when(sum(grep("i13", paste(variable))) > 0 ~ "yes",
                                    TRUE ~ "no")) %>% ungroup()
df <- structure(list(rsem_outdir_NAi13t3n2_S1 = c(756.624, 0, 357.557, 
1.1, 0, 0), rsem_outdir_NAi13t3n14_S2 = c(906.464, 0, 352.441, 
1.022, 0, 0), rsem_outdir_NAi13t3n26_S3 = c(307.957, 0, 211.422, 
1.04, 0, 0), rsem_outdir_NAi13t3n27_S4 = c(739.13, 0, 268.329, 
1.243, 0, 0), rsem_outdir_NAi13t3n31_S5 = c(206.39, 0, 205.709, 
1.052, 0, 0), rsem_outdir_NAi6t3n1_S6 = c(154.113, 0, 247.21, 
1.458, 0, 0), rsem_outdir_NAi6t3n10_S7 = c(220.125, 0, 346.917, 
0.378, 0, 0), rsem_outdir_NAi6t3n16_S8 = c(554.147, 0, 252.998, 
0.503, 0, 0), rsem_outdir_NAi6t3n24_S9 = c(317.122, 0, 325.901, 
1.32, 0, 0), rsem_outdir_NAi6t3n29_S10 = c(183.428, 0, 216.055, 
1.47, 0, 0), rsem_outdir_NAi13t12n7_S11 = c(125.687, 0, 157.471, 
0.41, 0, 0), rsem_outdir_NAi13t12n10_S12 = c(403.675, 0, 224.673, 
1.034, 0, 0), rsem_outdir_NAi13t12n11_S13 = c(1387.289, 0, 429.794, 
1.657, 0, 0), rsem_outdir_NAi13t12n16_S14 = c(470.699, 0, 145.129, 
0.949, 0, 0), rsem_outdir_NAi13t12n19_S15 = c(401.953, 0, 164.364, 
1.143, 0, 0), rsem_outdir_NAi6t12n3_S16 = c(534.054, 0, 240.231, 
0.814, 0, 0), rsem_outdir_NAi6t12n4_S17 = c(420.981, 0, 204.186, 
0.866, 0, 0), rsem_outdir_NAi6t12n18_S18 = c(509.926, 0, 263.276, 
0.716, 0, 0), rsem_outdir_NAi6t12n23_S19 = c(603.937, 0, 249.055, 
1.038, 0, 0), rsem_outdir_NAi6t12n27_S20 = c(374.14, 0, 293.112, 
1.163, 0, 0)), row.names = c("TRINITY_DN0_c0_g1", "TRINITY_DN0_c13_g1", 
"TRINITY_DN0_c1_g1", "TRINITY_DN0_c1_g2", "TRINITY_DN0_c2_g1", 
"TRINITY_DN0_c40_g1"), class = "data.frame")))


> df

Not sure if this is helpful but here is the sessionInfo

R version 4.1.2 (2021-11-01)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.6 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8       
 [4] LC_COLLATE=en_US.UTF-8     LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                  LC_ADDRESS=C              
[10] LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] reshape_0.8.9   forcats_0.5.1   stringr_1.5.0   dplyr_1.1.4     purrr_1.0.2    
 [6] readr_2.1.2     tidyr_1.3.0     tibble_3.2.1    ggplot2_3.5.0   tidyverse_1.3.1

loaded via a namespace (and not attached):
 [1] tidyselect_1.2.0 xfun_0.39        haven_2.4.3      colorspace_2.1-0
 [5] vctrs_0.6.5      generics_0.1.3   htmltools_0.5.5  yaml_2.3.7      
 [9] utf8_1.2.3       rlang_1.1.2      pillar_1.9.0     glue_1.6.2      
[13] withr_2.5.0      DBI_1.1.1        dbplyr_2.1.1     modelr_0.1.8    
[17] readxl_1.3.1     plyr_1.8.6       lifecycle_1.0.3  munsell_0.5.0   
[21] gtable_0.3.3     cellranger_1.1.0 rvest_1.0.2      evaluate_0.23   
[25] knitr_1.43       tzdb_0.2.0       fastmap_1.1.1    fansi_1.0.4     
[29] broom_1.0.1      Rcpp_1.0.10      scales_1.3.0     backports_1.3.0 
[33] jsonlite_1.8.8   fs_1.6.1         hms_1.1.1        digest_0.6.33   
[37] stringi_1.7.12   grid_4.1.2       cli_3.6.1        tools_4.1.2     
[41] magrittr_2.0.3   crayon_1.5.2     pkgconfig_2.0.3  ellipsis_0.3.2  
[45] xml2_1.3.3       reprex_2.0.1     lubridate_1.8.0  assertthat_0.2.1
[49] rmarkdown_2.21   httr_1.4.5       rstudioapi_0.13  R6_2.5.1        
[53] compiler_4.1.2  

Solution

  • I suspect you are looking for

    df |> 
      tibble::rownames_to_column("Gene") |>
      tidyr::pivot_longer(cols = -Gene, 
                          names_to = "Variable", values_to = "Value") |>
      dplyr::mutate(Light_exposure = grepl("i13", Variable))
    

    giving

    # A tibble: 120 × 4
       Gene              Variable  Value Light_exposure
       <chr>             <chr>     <dbl> <lgl>         
     1 TRINITY_DN0_c0_g1 rsem_out…  757. TRUE          
     2 TRINITY_DN0_c0_g1 rsem_out…  906. TRUE          
     3 TRINITY_DN0_c0_g1 rsem_out…  308. TRUE          
     4 TRINITY_DN0_c0_g1 rsem_out…  739. TRUE          
     5 TRINITY_DN0_c0_g1 rsem_out…  206. TRUE          
     6 TRINITY_DN0_c0_g1 rsem_out…  154. FALSE         
     7 TRINITY_DN0_c0_g1 rsem_out…  220. FALSE         
     8 TRINITY_DN0_c0_g1 rsem_out…  554. FALSE         
     9 TRINITY_DN0_c0_g1 rsem_out…  317. FALSE         
    10 TRINITY_DN0_c0_g1 rsem_out…  183. FALSE         
    # ℹ 110 more rows
    # ℹ Use `print(n = ...)` to see more rows
    

    We first move the row names to a new variable Gene to keep track of them. We then pivot the data to long format, and finally identify which strings (Variable) contain i13.

    Notice that, it is easier and more common to work with logical TRUE/FALSE than with character "no"/"yes".


    Input

    df <- structure(list(rsem_outdir_NAi13t3n2_S1 = c(756.624, 0, 357.557, 
    1.1, 0, 0), rsem_outdir_NAi13t3n14_S2 = c(906.464, 0, 352.441, 
    1.022, 0, 0), rsem_outdir_NAi13t3n26_S3 = c(307.957, 0, 211.422, 
    1.04, 0, 0), rsem_outdir_NAi13t3n27_S4 = c(739.13, 0, 268.329, 
    1.243, 0, 0), rsem_outdir_NAi13t3n31_S5 = c(206.39, 0, 205.709, 
    1.052, 0, 0), rsem_outdir_NAi6t3n1_S6 = c(154.113, 0, 247.21, 
    1.458, 0, 0), rsem_outdir_NAi6t3n10_S7 = c(220.125, 0, 346.917, 
    0.378, 0, 0), rsem_outdir_NAi6t3n16_S8 = c(554.147, 0, 252.998, 
    0.503, 0, 0), rsem_outdir_NAi6t3n24_S9 = c(317.122, 0, 325.901, 
    1.32, 0, 0), rsem_outdir_NAi6t3n29_S10 = c(183.428, 0, 216.055, 
    1.47, 0, 0), rsem_outdir_NAi13t12n7_S11 = c(125.687, 0, 157.471, 
    0.41, 0, 0), rsem_outdir_NAi13t12n10_S12 = c(403.675, 0, 224.673, 
    1.034, 0, 0), rsem_outdir_NAi13t12n11_S13 = c(1387.289, 0, 429.794, 
    1.657, 0, 0), rsem_outdir_NAi13t12n16_S14 = c(470.699, 0, 145.129, 
    0.949, 0, 0), rsem_outdir_NAi13t12n19_S15 = c(401.953, 0, 164.364, 
    1.143, 0, 0), rsem_outdir_NAi6t12n3_S16 = c(534.054, 0, 240.231, 
    0.814, 0, 0), rsem_outdir_NAi6t12n4_S17 = c(420.981, 0, 204.186, 
    0.866, 0, 0), rsem_outdir_NAi6t12n18_S18 = c(509.926, 0, 263.276, 
    0.716, 0, 0), rsem_outdir_NAi6t12n23_S19 = c(603.937, 0, 249.055, 
    1.038, 0, 0), rsem_outdir_NAi6t12n27_S20 = c(374.14, 0, 293.112, 
    1.163, 0, 0)), row.names = c("TRINITY_DN0_c0_g1", "TRINITY_DN0_c13_g1", 
    "TRINITY_DN0_c1_g1", "TRINITY_DN0_c1_g2", "TRINITY_DN0_c2_g1", 
    "TRINITY_DN0_c40_g1"), class = "data.frame")))