rstringgsubgrepl

Adding leading zeros in the character and numeric mixed column names in R


I have a dataframe with some column names like; "Sample_ID", "Time00", "X7236Nr1", "Y844Nr1856", "X9834Nr21", "S844Nr567"

I want to add leading zeros to the digits after "Nr", so that I can convert it all to 4 digit numbers; "Sample_ID", "Time00", "X7236Nr0001", "Y844Nr1856", "X9834Nr0021", "S844Nr0567"

I tried to use rename_at to select the columns and apply the appropriate function such as sprintf,

df %>% rename_at(vars(starts_with("[A-B][0-9]")), ~ FUNCTION)

but could not build correct function. Can you please advise any way to deal with that kind of mixed strings?

Thanks in advance


Solution

  • In one row

    # your data.frame
    df <- data.frame(Sample_ID = 1, Time00 = 1, X7236Nr1 = 1, Y844Nr1856 = 1, X9834Nr21 = 1, S844Nr567 = 1)
    # one row only base R for the enthusiasts w/o any explanation
    df <- do.call(data.frame, lapply(names(df), function(x) setNames(list(df[[x]]), if(grepl("Nr(\\d+)", x)) paste0(sub("Nr(\\d+)", "", x), "Nr", sprintf("%04d", as.numeric(sub('.+Nr(.+)', '\\1', x)))) else x)))
    

    Accepted Answer

    I have a dataframe with some column names like; "Sample_ID", "Time00", "X7236Nr1", "Y844Nr1856", "X9834Nr21", "S844Nr567"

    you can do it by using str_replace_all with a str_match that finds the "Nr" + number and str_pad()s the number to 4 digits with zeroes.

    library(dplyr)
    library(stringr)
    
    # your data.frame
    df <- data.frame(Sample_ID = 1, Time00 = 1, X7236Nr1 = 1, Y844Nr1856 = 1, X9834Nr21 = 1, S844Nr567 = 1)
    
    df <- df %>%
      rename_with(~ str_replace_all(., "Nr(\\d+)", function(x) {
        match <- str_match(x, "Nr(\\d+)")
        if (!is.na(match[2])) {
          paste0("Nr", str_pad(match[2], 4, pad = "0")) # only do if "Nr" is found
        } else {
          x
        }
      }))
    
    ### Result
    > colnames(df)
    "Sample_ID"   "Time00"      "X7236Nr0001" "Y844Nr1856"  "X9834Nr0021" "S844Nr0567" 
    
    # Explanations
    > str_match("Y844Nr0856", "Nr(\\d+)")
         [,1]     [,2]  
    [1,] "Nr0856" "0856"
     
    > str_match("Time00", "Nr(\\d+)") # has NA as match[,2], therefore we will not replace anything
         [,1] [,2]
    [1,] NA   NA  
     
    > str_pad("856", 4, pad = "0") # could also use sprintf()
    [1] "0856"