rregexreplacegsub

How to use regex in R to shorten strings by one space


I have some old data files with a large number of writing errors. An example is attached below, where some of the numbers in the .dat file are 9 digits or characters long rather than the expected 8.

I want to use regex to find these cases and chop off the 9th digit or character for elements in the 2nd and 7th lines. I don't see what's wrong with my code, but nothing happens when I run it.

library(strex)

test_lines <- c(
  "01+0123.  02+0337.  03+0200.  04-4.493  05+089.3  06-0.064  07+2.872
   08+2.755",
  "09+082.72  10+16.34  11+0.000  12+0.0000",
  "01+0123.  02+0337.  03+0300.  
   04-5.095  05+091.3  06-0.075  07+2.981  08+2.853",
  "09+092.5  10+16.79  11+0.000  
   12+0.000",
  "01+0123.  02+0337.  03+0400.  04-4.983  05+092.0  06-0.091  07+3.600  
   08+3.241",
  "09+112.4  10+25.58  11+0.000  12+0.000",
  "01+0123.  02+0337.  03+0500..  
   04-4.941  05+091.8  06-0.1043  07+2.758  08+2.201",
  "09+111.8  10+36.39  11+0.000  
   12+0.000",
  "01+0123.  02+0337.  03+0600.  04-5.026  05+092.3  06-0.120  07+4.678  
   08+4.463",
  "09+114.5  10+17.36  11+0.000  12+0.000"
)

bv_9_lines <- grep("\\S{9}", test_lines)

if (length(bv_9_lines)) {
  for (p in bv_9_lines) {
    line_to_fix <- test_lines[p]
    occs <- unlist(str_extract_all(line_to_fix, "\\S{9}"))
    
    for (o in 1:length(occs)) {
      line_to_fix <- str_replace(
        line_to_fix,
        occs[o],
        substr(occs[o], 1, 8)
      )
    }
    
    test_lines[p] <- line_to_fix
  }
}

I have also tried using gsub, as shown below, but that fails as well.

line_to_fix <- gsub(occs[o], substr(occs[o], 1, 8), line_to_fix)

Solution

  • We can use str_replace_all from stringr or gsubfn from gsubfn to apply substr to non-whitespace character substrings 9 long.

    Assuming the particular whitespace is not important we normalize the whitespace using str_squish to check that the result is the same as expected. See Note at end where we copied the inputs from the question.

    ## 1
    library(stringr)
    res1 <- str_replace_all(test_lines, "\\S{9}", \(x) substr(x, 1, 8))
    
    identical(str_squish(res2), str_squish(expected))
    ## [1] TRUE
    
    ## 2
    library(gsubfn)
    res2 <- gsubfn("\\S{9}", ~ substr(x, 1, 8), test_lines)
    
    identical(stringr::str_squish(res1), stringr::str_squish(expected))
    ## [1] TRUE
    

    Note

    test_lines <- c(
      "01+0123.  02+0337.  03+0200.  04-4.493  05+089.3  06-0.064  07+2.872
       08+2.755",
      "09+082.72  10+16.34  11+0.000  12+0.0000",
      "01+0123.  02+0337.  03+0300.  
       04-5.095  05+091.3  06-0.075  07+2.981  08+2.853",
      "09+092.5  10+16.79  11+0.000  
       12+0.000",
      "01+0123.  02+0337.  03+0400.  04-4.983  05+092.0  06-0.091  07+3.600  
       08+3.241",
      "09+112.4  10+25.58  11+0.000  12+0.000",
      "01+0123.  02+0337.  03+0500..  
       04-4.941  05+091.8  06-0.1043  07+2.758  08+2.201",
      "09+111.8  10+36.39  11+0.000  
       12+0.000",
      "01+0123.  02+0337.  03+0600.  04-5.026  05+092.3  06-0.120  07+4.678  
       08+4.463",
      "09+114.5  10+17.36  11+0.000  12+0.000"
    )
    
    expected <-  c(
      "01+0123.  02+0337.  03+0200.  04-4.493  05+089.3  06-0.064  07+2.872
       08+2.755",
      "09+082.7  10+16.34  11+0.000  12+0.000",
      "01+0123.  02+0337.  03+0300.  04-5.095  05+091.3  06-0.075  07+2.981
       08+2.853",
      "09+092.5  10+16.79  11+0.000  12+0.000",
      "01+0123.  02+0337.  03+0400.  04-4.983  05+092.0  06-0.091  07+3.600  
       08+3.241",
      "09+112.4  10+25.58  11+0.000  12+0.000",
      "01+0123.  02+0337.  03+0500.  04-4.941  05+091.8  06-0.104  07+2.758 
       08+2.201",
      "09+111.8  10+36.39  11+0.000  12+0.000",
      "01+0123.  02+0337.  03+0600.  04-5.026  05+092.3  06-0.120  07+4.678  
       08+4.463",
      "09+114.5  10+17.36  11+0.000  12+0.000"
    )