I have some old data files with a large number of writing errors. An example is attached below, where some of the numbers in the .dat file are 9 digits or characters long rather than the expected 8.
I want to use regex to find these cases and chop off the 9th digit or character for elements in the 2nd and 7th lines. I don't see what's wrong with my code, but nothing happens when I run it.
library(strex)
test_lines <- c(
"01+0123. 02+0337. 03+0200. 04-4.493 05+089.3 06-0.064 07+2.872
08+2.755",
"09+082.72 10+16.34 11+0.000 12+0.0000",
"01+0123. 02+0337. 03+0300.
04-5.095 05+091.3 06-0.075 07+2.981 08+2.853",
"09+092.5 10+16.79 11+0.000
12+0.000",
"01+0123. 02+0337. 03+0400. 04-4.983 05+092.0 06-0.091 07+3.600
08+3.241",
"09+112.4 10+25.58 11+0.000 12+0.000",
"01+0123. 02+0337. 03+0500..
04-4.941 05+091.8 06-0.1043 07+2.758 08+2.201",
"09+111.8 10+36.39 11+0.000
12+0.000",
"01+0123. 02+0337. 03+0600. 04-5.026 05+092.3 06-0.120 07+4.678
08+4.463",
"09+114.5 10+17.36 11+0.000 12+0.000"
)
bv_9_lines <- grep("\\S{9}", test_lines)
if (length(bv_9_lines)) {
for (p in bv_9_lines) {
line_to_fix <- test_lines[p]
occs <- unlist(str_extract_all(line_to_fix, "\\S{9}"))
for (o in 1:length(occs)) {
line_to_fix <- str_replace(
line_to_fix,
occs[o],
substr(occs[o], 1, 8)
)
}
test_lines[p] <- line_to_fix
}
}
I have also tried using gsub, as shown below, but that fails as well.
line_to_fix <- gsub(occs[o], substr(occs[o], 1, 8), line_to_fix)
We can use str_replace_all from stringr or gsubfn from gsubfn to apply substr to non-whitespace character substrings 9 long.
Assuming the particular whitespace is not important we normalize the whitespace using str_squish to check that the result is the same as expected. See Note at end where we copied the inputs from the question.
## 1
library(stringr)
res1 <- str_replace_all(test_lines, "\\S{9}", \(x) substr(x, 1, 8))
identical(str_squish(res2), str_squish(expected))
## [1] TRUE
## 2
library(gsubfn)
res2 <- gsubfn("\\S{9}", ~ substr(x, 1, 8), test_lines)
identical(stringr::str_squish(res1), stringr::str_squish(expected))
## [1] TRUE
test_lines <- c(
"01+0123. 02+0337. 03+0200. 04-4.493 05+089.3 06-0.064 07+2.872
08+2.755",
"09+082.72 10+16.34 11+0.000 12+0.0000",
"01+0123. 02+0337. 03+0300.
04-5.095 05+091.3 06-0.075 07+2.981 08+2.853",
"09+092.5 10+16.79 11+0.000
12+0.000",
"01+0123. 02+0337. 03+0400. 04-4.983 05+092.0 06-0.091 07+3.600
08+3.241",
"09+112.4 10+25.58 11+0.000 12+0.000",
"01+0123. 02+0337. 03+0500..
04-4.941 05+091.8 06-0.1043 07+2.758 08+2.201",
"09+111.8 10+36.39 11+0.000
12+0.000",
"01+0123. 02+0337. 03+0600. 04-5.026 05+092.3 06-0.120 07+4.678
08+4.463",
"09+114.5 10+17.36 11+0.000 12+0.000"
)
expected <- c(
"01+0123. 02+0337. 03+0200. 04-4.493 05+089.3 06-0.064 07+2.872
08+2.755",
"09+082.7 10+16.34 11+0.000 12+0.000",
"01+0123. 02+0337. 03+0300. 04-5.095 05+091.3 06-0.075 07+2.981
08+2.853",
"09+092.5 10+16.79 11+0.000 12+0.000",
"01+0123. 02+0337. 03+0400. 04-4.983 05+092.0 06-0.091 07+3.600
08+3.241",
"09+112.4 10+25.58 11+0.000 12+0.000",
"01+0123. 02+0337. 03+0500. 04-4.941 05+091.8 06-0.104 07+2.758
08+2.201",
"09+111.8 10+36.39 11+0.000 12+0.000",
"01+0123. 02+0337. 03+0600. 04-5.026 05+092.3 06-0.120 07+4.678
08+4.463",
"09+114.5 10+17.36 11+0.000 12+0.000"
)