rregex

Recreating Regex Functions by Hand


As a learning exercise, I am trying to re-create a regex expression in R the manual way.

For example, suppose I have this string:

var1 <- c("111 222 a1C 5b2", "B2G-6l3 atttr", "nothing here", "something P2b5p2 something")

I want to see if each element has the consecutive pattern: letter, number, letter, space/no space/separator, number, letter, number.

I tried to manually define conditions for this problem:

cond_1 <- c("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", 
            "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", 
            "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", 
            "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z")

cond_2 <- c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9")

cond_3 <- c("", " ", "-")

Then, I tried to write a loop to check if each element in var1 satisfies these conditions:

original_value <- c()
pattern_found <- c()
value <- c()

for (i in var1) {
    chars <- strsplit(i, "")[[1]]
    
    found <- FALSE
    
 
    for (j in 1:(length(chars) - 6)) {
        # Check if the pattern is found
        if (chars[j] %in% cond_1 && chars[j+1] %in% cond_2 && chars[j+2] %in% cond_1 &&
            chars[j+3] %in% cond_3 && chars[j+4] %in% cond_2 && chars[j+5] %in% cond_1 &&
            chars[j+6] %in% cond_2) {
            found <- TRUE
            break
        }
    }
    
  
    original_value <- c(original_value, i)
    pattern_found <- c(pattern_found, ifelse(found, "yes", "no"))
    value <- c(value, ifelse(found, paste(chars[j:(j+6)], collapse = ""), NA))
}


df <- data.frame(original_value, pattern_found, value)

The code seems to have partly worked:

              original_value pattern_found   value
1            111 222 a1C 5b2           yes a1C 5b2
2              B2G-6l3 atttr           yes B2G-6l3
3               nothing here            no    <NA>
4 something P2b5p2 something            no    <NA>

How can I fix this?

PS: Here is the classic regex approach:

pattern <- "[a-zA-Z]\\d[a-zA-Z][- ,_]*\\d[a-zA-Z]\\d"

original_value <- c()
pattern_found <- c()
value <- c()

for (i in var1) {
  if (grepl(pattern, i)) {
    original_value <- c(original_value, i)
    pattern_found <- c(pattern_found, "yes")
    value <- c(value, regmatches(i, regexpr(pattern, i)))
  } else {
    original_value <- c(original_value, i)
    pattern_found <- c(pattern_found, "no")
    value <- c(value, NA)
  }
}

df <- data.frame(original_value, pattern_found, value)

Solution

  • You could manually build a regex pattern with character classes directly using your source vectors, e.g.

    cond_1 <- c("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", 
                "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", 
                "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", 
                "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z")
    
    cond_2 <- c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9")
    
    cond_3 <- c(" ", "-")
    
    r1 <- paste0("[", paste(cond_1, collapse=""), "]")
    r2 <- paste0("[", paste(cond_2, collapse=""), "]")
    r3 <- paste0("[", paste(cond_3, collapse=""), "]")
    
    regex <- paste0(r1, r2, r1, r3, "?", r2, r1, r2)
    regex
    
    [1] "[abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ][0123456789][abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ][ -]?[0123456789][abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ][0123456789]"