cpcre2

How do I split a string with pcre2.h in C


I would like to know how I can split a string in C with pcre2.h. I wanna use the GPT-2 tokenizer regex. My code:

#define PCRE2_CODE_UNIT_WIDTH 8
#define PCRE2_STATIC

#include "pcre2.h"  
#include <stdio.h>
#include <stdlib.h>
#include <string.h>


int main()
{
    pcre2_code* re;
    PCRE2_SIZE erroffset;
    int errcode;
    PCRE2_UCHAR8 buffer[128];

    int rc;
    PCRE2_SIZE* ovector;

    const char* pattern = "'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+";
    size_t pattern_size = strlen(pattern);

    const char* subject = "hello, i (am) a taco.";
    size_t subject_size = strlen(subject);
    uint32_t options = 0;

    pcre2_match_data* match_data;
    uint32_t ovecsize = 128;

    re = pcre2_compile(pattern, pattern_size, options, &errcode, &erroffset, NULL);
    if (re == NULL)
    {
        pcre2_get_error_message(errcode, buffer, 120);
        fprintf(stderr, "%d\t%s\n", errcode, buffer);
        return 1;
    }

    match_data = pcre2_match_data_create(ovecsize, NULL);
    rc = pcre2_match(re, subject, subject_size, 0, options, match_data, NULL);
    if (rc == 0) {
        fprintf(stderr, "offset vector too small: %d", rc);
    }
    else if (rc > 0)
    {
        ovector = pcre2_get_ovector_pointer(match_data);
        PCRE2_SIZE i;
        for (i = 0; i < rc; i++)
        {
            PCRE2_SPTR start = subject + ovector[2 * i];
            PCRE2_SIZE slen = ovector[2 * i + 1] - ovector[2 * i];
            printf("%2d: %.*s\n", i, (int)slen, (char*)start);
        }
    }
    else if (rc < 0)
    {
        printf("No match\n");
    }

    pcre2_match_data_free(match_data);
    pcre2_code_free(re);

    return 0;
}

The output in the terminal is: 0: hello, i (am) a taco.

But I want every char/string to be separate like:

hello , i ( am ) a taco .


Solution

  • You’re only getting the first match because pcre2_match finds a single match per call. To get all matches (like re.findall() in Python), you need to loop and keep calling pcre2_match, updating the start offset each time.

    Here’s how you can do it:

    PCRE2_SIZE offset = 0;
    while (offset < subject_size) {
        rc = pcre2_match(re, subject, subject_size, offset, 0, match_data, NULL);
        if (rc < 0) break;
    
        ovector = pcre2_get_ovector_pointer(match_data);
        PCRE2_SPTR start = subject + ovector[0];
        PCRE2_SIZE len = ovector[1] - ovector[0];
    
        printf("%.*s\n", (int)len, start);
    
        offset = (len == 0) ? offset + 1 : ovector[1]; // avoid infinite loop
    }
    

    Also, make sure you’re compiling with PCRE2_UTF | PCRE2_UCP if you’re using \p{L} or Unicode classes in your pattern.