I would like to know how I can split a string in C with pcre2.h. I wanna use the GPT-2 tokenizer regex. My code:
#define PCRE2_CODE_UNIT_WIDTH 8
#define PCRE2_STATIC
#include "pcre2.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main()
{
pcre2_code* re;
PCRE2_SIZE erroffset;
int errcode;
PCRE2_UCHAR8 buffer[128];
int rc;
PCRE2_SIZE* ovector;
const char* pattern = "'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+";
size_t pattern_size = strlen(pattern);
const char* subject = "hello, i (am) a taco.";
size_t subject_size = strlen(subject);
uint32_t options = 0;
pcre2_match_data* match_data;
uint32_t ovecsize = 128;
re = pcre2_compile(pattern, pattern_size, options, &errcode, &erroffset, NULL);
if (re == NULL)
{
pcre2_get_error_message(errcode, buffer, 120);
fprintf(stderr, "%d\t%s\n", errcode, buffer);
return 1;
}
match_data = pcre2_match_data_create(ovecsize, NULL);
rc = pcre2_match(re, subject, subject_size, 0, options, match_data, NULL);
if (rc == 0) {
fprintf(stderr, "offset vector too small: %d", rc);
}
else if (rc > 0)
{
ovector = pcre2_get_ovector_pointer(match_data);
PCRE2_SIZE i;
for (i = 0; i < rc; i++)
{
PCRE2_SPTR start = subject + ovector[2 * i];
PCRE2_SIZE slen = ovector[2 * i + 1] - ovector[2 * i];
printf("%2d: %.*s\n", i, (int)slen, (char*)start);
}
}
else if (rc < 0)
{
printf("No match\n");
}
pcre2_match_data_free(match_data);
pcre2_code_free(re);
return 0;
}
The output in the terminal is: 0: hello, i (am) a taco.
But I want every char/string to be separate like:
hello
,
i
(
am
)
a
taco
.
You’re only getting the first match because pcre2_match finds a single match per call. To get all matches (like re.findall() in Python), you need to loop and keep calling pcre2_match, updating the start offset each time.
Here’s how you can do it:
PCRE2_SIZE offset = 0;
while (offset < subject_size) {
rc = pcre2_match(re, subject, subject_size, offset, 0, match_data, NULL);
if (rc < 0) break;
ovector = pcre2_get_ovector_pointer(match_data);
PCRE2_SPTR start = subject + ovector[0];
PCRE2_SIZE len = ovector[1] - ovector[0];
printf("%.*s\n", (int)len, start);
offset = (len == 0) ? offset + 1 : ovector[1]; // avoid infinite loop
}
Also, make sure you’re compiling with PCRE2_UTF | PCRE2_UCP if you’re using \p{L} or Unicode classes in your pattern.