I want to write a function int** text_match(const char* input, const char* pattern);
that returns all match offsets(start and end respectively)
So far I have implemented this, by iterating over the input with an input offset, finding out if there is a match within the input + its offset, obtaining the offset and setting the input offset to the end offset of the match.
This approach has some very significant disadvantages
I experimented with pcre2_get_ovector_count()
a bit, however so far whenever I provide it with the pcre2_match_data
after having run pcre2_match()
it only returns 1 and not the actual amount of matches
How could I find out all match offsets with pcre2 directly? Or at least avoid the issues I have described above?
my implementation so far (will be licensed under the MIT)
int** text_match(const char* input, const char* regex){
if(!input || !regex) return (int**)calloc(1, sizeof(int*));
int errorcode;
PCRE2_SIZE erroroffset;
pcre2_code *compiled_regex = pcre2_compile((PCRE2_SPTR)regex, PCRE2_ZERO_TERMINATED, PCRE2_EXTENDED|PCRE2_UTF, &errorcode, &erroroffset, NULL);
if(!compiled_regex) return (int**)calloc(1, sizeof(int*));
pcre2_match_data *match_data = pcre2_match_data_create_from_pattern(compiled_regex, NULL);
if(!match_data){
pcre2_code_free(compiled_regex);
return (int**)calloc(1, sizeof(int*));
}
int** output = (int**)calloc(1, sizeof(int*));
if(!output){
pcre2_code_free(compiled_regex);
pcre2_match_data_free(match_data);
return (int**)calloc(1, sizeof(int*));
}
PCRE2_SIZE start_offset = 0;
int matches_amount;
size_t length_input = strlen(input);
for(matches_amount = 0; ; matches_amount++){
if(start_offset >= length_input) break;
if(pcre2_match(compiled_regex, (PCRE2_SPTR)input, PCRE2_ZERO_TERMINATED, start_offset, 0, match_data, NULL) < 0) break;
PCRE2_SIZE *offsets = pcre2_get_ovector_pointer(match_data);
if(offsets == NULL) break;
output[matches_amount] = (int*)malloc(2 * sizeof(int));
if(output[matches_amount] == NULL) break;
output[matches_amount][0] = offsets[0];
output[matches_amount][1] = offsets[1];
int** new_output = (int**)realloc(output, sizeof(int*) * (matches_amount + 2));
if(new_output == NULL) break;
output = new_output;
if(start_offset == offsets[1]){
start_offset++;
}else{
start_offset = offsets[1];
}
}
output[matches_amount] = NULL;
pcre2_code_free(compiled_regex);
pcre2_match_data_free(match_data);
return output;
}
As pointed out by @ikegami the approach was right, however I did too little edge-case detection. This edit just serves one purpose. To provide the corrected function so that if someone finds this post at a later time and needs a function for this here you have it(will be licensed under the MIT btw so do with it whatever you want)
int** text_match(const char* input, const char* regex){
if(!input || !regex) return (int**)calloc(1, sizeof(int*));
int errorcode;
PCRE2_SIZE erroroffset;
pcre2_code *compiled_regex = pcre2_compile((PCRE2_SPTR)regex, PCRE2_ZERO_TERMINATED, PCRE2_EXTENDED|PCRE2_UTF, &errorcode, &erroroffset, NULL);
if(!compiled_regex) return (int**)calloc(1, sizeof(int*));
pcre2_match_data *match_data = pcre2_match_data_create_from_pattern(compiled_regex, NULL);
if(!match_data){
pcre2_code_free(compiled_regex);
return NULL;
}
int** output = (int**)calloc(1, sizeof(int*));
if(!output){
pcre2_code_free(compiled_regex);
pcre2_match_data_free(match_data);
return NULL;
}
PCRE2_SIZE start_offset = 0;
int matches_amount;
size_t length_input = strlen(input);
uint32_t options = 0;
for(matches_amount = 0; ; matches_amount++){
if(pcre2_match(compiled_regex, (PCRE2_SPTR)input, PCRE2_ZERO_TERMINATED, start_offset, options, match_data, NULL) < 0) break;
PCRE2_SIZE *offsets = pcre2_get_ovector_pointer(match_data);
if(offsets == NULL) break;
output[matches_amount] = (int*)malloc(2 * sizeof(int));
if(!output[matches_amount]){
for(int i = 0; output[i]; i++) free(output[i]);
free(output);
pcre2_code_free_8(compiled_regex);
pcre2_match_data_free_8(match_data);
return NULL;
}
output[matches_amount][0] = offsets[0];
output[matches_amount][1] = offsets[1];
int** new_output = (int**)realloc(output, sizeof(int*) * (matches_amount + 2));
if(!new_output){
for(int i = 0; output[i]; i++) free(output[i]);
free(output);
pcre2_code_free_8(compiled_regex);
pcre2_match_data_free_8(match_data);
return NULL;
}
output = new_output;
if(offsets[0] == offsets[1]){
if(offsets[0] == length_input) break;
options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
}else{
size_t startchar = pcre2_get_startchar(match_data);
if(start_offset > startchar) continue;
start_offset = startchar + 1;
for(; start_offset < length_input; start_offset++){
if((input[start_offset] & 0xc0) != 0x80) break;
}
}
}
output[matches_amount] = NULL;
pcre2_code_free(compiled_regex);
pcre2_match_data_free(match_data);
return output;
}
The pcre2demo
man page provides a demo. It includes the following:
/* Loop for second and subsequent matches */
for (;;)
{
uint32_t options = 0; /* Normally no options */
PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */
/* If the previous match was for an empty string, we are finished if we are
at the end of the subject. Otherwise, arrange to run another match at the
same point to see if a non-empty match can be found. */
if (ovector[0] == ovector[1])
{
if (ovector[0] == subject_length) break;
options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
}
/* If the previous match was not an empty string, there is one tricky case to
consider. If a pattern contains \K within a lookbehind assertion at the
start, the end of the matched string can be at the offset where the match
started. Without special action, this leads to a loop that keeps on matching
the same substring. We must detect this case and arrange to move the start on
by one character. The pcre2_get_startchar() function returns the starting
offset that was passed to pcre2_match(). */
else
{
PCRE2_SIZE startchar = pcre2_get_startchar(match_data);
if (start_offset <= startchar)
{
if (startchar >= subject_length) break; /* Reached end of subject. */
start_offset = startchar + 1; /* Advance by one character. */
if (utf8) /* If UTF-8, it may be more */
{ /* than one code unit. */
for (; start_offset < subject_length; start_offset++)
if ((subject[start_offset] & 0xc0) != 0x80) break;
}
}
}
...
}
So you are indeed expected to manipulate the start position (and the options as well), although there are some issues with your current implementation.