cregexstrncpy

Regex and strncpy not working as I understood them


I'm running into multiple issues with the following code:

  1. When I'm running this code on a file containing multiple URLs per line, the regex while loop runs indefinitely.
  2. On a file containing only one HTTP URL on each line, valgrind complains that matches[count] = malloc(sizeof(char) * match_length); and matches[count][match_length] = 0; are Invalid size 1 write. I think my allocation code sucks (sizeof(matches) / sizeof(matches[0]) after my while loops compute to 1).
  3. I thought strncpy would add a null byte at the end of my string automatically but it appears it doesn't 🤷
  4. Might be related to 3. : After those while loops, if I print each element of the array, they contains a newline, which shouldn't be the case, only way to get rid of those is to run matches[count][match_length] = 0;

I think I misunderstood lots of part about regex, strncpy and malloc. I haven't done C programming since college and wanted to get back at it.

    char **matches = (char **)malloc(sizeof(char *));
    char *line = NULL;
    size_t l = 0, count = 0, size = 1;
    ssize_t read = 0;
    
    regex_t rgx;
    if (regcomp(&rgx, "((gopher|https?)://[[:alnum:][:punct:]]+)", REG_ICASE | REG_EXTENDED)) {
        return 0;
    }
    
    while ((read = getline(&line, &l, file)) != -1) {
        regmatch_t match = { 0, 0 };
        int match_length, error;
        if ((error = regexec(&rgx, line + match.rm_eo, 1, &match, 0)) != 0)
            continue;
    
        do {
            printf("Error: %d, Line: %s\n", error, line + match.rm_eo);
            match_length = match.rm_eo - match.rm_so;
            matches[count] = malloc(sizeof(char) * match_length);
            strncpy(matches[count], line + match.rm_so, match_length);
            matches[count][match_length] = 0;
    
            if (++count == size) {
                size *= 2;
                matches = realloc(matches, sizeof(char *) * size);
                if (!matches) {
                    regfree(&rgx);
                    free(line);
                    return 0;
                }
            }
        } while ((error = regexec(&rgx, line + match.rm_eo + 1, 1, &match, REG_NOTBOL)) == 0);
    }

Solution

  • There are multiple problems:

    Here is a modified version:

    char **filter_urls(FILE *file, size_t *countp) {
        size_t count = 0, size = 1;
        char **matches = (char **)malloc(sizeof(char *) * size);
        char *line = NULL;
        size_t line_size = 0;
        int fail = 0;
        ssize_t len;
        regex_t rgx;
    
        if (matches == NULL)
            return NULL;
    
        if (regcomp(&rgx, "((gopher|https?)://[[:alnum:][:punct:]]+)", REG_ICASE | REG_EXTENDED)) {
            free(matches);
            return NULL;
        }
        
        while ((len = getline(&line, &line_size, file)) != -1) {
            // strip the trailing newline if any (optional)
            //if (len > 0 && line[len - 1] == '\n') {
            //    line[--len] == '\0'
            //}
            regmatch_t match = { 0, 0 };
            int flags = 0;
            char *ptr = line;
            while (regexec(&rgx, ptr, 1, &match, flags) != 0) {
                size_t match_length = match.rm_eo - match.rm_so;
                matches[count] = strndup(ptr + match.rm_so, match_length);
                if (matches[count] == NULL) {
                    fail = 1;
                    break;
                }
                if (++count == size) {
                    size_t new_size = size * 2;
                    char **new_matches = realloc(matches, sizeof(char *) * new_size);
                    if (new_matches == NULL) {
                        fail = 1;
                        break;
                    }
                    size = new_size;
                    matches = new_matches;
                }
                ptr += match.rm_eo;
                flags = REG_NOTBOL;
            }
        }
        regfree(&rgx);
        free(line);
        if (fail) {
            // allocation failure: free all
            while (count > 0) {
                free(matches[--count]);
            }
            free(matches);
            return NULL;
        }
        matches[count] = NULL;
        *countp = count;
        return matches;
    }