csortingexternal-sorting

Corrupted data when using a multidimensional char array


I'm currently implementing a function to use the "external sort" method because I have to sort a big file (+200K lines) on a device with low RAM, right now just trying to make it run on a windows pc. I'm working on the function to split the file in tiny sorted files.

The problem I'm facing is that among the tiny sorted files the function creates, the data on certain lines are truncated.

I'm quite sure I've done a mistake somewhere but was not able to find it, yet. Could you help me to discover the problem please ?

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

#define MAX_LINE_LEN 50
#define MAX_LINES_SORTED 130

void createSortedFiles(FILE*);

int main()
{
    FILE *fp = fopen("C:\\C\\Tests\\1.txt", "r+");

    if(fp == NULL){
        printf("Error opening fp");
        return 1;
    }

    createSortedFiles(fp);

    fclose(fp);

    return 0;
}

int cmp(const void *p1, const void *p2) {
    return strcmp(p1, p2);
}

void createSortedFiles(FILE* fp) {
    FILE* sfp;
    //FILE* sfp2 = fopen("C:\\C\\Tests\\test.txt", "w+");
    char lines[MAX_LINES_SORTED][MAX_LINE_LEN + 1] = {0}, buffer[MAX_LINE_LEN + 1] = { 0 }, fnum[6];
    char fname[20] = "C:\\C\\Tests\\";
    char *p;
    int i = 0, j = 0 /*file names*/, int max_lines = MAX_LINES_SORTED - 1;
    size_t N;

    while (1){
        p = fgets(buffer, MAX_LINE_LEN, fp);
        // fwrite(buffer, strlen(buffer), 1, sfp2);

        if(strlen(buffer) > 0 || i > 0){
            if(p != NULL)
                memcpy(lines[i], buffer, strlen(buffer));


            //If reached the max number of lines accepted in the array
            //Or reached EOF
            //=> Sort and write the array "lines"
            if (i >= max_lines || p == NULL) {
                N = sizeof(lines) / sizeof(lines[0]);
                qsort(lines, N, sizeof(*lines), cmp);

                //sets the name of the current file
                memset(&fname[11], 0, 9);
                itoa(j, fnum, 10);
                strcat(fname, fnum);

                if ((sfp = fopen(fname, "w+")) == NULL) {
                    printf("Error opening sfp");
                    return;
                }

                for (i = 0; i < N; i++) {
                    fwrite(lines[i], strlen(lines[i]), 1, sfp);
                }

                fclose(sfp);

                memset(lines, 0, sizeof(lines[0][0]) * MAX_LINES_SORTED * MAX_LINE_LEN);

                j++; i = -1; //because incremented right after
            }
        }

        if(p == NULL){
            break;
        }

        i++;
    }

    //fclose(sfp2);

    return;
}

Here's an example of the fp file (each lines ending with \r\n):

8023796280724;00060-014.W47
8023796280731;00060-014.W48
;0009070305/08007
;0009470337/08007
;0009490338/13001
;0010480311/08007
;0010830308/08007
;0011S
8033280129293;002004GRS4XL
;002015RSM
5708628117005;00207-630-06T42
5708628117012;00207-630-06T44
5708628117036;00207-630-06T46
4051428088756;647530241000045
4051428088763;647530241000046
4051428088770;647530241000047
;647BLPMF
4051428092586;648510256000040
4051428092593;648510256000041
4051428092609;648510256000042
4051428092616;648510256000043
4051428092623;648510256000044
4051428092630;648510256000045
4051428092647;648510256000046


Solution

  • Your "truncated lines" are not really truncated lines, they are stray data left in the buffer from previous files.

    This array:

    #define MAX_LINE_LEN 50
    #define MAX_LINES_SORTED 130
    
    char lines[MAX_LINES_SORTED][MAX_LINE_LEN + 1];
    

    has 6630 bytes, but here:

    memset(lines, 0, sizeof(lines[0][0]) * MAX_LINES_SORTED * MAX_LINE_LEN);
    

    you zero out only 6500 bytes and leave the last two lines as they are.

    You can fix this by using (MAX_LINE_LEN +1) in the size calculation, but the array can be zeroes out more tersely (and more reliably) with just:

    memset(lines, 0, sizeof(lines));