cfilebinaryfilesfread

Error in reading parts of a binary file C


I am trying to read a binary file in three parts, one part in the first 20% of the file, the second part is the last 20% of the file, and the last part is the remaining 60%

I have run into a problem, that starts in the reading of the second part and most probably in the third part as well.

Is there a chance I have used fread() or fseek() incorrectly, since it is the only change? If so, how can I use fread() to read only a certain part of the data?

int quickScan(FILE* fileToCheck, FILE* virus, int virusLen, int fileLen)
{
    char* buffer = NULL, * virusSig = NULL;
    char result = ' ';
    int firstFile20 = fileLen * 0.2, firstVirus20 = virusLen * 0.2;

    /**
    //first 20% check :
    buffer = (char*)calloc(firstFile20, sizeof(char));
    virusSig = (char*)calloc(firstVirus20, sizeof(char));

    if (buffer == NULL || virusSig == NULL)
    {
        //this means the file is empty, therfor there is no chance it appears in the file
        return 0;
    }

    fread(buffer, 1, firstFile20, fileToCheck);
    fread(virusSig, 1, firstVirus20, virus);

    result = strstrForBinary(buffer, firstFile20, virusSig, firstVirus20);

    if (result != NULL)
    {
        return 1; //found in the first 20%
    }*/
    
    //last 20% check :
    free(buffer);
    free(virusSig);

    int lastFile20 = fileLen - firstFile20, lastVirus20 = virusLen - firstVirus20;

    buffer = (char*)calloc(firstFile20, sizeof(char));
    virusSig = (char*)calloc(firstVirus20, sizeof(char));

    //move the file pointer
    fseek(fileToCheck, lastFile20, SEEK_SET);
    fseek(virus, lastFile20, SEEK_SET);
    
    fread(buffer, 1, lastFile20, fileToCheck);
    fread(virusSig, 1, lastVirus20, virus);
    
    result = strstrForBinary(buffer, lastFile20, virusSig, lastVirus20);
    
    if (result != NULL)
    {
        free(buffer);
        free(virusSig);
        return 2; //found in the last 20%
    }
        //midddle part

    //move the file pointer
    fseek(fileToCheck, fileLen, SEEK_SET);
    fseek(virus, virusLen, SEEK_SET);

    fread(buffer, 1, firstFile20, fileToCheck);
    fread(virusSig, 1, firstVirus20, virus);

    result = strstrForBinary(buffer, (fileLen - firstFile20 - lastFile20), virusSig, (virusLen - firstVirus20 - lastVirus20));

    if (result != NULL)
    {
        free(buffer);
        free(virusSig);
        return 3; //found in the last 60%
    }

    free(buffer);
    free(virusSig);
    return 0; // never been found
}

Using the VS debugger I was able to isolate the problem to the function strstrForBinary():

char* (strchrForBinary)(const char* string, int charToSearchFor, int len) 
{
    const char ch = charToSearchFor;

    for (; *string != ch; string++, len--)
        if (len == 0) //didn't find ch anywhere in the string
            return (NULL);
    //got here if *string == ch, so return the string from that part foward
    return ((char*)string);
}

char* (strstrForBinary)(const char* haystack, int lenOfFirst, const char* needle, int lenOfInstance) 
{
    const char* subHaystack = haystack;
    const char* subNeedle = needle;

    //change from if (*b == 0) because null isn't the end in binary files
    // if the length of the file ot check is 0, there will never an occourence
    if (lenOfFirst == 0)
        return (NULL);
    //if the length of the file used for checking is 0, it means all of the other file is an instance of the first file
    if (lenOfInstance == 0)
        return ((char*)haystack);

    // match prefix
    for (; (haystack = strchrForBinary(haystack, *needle, subHaystack - haystack + lenOfFirst)) != NULL && subHaystack - haystack + lenOfFirst != 0; ++haystack) {

        // match rest of prefix
        const char* sc1 = NULL, * sc2 = NULL;
        for (sc1 = haystack, sc2 = needle; ;)
        {
            if (++sc2 >= subNeedle + lenOfInstance)
            {
                return ((char*)haystack);
            }
            else if (*++sc1 != *sc2) //HERE IT IS UNABLE TO READ *++sc1
            {
                break;
            }
        }
    }
    return (NULL);
}

It is unable to read *++sc1 and crashes there.

I've used this function in other cases and I think the problem isn't in this function, but perhaps in the way I've read the part of the file into char* because it worked perfectly fine in every other case. I suspect I'll have a similar problem in the 60% part (although the VS debbuger doesn't specify).


Solution

  • int lastFile20 = fileLen - firstFile20, lastVirus20 = virusLen - firstVirus20;
    

    Last time I checked, if you subtract 20% from a whole, you get 80%. So lastFile20 is 80% of your file length, and likewise lastVirus20.

    fseek(fileToCheck, lastFile20, SEEK_SET);
    fseek(virus, lastFile20, SEEK_SET);
    
    fread(buffer, 1, lastFile20, fileToCheck);
    fread(virusSig, 1, lastVirus20, virus);
    

    So you skip 80% of each file, and then try reading another 80%. This would place you right at the 160% mark, but that's not a problem, because fread will just stop reading at the end. You happily ignore that though, because you are not checking the return value of fread, which is a cardinal sin in C programming. Your buffer is only 20% of the file size, but that's about what fread is going to read, give or take a rounding error. So if you have a buffer overrun, it's only by a little (insert ironic smiley face here).

    Now in the middle part things get more interesting.

    fseek(fileToCheck, fileLen, SEEK_SET);
    fseek(virus, virusLen, SEEK_SET);
    
    fread(buffer, 1, firstFile20, fileToCheck);
    fread(virusSig, 1, firstVirus20, virus);
    
    result = strstrForBinary(buffer, (fileLen - firstFile20 - lastFile20), virusSig, (virusLen - firstVirus20 - lastVirus20));
    

    You put the file pointer right at the end of the file and then try to read something. That's not a big problem, fread will just fail and you will miss your virus match (another ironic smiley). The problem is that your allocated buffers are 20% of the file, but you give strstrForBinary 80%. That's a buffer overrun, and by quite a bit this time.

    I recommend writing a couple of functions (not tested):

    const char* readFileFragment (FILE* file, 
                                  size_t offset, 
                                  size_t size)
    {
       char* buf = malloc(needleSize); 
       if (buf == NULL) 
          { /* print an error and exit the program */ 
       if (fseek(file, offset, SEEK_SET) < 0)
          { /* print an error and exit */ }
       if (fread(buf, 1, size, file) != size)
          { /* print an error and exit */ }
       return buf;       
    }
    
    int findInFileFragments (FILE* haystackFile, 
                             size_t haystackOffset, 
                             size_t haystackSize, 
                             FILE* needleFile,
                             size_t needleOffset,
                             size_t needleSize)
    {
    
        /* Trivial checks */
        if (needleSize > haystackSize) return 0;
        if (needleSize == 0) return 1;
    
        char* haystack = readFileFragment(haystackFile, haystackOffset, haystackSize);
        char* needle = readFileFragment(needleFile, needleOffset, needleSize);          
       
        int match = strstrForBinary(haystack, haystackSize, needle, needleSize) != NULL;
    
        free(haystack);
        free(needle);
    
        return match;
        
    }
    

    There is is too much code in strstrForBinary and it's all too easy to get confused. There may or may not be another buffer overrun there in the inner loop, it's hard to tell. Here's how I would implement the function:

      const char* strstrForBinary (const char* haystack, 
                                   size_t haystackLen, 
                                   const char* needle, 
                                   size_t needleLen)
      {
          while (haystackLen >= needleLen)
          {
              if (memcmp(haystack, needle, needleLen) == 0) 
                  return haystack;
              haystack++; haystackLen--;
          }      
          return NULL;
      }
    

    (of course something like KMP algorithm is preferred, but one little step at a time).