csegmentation-faulthuffman-code

Huffman code segfaults on random input but not on text input


(I asked multiple questions to the implementation of the huffman code here, if I ask too many questions please tell me and I'll stop. Any tips on how to learn to find those mistakes myself are appreciated, I spent hours and couldn't find the problem.)
I try to implement huffman code in C. Right now I can successfully read a file, generate the tree and write the compressed data to the output file. However if my input file is random generated (using something like dd if=/dev/random of=foo bs=100 count=1) the program aborts with a segfault. The weird thing is, this only happens on some random files, some others work. I'm confident that the null character isn't the problem because I don't use any str*() functions (I know printTree() does rely on \0 but I don't use the function), also one random file that I generated contained an empty byte and it worked. One more time I am profoundly confused as to why my code doesn't work, more specific why it doesn't treat all input equally (I would have expected that my code either fails all the time or never as long as the input has the same length).

huffman.c:

#include "huffman.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

struct node {
    node* left;
    node* right;
    double weight;
    char* bytes;
    size_t nmemb;
};

void printTree(struct node* root, int depth)
{
    if (root == NULL) return;

    for (int i = 0; i < depth; i++) {
        printf(" │ ");
    }
    printf("(%s: %.2f)\n", root->bytes, root->weight);

    printTree(root->left, depth + 1);
    printTree(root->right, depth + 1);
}

// Order nodes in array, smallest weight last
static int compareFrequency(const void* a, const void* b)
{
    if ((*(node**)a)->weight < (*(node**)b)->weight) return 1;
    if ((*(node**)a)->weight > (*(node**)b)->weight) return -1;
    if ((*(node**)a)->nmemb < (*(node**)b)->nmemb) return 1;
    if ((*(node**)a)->nmemb > (*(node**)b)->nmemb) return -1;
    return 1;
}

// Encode input to output
void encode(FILE* input, FILE* output)
{
    unsigned int possibleBytes[256] = { 0 }, differentBytes = 0, i;
    unsigned long long totalBytes = 0;
    char currChar;

    // Count number of occurence of each individual byte
    while ((currChar = fgetc(input)) != EOF) {
        possibleBytes[(int)currChar]++;
        totalBytes++;
    }

    // Count number of unique bytes
    for (i = 0; i < 256; i++) {
        if (possibleBytes[i]) differentBytes++;
    }

    // Make array filled with nodes (at the moment only the used bytes)
    node* nodes = calloc(differentBytes * 2 - 1, sizeof(node));
    node* nodesP = nodes;

    for (i = 0; i < 256; i++) {
        if (possibleBytes[i]) {
            nodesP->bytes = calloc(1, sizeof(char));
            nodesP->bytes[0] = i;
            nodesP->weight = (double)possibleBytes[i] / totalBytes;
            nodesP->nmemb = 1;
            nodesP++;
        }
    }

    // Fill trees array with nodes
    node** trees = calloc(differentBytes, sizeof(node*));
    node** treesP = trees;
    nodesP = nodes;
    unsigned int treesLeft = differentBytes;

    for (i = 0; i < treesLeft; i++) {
        *treesP = nodesP;
        treesP++;
        nodesP++;
    }

    // Build tree
    node* lastTree;
    node* secondLastTree;

    while (treesLeft > 1) {
        qsort(trees, treesLeft, sizeof(node*), &compareFrequency);

        lastTree = *(trees + treesLeft - 1);
        secondLastTree = *(trees + treesLeft - 2);

        nodesP->left = lastTree;
        nodesP->right = secondLastTree;
        nodesP->weight = lastTree->weight + secondLastTree->weight;
        nodesP->nmemb = lastTree->nmemb + secondLastTree->nmemb;
        nodesP->bytes = calloc(lastTree->nmemb + secondLastTree->nmemb, sizeof(char));
        // str* functions use \0 terminated string -> Might not work if array contains \0 (not at
        // the end)
        // strcat(nodesP->bytes, lastTree->bytes);
        // strcat(nodesP->bytes, secondLastTree->bytes);
        memcpy(nodesP->bytes, lastTree->bytes, lastTree->nmemb);
        memcpy(nodesP->bytes + lastTree->nmemb, secondLastTree->bytes, secondLastTree->nmemb);
        trees[treesLeft - 2] = nodesP;

        treesLeft--;
        nodesP++;
    }

    // write tree to file
    // https://stackoverflow.com/a/759766/15833045
    //...

    // write data to file
    // left is 0, right is 1
    rewind(input);
    unsigned char bitPosition = 1;
    unsigned char buffer[BUFSIZ];
    node* currNode = *trees;
    currChar = fgetc(input);

    for (i = 0; currChar != EOF; i++) {
        // Fill byte of buffer
        while (bitPosition <= 8) {
            // Set 0 if going left, set 1 if going right
            if (memchr(currNode->left->bytes, currChar, currNode->nmemb)) {
                buffer[i] &= ~(1 << (8 - bitPosition));
                currNode = currNode->left;
            } else {
                buffer[i] |= (1 << (8 - bitPosition));
                currNode = currNode->right;
            }

            // If leaf, reset currNode and get next byte
            if (!currNode->left) {
                currNode = *trees;
                currChar = fgetc(input);
                if (currChar == EOF) {
                    break;
                }
            }
            bitPosition++;
        }
        bitPosition = 1;

        if (i == BUFSIZ - 1) {
            i = 0;
            fwrite(buffer, sizeof(char), BUFSIZ, output);
        }
    }

    fwrite(buffer, sizeof(char), i, output);
}

main.c:

#include "huffman.h"
#include <stdio.h>

int main(int argc, char* argv[])
{
    FILE* inputP = fopen(argv[1], "rb");
    FILE* outputP = fopen(argv[2], "w");

    if (!(inputP && outputP)) return 1;

    encode(inputP, outputP);

    fclose(inputP);
    fclose(outputP);
    return 0;
}

Makefile:

CC = gcc
CFLAGS = -g
OBJ = main.o huffman.o

huffman: $(OBJ)
    $(CC) $(OBJ) -o $@

%.o: %.c
    $(CC) $(CFLAGS) -c $<


clean:
    rm *.o

gdb output (is slightly different when using different random files and hence a different core dumps):

Reading symbols from huffman...
[New LWP 9843]
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/usr/lib/libthread_db.so.1".
Core was generated by `huffman a b'.
Program terminated with signal SIGSEGV, Segmentation fault.
#0  __memchr_avx2 () at ../sysdeps/x86_64/multiarch/memchr-avx2.S:82
82              VPCMPEQ (%rdi), %ymm0, %ymm1

huffman.h

#ifndef HUFFMAN_H
#define HUFFMAN_H

#include <stdio.h>

typedef struct node node;

// static int compareFrequency(const void* a, const void* b);
void encode(FILE* input, FILE* output);

#endif

Solution

  • currChar needs to be int. That puts the characters read in 0..255 instead of -128..127. Then negative values, where at least one would certainly* be found in 100 random bytes, will result in an out-of-bounds access of possibleBytes[].

    Your memchr(currNode->left->bytes, currChar, currNode->nmemb) is using the count of bytes at currNode, which is more than the bytes at currNode->left. The last argument needs to be currNode->left->nmemb.

    Some comments:

    1. You need a typedef struct node node; at the start for it to compile at all.
    2. All of the casts in compareFrequency() need to be node * const * to correspond to the const void * arguments.
    3. You need to return 0; at the end of compareFrequency(), for it to do what it claims.
    4. You should use integers, in particular long long's, for the weights. Not double's. You don't need probabilities, just frequencies. Don't divide by the total.
    5. Writing the tree to the file is easy. Traverse the tree recursively, and for every branch write a 0 bit, and for every leaf write 1 bit followed by the eight bits of the character at that leaf.
    6. Instead of keeping lists of all of the bytes at each branch for searching when encoding, you should simply make a table of Huffman codes by traversing the tree once. (Which can be the same time you traverse the tree to write it out.) Then just look up the character in the table for encoding.
    7. Your i = 0; at the end of the buffer is followed by the i++ of the for loop, so you start the next round at i equal to one, instead of the intended zero.
    8. You need to think about what to do if differentBytes is one. What is the length of the code in that case? You will also need to think about how to handle the last few bits in the last byte, and keep them from decoding into an extraneous character. There exists singular solutions that solve both of those problems.

    *: with probability 1 - 2-100.