(I asked multiple questions to the implementation of the huffman code here, if I ask too many questions please tell me and I'll stop. Any tips on how to learn to find those mistakes myself are appreciated, I spent hours and couldn't find the problem.)
I try to implement huffman code in C. Right now I can successfully read a file, generate the tree and write the compressed data to the output file.
However if my input file is random generated (using something like dd if=/dev/random of=foo bs=100 count=1
) the program aborts with a segfault. The weird thing is, this only happens on some random files, some others work.
I'm confident that the null character isn't the problem because I don't use any str*() functions (I know printTree()
does rely on \0
but I don't use the function), also one random file that I generated contained an empty byte and it worked. One more time I am profoundly confused as to why my code doesn't work, more specific why it doesn't treat all input equally (I would have expected that my code either fails all the time or never as long as the input has the same length).
huffman.c:
#include "huffman.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
struct node {
node* left;
node* right;
double weight;
char* bytes;
size_t nmemb;
};
void printTree(struct node* root, int depth)
{
if (root == NULL) return;
for (int i = 0; i < depth; i++) {
printf(" │ ");
}
printf("(%s: %.2f)\n", root->bytes, root->weight);
printTree(root->left, depth + 1);
printTree(root->right, depth + 1);
}
// Order nodes in array, smallest weight last
static int compareFrequency(const void* a, const void* b)
{
if ((*(node**)a)->weight < (*(node**)b)->weight) return 1;
if ((*(node**)a)->weight > (*(node**)b)->weight) return -1;
if ((*(node**)a)->nmemb < (*(node**)b)->nmemb) return 1;
if ((*(node**)a)->nmemb > (*(node**)b)->nmemb) return -1;
return 1;
}
// Encode input to output
void encode(FILE* input, FILE* output)
{
unsigned int possibleBytes[256] = { 0 }, differentBytes = 0, i;
unsigned long long totalBytes = 0;
char currChar;
// Count number of occurence of each individual byte
while ((currChar = fgetc(input)) != EOF) {
possibleBytes[(int)currChar]++;
totalBytes++;
}
// Count number of unique bytes
for (i = 0; i < 256; i++) {
if (possibleBytes[i]) differentBytes++;
}
// Make array filled with nodes (at the moment only the used bytes)
node* nodes = calloc(differentBytes * 2 - 1, sizeof(node));
node* nodesP = nodes;
for (i = 0; i < 256; i++) {
if (possibleBytes[i]) {
nodesP->bytes = calloc(1, sizeof(char));
nodesP->bytes[0] = i;
nodesP->weight = (double)possibleBytes[i] / totalBytes;
nodesP->nmemb = 1;
nodesP++;
}
}
// Fill trees array with nodes
node** trees = calloc(differentBytes, sizeof(node*));
node** treesP = trees;
nodesP = nodes;
unsigned int treesLeft = differentBytes;
for (i = 0; i < treesLeft; i++) {
*treesP = nodesP;
treesP++;
nodesP++;
}
// Build tree
node* lastTree;
node* secondLastTree;
while (treesLeft > 1) {
qsort(trees, treesLeft, sizeof(node*), &compareFrequency);
lastTree = *(trees + treesLeft - 1);
secondLastTree = *(trees + treesLeft - 2);
nodesP->left = lastTree;
nodesP->right = secondLastTree;
nodesP->weight = lastTree->weight + secondLastTree->weight;
nodesP->nmemb = lastTree->nmemb + secondLastTree->nmemb;
nodesP->bytes = calloc(lastTree->nmemb + secondLastTree->nmemb, sizeof(char));
// str* functions use \0 terminated string -> Might not work if array contains \0 (not at
// the end)
// strcat(nodesP->bytes, lastTree->bytes);
// strcat(nodesP->bytes, secondLastTree->bytes);
memcpy(nodesP->bytes, lastTree->bytes, lastTree->nmemb);
memcpy(nodesP->bytes + lastTree->nmemb, secondLastTree->bytes, secondLastTree->nmemb);
trees[treesLeft - 2] = nodesP;
treesLeft--;
nodesP++;
}
// write tree to file
// https://stackoverflow.com/a/759766/15833045
//...
// write data to file
// left is 0, right is 1
rewind(input);
unsigned char bitPosition = 1;
unsigned char buffer[BUFSIZ];
node* currNode = *trees;
currChar = fgetc(input);
for (i = 0; currChar != EOF; i++) {
// Fill byte of buffer
while (bitPosition <= 8) {
// Set 0 if going left, set 1 if going right
if (memchr(currNode->left->bytes, currChar, currNode->nmemb)) {
buffer[i] &= ~(1 << (8 - bitPosition));
currNode = currNode->left;
} else {
buffer[i] |= (1 << (8 - bitPosition));
currNode = currNode->right;
}
// If leaf, reset currNode and get next byte
if (!currNode->left) {
currNode = *trees;
currChar = fgetc(input);
if (currChar == EOF) {
break;
}
}
bitPosition++;
}
bitPosition = 1;
if (i == BUFSIZ - 1) {
i = 0;
fwrite(buffer, sizeof(char), BUFSIZ, output);
}
}
fwrite(buffer, sizeof(char), i, output);
}
main.c:
#include "huffman.h"
#include <stdio.h>
int main(int argc, char* argv[])
{
FILE* inputP = fopen(argv[1], "rb");
FILE* outputP = fopen(argv[2], "w");
if (!(inputP && outputP)) return 1;
encode(inputP, outputP);
fclose(inputP);
fclose(outputP);
return 0;
}
Makefile:
CC = gcc
CFLAGS = -g
OBJ = main.o huffman.o
huffman: $(OBJ)
$(CC) $(OBJ) -o $@
%.o: %.c
$(CC) $(CFLAGS) -c $<
clean:
rm *.o
gdb output (is slightly different when using different random files and hence a different core dumps):
Reading symbols from huffman...
[New LWP 9843]
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/usr/lib/libthread_db.so.1".
Core was generated by `huffman a b'.
Program terminated with signal SIGSEGV, Segmentation fault.
#0 __memchr_avx2 () at ../sysdeps/x86_64/multiarch/memchr-avx2.S:82
82 VPCMPEQ (%rdi), %ymm0, %ymm1
huffman.h
#ifndef HUFFMAN_H
#define HUFFMAN_H
#include <stdio.h>
typedef struct node node;
// static int compareFrequency(const void* a, const void* b);
void encode(FILE* input, FILE* output);
#endif
currChar
needs to be int
. That puts the characters read in 0..255 instead of -128..127. Then negative values, where at least one would certainly* be found in 100 random bytes, will result in an out-of-bounds access of possibleBytes[]
.
Your memchr(currNode->left->bytes, currChar, currNode->nmemb)
is using the count of bytes at currNode
, which is more than the bytes at currNode->left
. The last argument needs to be currNode->left->nmemb
.
Some comments:
typedef struct node node;
at the start for it to compile at all.compareFrequency()
need to be node * const *
to correspond to the const void *
arguments.return 0;
at the end of compareFrequency()
, for it to do what it claims.long long
's, for the weights. Not double
's. You don't need probabilities, just frequencies. Don't divide by the total.0
bit, and for every leaf write 1
bit followed by the eight bits of the character at that leaf.i = 0;
at the end of the buffer is followed by the i++
of the for
loop, so you start the next round at i
equal to one, instead of the intended zero.differentBytes
is one. What is the length of the code in that case? You will also need to think about how to handle the last few bits in the last byte, and keep them from decoding into an extraneous character. There exists singular solutions that solve both of those problems.*: with probability 1 - 2-100.