ccompressiontext-filestxthuffman-code

Huffman's algorithm produces a compressed file that is larger than the original one


I have written this function. It does some magic with the file that I input (the one to compress) - a .txt (text) file, however the output (compressed) file is larger than the original one. For example: I have inputted a 1169MB file and the program returned a 1687MB file.

What can be done to solve this issue?

Maybe there are different approaches or variants to compressing a file using Huffman's algorithm that I should be using instead?

struct HuffNode{
    unsigned data;
    struct HuffNode *left;
    struct HuffNode *right;
    struct HuffNode *parent;
    int is_leaf;
};
typedef struct HuffNode HuffNode;

void count_frequency(FILE *fp, unsigned *freq) {
  size_t original_pos = ftell(fp);
  int ch;
  while ((ch = fgetc(fp)) != EOF) {
    if (ch >= 0 && ch < 256)
      freq[ch]++;
  }
  fseek(fp, original_pos, SEEK_SET);
}

void construct_huffman(unsigned *freq_in, HuffNode *tree) {
  int count = 256; 
  unsigned freq[256] = {0}; 
  HuffNode *node[256]; 

  for (int i = 0; i < 256; i++) { 
    freq[i] = freq_in[i]; 
    tree[i].data = i; 
    tree[i].left = NULL;
    tree[i].right = NULL;
    tree[i].parent = NULL;
    tree[i].is_leaf = 1; 
    node[i] = &tree[i]; 
  }
  for (int i = 0; i < 256; i++) {  
    for (int j = 0; j < 256 - i - 1; j++) {
      if (j + 1 < 256 && (freq[j] < freq[j + 1] || (freq[j] == freq[j+1] && j < j + 1))) { 
        unsigned t = freq[j];
        freq[j] = freq[j + 1];
        freq[j + 1] = t;
        HuffNode *p = node[j]; 
        node[j] = node[j + 1];
        node[j + 1] = p;
      }
    }
  }

  while (count > 1) {
    int pos = 512 - count;
    HuffNode *parent = &tree[pos];
    int i = count - 2, j = count - 1;
    parent->left = node[j]; 
    parent->right = node[i]; 
    node[j]->parent = parent;
    node[i]->parent = parent;
    node[i]->is_leaf = 0;
    node[j]->is_leaf = 0;
    node[i] = parent;
    freq[i] += freq[j]; 
    for (; i > 0 && freq[i] > freq[i - 1]; i--) { 
      unsigned t = freq[i];
      freq[i] = freq[i - 1];
      freq[i - 1] = t;
      HuffNode *p = node[i];
      node[i] = node[i - 1];
      node[i - 1] = p;
    }
    count--;
  }
  node[0]->parent = NULL;
}

void encode_stream(FILE *fin, FILE *fout, HuffNode *tree, unsigned *padding) {
  int n;
  unsigned char ch;
  unsigned char buff = 0, nbuf = 0;
  HuffNode *p;
  unsigned char code[256] = {0};
  while ((n = fgetc(fin)) != EOF) {
    if(n < 0 || n >= 256){
      printf("invalid characters in the file");
      return;
    }
    ch = n;
    p = &tree[ch];
    int code_length = 0;
    while (p->parent) {
      if (p == p->parent->right) {
        code[code_length] = 1;
      }
      p = p->parent;
      code_length++;
    }
    for (int i = code_length - 1; i >= 0; i--) {
      buff |= code[i] << nbuf;
      nbuf++;
      if (nbuf == 8) {
        fputc(buff, fout);
        nbuf = 0;
        buff = 0;
      }
    }
  }
  if (nbuf > 0) {
    fputc(buff, fout);
  }
  *padding = 8 - nbuf;
}

void compress_file(FILE *fin, FILE *fout) {

  unsigned freq[256] = {0}, padding = 0; //- хз
  int flag = 0;
  size_t padding_pos;
  HuffNode tree[512]; 
  
  if (flag == 0) {
    count_frequency(fin, freq);
    construct_huffman(freq, tree);
    rewind(fin);
    for (int i = 0; i < 256; i++) {
      fwrite(&freq[i], sizeof(unsigned), 1, fout);
    }
    padding_pos = ftell(fout);
    fwrite(&padding, sizeof(unsigned), 1, fout);
    encode_stream(fin, fout, tree, &padding);
    fseek(fout, padding_pos, SEEK_SET);
    fwrite(&padding, sizeof(unsigned), 1, fout);
    
  }
}

I don't know why it doesn't properly compress the file.


Solution

  • Before wondering about the size of the output, you need to first make sure that the output is actually a Huffman encoding. It currently is not. The vast majority of the output bytes are 0xff. You should at least look at what's in your output before concluding that it's working.

    What you really need to do is write the companion decompressor, and get both working through extensive testing to verify that compression followed by decompression is lossless. Then you can check to see if the output is smaller than the input for something that you expect to be compressible, e.g. English text.

    You can greatly reduce the size of the output for small inputs by encoding the Huffman tree, as opposed to sending all 256 frequencies at four bytes each, 1K total. Traverse the tree and write a 1 bit for each branch and a 0 bit followed by the eight bits of the character for a leaf. This tree representation is self-terminating. Follow that with the Huffman-encoded data. For n encoded bytes, that will take 10n-1 bits. At most ~320 bytes if all possible bytes are encoded. Most likely not all are. (There are other even more compact encodings of the code by using Canonical Huffman Codes, but this is an easy one to get started with.)