pythontreebinaryhuffman-code

huffman tree encoding in python


i have been trying to teach myself about huffman trees, and figured i should make something to test my understanding. it was all going well until my last version, which is now bugging out, and i am not sure why

i've been back and forth, and keep introducing new errors, or getting the same one in new ways

this is the python script i am running:

import heapq
from collections import Counter

def read_file(filename):
    with open(filename, 'r') as file:
        text = file.read()
    return text

def write_file(filename, encoded_text, codes):
    with open(filename, 'w') as file:
        file.write(''.join(encoded_text))
        for char, code in codes.items():
            file.write(f'{char}: {code}\n') 

def build_huffman_tree(freq_dict):
    heap = [[wt, [sym, '']] for sym, wt in freq_dict.items()]
    heapq.heapify(heap)
    
    while len(heap) > 1:
        lo = heapq.heappop(heap)
        hi = heapq.heappop(heap)
        for pair in lo[1:]:
            pair[1] = '0' + pair[1]  
        for pair in hi[1:]:
            pair[1] = '1' + pair[1]
        heapq.heappush(heap, [lo[0] + hi[0]] + lo[1:] + hi[1:])
    
    return sorted(heapq.heappop(heap)[1:], key=lambda p: (len(p[-1]), p))

def huffman_encode(input_filename, output_filename):
    text = read_file(input_filename)
    if not text:
        print("Input file is empty.")
        return
    freq_dict = Counter(text)
    if '"' not in freq_dict:
        freq_dict['"'] = 0 
    codes = {sym: code for code, sym in build_huffman_tree(freq_dict)}

    print("Characters in input text:", set(text))
    print("Characters in Huffman codes dictionary:", set(codes.keys()))

    encoded_text = [codes[char] for char in text]
    write_file(output_filename, encoded_text, codes)

input_filename = '../lorem.txt'
output_filename = 'encoded.txt'
huffman_encode(input_filename, output_filename)

terminal output when running the above is:

Characters in input text: {'a', 'v', 'e', 'g', 'i', 'm', ' ', 'b', '?', 'h', 'l', 's', 'Q', 'x', 'n', 't', 'd', ',', 'S', 'p', 'r', 'c', '\n', 'N', 'u', 'o', 'q', 'f', 'U', '.', '"'} Characters in Huffman codes dictionary: {'0011000', '00111', '0110111100', '1110', '10000', '011010111', '000', '0110111101', '01101101', '0010', '110100', '11011', '101', '1111', '011011100', '0110100', '10001', '0011001', '1100', '001101', '0100', '0101', '01101010', '011011101', '0111', '1001', '011011111', '011010110', '110101', '01101100', '01100'} Traceback (most recent call last): File "/Users/username/Documents/fizzbuzzers-and-more/huffman_tree/python/python_huffman.py", line 51, in huffman_encode(input_filename, output_filename) File "/Users/username/Documents/fizzbuzzers-and-more/huffman_tree/python/python_huffman.py", line 46, in huffman_encode encoded_text = [codes[char] for char in text] ~~~~~^^^^^^ KeyError: '"'

i assume i am making a stupid mistake, but so far i have been unable to find it. i have tried a few variations on the same idea and i get the same error or worse. i also tried logging the characters, the dictionary, etc. at one point i tried to log characters not found in 'codes', and got all of them in order, which made me think i was not creating the dict. when i print-log 'codes', i get:

{'101': ' ', '000': 'e', '1110': 'a', '1111': 'i', '0100': 'm', '0111': 'o', '0010': 'r', '0101': 's', '1001': 't', '1100': 'u', '00111': 'd', '10001': 'l', '11011': 'n', '01100': 'p', '10000': 'q', '001101': ',', '110101': 'c', '110100': 'v', '0110100': '\n', '0011001': 'b', '0011000': 'g', '01101010': '.', '01101100': 'h', '01101101': 'x', '011010111': '"', '011011100': '?', '011011101': 'N', '011010110': 'U', '011011111': 'f', '0110111100': 'Q', '0110111101': 'S'}

my original code, before i started trying to fix things:

import heapq
from collections import Counter

def read_file(filename):
    with open(filename, 'r') as file:
        text = file.read()
    return text

def write_file(filename, encoded_text, codes):
    with open(filename, 'w') as file:
        file.write(''.join(encoded_text))
        file.write('\n')
        for char, code in codes.items():
            file.write(f'{char}: {code}\n')

def build_huffman_tree(freq_dict):
    heap = [[wt, [sym, '']] for sym, wt in freq_dict.items()]
    heapq.heapify(heap)
    while len(heap) > 1:
        lo = heapq.heappop(heap)
        hi = heapq.heappop(heap)
        for pair in lo[1:]:
            pair[1] = '0' + pair[1]
        for pair in hi[1:]:
            pair[1] = '1' + pair[1]
        heapq.heappush(heap, [lo[0] + hi[0]] + lo[1:] + hi[1:])
    return sorted(heapq.heappop(heap)[1:], key=lambda p: (len(p[-1]), p))

def huffman_encode(input_filename, output_filename):
    text = read_file(input_filename)
    freq_dict = Counter(text)
    codes = {sym: code for code, sym in build_huffman_tree(freq_dict)}
    encoded_text = [codes[char] for char in text]
    write_file(output_filename, encoded_text, codes)

input_filename = '../lorem.txt'
output_filename = 'encoded.txt'
huffman_encode(input_filename, output_filename)

that code gave me much same error:

Traceback (most recent call last): File "/Users/username/Documents/fizzbuzzers-and-more/huffman_tree/python/python_huffman.py", line 38, in huffman_encode(input_filename, output_filename) File "/Users/username/Documents/fizzbuzzers-and-more/huffman_tree/python/python_huffman.py", line 33, in huffman_encode encoded_text = [codes[char] for char in text] ~~~~~^^^^^^ KeyError: '"'

text being converted (lorem.txt) contains:

"Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt. Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem. Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?"


Solution

  • this code seems to work:

    import heapq
    from collections import Counter
    
    def read_file(filename):
        with open(filename, 'r') as file:
            text = file.read()
        return text
    
    def write_file(filename, encoded_text, codes):
        with open(filename, 'w') as file:
            file.write(''.join(encoded_text))
            file.write('\n')
            for char, code in codes.items():
                file.write(f'{char}: {code}\n')
    
    def build_huffman_tree(freq_dict):
        heap = [[wt, [sym, '']] for sym, wt in freq_dict.items()]
        heapq.heapify(heap)
        while len(heap) > 1:
            lo = heapq.heappop(heap)
            hi = heapq.heappop(heap)
            for pair in lo[1:]:
                pair[1] = '0' + pair[1]
            for pair in hi[1:]:
                pair[1] = '1' + pair[1]
            heapq.heappush(heap, [lo[0] + hi[0], *lo[1:], *hi[1:]])
        return sorted(heapq.heappop(heap)[1:], key=lambda p: (len(p[-1]), p))
    
    def huffman_encode(input_filename, output_filename):
        text = read_file(input_filename)
        freq_dict = Counter(text)
        codes = {sym: code for sym, code in build_huffman_tree(freq_dict)}
        # print(codes)
        encoded_text = [codes[char] for char in text]
        write_file(output_filename, encoded_text, codes)
    
    input_filename = '../lorem.txt'
    output_filename = 'python_huffman_encoded.txt'
    huffman_encode(input_filename, output_filename)
    

    lines of note are:

    heapq.heappush(heap, [lo[0] + hi[0], *lo[1:], *hi[1:]])
    

    and the aforementioned:

    codes = {sym: code for sym, code in build_huffman_tree(freq_dict)}
    

    thank you to anyone who had a look, i think this may now be solved

    i shall leave it open in case anyone has spotted any other mistakes, silly or otherwise

    edit: will now close when able