c++lzw

Last symbol is duplicated in LZW


I tried implementing LZW encoding/decoding and ended up with the following code

#include <cstdint>
#include <fstream>
#include <iostream>
#include <sstream>
#include <string>
#include <unordered_map>

using Index = std::int16_t;

void encode(std::istream &input, std::ostream &output) {
  Index index{0};
  std::unordered_map<std::string, Index> dictionary{};

  for (int i = 0; i < 256; ++i) {
    dictionary[{static_cast<char>(i & 0xFF)}] = index++;
  }

  char k;
  input.read(&k, sizeof(char));
  std::string buffer{""};
  while (input) {
    const auto tmp = buffer + k;
    if (dictionary.contains(tmp))
      buffer = tmp;
    else {
      dictionary[tmp] = index++;
      output.write(reinterpret_cast<const char *>(&dictionary[buffer]),
                   sizeof(Index));
      buffer = {k};
    }
    input.read(&k, sizeof(char));
  }
  output.write(reinterpret_cast<const char *>(&dictionary[buffer]),
               sizeof(Index));
}

void decode(std::istream &input, std::ostream &output) {
  Index index{0};
  std::unordered_map<Index, std::string> dictionary{};

  for (int i = 0; i < 256; ++i) {
    dictionary[index++] = {static_cast<char>(i & 0xFF)};
  }

  Index k;
  input.read(reinterpret_cast<char *>(&k), sizeof(Index));
  output << dictionary[k];

  Index old{k};
  std::string buffer;
  while (input) {
    input.read(reinterpret_cast<char *>(&k), sizeof(Index));
    buffer = dictionary[old];

    std::string tmp;
    if (dictionary.contains(k)) {
      const auto &entry = dictionary[k];
      tmp = buffer + entry.front();
      output << entry;
    } else {
      tmp = buffer + buffer.front();
      output << tmp;
    }
    dictionary[index++] = tmp;
    old = k;
  }
}

It kinda works but the last character after decoding is duplicated:

int main() {
  std::string input{"hello world!"};
  std::istringstream iss{input};
  std::stringstream ss{};
  encode(iss, ss);
  decode(ss, std::cout);
}

Outputs hello world!! instead of hello world!. I can't find my mistake, maybe someone else can?


Solution

  • As @NathanOliver pointed out in the comments changing the decode loop to

    while (input.read(reinterpret_cast<char *>(&k), sizeof(Index))) { 
    

    fixes it.