c++unicodeencodingutf-8filestream

Multi-byte characters and C++ Streams


I have code that finds some words in a .txt file (in the range of 9-25), and then finds those words in another .txt file, and saves them into a .txt file.

The problem is with multi-byte characters. The code works with symbols like あきる or , but doesn't work with letters like . In a .txt file, its just the letter e.

Also, I don't know if it helps, but when I try to print all the words into the console, it only prints words with English characters.

namespace fs = std::filesystem;
void processFile(const fs::path filePath, const fs::path resultPath, const std::unordered_set<std::wstring>& dict) {
    std::wifstream fileIn(filePath, std::ios::binary);

    // Create a locale using UTF-8 encoding
    std::locale utf8_locale(fileIn.getloc(), new std::codecvt_utf8<wchar_t>);

    fileIn.imbue(utf8_locale);

    std::vector<int> indexes;
    std::wstring previousWord;
    bool twelveInRow = false;
    int counter = 0;

    std::vector<std::wstring> words;
    std::wstring word;
    wchar_t prev_ch = L' ';
    wchar_t ch;

    while (fileIn.get(ch)) {
        ch = towlower(ch);

        if (iswalpha(ch)) {
            word += ch;
        }
        else if (iswspace(ch) && !iswspace(prev_ch) && !word.empty()) {
            words.push_back(std::move(word));
            word.clear();
        }
        prev_ch = ch;
    }

    if (!word.empty()) {
        words.push_back(std::move(word));
    }

    fileIn.close();

    for (int i = 0; i < words.size(); ++i) {
        std::wcout << words[i] << std::endl;
    }

    for (int i = 0; i < words.size(); ++i) {
        const auto& word = words[i];
        if (dict.find(word) == dict.end()) {
            if (previousWord != L"astralcomplex65") {
                indexes.push_back(-1);
                previousWord = L"astralcomplex65";
            }
        }
        else {
            indexes.push_back(i);
            previousWord = word;
            counter++;
            if (counter >= 9 && counter <= 25) {
                twelveInRow = true;
            }
        }
    }

    if (twelveInRow) {
        wchar_t prev_ch1 = L' ';
        wchar_t ch1;

        std::wifstream InFile(filePath, std::ios::binary);
        InFile.imbue(utf8_locale);
        //       std::filesystem::path filePath = std::filesystem::absolute(fileIn.getloc().name());
        std::wofstream file(resultPath / (randomName() + L"_Line_Matches" + L".txt"), std::ios::binary);
        // Set the locale for output using UTF-8 encoding
        file.imbue(utf8_locale);

        std::wstring content;
        content += filePath.wstring() + L'\n';
        for (auto i : indexes) {
            content += ((i == -1 ? L"" : words[i]) + L"\n");
        }
        content += L'\n';

        file << content;
        file.close();
        InFile.close();
    }
}

const std::unordered_set<std::wstring> loadDict(const fs::path path) {
    std::wifstream fileIn(path, std::ios::binary);

    // Create a locale using UTF-8 encoding
    std::locale utf8_locale(fileIn.getloc(), new std::codecvt_utf8<wchar_t>);

    fileIn.imbue(utf8_locale);

    std::unordered_set<std::wstring> words;
    std::wstring word;

    while (fileIn >> word) {
        words.insert(word);
    }

    fileIn.close();

    return words;
}

I tried to get the letter by just copying the .txt file and creating another .txt file, and it works, so there is a problem with my massive indexes which have positions of characters that should be written in another .txt file.


Solution

  • one guy from comments gave an answer basically i am putting characters in std::vector<std::wstring> words; massive only one at a time, and somme characters may require more than 1 wchar_T, it's better to rewrite while(fileIn.get(ch)) cycle to this

        while (fileIn >> word) {
            // Convert each character to lowercase
            std::transform(word.begin(), word.end(), word.begin(), towlower);
    
            // Process the word
            if (iswalpha(word[0])) {
                words.push_back(std::move(word));
            }
            else if (!word.empty()) {
                // Handle the case where the word starts with a non-alphabetic character
                words.push_back(std::wstring(1, word[0]));
            }
        }