I have code that finds some words in a .txt
file (in the range of 9-25), and then finds those words in another .txt
file, and saves them into a .txt
file.
The problem is with multi-byte characters. The code works with symbols like あきる
or 會
, but doesn't work with letters like é
. In a .txt
file, its just the letter e
.
Also, I don't know if it helps, but when I try to print all the words into the console, it only prints words with English characters.
namespace fs = std::filesystem;
void processFile(const fs::path filePath, const fs::path resultPath, const std::unordered_set<std::wstring>& dict) {
std::wifstream fileIn(filePath, std::ios::binary);
// Create a locale using UTF-8 encoding
std::locale utf8_locale(fileIn.getloc(), new std::codecvt_utf8<wchar_t>);
fileIn.imbue(utf8_locale);
std::vector<int> indexes;
std::wstring previousWord;
bool twelveInRow = false;
int counter = 0;
std::vector<std::wstring> words;
std::wstring word;
wchar_t prev_ch = L' ';
wchar_t ch;
while (fileIn.get(ch)) {
ch = towlower(ch);
if (iswalpha(ch)) {
word += ch;
}
else if (iswspace(ch) && !iswspace(prev_ch) && !word.empty()) {
words.push_back(std::move(word));
word.clear();
}
prev_ch = ch;
}
if (!word.empty()) {
words.push_back(std::move(word));
}
fileIn.close();
for (int i = 0; i < words.size(); ++i) {
std::wcout << words[i] << std::endl;
}
for (int i = 0; i < words.size(); ++i) {
const auto& word = words[i];
if (dict.find(word) == dict.end()) {
if (previousWord != L"astralcomplex65") {
indexes.push_back(-1);
previousWord = L"astralcomplex65";
}
}
else {
indexes.push_back(i);
previousWord = word;
counter++;
if (counter >= 9 && counter <= 25) {
twelveInRow = true;
}
}
}
if (twelveInRow) {
wchar_t prev_ch1 = L' ';
wchar_t ch1;
std::wifstream InFile(filePath, std::ios::binary);
InFile.imbue(utf8_locale);
// std::filesystem::path filePath = std::filesystem::absolute(fileIn.getloc().name());
std::wofstream file(resultPath / (randomName() + L"_Line_Matches" + L".txt"), std::ios::binary);
// Set the locale for output using UTF-8 encoding
file.imbue(utf8_locale);
std::wstring content;
content += filePath.wstring() + L'\n';
for (auto i : indexes) {
content += ((i == -1 ? L"" : words[i]) + L"\n");
}
content += L'\n';
file << content;
file.close();
InFile.close();
}
}
const std::unordered_set<std::wstring> loadDict(const fs::path path) {
std::wifstream fileIn(path, std::ios::binary);
// Create a locale using UTF-8 encoding
std::locale utf8_locale(fileIn.getloc(), new std::codecvt_utf8<wchar_t>);
fileIn.imbue(utf8_locale);
std::unordered_set<std::wstring> words;
std::wstring word;
while (fileIn >> word) {
words.insert(word);
}
fileIn.close();
return words;
}
I tried to get the é
letter by just copying the .txt
file and creating another .txt
file, and it works, so there is a problem with my massive indexes which have positions of characters that should be written in another .txt
file.
one guy from comments gave an answer
basically i am putting characters in std::vector<std::wstring> words;
massive only one at a time, and somme characters may require more than 1 wchar_T
, it's better to rewrite while(fileIn.get(ch))
cycle to this
while (fileIn >> word) {
// Convert each character to lowercase
std::transform(word.begin(), word.end(), word.begin(), towlower);
// Process the word
if (iswalpha(word[0])) {
words.push_back(std::move(word));
}
else if (!word.empty()) {
// Handle the case where the word starts with a non-alphabetic character
words.push_back(std::wstring(1, word[0]));
}
}