Consider the following two functions, the first one uses the Windows API functions ReadFile()
and CreateFileW()
, whereas the second function uses fopen()
and fgetws()
, to read a non-English text from a file called data.txt
.
The first function outputs garbage text, whereas the second function outputs the text from the file without any problems.
Notice that fopen()
has ccs=UTF-8
that defines what character encoding to use, whereas read_file_2()
does not have something similar.
DWORD read_file_2()
{
wchar_t wstr[512];
BOOL success = FALSE;
DWORD dwRead, total =0;
HANDLE handle = CreateFileW(L"data.txt",
GENERIC_READ,
0,
NULL,
3,
FILE_ATTRIBUTE_NORMAL,
NULL);
if (handle == INVALID_HANDLE_VALUE)
return -1;
do
{
success = ReadFile(handle, wstr, 20, &dwRead, NULL);
total += dwRead;
} while(!success || dwRead == 0);
wstr[total] = L'\0';
wprintf(L"%ls\n",wstr);
return 0;
}
void read_file_1()
{
wchar_t converted[20];
FILE * ptr;view=msvc-170
ptr = fopen("data.txt", "rt+,ccs=UTF-8");
fgetws(converted, 20, ptr);
wprintf(L"%ls\n", converted);
fclose(ptr);
}
int main()
{
_setmode(fileno(stdin), _O_U8TEXT);
_setmode(fileno(stdout), _O_U8TEXT);
read_file_1();
read_file_2();
}
How does one use ReadFile()
to read a wchar_t
string from a text file and output it to the terminal without turning it into garbage text?
Шифрование.txt ال
퀠킨톸톄킀킾킲킰킽킸♥
Actual content of data.txt
:
Шифрование.txt العربية.txt
You can use MultiByteToWideChar.
#define MALLOC( t, n ) ( ( t* )malloc( sizeof( t ) * n ) )
int total_wchars = MultiByteToWideChar(
CP_UTF8, // CodePage
0, // dwFlags
bytes, // lpMultiByteStr The bytes read using `ReadFile`/`read`.
total_bytes, // cbMultiByte No need for NUL.
NULL, // lpWideCharStr
0 // cchWideChar 0 = Get size incl NUL.
);
if ( total_wchars == 0 ) {
// Error. Use GetLastError() and such.
...
}
LPWSTR wchars = MALLOC( WCHAR, total_wchars );
MultiByteToWideChar(
CP_UTF8, // CodePage
0, // dwFlags
bytes, // lpMultiByteStr
total_bytes, // cbMultiByte
wchars, // lpWideCharStr
total_wchars // cchWideChar
);
Note that if the compiler has wchar_t
,
WCHAR
is wchar_t
LPWSTR
is wchar_t *
LPCWSTR
is const wchar_t *