I am experimenting with wctomb
in order to convert a wchar_t
into its UTF-8 equivalent stored in a char[]
. It works nicely, but not for surrogate characters ranging U+D800
to U+DFFF
.
int ret;
// null-terminated
// VS gives a warning on wctomb() for buffer overrunning on char mb[4]={0} for some reason ...
char mb[5] = { 0 };
setlocale(LC_ALL, "en-US.utf8");
// Gives 0xE2 0xAA 0x96 just fine, wctomb returns 3
ret = wctomb(mb, L'\x2A96');
// expected 0xED 0xBA 0xA0, but wctomb returns -1, i.e. invalid character
ret = wctomb(mb, L'\xDEA0');
Is there another way to get the UTF-8 form of the surrogate character alone?
I also tried wctomb_s
through errno_t
and &ret
but it just yields the same outcome ...
While it is not correct to encode lone surrogate codes in UTF-8, this limitation is somewhat artificial and an extension to UTF-8 called WTF-8 supports it with the same encoding method as UTF-8. The resulting byte sequence would be considered invalid by strict UTF-8 decoders. If you intend to keep the converted sequence internally and use a custom (lenient) parser to convert back to 16- or 32-bit codes, you can use these converters:
// convert a 16-bit surrogate code to WTF-8.
// return the number of bytes to skip (3)
int wtf8_convert_surrogate_code(char *dest, uint16_t cc) {
dest[0] = (char)(0xE0 + (cc >> 12));
dest[1] = (char)(0x80 + ((cc >> 6) & 0x3F));
dest[2] = (char)(0x80 + (cc & 0x3F));
return 3;
}
Here is an encoder intended as a replacement for the wctomb
function:
int wtf8_encode(char *dest, uint32_t cc) {
if (cc < 0x80) {
dest[0] = (char)(cc);
return 1;
} else
if (cc < 0x800) {
dest[0] = (char)(0xC0 + (cc >> 6));
dest[1] = (char)(0x80 + (cc & 0x3F));
return 2;
} else
if (cc < 0x10000) {
// accept all 16-bit codes, unlike UTF-8
dest[0] = (char)(0xE0 + (cc >> 12));
dest[1] = (char)(0x80 + ((cc >> 6) & 0x3F));
dest[2] = (char)(0x80 + (cc & 0x3F));
return 3;
} else
if (cc < 0x110000) {
dest[0] = (char)(0xF0 + (cc >> 18));
dest[1] = (char)(0x80 + ((cc >> 12) & 0x3F));
dest[2] = (char)(0x80 + ((cc >> 6) & 0x3F));
dest[3] = (char)(0x80 + (cc & 0x3F));
return 4;
}
return 0;
}
To convert back to Unicode code-points, you can use this lenient converter:
// convert a WTF-8 sequence to a 32-bit code point.
// return the number of bytes to skip or 0 in case of encoding error
int wtf8_decode32(const char *p0, size_t max_len, uint32_t *pc) {
if (!max_len)
return 0;
const uint8_t *p = (const uint8_t *)p0;
uint32_t c = *p++;
if (c < 0x80) {
*pc = c;
return 1;
} else
if (c < 0xC2) {
// invalid prefix byte or naked trailing byte
} else
if (c < 0xE0) {
if (max_len >= 2 && p[0] >= 0x80 && p[0] <= 0xBF) {
*pc = ((c - 0xC0) << 6) + (p[0] - 0x80);
return 2;
}
} else
if (c < 0xF0) {
if (max_len >= 3
&& p[0] >= 0x80 && p[0] <= 0xBF
&& p[1] >= 0x80 && p[1] <= 0xBF) {
c = ((c - 0xE0) << 12) + ((p[0] - 0x80) << 6) + (p[1] - 0x80);
// here we purposely DO NOT test for surrogate codes
if (c >= 0x800) {
*pc = c;
return 3;
}
}
} else
if (c <= 0xF4) {
if (max_len >= 4
&& p[0] >= 0x80 && p[0] <= 0xBF
&& p[1] >= 0x80 && p[1] <= 0xBF
&& p[2] >= 0x80 && p[2] <= 0xBF) {
c = ((c - 0xF0) << 18) + ((p[0] - 0x80) << 12) +
((p[1] - 0x80) << 6) + (p[2] - 0x80);
if (c >= 0x10000 && c < 0x110000) {
*pc = c;
return 4;
}
}
}
return 0;
}
You can convert WTF-8 sequences to UTF-16 with this function:
// convert a WTF-8 sequence to UTF-16.
// return the number of words produced at destination, 0 in case of encoding errors
// source pointer is updated.
int wtf8_decode16(const char **pp, size_t max_len, uint16_t *pc) {
if (!max_len)
return 0;
const uint8_t *p = (const uint8_t *)*pp;
uint32_t c = *p++;
if (c < 0x80) {
*pc = (uint16_t)c;
*pp = (const char *)p;
return 1;
} else
if (c < 0xC2) {
// invalid prefix byte or naked trailing byte
} else
if (c < 0xE0) {
if (max_len >= 2 && p[0] >= 0x80 && p[0] <= 0xBF) {
*pc = (uint16_t)(((c - 0xC0) << 6) + (p[0] - 0x80));
*pp = (const char *)(p + 1);
return 1;
}
} else
if (c < 0xF0) {
if (max_len >= 3
&& p[0] >= 0x80 && p[0] <= 0xBF
&& p[1] >= 0x80 && p[1] <= 0xBF) {
c = ((c - 0xE0) << 12) + ((p[0] - 0x80) << 6) + (p[1] - 0x80);
// here we purposely DO NOT test for surrogate codes
if (c >= 0x800) {
*pc = (uint16_t)c;
*pp = (const char *)(p + 2);
return 1;
}
}
} else
if (c <= 0xF4) {
if (max_len >= 4
&& p[0] >= 0x80 && p[0] <= 0xBF
&& p[1] >= 0x80 && p[1] <= 0xBF
&& p[2] >= 0x80 && p[2] <= 0xBF) {
c = ((c - 0xF0) << 18) + ((p[0] - 0x80) << 12) +
((p[1] - 0x80) << 6) + (p[2] - 0x80);
if (c >= 0x10000 && c < 0x110000) {
pc[0] = (uint16_t)(0xD800 + ((c - 0x1000) >> 10));
pc[1] = (uint16_t)(0xDC00 + ((c - 0x1000) & 0x3FF));
*pp = (const char *)(p + 3);
return 2;
}
}
}
return 0;
}