cutf-8c99surrogate-pairs

How do you convert a single surrogate character without a pair into its UTF-8 equivalent?


I am experimenting with wctomb in order to convert a wchar_t into its UTF-8 equivalent stored in a char[]. It works nicely, but not for surrogate characters ranging U+D800 to U+DFFF.

int ret;
// null-terminated
// VS gives a warning on wctomb() for buffer overrunning on char mb[4]={0} for some reason ...
char mb[5] = { 0 };
setlocale(LC_ALL, "en-US.utf8");
// Gives 0xE2 0xAA 0x96 just fine, wctomb returns 3
ret = wctomb(mb, L'\x2A96');
// expected 0xED 0xBA 0xA0, but wctomb returns -1, i.e. invalid character
ret = wctomb(mb, L'\xDEA0');

Is there another way to get the UTF-8 form of the surrogate character alone?
I also tried wctomb_s through errno_t and &ret but it just yields the same outcome ...


Solution

  • While it is not correct to encode lone surrogate codes in UTF-8, this limitation is somewhat artificial and an extension to UTF-8 called WTF-8 supports it with the same encoding method as UTF-8. The resulting byte sequence would be considered invalid by strict UTF-8 decoders. If you intend to keep the converted sequence internally and use a custom (lenient) parser to convert back to 16- or 32-bit codes, you can use these converters:

    // convert a 16-bit surrogate code to WTF-8.
    // return the number of bytes to skip (3)
    int wtf8_convert_surrogate_code(char *dest, uint16_t cc) {
        dest[0] = (char)(0xE0 + (cc >> 12));
        dest[1] = (char)(0x80 + ((cc >> 6) & 0x3F));
        dest[2] = (char)(0x80 + (cc & 0x3F));
        return 3;
    }
    

    Here is an encoder intended as a replacement for the wctomb function:

    int wtf8_encode(char *dest, uint32_t cc) {
        if (cc < 0x80) {
            dest[0] = (char)(cc);
            return 1;
        } else
        if (cc < 0x800) {
            dest[0] = (char)(0xC0 + (cc >> 6));
            dest[1] = (char)(0x80 + (cc & 0x3F));
            return 2;
        } else
        if (cc < 0x10000) {
            // accept all 16-bit codes, unlike UTF-8
            dest[0] = (char)(0xE0 + (cc >> 12));
            dest[1] = (char)(0x80 + ((cc >> 6) & 0x3F));
            dest[2] = (char)(0x80 + (cc & 0x3F));
            return 3;
        } else
        if (cc < 0x110000) {
            dest[0] = (char)(0xF0 + (cc >> 18));
            dest[1] = (char)(0x80 + ((cc >> 12) & 0x3F));
            dest[2] = (char)(0x80 + ((cc >> 6) & 0x3F));
            dest[3] = (char)(0x80 + (cc & 0x3F));
            return 4;
        }
        return 0;
    }
    

    To convert back to Unicode code-points, you can use this lenient converter:

    // convert a WTF-8 sequence to a 32-bit code point.
    // return the number of bytes to skip or 0 in case of encoding error
    int wtf8_decode32(const char *p0, size_t max_len, uint32_t *pc) {
        if (!max_len)
            return 0;
    
        const uint8_t *p = (const uint8_t *)p0;
        uint32_t c = *p++;
        if (c < 0x80) {
            *pc = c;
            return 1;
        } else
        if (c < 0xC2) {
            // invalid prefix byte or naked trailing byte
        } else
        if (c < 0xE0) {
            if (max_len >= 2 && p[0] >= 0x80 && p[0] <= 0xBF) {
                *pc = ((c - 0xC0) << 6) + (p[0] - 0x80);
                return 2;
            }
        } else
        if (c < 0xF0) {
            if (max_len >= 3
            &&  p[0] >= 0x80 && p[0] <= 0xBF
            &&  p[1] >= 0x80 && p[1] <= 0xBF) {
                c = ((c - 0xE0) << 12) + ((p[0] - 0x80) << 6) + (p[1] - 0x80);
                // here we purposely DO NOT test for surrogate codes
                if (c >= 0x800) {
                    *pc = c;
                    return 3;
                }
            }
        } else
        if (c <= 0xF4) {
            if (max_len >= 4
            &&  p[0] >= 0x80 && p[0] <= 0xBF
            &&  p[1] >= 0x80 && p[1] <= 0xBF
            &&  p[2] >= 0x80 && p[2] <= 0xBF) {
                c = ((c - 0xF0) << 18) + ((p[0] - 0x80) << 12) +
                    ((p[1] - 0x80) << 6) + (p[2] - 0x80);
                if (c >= 0x10000 && c < 0x110000) {
                    *pc = c;
                    return 4;
                }
            }
        }
        return 0;
    }
    

    You can convert WTF-8 sequences to UTF-16 with this function:

    // convert a WTF-8 sequence to UTF-16.
    // return the number of words produced at destination, 0 in case of encoding errors
    // source pointer is updated.
    int wtf8_decode16(const char **pp, size_t max_len, uint16_t *pc) {
        if (!max_len)
            return 0;
    
        const uint8_t *p = (const uint8_t *)*pp;
        uint32_t c = *p++;
        if (c < 0x80) {
            *pc = (uint16_t)c;
            *pp = (const char *)p;
            return 1;
        } else
        if (c < 0xC2) {
            // invalid prefix byte or naked trailing byte
        } else
        if (c < 0xE0) {
            if (max_len >= 2 && p[0] >= 0x80 && p[0] <= 0xBF) {
                *pc = (uint16_t)(((c - 0xC0) << 6) + (p[0] - 0x80));
                *pp = (const char *)(p + 1);
                return 1;
            }
        } else
        if (c < 0xF0) {
            if (max_len >= 3
            &&  p[0] >= 0x80 && p[0] <= 0xBF
            &&  p[1] >= 0x80 && p[1] <= 0xBF) {
                c = ((c - 0xE0) << 12) + ((p[0] - 0x80) << 6) + (p[1] - 0x80);
                // here we purposely DO NOT test for surrogate codes
                if (c >= 0x800) {
                    *pc = (uint16_t)c;
                    *pp = (const char *)(p + 2);
                    return 1;
                }
            }
        } else
        if (c <= 0xF4) {
            if (max_len >= 4
            &&  p[0] >= 0x80 && p[0] <= 0xBF
            &&  p[1] >= 0x80 && p[1] <= 0xBF
            &&  p[2] >= 0x80 && p[2] <= 0xBF) {
                c = ((c - 0xF0) << 18) + ((p[0] - 0x80) << 12) +
                    ((p[1] - 0x80) << 6) + (p[2] - 0x80);
                if (c >= 0x10000 && c < 0x110000) {
                    pc[0] = (uint16_t)(0xD800 + ((c - 0x1000) >> 10));
                    pc[1] = (uint16_t)(0xDC00 + ((c - 0x1000) & 0x3FF));
                    *pp = (const char *)(p + 3);
                    return 2;
                }
            }
        }
        return 0;
    }