How do you convert a single surrogate character without a pair into its UTF-8 equivalent?

I am experimenting with wctomb in order to convert a wchar_t into its UTF-8 equivalent stored in a char[]. It works nicely, but not for surrogate characters ranging U+D800 to U+DFFF.

int ret;
// null-terminated
// VS gives a warning on wctomb() for buffer overrunning on char mb[4]={0} for some reason ...
char mb[5] = { 0 };
setlocale(LC_ALL, "en-US.utf8");
// Gives 0xE2 0xAA 0x96 just fine, wctomb returns 3
ret = wctomb(mb, L'\x2A96');
// expected 0xED 0xBA 0xA0, but wctomb returns -1, i.e. invalid character
ret = wctomb(mb, L'\xDEA0');

Is there another way to get the UTF-8 form of the surrogate character alone?
I also tried wctomb_s through errno_t and &ret but it just yields the same outcome ...

Solution

While it is not correct to encode lone surrogate codes in UTF-8, this limitation is somewhat artificial and an extension to UTF-8 called WTF-8 supports it with the same encoding method as UTF-8. The resulting byte sequence would be considered invalid by strict UTF-8 decoders. If you intend to keep the converted sequence internally and use a custom (lenient) parser to convert back to 16- or 32-bit codes, you can use these converters:

// convert a 16-bit surrogate code to WTF-8.
// return the number of bytes to skip (3)
int wtf8_encode_surrogate_code(char *dest, uint16_t cc) {
    dest[0] = (char)(0xE0 + (cc >> 12));
    dest[1] = (char)(0x80 + ((cc >> 6) & 0x3F));
    dest[2] = (char)(0x80 + (cc & 0x3F));
    return 3;
}

Here is an encoder intended as a replacement for the wctomb function:

int wtf8_encode(char *dest, uint32_t cc) {
    if (cc < 0x80) {
        dest[0] = (char)(cc);
        return 1;
    } else
    if (cc < 0x800) {
        dest[0] = (char)(0xC0 + (cc >> 6));
        dest[1] = (char)(0x80 + (cc & 0x3F));
        return 2;
    } else
    if (cc < 0x10000) {
        // accept all 16-bit codes, unlike UTF-8
        dest[0] = (char)(0xE0 + (cc >> 12));
        dest[1] = (char)(0x80 + ((cc >> 6) & 0x3F));
        dest[2] = (char)(0x80 + (cc & 0x3F));
        return 3;
    } else
    if (cc < 0x110000) {
        dest[0] = (char)(0xF0 + (cc >> 18));
        dest[1] = (char)(0x80 + ((cc >> 12) & 0x3F));
        dest[2] = (char)(0x80 + ((cc >> 6) & 0x3F));
        dest[3] = (char)(0x80 + (cc & 0x3F));
        return 4;
    }
    return 0;
}

To convert back to Unicode code-points, you can use this lenient converter:

// convert a WTF-8 sequence to a 32-bit code point.
// return -1 if more bytes needed
// return 0 on encoding error
// otherwise return the number of bytes to skip, code stored in *pc 
int wtf8_decode32(const char *p0, size_t max_len, uint32_t *pc) {
    if (!max_len)
        return -1;

    const uint8_t *p = (const uint8_t *)p0;
    uint32_t c = *p++;
    if (c < 0x80) {
        *pc = c;
        return 1;
    } else
    if (c < 0xC2) {
        // invalid prefix byte or naked trailing byte
    } else
    if (c < 0xE0) {
        if (max_len < 2)
            return -1;
        if (p[0] >= 0x80 && p[0] <= 0xBF) {
            *pc = ((c - 0xC0) << 6) + (p[0] - 0x80);
            return 2;
        }
    } else
    if (c < 0xF0) {
        if (max_len < 3)
            return -1;
        if (p[0] >= 0x80 && p[0] <= 0xBF
        &&  p[1] >= 0x80 && p[1] <= 0xBF) {
            c = ((c - 0xE0) << 12) + ((p[0] - 0x80) << 6) + (p[1] - 0x80);
            // here we purposely DO NOT test for surrogate codes
            if (c >= 0x800) {
                *pc = c;
                return 3;
            }
        }
    } else
    if (c <= 0xF4) {
        if (max_len < 4)
            return -1;
        if (p[0] >= 0x80 && p[0] <= 0xBF
        &&  p[1] >= 0x80 && p[1] <= 0xBF
        &&  p[2] >= 0x80 && p[2] <= 0xBF) {
            c = ((c - 0xF0) << 18) + ((p[0] - 0x80) << 12) +
                ((p[1] - 0x80) << 6) + (p[2] - 0x80);
            if (c >= 0x10000 && c < 0x110000) {
                *pc = c;
                return 4;
            }
        }
    }
    return 0; // encoding error
}

You can convert WTF-8 sequences to UTF-16 with this function:

// convert a WTF-8 sequence to UTF-16.
// return -1 if more bytes needed
// return 0 on encoding error
// otherwise return the number of words produced at destination pc
// source pointer is updated.
int wtf8_decode16(const char **pp, size_t max_len, uint16_t *pc) {
    if (!max_len)
        return -1;

    const uint8_t *p = (const uint8_t *)*pp;
    uint32_t c = *p++;
    if (c < 0x80) {
        *pc = (uint16_t)c;
        *pp = (const char *)p;
        return 1;
    } else
    if (c < 0xC2) {
        // invalid prefix byte or naked trailing byte
    } else
    if (c < 0xE0) {
        if (max_len < 2)
            return -1;
        if (p[0] >= 0x80 && p[0] <= 0xBF) {
            *pc = (uint16_t)(((c - 0xC0) << 6) + (p[0] - 0x80));
            *pp = (const char *)(p + 1);
            return 1;
        }
    } else
    if (c < 0xF0) {
        if (max_len < 3)
            return -1;
        if (p[0] >= 0x80 && p[0] <= 0xBF
        &&  p[1] >= 0x80 && p[1] <= 0xBF) {
            c = ((c - 0xE0) << 12) + ((p[0] - 0x80) << 6) + (p[1] - 0x80);
            // here we purposely DO NOT test for surrogate codes
            // but we do reject overlong encodings
            if (c >= 0x800) {
                *pc = (uint16_t)c;
                *pp = (const char *)(p + 2);
                return 1;
            }
        }
    } else
    if (c <= 0xF4) {
        if (max_len < 4)
            return -1;
        if (p[0] >= 0x80 && p[0] <= 0xBF
        &&  p[1] >= 0x80 && p[1] <= 0xBF
        &&  p[2] >= 0x80 && p[2] <= 0xBF) {
            c = ((c - 0xF0) << 18) + ((p[0] - 0x80) << 12) +
                ((p[1] - 0x80) << 6) + (p[2] - 0x80);
            if (c >= 0x10000 && c < 0x110000) {
                pc[0] = (uint16_t)(0xD800 + ((c - 0x1000) >> 10));
                pc[1] = (uint16_t)(0xDC00 + ((c - 0x1000) & 0x3FF));
                *pp = (const char *)(p + 3);
                return 2;
            }
        }
    }
    return 0; // encoding error
}