Reading utf-8 encoded files with fopen C

I have created a text file with following characters for testing utf-8 encoding:

%gÁüĳȐʨΘЋЮѦҗԘՔהڳضणணษ༒Ⴃᎃᡧᬐ⁜₪≸☺⛜⺟むヸ㒦㢒

I also have written this program in C to open file and read it:

#pragma warning(disable:4996)

#include <stdio.h>
#include <stdlib.h>

int main() {
    FILE *ptr;
    ptr = fopen("inputtest.txt", "r, ccs=UTF-8");
    char input[50];
    if (ptr == NULL)
        perror("Error opening file");
    else {
        if (fgets(input, 50, ptr) != NULL) {
            puts(input);
        }
        printf(input);
        fclose(ptr);
    }
}

If I don't use ccs=UTF-8, I will get some unreadable characters. But with it, the program crashes with code -1073740791. Also after using wchar_t and fgetws the program's output was just %. Note: I am using windows 11 and visual studio 2022 and I need to input multi-language characters.

Solution

Consider using fgetws(3), instead, and using setlocale(3) prior to that. For one byte characters, you are limited to ascii or at most one byte characters. And of course, use wchar_t characters, instead of char.

But, if you use utf-8 encoding, all bytes are read as bytes, and can be printed as bytes. You can read and write those without interpreting them (except, of course if you want to interpret them):

#include <stdio.h>
int main()
{
    int c;
    while ((c = fgetc(stdin)) != EOF)
        putchar(c);
}

should work:

$ a.out <<EOF
> %gÁüĳȐʨΘЋЮѦҗԘՔהڳضणணษ༒Ⴃᎃᡧᬐ⁜₪≸☺⛜⺟むヸ㒦㢒
> EOF
%gÁüĳȐʨΘЋЮѦҗԘՔהڳضणணษ༒Ⴃᎃᡧᬐ⁜₪≸☺⛜⺟むヸ㒦㢒
$ _

A better approach (to asciify its utf8 input) is shown below:

#include <ctype.h>
#include <stdio.h>

#define INCORRECT 0xfffd

void out_char(unsigned c)
{
    switch (c) {
    case '\n': case '\r': case '\t': case '\b': case '\f':
        putchar(c);
        return;
    case '\\':
        printf("\\u%04x", c);
        return;
    }
    if (c < 0x80) {
        if (iscntrl(c)) {
            printf("\\u%04x", c);
            return;
        }
        putchar(c);
        return;
    }
    if (c < 0x10000) {
        printf("\\u%04x", c);
        return;
    }
    printf("\\U%08x", c);
    return;
} /* out_char */

int to_ascii()
{
    int c;
    int in    = 0;
    int state = 0x00; /* idle state */

    while ((c = getchar()) != EOF) {
        switch (state) {

        case 0x00: /* idle state */
            if (c >= 0xf0) { /* 0b1111xxxx, illegal -- reserved */
                out_char(INCORRECT);
            } else if (c >= 0xe0) { /* 0b1110xxxx, three bytes seq. */
                in = c & 0x0f;
                state = 0x02;
            } else if (c >= 0xc0) { /* 0b110xxxxx, two bytes seq. */
                in = c & 0x1f;
                state = 0x01;
            } else if (c >= 0x80) { /* 0b10xxxxxx, invalid */
                out_char(INCORRECT);
            } else { /* 0b0xxxxxxx */
                out_char(c);
            }
            break;

        case 0x01: /* one to go */
            if (c >= 0xf0) { /* 0b1111xxxx, illegal + illegal */
                out_char(INCORRECT);
                out_char(INCORRECT);
                state = 0x00; /* idle */
            } else if (c >= 0xe0) { /* 0b1110xxxx, illegal */
                out_char(INCORRECT);
                in = c & 0x0f;
                state = 0x02; /* two to go */
            } else if (c >= 0xc0) { /* 0b110xxxxx, illegal */
                out_char(INCORRECT);
                in = c & 0x1f;
                state = 0x01; /* one to go */
            } else if (c >= 0x80) { /* 0x10xxxxxx, legal */
                in <<= 6;
                in |= c & 0x3f;
                out_char(in);
                state = 0x00;
            } else { /* 0x0xxxxxxx, illegal + legal */
                out_char(INCORRECT);
                out_char(c);
                state = 0x00;
            }
            break;

        case 0x02: /* two to go */
            if (c >= 0xf0) { /* 0b1111xxxx, illegal + illegal */
                out_char(INCORRECT);
                out_char(INCORRECT);
                state = 0x00; /* idle */
            } else if (c >= 0xe0) { /* 0b1110xxxx, illegal */
                out_char(INCORRECT);
                in = c & 0x0f;
                state = 0x02; /* two to go */
            } else if (c >= 0xc0) { /* 0b110xxxxx, illegal */
                out_char(INCORRECT);
                in = c & 0x1f;
                state = 0x01; /* one to go */
            } else if (c >= 0x80) { /* 0x10xxxxxx, legal */
                in <<= 6;
                in |= c & 0x3f;
                state = 0x01;
            } else { /* 0x0xxxxxxx, illegal + legal */
                out_char(INCORRECT);
                out_char(c);
                state = 0x00;
            }
            break;
        } /* switch */
    } /* while */
} /* to_ascii */

int main()
{
    to_ascii();
}

that, on the given input, will output:

$ a.out <<EOF
> %gÁüĳȐʨΘЋЮѦҗԘՔהڳضणணษ༒Ⴃᎃᡧᬐ⁜₪≸☺⛜⺟むヸ㒦㢒
> EOF
%g\u00c1\u00fc\u0133\u0210\u02a8\u0398\u040b\u042e\u0466\u0497\u0518\u0554\u05d4\u06b3\u0636\u0923\u0ba3\u0e29\u0f12\u10a3\u1383\u1867\u1b10\u205c\u20aa\u2278\u263a\u26dc\u2e9f\u3080\u30f8\u34a6\u3892
$ _