cutf-8fopen

Reading utf-8 encoded files with fopen C


I have created a text file with following characters for testing utf-8 encoding:

%gÁüijȐʨΘЋЮѦҗԘՔהڳضणணษ༒Ⴃᎃᡧᬐ⁜₪≸☺⛜⺟むヸ㒦㢒

I also have written this program in C to open file and read it:

#pragma warning(disable:4996)

#include <stdio.h>
#include <stdlib.h>

int main() {
    FILE *ptr;
    ptr = fopen("inputtest.txt", "r, ccs=UTF-8");
    char input[50];
    if (ptr == NULL)
        perror("Error opening file");
    else {
        if (fgets(input, 50, ptr) != NULL) {
            puts(input);
        }
        printf(input);
        fclose(ptr);
    }
}

If I don't use ccs=UTF-8, I will get some unreadable characters. But with it, the program crashes with code -1073740791. Also after using wchar_t and fgetws the program's output was just %. Note: I am using windows 11 and visual studio 2022 and I need to input multi-language characters.


Solution

  • Consider using fgetws(3), instead, and using setlocale(3) prior to that. For one byte characters, you are limited to ascii or at most one byte characters. And of course, use wchar_t characters, instead of char.

    But, if you use utf-8 encoding, all bytes are read as bytes, and can be printed as bytes. You can read and write those without interpreting them (except, of course if you want to interpret them):

    #include <stdio.h>
    int main()
    {
        int c;
        while ((c = fgetc(stdin)) != EOF)
            putchar(c);
    }
    

    should work:

    $ a.out <<EOF
    > %gÁüijȐʨΘЋЮѦҗԘՔהڳضणணษ༒Ⴃᎃᡧᬐ⁜₪≸☺⛜⺟むヸ㒦㢒
    > EOF
    %gÁüijȐʨΘЋЮѦҗԘՔהڳضणணษ༒Ⴃᎃᡧᬐ⁜₪≸☺⛜⺟むヸ㒦㢒
    $ _
    

    A better approach (to asciify its utf8 input) is shown below:

    #include <ctype.h>
    #include <stdio.h>
    
    #define INCORRECT 0xfffd
    
    void out_char(unsigned c)
    {
        switch (c) {
        case '\n': case '\r': case '\t': case '\b': case '\f':
            putchar(c);
            return;
        case '\\':
            printf("\\u%04x", c);
            return;
        }
        if (c < 0x80) {
            if (iscntrl(c)) {
                printf("\\u%04x", c);
                return;
            }
            putchar(c);
            return;
        }
        if (c < 0x10000) {
            printf("\\u%04x", c);
            return;
        }
        printf("\\U%08x", c);
        return;
    } /* out_char */
    
    int to_ascii()
    {
        int c;
        int in    = 0;
        int state = 0x00; /* idle state */
    
        while ((c = getchar()) != EOF) {
            switch (state) {
    
            case 0x00: /* idle state */
                if (c >= 0xf0) { /* 0b1111xxxx, illegal -- reserved */
                    out_char(INCORRECT);
                } else if (c >= 0xe0) { /* 0b1110xxxx, three bytes seq. */
                    in = c & 0x0f;
                    state = 0x02;
                } else if (c >= 0xc0) { /* 0b110xxxxx, two bytes seq. */
                    in = c & 0x1f;
                    state = 0x01;
                } else if (c >= 0x80) { /* 0b10xxxxxx, invalid */
                    out_char(INCORRECT);
                } else { /* 0b0xxxxxxx */
                    out_char(c);
                }
                break;
    
            case 0x01: /* one to go */
                if (c >= 0xf0) { /* 0b1111xxxx, illegal + illegal */
                    out_char(INCORRECT);
                    out_char(INCORRECT);
                    state = 0x00; /* idle */
                } else if (c >= 0xe0) { /* 0b1110xxxx, illegal */
                    out_char(INCORRECT);
                    in = c & 0x0f;
                    state = 0x02; /* two to go */
                } else if (c >= 0xc0) { /* 0b110xxxxx, illegal */
                    out_char(INCORRECT);
                    in = c & 0x1f;
                    state = 0x01; /* one to go */
                } else if (c >= 0x80) { /* 0x10xxxxxx, legal */
                    in <<= 6;
                    in |= c & 0x3f;
                    out_char(in);
                    state = 0x00;
                } else { /* 0x0xxxxxxx, illegal + legal */
                    out_char(INCORRECT);
                    out_char(c);
                    state = 0x00;
                }
                break;
    
            case 0x02: /* two to go */
                if (c >= 0xf0) { /* 0b1111xxxx, illegal + illegal */
                    out_char(INCORRECT);
                    out_char(INCORRECT);
                    state = 0x00; /* idle */
                } else if (c >= 0xe0) { /* 0b1110xxxx, illegal */
                    out_char(INCORRECT);
                    in = c & 0x0f;
                    state = 0x02; /* two to go */
                } else if (c >= 0xc0) { /* 0b110xxxxx, illegal */
                    out_char(INCORRECT);
                    in = c & 0x1f;
                    state = 0x01; /* one to go */
                } else if (c >= 0x80) { /* 0x10xxxxxx, legal */
                    in <<= 6;
                    in |= c & 0x3f;
                    state = 0x01;
                } else { /* 0x0xxxxxxx, illegal + legal */
                    out_char(INCORRECT);
                    out_char(c);
                    state = 0x00;
                }
                break;
            } /* switch */
        } /* while */
    } /* to_ascii */
    
    int main()
    {
        to_ascii();
    }
    

    that, on the given input, will output:

    $ a.out <<EOF
    > %gÁüijȐʨΘЋЮѦҗԘՔהڳضणணษ༒Ⴃᎃᡧᬐ⁜₪≸☺⛜⺟むヸ㒦㢒
    > EOF
    %g\u00c1\u00fc\u0133\u0210\u02a8\u0398\u040b\u042e\u0466\u0497\u0518\u0554\u05d4\u06b3\u0636\u0923\u0ba3\u0e29\u0f12\u10a3\u1383\u1867\u1b10\u205c\u20aa\u2278\u263a\u26dc\u2e9f\u3080\u30f8\u34a6\u3892
    $ _