I have created a text file with following characters for testing utf-8 encoding:
%gÁüijȐʨΘЋЮѦҗԘՔהڳضणணษ༒Ⴃᎃᡧᬐ⁜₪≸☺⛜⺟むヸ㒦㢒
I also have written this program in C to open file and read it:
#pragma warning(disable:4996)
#include <stdio.h>
#include <stdlib.h>
int main() {
FILE *ptr;
ptr = fopen("inputtest.txt", "r, ccs=UTF-8");
char input[50];
if (ptr == NULL)
perror("Error opening file");
else {
if (fgets(input, 50, ptr) != NULL) {
puts(input);
}
printf(input);
fclose(ptr);
}
}
If I don't use ccs=UTF-8
, I will get some unreadable characters. But with it, the program crashes with code -1073740791
. Also after using wchar_t
and fgetws
the program's output was just %
.
Note: I am using windows 11 and visual studio 2022 and I need to input multi-language characters.
Consider using fgetws(3)
, instead, and using setlocale(3)
prior to that. For one byte characters, you are limited to ascii or at most one byte characters. And of course, use wchar_t
characters, instead of char
.
But, if you use utf-8 encoding, all bytes are read as bytes, and can be printed as bytes. You can read and write those without interpreting them (except, of course if you want to interpret them):
#include <stdio.h>
int main()
{
int c;
while ((c = fgetc(stdin)) != EOF)
putchar(c);
}
should work:
$ a.out <<EOF
> %gÁüijȐʨΘЋЮѦҗԘՔהڳضणணษ༒Ⴃᎃᡧᬐ⁜₪≸☺⛜⺟むヸ㒦㢒
> EOF
%gÁüijȐʨΘЋЮѦҗԘՔהڳضणணษ༒Ⴃᎃᡧᬐ⁜₪≸☺⛜⺟むヸ㒦㢒
$ _
A better approach (to asciify its utf8 input) is shown below:
#include <ctype.h>
#include <stdio.h>
#define INCORRECT 0xfffd
void out_char(unsigned c)
{
switch (c) {
case '\n': case '\r': case '\t': case '\b': case '\f':
putchar(c);
return;
case '\\':
printf("\\u%04x", c);
return;
}
if (c < 0x80) {
if (iscntrl(c)) {
printf("\\u%04x", c);
return;
}
putchar(c);
return;
}
if (c < 0x10000) {
printf("\\u%04x", c);
return;
}
printf("\\U%08x", c);
return;
} /* out_char */
int to_ascii()
{
int c;
int in = 0;
int state = 0x00; /* idle state */
while ((c = getchar()) != EOF) {
switch (state) {
case 0x00: /* idle state */
if (c >= 0xf0) { /* 0b1111xxxx, illegal -- reserved */
out_char(INCORRECT);
} else if (c >= 0xe0) { /* 0b1110xxxx, three bytes seq. */
in = c & 0x0f;
state = 0x02;
} else if (c >= 0xc0) { /* 0b110xxxxx, two bytes seq. */
in = c & 0x1f;
state = 0x01;
} else if (c >= 0x80) { /* 0b10xxxxxx, invalid */
out_char(INCORRECT);
} else { /* 0b0xxxxxxx */
out_char(c);
}
break;
case 0x01: /* one to go */
if (c >= 0xf0) { /* 0b1111xxxx, illegal + illegal */
out_char(INCORRECT);
out_char(INCORRECT);
state = 0x00; /* idle */
} else if (c >= 0xe0) { /* 0b1110xxxx, illegal */
out_char(INCORRECT);
in = c & 0x0f;
state = 0x02; /* two to go */
} else if (c >= 0xc0) { /* 0b110xxxxx, illegal */
out_char(INCORRECT);
in = c & 0x1f;
state = 0x01; /* one to go */
} else if (c >= 0x80) { /* 0x10xxxxxx, legal */
in <<= 6;
in |= c & 0x3f;
out_char(in);
state = 0x00;
} else { /* 0x0xxxxxxx, illegal + legal */
out_char(INCORRECT);
out_char(c);
state = 0x00;
}
break;
case 0x02: /* two to go */
if (c >= 0xf0) { /* 0b1111xxxx, illegal + illegal */
out_char(INCORRECT);
out_char(INCORRECT);
state = 0x00; /* idle */
} else if (c >= 0xe0) { /* 0b1110xxxx, illegal */
out_char(INCORRECT);
in = c & 0x0f;
state = 0x02; /* two to go */
} else if (c >= 0xc0) { /* 0b110xxxxx, illegal */
out_char(INCORRECT);
in = c & 0x1f;
state = 0x01; /* one to go */
} else if (c >= 0x80) { /* 0x10xxxxxx, legal */
in <<= 6;
in |= c & 0x3f;
state = 0x01;
} else { /* 0x0xxxxxxx, illegal + legal */
out_char(INCORRECT);
out_char(c);
state = 0x00;
}
break;
} /* switch */
} /* while */
} /* to_ascii */
int main()
{
to_ascii();
}
that, on the given input, will output:
$ a.out <<EOF
> %gÁüijȐʨΘЋЮѦҗԘՔהڳضणணษ༒Ⴃᎃᡧᬐ⁜₪≸☺⛜⺟むヸ㒦㢒
> EOF
%g\u00c1\u00fc\u0133\u0210\u02a8\u0398\u040b\u042e\u0466\u0497\u0518\u0554\u05d4\u06b3\u0636\u0923\u0ba3\u0e29\u0f12\u10a3\u1383\u1867\u1b10\u205c\u20aa\u2278\u263a\u26dc\u2e9f\u3080\u30f8\u34a6\u3892
$ _