Characters printed differently in R.app/RStudio/reprex

With R 4.4.0 on a MacBook, nothing locale() or encoding related in .Rprofile or .Renviron. Sys.getlocale() on a fresh session returns "en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8" in both the native R console, or RStudio.

KOI8-R is a Cyrillic encoding that uses one byte per character. When using reprex from R studio (this is my output, which conforms to my expectations.

Note: this is using the reprex addin, which is running reprex::reprex(), itself using as default input code from the paste bin.

ch256 <- sapply(0:255, function(x) rawToChar(as.raw(x)))
Sys.setlocale("LC_CTYPE", "ru_RU.KOI8-R")
#> [1] "ru_RU.KOI8-R"
ch256
#>   [1] ""     "\001" "\002" "\003" "\004" "\005" "\006" "\a"   "\b"   "\t"  
#>  [11] "\n"   "\v"   "\f"   "\r"   "\016" "\017" "\020" "\021" "\022" "\023"
#>  [21] "\024" "\025" "\026" "\027" "\030" "\031" "\032" "\033" "\034" "\035"
#>  [31] "\036" "\037" " "    "!"    "\""   "#"    "$"    "%"    "&"    "'"   
#>  [41] "("    ")"    "*"    "+"    ","    "-"    "."    "/"    "0"    "1"   
#>  [51] "2"    "3"    "4"    "5"    "6"    "7"    "8"    "9"    ":"    ";"   
#>  [61] "<"    "="    ">"    "?"    "@"    "A"    "B"    "C"    "D"    "E"   
#>  [71] "F"    "G"    "H"    "I"    "J"    "K"    "L"    "M"    "N"    "O"   
#>  [81] "P"    "Q"    "R"    "S"    "T"    "U"    "V"    "W"    "X"    "Y"   
#>  [91] "Z"    "["    "\\"   "]"    "^"    "_"    "`"    "a"    "b"    "c"   
#> [101] "d"    "e"    "f"    "g"    "h"    "i"    "j"    "k"    "l"    "m"   
#> [111] "n"    "o"    "p"    "q"    "r"    "s"    "t"    "u"    "v"    "w"   
#> [121] "x"    "y"    "z"    "{"    "|"    "}"    "~"    "\177" "─" "│"
#> [131] "┌" "┐" "└" "┘" "├" "┤" "┬" "┴" "┼" "▀"
#> [141] "▄" "█" "▌" "▐" "░" "▒" "▓" "⌠" "■" "∙"
#> [151] "√" "≈" "≤" "≥" " " "⌡" "°" "²" "·" "÷"
#> [161] "═" "║" "╒" "ё" "╓" "╔" "╕" "╖" "╗" "╘"
#> [171] "╙" "╚" "╛" "╜" "╝" "╞" "╟" "╠" "╡" "Ё"
#> [181] "╢" "╣" "╤" "╥" "╦" "╧" "╨" "╩" "╪" "╫"
#> [191] "╬" "©" "ю" "а" "б" "ц" "д" "е" "ф" "г"
#> [201] "х" "и" "й" "к" "л" "м" "н" "о" "п" "я"
#> [211] "р" "с" "т" "у" "ж" "в" "ь" "ы" "з" "ш"
#> [221] "э" "щ" "ч" "ъ" "Ю" "А" "Б" "Ц" "Д" "Е"
#> [231] "Ф" "Г" "Х" "И" "Й" "К" "Л" "М" "Н" "О"
#> [241] "П" "Я" "Р" "С" "Т" "У" "Ж" "В" "Ь" "Ы"
#> [251] "З" "Ш" "Э" "Щ" "Ч" "Ъ"

However the same code printed in my RStudio console prints something different (fake reprex from output copy and paste):

ch256 <- sapply(0:255, function(x) rawToChar(as.raw(x)))
Sys.setlocale("LC_CTYPE", "ru_RU.KOI8-R")
ch256
#> [1] ""     "\001" "\002" "\003" "\004" "\005" "\006" "\a"   "\b"   "\t"  
#> [11] "\n"   "\v"   "\f"   "\r"   "\016" "\017" "\020" "\021" "\022" "\023"
#> [21] "\024" "\025" "\026" "\027" "\030" "\031" "\032" "\033" "\034" "\035"
#> [31] "\036" "\037" " "    "!"    "\""   "#"    "$"    "%"    "&"    "'"   
#> [41] "("    ")"    "*"    "+"    ","    "-"    "."    "/"    "0"    "1"   
#> [51] "2"    "3"    "4"    "5"    "6"    "7"    "8"    "9"    ":"    ";"   
#> [61] "<"    "="    ">"    "?"    "@"    "A"    "B"    "C"    "D"    "E"   
#> [71] "F"    "G"    "H"    "I"    "J"    "K"    "L"    "M"    "N"    "O"   
#> [81] "P"    "Q"    "R"    "S"    "T"    "U"    "V"    "W"    "X"    "Y"   
#> [91] "Z"    "["    "\\"   "]"    "^"    "_"    "`"    "a"    "b"    "c"   
#> [101] "d"    "e"    "f"    "g"    "h"    "i"    "j"    "k"    "l"    "m"   
#> [111] "n"    "o"    "p"    "q"    "r"    "s"    "t"    "u"    "v"    "w"   
#> [121] "x"    "y"    "z"    "{"    "|"    "}"    "~"    "\177" "�" "�"
#> [131] "�" "�" "�" "�" "�" "�" "�" "�" "�" "�"
#> [141] "�" "�" "�" "�" "�" "�" "�" "�" "�" "�"
#> [151] "�" "�" "�" "�" "�" "�" "�" "�" "�" "�"
#> [161] "�" "�" "�" "�" "�" "�" "�" "�" "�" "�"
#> [171] "�" "�" "�" "�" "�" "�" "�" "�" "�" "�"
#> [181] "�" "�" "�" "�" "�" "�" "�" "�" "�" "�"
#> [191] "�" "�" "�" "�" "�" "�" "�" "�" "�" "�"
#> [201] "�" "�" "�" "�" "�" "�" "�" "�" "�" "�"
#> [211] "�" "�" "�" "�" "�" "�" "�" "�" "�" "�"
#> [221] "�" "�" "�" "�" "�" "�" "�" "�" "�" "�"
#> [231] "�" "�" "�" "�" "�" "�" "�" "�" "�" "�"
#> [241] "�" "�" "�" "�" "�" "�" "�" "�" "�" "�"
#> [251] "�" "�" "�" "�" "�" "�"

In the R for Mac OS X GUI (R.app) it's different again, the encoding appears to be ignored and latin1 looking characters are printed (fake reprex from output copy and paste):

ch256 <- sapply(0:255, function(x) rawToChar(as.raw(x)))
Sys.setlocale("LC_CTYPE", "ru_RU.KOI8-R")
#> [1] "ru_RU.KOI8-R"
ch256
#> [1] ""     "\001" "\002" "\003" "\004" "\005" "\006" "\a"   "\b"   "\t"  
#> [11] "\n"   "\v"   "\f"   "\r"   "\016" "\017" "\020" "\021" "\022" "\023"
#> [21] "\024" "\025" "\026" "\027" "\030" "\031" "\032" "\033" "\034" "\035"
#> [31] "\036" "\037" " "    "!"    "\""   "#"    "$"    "%"    "&"    "'"   
#> [41] "("    ")"    "*"    "+"    ","    "-"    "."    "/"    "0"    "1"   
#> [51] "2"    "3"    "4"    "5"    "6"    "7"    "8"    "9"    ":"    ";"   
#> [61] "<"    "="    ">"    "?"    "@"    "A"    "B"    "C"    "D"    "E"   
#> [71] "F"    "G"    "H"    "I"    "J"    "K"    "L"    "M"    "N"    "O"   
#> [81] "P"    "Q"    "R"    "S"    "T"    "U"    "V"    "W"    "X"    "Y"   
#> [91] "Z"    "["    "\\"   "]"    "^"    "_"    "`"    "a"    "b"    "c"   
#> [101] "d"    "e"    "f"    "g"    "h"    "i"    "j"    "k"    "l"    "m"   
#> [111] "n"    "o"    "p"    "q"    "r"    "s"    "t"    "u"    "v"    "w"   
#> [121] "x"    "y"    "z"    "{"    "|"    "}"    "~"    "\177" "Ä" "Å"
#> [131] "Ç" "É" "Ñ" "Ö" "Ü" "á" "à" "â" "ä" "ã"
#> [141] "å" "ç" "é" "è" "ê" "ë" "í" "ì" "î" "ï"
#> [151] "ñ" "ó" "ò" "ô" "ö" "õ" "ú" "ù" "û" "ü"
#> [161] "†" "°" "¢" "£" "§" "•" "¶" "ß" "®" "�"
#> [171] "™" "´" "¨" "≠" "Æ" "Ø" "∞" "±" "≤" "≥"
#> [181] "¥" "µ" "∂" "∑" "∏" "π" "∫" "ª" "º" "Ω"
#> [191] "æ" "ø" "¿" "¡" "¬" "√" "ƒ" "≈" "∆" "«"
#> [201] "»" "…" " " "À" "Ã" "Õ" "Œ" "œ" "–" "—"
#> [211] "“" "”" "‘" "’" "÷" "◊" "ÿ" "Ÿ" "⁄" "€"
#> [221] "‹" "›" "ﬁ" "ﬂ" "‡" "·" "‚" "„" "‰" "Â"
#> [231] "Ê" "Á" "Ë" "È" "Í" "Î" "Ï" "Ì" "Ó" "Ô"
#> [241] "" "Ò" "Ú" "Û" "Ù" "ı" "ˆ" "˜" "¯" "˘"
#> [251] "˙" "˚" "¸" "˝" "˛" "ˇ"

In fact I can reproduce the above with the ISO8859-1 encoding as well (latin1), the native R console will print those correctly this time like reprex, but the RStudio output will still be wrong.

I know that making everything UTF-8 fixes everything, but I really want to understand :

What's happening here?
Is it possible to get the correct output everywhere?
Is this output different on different systems?

Solution

I'm not a macOS or locale expert by any means, but this issue seems to boil down to the documented limitations of Sys.setlocale (a simple wrapper around setlocale from the Standard C Library; see man setlocale). help("Sys.setlocale") says:

Attempts to change the character set (by Sys.setlocale("LC_CTYPE", ), if that implies a different character set) during a session may not work and are likely to lead to some confusion.

IIUC that is because the application embedding R, which handles the output stream, may not be written to honor changes to the character set by the embedded R. So you really need to be reading the documentation of the application embedding R.

The R for macOS FAQ says:

By default R.APP uses UTF-8 for newly created documents and for the console. When opening new documents R.APP assumes UTF-8 and only if the document violates UTF-8 rules, it will try to fallback to legacy encoding, usually Mac Roman.

Indeed, your output from R.APP seems consistent with Mac OS Roman.

This Posit Support article says:

If you call Sys.setlocale with "LC_CTYPE" or "LC_ALL" to change the system locale while RStudio is running, you may run into some minor issues as RStudio assumes the system encoding doesn't change.

suggesting that the character set used by the RStudio console is fixed at start up depending on the environment at start up. Well, if we dig around in the RStudio sources, we find that it effectively requires UTF-8 even if the environment indicates a different, macOS-supported character set. (And, on my macOS, locale -a indicates that KO18-R is supported.)

That leaves Terminal.app, which I tend to use instead of R.app because I tend to want a shell. The encoding there can be set under Settings > Profiles > Advanced > International. If that is set to UTF-8, then we see output similar to RStudio. But if that is set to KO18-R, then we see "expected" output for bytes 0 through 255. Nice.

To answer some of the remaining questions:

How do you get "expected" output under every application?

If you know that the source encoding is KO18-R and that the system encoding is UTF-8, then use iconv to translate the strings to the system encoding instead of trying to change the character set to the match the source encoding.

iconv(ch256, from = "KO18-R", to = "UTF-8")

If you don't know that the system encoding is UTF-8, then you could try using to = l10n_info()[["codeset"]]. I'm not sure if that is general or portable, though ...

Why are bytes 128 through 255 rendered as `"�"`?

Under section "Single-byte locales", help("print.default") says:

If a non-printable character is encountered during output, it is represented as one of the ANSI escape sequences (\a, \b, \f, \n, \r, \t, \v, \\ and \0: see Quotes), or failing that as a 3-digit octal code: for example the UK currency pound sign in the C locale (if implemented correctly) is printed as \243. Which characters are non-printable depends on the locale.

Under section "Unicode and other multi-byte locales", it says:

It is possible to have a character string in a character vector that is not valid in the current locale. If a byte is encountered that is not part of a valid character it is printed in hex in the form \xab and this is repeated until the start of a valid character. (This will rapidly recover from minor errors in UTF-8.)

You told R to use a single-byte encoding, namely KO18-R. In that encoding, bytes 128 through 255 are printable characters, so print.default does not attempt to format them as octal "\abc". It leaves the original, single bytes alone. But those bytes do not represent valid characters in the UTF-8 encoding used by the application embedding R, so they are ultimately rendered as the standard, multi-byte replacement character "�". You do not see the hex "\xab" because (again) R thinks that you are using a single-byte encoding. It has no way of knowing that the application embedding R is actually using a multi-byte encoding, where "\xab" would be more informative than "�".

Why does `reprex` produce "expected" output?

I don't really know. reprex uses rmarkdown to render output and rmarkdown seems to use UTF-8 unconditionally. My guess is that somewhere in the reprex call stack the output containing bytes 128 through 255 is translated from KO18-R to UTF-8. But how would rmarkdown know to translate from KO18-R? Does it somehow record the encoding in use before the R subprocess terminates? The messages emitted by this augmented code block are suggestive ...

reprex::reprex({
                   Sys.setlocale("LC_CTYPE", "ru_RU.KOI8-R")
                   sapply(0:255, function(x) rawToChar(as.raw(x)))
                   Sys.setlocale("LC_CTYPE", "ru_RU.UTF-8")
                   sapply(0:255, function(x) rawToChar(as.raw(x)))
               },
               std_out_err = TRUE)

Quitting from lines  at lines 20-24 [unnamed-chunk-2] (soot-cub_reprex.spin.Rmd)
Error in gsub("[\n]{2,}$", "\n", x) : input string 1 is invalid
In addition: Warning messages:
1: In grepl("^\\s*$", x) :
  unable to translate '  [1] ""     "\001" "\002" "\003" "\004" "\005" "\006" "\a"   "\b"   "\t"  
 [11] "\n"   "\v"   "\f"   "\r"   "\016" "\017" "\020" "\021" "\022" "\023"
 [21] "\024" "\025" "\026" "\027" "\030" "\031" "\032" "\033" "\034" "\035"
 [31] "\036" "\037" " " ...' to a wide string
2: In grepl("^\\s*$", x) : input string 1 is invalid
3: In gsub("[\n]{2,}$", "\n", x) :
  unable to translate '  [1] ""     "\001" "\002" "\003" "\004" "\005" "\006" "\a"   "\b"   "\t"  
 [11] "\n"   "\v"   "\f"   "\r"   "\016" "\017" "\020" "\021" "\022" "\023"
 [21] "\024" "\025" "\026" "\027" "\030" "\031" "\032" "\033" "\034" "\035"
 [31] "\036" "\037" " " ...' to a wide string

Maybe one of the functions in the stack should be passing useBytes = TRUE to grep and friends. Or maybe not. It would be nice to see the traceback ...

Characters printed differently in R.app/RStudio/reprex

How do you get "expected" output under every application?

Why are bytes 128 through 255 rendered as "�"?

Why does reprex produce "expected" output?

Why are bytes 128 through 255 rendered as `"�"`?

Why does `reprex` produce "expected" output?