I'm trying to figure out how to decode some corrupt characters I have in a spreadsheet. There is a list of website titles: some in English, some in Greek, some in other languages. For example, Greek phrase ΕΛΛΗΝΙΚΑ ΝΕΑ ΤΩΡΑ
shows as ΕΛΛΗΝΙΚΑ ΝΕΑ ΤΩΡΑ
. So the whitespaces are OK, but the actual letters gone all wrong.
I have noticed that letters got converted to pairs of symbols:
Ε
- Ε
Λ
- Λ
And so on. So it's almost always Œ
and then some other symbol after it.
I went further, removed the repeated letter and checked difference in ASCII codes of the actual phrase and what was left of the corrupted phrase: ord('ï') - ord('Ε')
and so on. The difference is almost the same all the time: `
678
678
678
676
676
677
676
678
0 (this is a whitespace)
676
678
678
0 (this is a whitespace)
765
768
753
678
I have manually decoded some of the other letters from other titles:
Greek
Œë Α
Œî Δ
Œï Ε
Œõ Λ
Œó Η
Œô Ι
Œö Κ
Œù Ν
Œ° Ρ
Œ§ Τ
Œ© Ω
Œµ ε
Œª λ
œÑ τ
ŒØ ί
Œø ο
œÑ τ
œâ ω
ŒΩ ν
Symbols
‚Äò ‘
‚Äô ’
‚Ķ …
‚Ć †
‚Äú “
Other
√© é
It's good I have a translation for this phrase, but there are a couple of others I don't have translation for. I would be glad to see any kind of advice because searching around StackOverflow didn't show me anything related.
It's a character encoding issue. The string appears to be in encoding Mac OS Roman (figured it out by educated guesses on this site). The IANA code for this encoding is macintosh
, and its Windows code page number is 100000.
Here's a Python function that will decode macintosh
to utf-8
strings:
def macToUtf8(s):
return bytes(s, 'macintosh').decode('utf-8')
print(macToUtf8('ΕΛΛΗΝΙΚΑ ΝΕΑ ΤΩΡΑ'))
# outputs: ΕΛΛΗΝΙΚΑ ΝΕΑ ΤΩΡΑ
My best guess is that your spreadsheet was saved on a Mac Computer, or perhaps saved using some Macintosh-based setting.
See also this issue: What encoding does MAC Excel use?
OP's comment mentioned Google Sheets, so it's possible others may have the same issue. Unfortunately, it does not look like Sheets supports anything other than ASCII and UTF-8, so to translate from Macintosh to UTF-8, we have to make a translation table. I did this with Python. This prints a JS object literal we can use for this purpose:
def formatUnicode(num):
return f"\\u{num:0{4}x}"
def printUtf8ByteDecodeTable(encoding):
print("{")
for i in range(256):
utf8Hex = formatUnicode(ord(i.to_bytes(1, byteorder='big').decode(encoding)))
hexByte = hex(i)
print(f" '{utf8Hex}': {hexByte},")
print("}")
printUtf8ByteDecodeTable('macintosh')
Once we have the lookup table, we can write the script:
function macToUtf8(str) {
const decodeTable = {
'\u0000': 0x0,
'\u0001': 0x1,
'\u0002': 0x2,
'\u0003': 0x3,
'\u0004': 0x4,
'\u0005': 0x5,
'\u0006': 0x6,
'\u0007': 0x7,
'\u0008': 0x8,
'\u0009': 0x9,
'\u000a': 0xa,
'\u000b': 0xb,
'\u000c': 0xc,
'\u000d': 0xd,
'\u000e': 0xe,
'\u000f': 0xf,
'\u0010': 0x10,
'\u0011': 0x11,
'\u0012': 0x12,
'\u0013': 0x13,
'\u0014': 0x14,
'\u0015': 0x15,
'\u0016': 0x16,
'\u0017': 0x17,
'\u0018': 0x18,
'\u0019': 0x19,
'\u001a': 0x1a,
'\u001b': 0x1b,
'\u001c': 0x1c,
'\u001d': 0x1d,
'\u001e': 0x1e,
'\u001f': 0x1f,
'\u0020': 0x20,
'\u0021': 0x21,
'\u0022': 0x22,
'\u0023': 0x23,
'\u0024': 0x24,
'\u0025': 0x25,
'\u0026': 0x26,
'\u0027': 0x27,
'\u0028': 0x28,
'\u0029': 0x29,
'\u002a': 0x2a,
'\u002b': 0x2b,
'\u002c': 0x2c,
'\u002d': 0x2d,
'\u002e': 0x2e,
'\u002f': 0x2f,
'\u0030': 0x30,
'\u0031': 0x31,
'\u0032': 0x32,
'\u0033': 0x33,
'\u0034': 0x34,
'\u0035': 0x35,
'\u0036': 0x36,
'\u0037': 0x37,
'\u0038': 0x38,
'\u0039': 0x39,
'\u003a': 0x3a,
'\u003b': 0x3b,
'\u003c': 0x3c,
'\u003d': 0x3d,
'\u003e': 0x3e,
'\u003f': 0x3f,
'\u0040': 0x40,
'\u0041': 0x41,
'\u0042': 0x42,
'\u0043': 0x43,
'\u0044': 0x44,
'\u0045': 0x45,
'\u0046': 0x46,
'\u0047': 0x47,
'\u0048': 0x48,
'\u0049': 0x49,
'\u004a': 0x4a,
'\u004b': 0x4b,
'\u004c': 0x4c,
'\u004d': 0x4d,
'\u004e': 0x4e,
'\u004f': 0x4f,
'\u0050': 0x50,
'\u0051': 0x51,
'\u0052': 0x52,
'\u0053': 0x53,
'\u0054': 0x54,
'\u0055': 0x55,
'\u0056': 0x56,
'\u0057': 0x57,
'\u0058': 0x58,
'\u0059': 0x59,
'\u005a': 0x5a,
'\u005b': 0x5b,
'\u005c': 0x5c,
'\u005d': 0x5d,
'\u005e': 0x5e,
'\u005f': 0x5f,
'\u0060': 0x60,
'\u0061': 0x61,
'\u0062': 0x62,
'\u0063': 0x63,
'\u0064': 0x64,
'\u0065': 0x65,
'\u0066': 0x66,
'\u0067': 0x67,
'\u0068': 0x68,
'\u0069': 0x69,
'\u006a': 0x6a,
'\u006b': 0x6b,
'\u006c': 0x6c,
'\u006d': 0x6d,
'\u006e': 0x6e,
'\u006f': 0x6f,
'\u0070': 0x70,
'\u0071': 0x71,
'\u0072': 0x72,
'\u0073': 0x73,
'\u0074': 0x74,
'\u0075': 0x75,
'\u0076': 0x76,
'\u0077': 0x77,
'\u0078': 0x78,
'\u0079': 0x79,
'\u007a': 0x7a,
'\u007b': 0x7b,
'\u007c': 0x7c,
'\u007d': 0x7d,
'\u007e': 0x7e,
'\u007f': 0x7f,
'\u00c4': 0x80,
'\u00c5': 0x81,
'\u00c7': 0x82,
'\u00c9': 0x83,
'\u00d1': 0x84,
'\u00d6': 0x85,
'\u00dc': 0x86,
'\u00e1': 0x87,
'\u00e0': 0x88,
'\u00e2': 0x89,
'\u00e4': 0x8a,
'\u00e3': 0x8b,
'\u00e5': 0x8c,
'\u00e7': 0x8d,
'\u00e9': 0x8e,
'\u00e8': 0x8f,
'\u00ea': 0x90,
'\u00eb': 0x91,
'\u00ed': 0x92,
'\u00ec': 0x93,
'\u00ee': 0x94,
'\u00ef': 0x95,
'\u00f1': 0x96,
'\u00f3': 0x97,
'\u00f2': 0x98,
'\u00f4': 0x99,
'\u00f6': 0x9a,
'\u00f5': 0x9b,
'\u00fa': 0x9c,
'\u00f9': 0x9d,
'\u00fb': 0x9e,
'\u00fc': 0x9f,
'\u2020': 0xa0,
'\u00b0': 0xa1,
'\u00a2': 0xa2,
'\u00a3': 0xa3,
'\u00a7': 0xa4,
'\u2022': 0xa5,
'\u00b6': 0xa6,
'\u00df': 0xa7,
'\u00ae': 0xa8,
'\u00a9': 0xa9,
'\u2122': 0xaa,
'\u00b4': 0xab,
'\u00a8': 0xac,
'\u2260': 0xad,
'\u00c6': 0xae,
'\u00d8': 0xaf,
'\u221e': 0xb0,
'\u00b1': 0xb1,
'\u2264': 0xb2,
'\u2265': 0xb3,
'\u00a5': 0xb4,
'\u00b5': 0xb5,
'\u2202': 0xb6,
'\u2211': 0xb7,
'\u220f': 0xb8,
'\u03c0': 0xb9,
'\u222b': 0xba,
'\u00aa': 0xbb,
'\u00ba': 0xbc,
'\u03a9': 0xbd,
'\u00e6': 0xbe,
'\u00f8': 0xbf,
'\u00bf': 0xc0,
'\u00a1': 0xc1,
'\u00ac': 0xc2,
'\u221a': 0xc3,
'\u0192': 0xc4,
'\u2248': 0xc5,
'\u2206': 0xc6,
'\u00ab': 0xc7,
'\u00bb': 0xc8,
'\u2026': 0xc9,
'\u00a0': 0xca,
'\u00c0': 0xcb,
'\u00c3': 0xcc,
'\u00d5': 0xcd,
'\u0152': 0xce,
'\u0153': 0xcf,
'\u2013': 0xd0,
'\u2014': 0xd1,
'\u201c': 0xd2,
'\u201d': 0xd3,
'\u2018': 0xd4,
'\u2019': 0xd5,
'\u00f7': 0xd6,
'\u25ca': 0xd7,
'\u00ff': 0xd8,
'\u0178': 0xd9,
'\u2044': 0xda,
'\u20ac': 0xdb,
'\u2039': 0xdc,
'\u203a': 0xdd,
'\ufb01': 0xde,
'\ufb02': 0xdf,
'\u2021': 0xe0,
'\u00b7': 0xe1,
'\u201a': 0xe2,
'\u201e': 0xe3,
'\u2030': 0xe4,
'\u00c2': 0xe5,
'\u00ca': 0xe6,
'\u00c1': 0xe7,
'\u00cb': 0xe8,
'\u00c8': 0xe9,
'\u00cd': 0xea,
'\u00ce': 0xeb,
'\u00cf': 0xec,
'\u00cc': 0xed,
'\u00d3': 0xee,
'\u00d4': 0xef,
'\uf8ff': 0xf0,
'\u00d2': 0xf1,
'\u00da': 0xf2,
'\u00db': 0xf3,
'\u00d9': 0xf4,
'\u0131': 0xf5,
'\u02c6': 0xf6,
'\u02dc': 0xf7,
'\u00af': 0xf8,
'\u02d8': 0xf9,
'\u02d9': 0xfa,
'\u02da': 0xfb,
'\u00b8': 0xfc,
'\u02dd': 0xfd,
'\u02db': 0xfe,
'\u02c7': 0xff,
};
// Translate
const bytes = [];
for (const c of str) {
bytes.push(decodeTable[c]);
}
// Apps script does not have the WHATWG standard TextDecoder class
return Utilities.newBlob("").setBytes(bytes).getDataAsString('utf-8');
}
For Node/browser, the equivalent of the last line is
return new TextDecoder().decode(new Uint8Array(bytes));