javascriptutf-8charset

How to convert a string into bytes in specific charset (encoding) using JavaScript?


I want to generate text bytes in various charsets (encodings), such as ISO-8859-1, Big5, UTF-8, UTF-16, etc., mostly for testing purposes (i.e. to make sure my app script can correctly handle the provided bytes in these charsets), which is almost like:

new TextEncoder(mycharset).encode(mystring)

Unfortunately TextEncoder only supports conversion of a string into UTF-8 bytes, which is unlike TextDecoder, which supports conversion of bytes in various charset into a string.

The behavior of TextEncoder is defined by the spec and is unlikely going to be changed in the future... Any folk know why there is the inconsistent behavior in both classes? And the way to do the job? (without manually providing a conversion table/map for all the target charsets)

TO REVIEWER: This issue is NOT a duplicate of the related question, which asks why the parameter of TextEncoder does not take effect, and the answer says that TextEncoder does not take a parameter by the spec.

This questions specifically asks for:

  1. why the spec specified that TextEncoder not take a parameter, which is inconsistent with TextDecoder
  2. HOW to perform the conversion as this question asked

Both are not asked and explained in the related question and answer.


Solution

  • Thanks for @Kaiido's idea. After some tests I finally find a way to do it natively:

    /**
     * Encode a string into bytes in the specified charset.
     *
     * @param {string} str - the string to encode
     * @param {string} [charset=UTF-8] - the target charset to encode into
     * @param {*} [replacement] - the replacement char for a non-encodable char,
     *     which should be a valid ASCII char. Empty string to replace with
     *     nothing. Falsy to throw an error instead.
     * @return {Promise<Uint8Array>} The encoded bytes.
     */
    var encodeText = (() => {
      function escapeHtml(str) {
        const rv = [];
        for (let i = 0, I = str.length; i < I; i++) {
          const code = str.codePointAt(i);
          if (code > 0xFFFF) { i++; }
          rv.push(`&#${code};`);
        }
        return rv.join('');
      }
    
      function unescapeHtml(str, replacement) {
        return unescape(str).replace(/&#(?:(\d+)|x([\dA-Fa-f]+));/g, (_, dec, hex) => {
          if (hex) {
            return String.fromCharCode(parseInt(hex, 16));
          }
          if (typeof replacement === 'string') {
            return replacement;
          }
          throw parseInt(dec, 10);
        });
      }
    
      function byteStringToU8Array(bstr) {
        let n = bstr.length, u8ar = new Uint8Array(n);
        while (n--) { u8ar[n] = bstr.charCodeAt(n); }
        return u8ar;
      }
    
      async function encodeText(str, charset = "UTF-8", replacement = null) {
        // test if the charset is available
        try {
          new TextDecoder(charset);
        } catch (ex) {
          throw new RangeError(`Specified charset "${charset}" is not supported.`);
        }
    
        charset = charset.toLowerCase();
    
        // specially handle Unicode transformations
        // Available UTF names:
        // https://developer.mozilla.org/en-US/docs/Web/API/Encoding_API/Encodings
        if (['utf-8', 'utf8', 'unicode-1-1-utf-8'].includes(charset)) {
          return new TextEncoder().encode(str);
        } else if (['utf-16be', 'utf-16le', 'utf-16'].includes(charset)) {
          const littleEndian = !(charset === 'utf-16be');
          const u8ar = new Uint8Array(str.length * 2);
          const view = new DataView(u8ar.buffer);
          for (let i = 0, I = str.length; i < I; i++) {
            const code = str.charCodeAt(i);
            view.setUint16(i * 2, code, littleEndian);
          }
          return u8ar;
        }
    
        const frame = document.createElement("iframe");
        const markup = `<!DOCTYPE html><script data-text="${escapeHtml(str)}">
    function escapeHtml(str) {
      return str.replace(/[&#%]/g, m => escape("&#x" + m.charCodeAt(0).toString(16) + ";"));
    }
    const text = escapeHtml(document.currentScript.dataset.text);
    const a = document.createElement("a");
    a.href = "https://example.com/?" + text;
    parent.postMessage(a.search.slice(1), "*");
    <\/script>`;
        const blob = new Blob([markup], {type: `text/html;charset=${charset}`});
        frame.src = URL.createObjectURL(blob);
        document.body.append(frame);
        const aborter = new AbortController();
        let result = await new Promise((resolve) => {
          addEventListener("message", ({source, data}) => {
            if (source === frame.contentWindow) {
              aborter.abort();
              resolve(data);
            }
          }, {signal: aborter.signal});
        });
        frame.remove();
        try {
          result = unescapeHtml(result, replacement);
        } catch (code) {
          const _code = code.toString(16).toUpperCase();
          const idx = str.indexOf(String.fromCodePoint(code));
          throw new RangeError(`Unable to encode char U+${_code} at position ${idx}`);
        }
        return byteStringToU8Array(result);
      }
    
      return encodeText;
    })();
    
    // tests
    (async () => {
      var str = "中文𠀀";
      var charset = "big5";
      var bytes = await encodeText(str, charset, '?');
      console.log(charset, bytes);
    
      var str = "中文𠀀";
      var charset = "shift_jis";
      var bytes = await encodeText(str, charset, '');
      console.log(charset, bytes);
    
      var str = "中文𠀀";
      var charset = "utf-8";
      var bytes = await encodeText(str, charset);
      console.log(charset, bytes);
    
      var str = "中文𠀀";
      var charset = "utf-16be";
      var bytes = await encodeText(str, charset);
      console.log(charset, bytes);
    
      var str = "中文𠀀";
      var charset = "utf-16le";
      var bytes = await encodeText(str, charset);
      console.log(charset, bytes);
    
      // check special chars are passed safely
      var str = "&#123;<>%20";
      var charset = "big5";
      var bytes = await encodeText(str, charset);
      console.log(charset, bytes);
    
      // throw an error for a bad charset
      try {
        var str = "中文𠀀";
        var charset = "wtf";
        var bytes = await encodeText(str, charset);
      } catch (ex) {
        console.error(ex);
      }
    
      // throw an error if no replacement string
      try {
        var str = "中文𠀀";
        var charset = "big5";
        var bytes = await encodeText(str, charset);
      } catch (ex) {
        console.error(ex);
      }
    })();