Amazon's Mechanical Turk rejects CSV HIT files that contain 4-byte UTF-8 characters - such as Emoji. However, Emoji characters are an integral part of the worker tasks and I need to keep them.
I found the script at https://github.com/charman/mturk-emoji, which replaces the Emoji character with their equivalent HTML spans (e.g., ). However, when feeding the preprocessed CSV to MTurk, the Emoji characters are not rendered.
I managed to solve the problem following these steps.
Convert the CVS with UTF-8 Emojis using the script encode_emoji.py
in the linked GitHub repo. You get, say, sample_with_emoji.csv
.
In Mechanical Turk, edit your current project and go to Design Layout
. In order for the HTML span
with the emoji bytes to be properly rendered, you need to add the following code at the beginning in the HTML Editor of MTurk:
<script src="https://code.jquery.com/jquery-3.3.1.js"
integrity="sha256-2Kok7MbOyxpgUVvAk/HJ2jigOSYS2auK4Pfzbm7uH60="
crossorigin="anonymous"></script>
<script>
function displayEmoji() {
/**
* utf8ByteArrayToString() copied from:
* https://github.com/google/closure-library/blob/e877b1eac410c0d842bcda118689759512e0e26f/closure/goog/crypt/crypt.js
*
* Converts a UTF-8 byte array to JavaScript's 16-bit Unicode.
* @param {Uint8Array|Array<number>} bytes UTF-8 byte array.
* @return {string} 16-bit Unicode string.
*/
var utf8ByteArrayToString = function(bytes) {
var out = [], pos = 0, c = 0;
while (pos < bytes.length) {
var c1 = bytes[pos++];
if (c1 < 128) {
out[c++] = String.fromCharCode(c1);
} else if (c1 > 191 && c1 < 224) {
var c2 = bytes[pos++];
out[c++] = String.fromCharCode((c1 & 31) << 6 | c2 & 63);
} else if (c1 > 239 && c1 < 365) {
// Surrogate Pair
var c2 = bytes[pos++];
var c3 = bytes[pos++];
var c4 = bytes[pos++];
var u = ((c1 & 7) << 18 | (c2 & 63) << 12 | (c3 & 63) << 6 | c4 & 63) -
0x10000;
out[c++] = String.fromCharCode(0xD800 + (u >> 10));
out[c++] = String.fromCharCode(0xDC00 + (u & 1023));
} else {
var c2 = bytes[pos++];
var c3 = bytes[pos++];
out[c++] =
String.fromCharCode((c1 & 15) << 12 | (c2 & 63) << 6 | c3 & 63);
}
}
return out.join('');
}
jQuery(this).text(utf8ByteArrayToString(JSON.parse(jQuery(this).attr('data-emoji-bytes'))));
}
</script>
<script>
jQuery(document).ready(function() {
jQuery('span.emoji-bytes').each(displayEmoji);
});
</script>
The above is basically the content at the bottom of the README file in the repo, with the script decode_emoji.js
added inline rather than sourced.
sample_with_emoji.csv
file, the emojis are properly rendered in the Preview.