I don't want to convert things like \n \r so I can't use unicode_escape directly.
But my code is very slow now, is there any better way?
def only_transform_backslash_u(content):
new_content = b''
idx = 0
while idx < len(content):
if content[idx:idx+2] == b'\\u':
new_content += content[idx:idx+6].decode('unicode_escape').encode('utf-8')
idx += 6
else:
new_content += content[idx:idx+1]
idx += 1
return new_content
I'd use a regular expression in bytes mode:
import re
def only_transform_backslash_u(content: bytes):
return re.sub(
rb"\\u([0-9a-f]{4})",
lambda m: chr(int(m.group(1), 16)).encode("utf-8"),
content,
flags=re.IGNORECASE,
)
out = only_transform_backslash_u(rb"Hel\nlo\u2028World")
print(repr(out))
prints out
b'Hel\\nlo\xe2\x80\xa8World'