I don't want to convert things like \n \r so I can't use unicode_escape directly.
But my code is very slow now, is there any better way?
def only_transform_backslash_u(content):
new_content = b''
idx = 0
while idx < len(content):
if content[idx:idx+2] == b'\\u':
new_content += content[idx:idx+6].decode('unicode_escape').encode('utf-8')
idx += 6
else:
new_content += content[idx:idx+1]
idx += 1
return new_content
Here is the sample input and expected output:
INPUT
content = br'''\u25c6\u4e00\u53ea\n\u68d5\u8272\r\u7684\u72d0
\u72f8\u8df3\u8fc7\u4e00\u53ea\u61d2\u60f0\u7684\u72d7'''
OUTPUT
new_content = br'''\xe2\x97\x86\xe4\xb8\x80\xe5\x8f\xaa\n\xe6\xa3\x95\xe8\x89\xb2\r\xe7\x9a\x84\xe7\x8b\x90
\xe7\x8b\xb8\xe8\xb7\xb3\xe8\xbf\x87\xe4\xb8\x80\t\xe5\x8f\xaa\xe6\x87\x92\xe6\x83\xb0\xe7\x9a\x84\xe7\x8b\x97'''
I'd use a regular expression in bytes mode:
import re
def only_transform_backslash_u(content: bytes):
return re.sub(
rb"\\u([0-9a-f]{4})",
lambda m: chr(int(m.group(1), 16)).encode("utf-8"),
content,
flags=re.IGNORECASE,
)
out = only_transform_backslash_u(rb"Hel\nlo\u2028World")
print(repr(out))
prints out
b'Hel\\nlo\xe2\x80\xa8World'