pythonencodingdecode

Replacing literal '\uxxxx' in string with corresponding unicode character


I don't want to convert things like \n \r so I can't use unicode_escape directly.

But my code is very slow now, is there any better way?

    def only_transform_backslash_u(content):
        new_content = b''
    
        idx = 0
        while idx < len(content):
            if content[idx:idx+2] == b'\\u':
                new_content += content[idx:idx+6].decode('unicode_escape').encode('utf-8')
                idx += 6
            else:
                new_content += content[idx:idx+1]
                idx += 1
        return new_content

Here is the sample input and expected output:

INPUT

content = br'''\u25c6\u4e00\u53ea\n\u68d5\u8272\r\u7684\u72d0
    \u72f8\u8df3\u8fc7\u4e00\u53ea\u61d2\u60f0\u7684\u72d7'''

OUTPUT

new_content = br'''\xe2\x97\x86\xe4\xb8\x80\xe5\x8f\xaa\n\xe6\xa3\x95\xe8\x89\xb2\r\xe7\x9a\x84\xe7\x8b\x90
    \xe7\x8b\xb8\xe8\xb7\xb3\xe8\xbf\x87\xe4\xb8\x80\t\xe5\x8f\xaa\xe6\x87\x92\xe6\x83\xb0\xe7\x9a\x84\xe7\x8b\x97'''

Solution

  • I'd use a regular expression in bytes mode:

    import re
    
    def only_transform_backslash_u(content: bytes):
        return re.sub(
            rb"\\u([0-9a-f]{4})",
            lambda m: chr(int(m.group(1), 16)).encode("utf-8"),
            content,
            flags=re.IGNORECASE,
        )
    
    
    out = only_transform_backslash_u(rb"Hel\nlo\u2028World")
    
    print(repr(out))
    

    prints out

    b'Hel\\nlo\xe2\x80\xa8World'