pythonencodingdecode

Replacing literal '\uxxxx' in string with corresponding unicode character


I don't want to convert things like \n \r so I can't use unicode_escape directly.

But my code is very slow now, is there any better way?

    def only_transform_backslash_u(content):
        new_content = b''
    
        idx = 0
        while idx < len(content):
            if content[idx:idx+2] == b'\\u':
                new_content += content[idx:idx+6].decode('unicode_escape').encode('utf-8')
                idx += 6
            else:
                new_content += content[idx:idx+1]
                idx += 1
        return new_content

Solution

  • I'd use a regular expression in bytes mode:

    import re
    
    def only_transform_backslash_u(content: bytes):
        return re.sub(
            rb"\\u([0-9a-f]{4})",
            lambda m: chr(int(m.group(1), 16)).encode("utf-8"),
            content,
            flags=re.IGNORECASE,
        )
    
    
    out = only_transform_backslash_u(rb"Hel\nlo\u2028World")
    
    print(repr(out))
    

    prints out

    b'Hel\\nlo\xe2\x80\xa8World'