pythonspecial-characterspython-re

Python re.sub () replace content but replacement contains special characters


I'm working on auto replacing contents in a file, the re.search() are successfully got the new_content, but it contains special characters and when I want to use re.sub() it shows :

error: invalid group reference 3 at position 85 (line 2, column 8)

I also tried using re.escape(), it will cause new_content unusable even it can replace successfully.

Here is my code :

import re
pattern = r'const\s+uint8_t\s+u8g2_custom_font\[\d+\] U8G2_FONT_SECTION.+?";\s'

with open('u8g2_custom_font.c', 'r') as file:
    content = file.read()

new_content = re.search(pattern, content, flags=re.DOTALL).group(0)

with open('u8g2_fonts.c', 'r') as file:
    content = file.read()


modified_content = re.sub(pattern, new_content, content, flags=re.DOTALL)

and here is the new_content :

const uint8_t u8g2_custom_font[1242] U8G2_FONT_SECTION("u8g2_custom_font") = 
  ";\0\3\2\5\5\3\5\6\20\20\0\376\16\376\13\377\0\0\1#\1\362\12 \20\322\301H\372\235\244"
  "s\322i\351 )i\16\245\203\244\244\71\64(\261\316I\347\244\223\222~ \6\0\320\243\0(\14\203"
  "\375\241J\242$\352-\312\2)\15\203\371\241\310\242,\352%J\42\0-\7$\230\242\30\2.\7B"
  "\34\242\30\2/\14F\25\242-V\303\64\254\246\0\60\21F\25\242\322\242$\324\246D\211\66\61\211\62"
  "\11\61\13E\31\242\312\244$\354\323 \62\17F\25\242\31\222PL\63-\254\246\303\0\63\20F\25\242"
  "\31\222PL\243\71\25\305dH\0\64\20F\25\242\14\265$\252dI\226\14cZ\1\71\16F\25\242"
  "\31\222\320\230\14jc\64\1:\10\342<\242\30\342!C\15F\25\242\31\222\320\332QL\206\4F\14"
  "F\25\242\70\244\325AI\273\2H\13F\25\242\10\35\207A\364\30L\11F\25\242H\373\353\60P\15"
  "F\25\242\30\224\320\70,iW\0R\20F\25\242\30\224\320\70,Q-\311\222P\14T\12G\25\242"
  "\70dq\177\3Y\16G\25\242H\325$\213\262J\32w\3_\7'\364\241\70\4a\16\6\25\242\31"
  "\222\60M\206\321\246,\1c\15\6\25\242\31\222P\355\230\14\11\0d\14f\25\242mY\264\321\233\262"
  "\4e\17\6\25\242\31\222P\34\6\265\230\14\11\0h\13f\25\242H[\26M\364\61i\13e\31\242"
  "\12sD\354\323 j\14\245\325\241\254#b\37\245H\2m\22\7\25\242X\224(\222\42)\222\42)"
  "\222\42\251\0n\12\6\25\242H\26M\364\61o\14\6\25\242\31\222\320\307dH\0p\16F\325\241H"
  "\26Mt\334\224%M\1r\13\6\25\242H\26MT\273\2s\14\6\25\242\31\222PvL\206\4u"
  "\12\6\25\242\10\375\246,\1w\21\7\25\242H\245H\212\244H\212\244H\252X\0\334\15\306\25\242\211"
  "\332\341\320\77&C\2\0\0\0\4\377\377 &\12G\24\242\210\42\251\0\60\253\31\252\371\301\314\261\34"
  "\12\303A\31\264\260[\330\226FY\22\245:\222\1\60\267\26j\35\302\321\261\234\244cq\216\344H\216"
  "\344H\30\311:\0\60\350\26i\35\302\34\216\71\222#\341\20m\71\222#\71\222\16\321\30\60\353\30k"
  "\31\302\315\263<\312\243<\312JY\251V\312*Y\250\245!\0N\12\37\357\361\301\316\361\34\317\361\34"
  "\317\361\34\37\206\34\311\361\34\317\361\34\317\361\34\317\221\341\3W\30)\15\326\301\370\240\246\312pQS"
  "i\30\42\251I\32\206Hj\222\206!R\263D\31.z\246\14\27)\314\264H\33\36\4[P\37\357"
  "\321\301\31\336\341\34\316\341\34\325\341|\370\236\343\71\236\343\71\236\343\71\232\344p\216\0[\244\42\357\361"
  "\301\316)\71\62<(\71\252\243\331p\320s\70\315\207;V\307rh\70\350P\216\347\303\7e\234."
  "\17\322\301\314\343\60J\223\60\11\263J\226\206Q\62H\345\64\211\323$\33\206p\14\207\60\251\206I\24"
  "fQ\61\312\362\60\311\323\34\211\0h\177*\17\322\301\313\361l\30\302,\15\263aH\206%\15\263a"
  "\310tx\7\223\312\60DI\261\24\347\321p\213\363\70\217\363\70\4i\5/\17\322\301\212\363\70\317\206"
  "AK\223p\220\262\64K\303h\70$;\222)\311\20U\262\250%\213\212YT\314\206(\314\241\60O"
  "\302\34\311\0j\2-\17\322\301\12\323\254\34e\203\324\222%\321\20e\311\224\15RK\226D\203\222%"
  "C\226\14b\236\17_\223:\324\254e\222\232\352\71\0mt*\17\322\301\317\322\260\32\305a\222Ur"
  "$\256\244\265\70J+\271\224\15C\264\245a\226\206Y\32fi\230\15C\216\244\21\0v\204%\15\326"
  "\301J\343\264\34\16\323\60&\241\230\204V\61\312\206!\213\304,\22c\61\26\343a\210\305,\311\301\10"
  "}\332\62\17\322\301Ks$\314\221p\30\262$J\243\322\60$C\226\206\331\60dy\32\245\322\260,"
  "J\232%u\250\22%\245\226nQ\22FI\226C)\0\215\357,\17\322\301\207\342A\213\263l\310J"
  "YK\343 \346i\22\207Y\232\245J\62\15Z\22f\225\60\253\204Ye\312\242u\320\261\22\0\226}"
  "/\356\325\301\30\244a\311\242\60\211\262a\211\262\60)\16K\224cQ\64\334\242\70\213\206AK\242$"
  "YJ\25%\214\42\65\312\364\244\71\213\0\226\373(\17\322\301\32\16:\224#\303\203\222\306\322\222,\71"
  "\222cK\262\223\207\203\330\70\34\304\306\341\240Cq\36\347\310\60\4\234\345#\15\326\301\315\301\34\33\216"
  "y\70\34\363p\70\346\350p\220rt\70hQK\324Rj\321\221:\30\1\0";

I would like to replace with the the new_content and wont cause it unusable.


Solution

  • The reason re.escape doesn't work is in the documentation:

    re.escape(pattern)
    ...
    This function must not be used for the replacement string in sub() and subn(), only backslashes should be escaped.

    Use:

    modified_content = re.sub(pattern, new_content.replace('\\', r'\\'), content, flags=re.DOTALL)
    #                                             ^^^^^^^^^^^^^^^^^^^^^
    

    Here is a simplified example:

    import re
    
    pattern = r'const\s+uint8_t\s+font\[\d+\] FONT.+?";\s'
    
    new_content = r'''const uint8_t font[123] FONT("u8g2_custom_font") =
      "\1\2\3\4";
    '''
    
    content = r'''
    double abc;
    const uint8_t font[123] FONT("u8g2_custom_font") = "abcd";
    int x = 123;
    '''
    
    # reproduces OP exception
    try:
        print(re.sub(pattern, new_content, content, flags=re.DOTALL))
    except re.error as e:
        print(e)
    
    # incorrect result using re.escape
    print(re.sub(pattern, re.escape(new_content), content, flags=re.DOTALL))
    
    # correct result escaping backslashes only
    print(re.sub(pattern, new_content.replace('\\', r'\\'), content, flags=re.DOTALL))
    

    Output:

    invalid group reference 1 at position 55 (line 2, column 5)
    
    double abc;
    const\ uint8_t\ font\[123\]\ FONT\("u8g2_custom_font"\)\ =\
    \ \ "\1\2\3\4";\
    int x = 123;
    
    
    double abc;
    const uint8_t font[123] FONT("u8g2_custom_font") =
      "\1\2\3\4";
    int x = 123;