pythonpandasdataframe

Why am I getting "raise source.error("multiple repeat", re.error: multiple repeat at position 2" when trying to save data frames to csv files?


The code is attached below. It works fine until it gets to ai: df_ai in the database dict.

data = pd.read_csv('survey_results_public.csv')

df_demographics = data[['ResponseId', 'MainBranch', 'Age', 'Employment', 'EdLevel', 'YearsCode', 'Country']]

df_learn_code = data[['ResponseId', 'LearnCode']]

df_language = data[['ResponseId', 'LanguageAdmired']]

df_ai = data[['ResponseId', 'AISelect', 'AISent', 'AIAcc', 'AIComplex', 'AIThreat', 'AIBen', 'AIToolCurrently Using']]

database = {'demographics': df_demographics, 'learn_code': df_learn_code, 'language': df_language, 'ai': df_ai}

def find_semicolons(dataframe):
    result = []

    firstFifty = dataframe.head(50)

    for column in firstFifty.columns:
        if firstFifty[column].apply(lambda x: ';' in str(x)).any():
            result.append(column)

    return result


def transform_dataframe(dataframe):
    result = find_semicolons(dataframe)

    for column in result:
        values = [str(x).split(';') for x in dataframe[column].unique().tolist()]
        flat_values = []
        for x in values:
            flat_values.extend(x)
        flat_values = set(flat_values)
        for x in flat_values:
            dataframe[x] = dataframe[column].str.contains(x, na=False).astype(int)



for x in database:
    transform_dataframe(database.get(x))
    database.get(x).to_csv(x + '.csv')

Here's the traceback

Traceback (most recent call last):
  File "/Users/shalim/PycharmProjects/work/stackoverflow.py", line 45, in <module>
    transform_dataframe(database.get(x))
  File "/Users/shalim/PycharmProjects/work/stackoverflow.py", line 40, in transform_dataframe
    dataframe[x] = dataframe[column].str.contains(x, na=False).astype(int)
  File "/Users/shalim/PycharmProjects/work/venv/lib/python3.9/site-packages/pandas/core/strings/accessor.py", line 137, in wrapper
    return func(self, *args, **kwargs)
  File "/Users/shalim/PycharmProjects/work/venv/lib/python3.9/site-packages/pandas/core/strings/accessor.py", line 1327, in contains
    if regex and re.compile(pat).groups:
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/re.py", line 252, in compile
    return _compile(pattern, flags)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/re.py", line 304, in _compile
    p = sre_compile.compile(pattern, flags)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/sre_compile.py", line 764, in compile
    p = sre_parse.parse(p, flags)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/sre_parse.py", line 948, in parse
    p = _parse_sub(source, state, flags & SRE_FLAG_VERBOSE, 0)
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/sre_parse.py", line 443, in _parse_sub
    itemsappend(_parse(source, state, verbose, nested + 1,
  File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/sre_parse.py", line 671, in _parse
    raise source.error("multiple repeat",
re.error: multiple repeat at position 2

Solution

  • Pandas .str.contains performs a regex search rather than a substring search, by default. That means that characters like * or + get treated as regex metacharacters instead of a literal asterisk or plus sign.

    It looks like you're trying to perform a substring search, not a regex search. Your x isn't a valid regex, and even if it was, it wouldn't mean what you want. You need to specify regex=False:

    dataframe[x] = dataframe[column].str.contains(x, na=False, regex=False).astype(int)