The code is attached below. It works fine until it gets to ai: df_ai
in the database
dict.
data = pd.read_csv('survey_results_public.csv')
df_demographics = data[['ResponseId', 'MainBranch', 'Age', 'Employment', 'EdLevel', 'YearsCode', 'Country']]
df_learn_code = data[['ResponseId', 'LearnCode']]
df_language = data[['ResponseId', 'LanguageAdmired']]
df_ai = data[['ResponseId', 'AISelect', 'AISent', 'AIAcc', 'AIComplex', 'AIThreat', 'AIBen', 'AIToolCurrently Using']]
database = {'demographics': df_demographics, 'learn_code': df_learn_code, 'language': df_language, 'ai': df_ai}
def find_semicolons(dataframe):
result = []
firstFifty = dataframe.head(50)
for column in firstFifty.columns:
if firstFifty[column].apply(lambda x: ';' in str(x)).any():
result.append(column)
return result
def transform_dataframe(dataframe):
result = find_semicolons(dataframe)
for column in result:
values = [str(x).split(';') for x in dataframe[column].unique().tolist()]
flat_values = []
for x in values:
flat_values.extend(x)
flat_values = set(flat_values)
for x in flat_values:
dataframe[x] = dataframe[column].str.contains(x, na=False).astype(int)
for x in database:
transform_dataframe(database.get(x))
database.get(x).to_csv(x + '.csv')
Here's the traceback
Traceback (most recent call last):
File "/Users/shalim/PycharmProjects/work/stackoverflow.py", line 45, in <module>
transform_dataframe(database.get(x))
File "/Users/shalim/PycharmProjects/work/stackoverflow.py", line 40, in transform_dataframe
dataframe[x] = dataframe[column].str.contains(x, na=False).astype(int)
File "/Users/shalim/PycharmProjects/work/venv/lib/python3.9/site-packages/pandas/core/strings/accessor.py", line 137, in wrapper
return func(self, *args, **kwargs)
File "/Users/shalim/PycharmProjects/work/venv/lib/python3.9/site-packages/pandas/core/strings/accessor.py", line 1327, in contains
if regex and re.compile(pat).groups:
File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/re.py", line 252, in compile
return _compile(pattern, flags)
File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/re.py", line 304, in _compile
p = sre_compile.compile(pattern, flags)
File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/sre_compile.py", line 764, in compile
p = sre_parse.parse(p, flags)
File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/sre_parse.py", line 948, in parse
p = _parse_sub(source, state, flags & SRE_FLAG_VERBOSE, 0)
File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/sre_parse.py", line 443, in _parse_sub
itemsappend(_parse(source, state, verbose, nested + 1,
File "/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.9/lib/python3.9/sre_parse.py", line 671, in _parse
raise source.error("multiple repeat",
re.error: multiple repeat at position 2
Pandas .str.contains
performs a regex search rather than a substring search, by default. That means that characters like *
or +
get treated as regex metacharacters instead of a literal asterisk or plus sign.
It looks like you're trying to perform a substring search, not a regex search. Your x
isn't a valid regex, and even if it was, it wouldn't mean what you want. You need to specify regex=False
:
dataframe[x] = dataframe[column].str.contains(x, na=False, regex=False).astype(int)