Im working with Stackoverflow 2024 survey. In the csv file there are several multivalued variables (separated by ;). I want to apply One-hot encoding to the variables Employment and LanguageAdmire by use MultiLabelBinarizer
. However, my code works only for the first one. It fails for the second one.
import pandas as pd
import gdown # pip install gdown
from sklearn.preprocessing import MultiLabelBinarizer
file_id = '1ul_F8Moo9jIGG5pAhUtYz-dIktQXp1Wf'
url = f'https://drive.google.com/uc?id={file_id}'
output = 'survey_results_public.csv'
gdown.download(url, output, quiet=False) # It takes some seconds (150MB csv file)
df = pd.read_csv(output)
df.drop('ResponseId', axis=1, inplace=True)
df=df[~df.duplicated(keep='first')].copy()
#df['LanguageAdmired'].fillna('Other', inplace=True)
df['LanguageAdmired'] = df['LanguageAdmired'].fillna('Other')
df['LanguageAdmired'] = df['LanguageAdmired'].str.split(';')
df['Employment'] = df['Employment'].str.split(';')
# Create instance of binarizer and fit_transform the 'Employment' column
mlb = MultiLabelBinarizer()
# Apply one-hot encoding to the 'Employment' column
binary_labels = mlb.fit_transform(df['Employment'])
# Convert the 'Employment' binary labels to a DataFrame
df_labels = pd.DataFrame(binary_labels, columns=['Employment_' + c for c in mlb.classes_])
# Concatenate the original DataFrame with the new one containing binary labels
df = pd.concat([df, df_labels], axis=1).copy()
# Create instance of binarizer and fit_transform the 'Employment' column
mlb2 = MultiLabelBinarizer()
# Apply one-hot encoding to the 'LanguageAdmired' column
binary_labels = mlb2.fit_transform(df['LanguageAdmired'])
# Convert the 'LanguageAdmired' binary labels to a DataFrame
df_labels = pd.DataFrame(binary_labels, columns=['LanguageAdmired_' + c for c in mlb2.classes_])
# Concatenate the original DataFrame with the new one containing binary labels
df = pd.concat([df, df_labels], axis=1)
df.shape
It fails here:
binary_labels = mlb2.fit_transform(df['LanguageAdmired'])
The error:
826 class_mapping = defaultdict(int)
827 class_mapping.default_factory = class_mapping.__len__
--> 828 yt = self._transform(y, class_mapping)
830 # sort classes and reorder columns
831 tmp = sorted(class_mapping, key=class_mapping.get)
...
--> 901 for label in labels:
902 try:
903 index.add(class_mapping[label])
TypeError: 'float' object is not iterable
Using df.shape
before and after concat()
I see it adds new rows.
Problem makes drop()
(and ~df.duplicated
) which removes some indexes
from original df
and later df_labels
creates data with different indexes which not exists in original df
, and concat()
creates new rows for missing indexes which adds NaN
in column LanguageAdmired
, and next MultiLabelBinarizer
has problem with these NaN
.
It needs to add original indexes to labels index=df.index
df_labels1 = pd.DataFrame(binary_labels1,
columns=['Employment_' + c for c in mlb1.classes_],
index=df.index)
df_labels2 = pd.DataFrame(binary_labels2,
columns=['LanguageAdmired_' + c for c in mlb1.classes_],
index=df.index)
Full working code used for tests:
import pandas as pd
#import gdown # pip install gdown
from sklearn.preprocessing import MultiLabelBinarizer
# downloaded manually: https://drive.google.com/uc?id=1ul_F8Moo9jIGG5pAhUtYz-dIktQXp1Wf
#file_id = '1ul_F8Moo9jIGG5pAhUtYz-dIktQXp1Wf'
#url = f'https://drive.google.com/uc?id={file_id}'
output = 'survey_results_public.csv'
#gdown.download(url, output, quiet=False) # It takes some seconds (150MB csv file)
df = pd.read_csv(output)
print('shape:', df.shape)
df.drop('ResponseId', axis=1, inplace=True)
df = df[~df.duplicated(keep='first')].copy()
#df['LanguageAdmired'].fillna('Other', inplace=True)
df['LanguageAdmired'] = df['LanguageAdmired'].fillna('Other')
df['LanguageAdmired'] = df['LanguageAdmired'].str.split(';')
df['Employment'] = df['Employment'].str.split(';')
print('shape:', df.shape)
# Create instance of binarizer and fit_transform the 'Employment' column
mlb1 = MultiLabelBinarizer()
# Apply one-hot encoding to the 'Employment' column
binary_labels1 = mlb1.fit_transform(df['Employment'])
# Convert the 'Employment' binary labels to a DataFrame
df_labels1 = pd.DataFrame(binary_labels1, columns=['Employment_' + c for c in mlb1.classes_], index=df.index)
print(len(mlb1.classes_))
print(df_labels1.shape)
# Create instance of binarizer and fit_transform the 'Employment' column
mlb2 = MultiLabelBinarizer()
# Apply one-hot encoding to the 'LanguageAdmired' column
binary_labels2 = mlb2.fit_transform(df['LanguageAdmired'])
# Convert the 'LanguageAdmired' binary labels to a DataFrame
df_labels2 = pd.DataFrame(binary_labels2, columns=['LanguageAdmired_' + c for c in mlb2.classes_], index=df.index)
print(len(mlb2.classes_))
print(df_labels2.shape)
# Concatenate the original DataFrame with the new one containing binary labels
df = pd.concat([df, df_labels1, df_labels2], axis=1)
print('shape:', df.shape)