import pandas as pd
data = [['4/04/2023','Floors','Tables','Roof','Paint','Paint'],['4/05/2023','Roof','Floors','Tables','Roof','Paint'],['4/06/2023','Paint','','Floors','Tables','Roof'],['4/07/2023','Roof','Paint','','Floors','Tables']]
df = pd.DataFrame(data, columns=['Date', 'Jim','Bob','Ed','James','Joe'])
df2 = df.melt(id_vars="Date", value_name='Items').dropna().reset_index(drop=True)
df3 = df2.pivot(index="Date", columns='Items', values='variable') df3 = df3[["Floors","Tables","Roof","Paint"]] #change the order
print(df)
print(df3)
I would like Pandas to detect duplicates and make an extra column where there are duplicates rather than throwing the following error:
raise ValueError("Index contains duplicate entries, cannot reshape")
ValueError: Index contains duplicate entries, cannot reshape
You can use:
data = [['4/04/2023','Floors','Tables','Roof','Paint','Paint'],
['4/05/2023','Roof','Floors','Tables','Roof','Paint'],
['4/06/2023','Paint','','Floors','Tables','Roof'],
['4/07/2023','Roof','Paint','','Floors','Tables']]
df = pd.DataFrame(data, columns=['Date', 'Jim','Bob','Ed','James','Joe'])
out = (
df.replace('', np.nan) # get rid of empty strings
# reshape to long
.melt(id_vars="Date", value_name='Items').dropna().reset_index(drop=True)
# deduplicate
.assign(n=lambda d: d.groupby(['Date', 'Items']).cumcount().add(1))
.pivot(index="Date", columns=['Items', 'n'], values='variable')
# flatten the MultiIndex
.pipe(lambda d: d.set_axis(d.columns.map(lambda x: f'{x[0]}_{x[1]}'), axis=1))
# fill NaN with empty strings and reset index
.fillna('').reset_index()
)
print(out)
Output:
Date Floors_1 Roof_1 Paint_1 Tables_1 Roof_2 Paint_2
0 4/04/2023 Jim Ed James Bob Joe
1 4/05/2023 Bob Jim Joe Ed James
2 4/06/2023 Ed Joe Jim James
3 4/07/2023 James Jim Bob Joe