I am working with an Excel data set that looks like this
Col 1 | Col 2 | Col 3 | Col 4 | Col 5 | Col 6 | Col 7 | Col 8 |
---|---|---|---|---|---|---|---|
1 | 2 | A\n \nB | C\n \nD | E\n \nF | G\n \nH | I\n \nJ | K\n \nL |
3 | 4 | ||||||
5 | 6 | a\n \nb | c | e\n \nf | g\n \nh | i\n \nj | k\n \nl |
I want to use Col 3 to determine how many new rows need to be created, and then split Col 3 through Col 8 based on "\n \n". My 2 issues are when Col 3 is empty and when there is a length mismatch between Col 3 and Col 4.
My goal is to have a data set that looks like this
Col 1 | Col 2 | Col 3 | Col 4 | Col 5 | Col 6 | Col 7 | Col 8 |
---|---|---|---|---|---|---|---|
1 | 2 | A | C | E | G | I | K |
1 | 2 | B | D | F | H | J | L |
5 | 6 | a | c | e | g | i | k |
5 | 6 | b | c | f | h | j | l |
You can see that in the goal data set Col 1 and Col 2 are replicated for the new row created, the row that contained an empty Col 3 was deleted, and the row that had a length mismatch between Col 3 and Col 4 just copied the value for the new row since there was nothing to split.
The following is the script that I have been working with. You can see the commented out section of if/elif/else that I have tried to use to solve my two issues. There are no issues when Col 3 is populated and the index lengths are the same across Col 3 - Col 8.
def splitappendix(moduleName):
tableHeaders = ["A", "B", "C", "D", "E", "F", "G", "H"]
splitfunctionHeaders = ["Col 1", "Col 2", "Col 3", "Col 4", "Col 5", "Col 6", "Col 7", "Col 8"]
df = pd.read_excel(f'Output\\{moduleName}')
df.columns = splitfunctionHeaders
PrintConsole(df)
for i, row in df.iterrows():
df.dropna(axis = 0, subset = ['Col 3'], inplace = True)
PrintConsole(df)
df_rep = pd.DataFrame(columns =[f'Col {i}' for i in range(1,9)])
for i, row in enumerate(df.index):
'''
if df.isnull().iloc[i,2]:
df.dropna(axis = 0, subset = ['Col 3'], inplace = True)
if df['Col 3'][i] == "NaN":
df.dropna(axis = 0, subset = ['Col 3'], inplace = True)
elif len(df['Col 3'][i]) != len(df['Col 4'][i]):
single_rep = pd.DataFrame(index = range(len(df['Col 3'][i].split('\n \n'))), columns =[f'Col {i}' for i in range(1,9)])
single_rep['Col 1'] = df.iat[i,0]
single_rep['Col 2'] = df.iat[i,1]
single_rep['Col 4'] = df.iat[i,3]
PrintConsole(single_rep)
for j in range(3,3) and range(5,9):
single_rep['Col {}'.format(j)] = df['Col {}'.format(j)][i].split('\n \n')
df_rep = pd.concat([df_rep, single_rep])
PrintConsole(df_rep)
'''
#else:
single_rep = pd.DataFrame(index = range(len(df['Col 3'][i].split('\n \n'))), columns =[f'Col {i}' for i in range(1,9)])
single_rep['Col 1'] = df.iat[i,0]
single_rep['Col 2'] = df.iat[i,1]
PrintConsole(single_rep)
for j in range(3,9):
single_rep['Col {}'.format(j)] = df['Col {}'.format(j)][i].split('\n \n')
df_rep = pd.concat([df_rep, single_rep])
PrintConsole(df_rep)
df_rep.reset_index(drop=True, inplace=True)
df_rep.columns = tableHeaders
PrintConsole(df_rep)
#df_rep.to_excel(f'Output\\{moduleName}')
if __name__ == "__main__":
moduleName = 'Appendix.xlsx'
splitappendix(moduleName)
PrintConsole("Split Complete")
ExitCode()
Assuming this input:
df = pd.DataFrame({'Col 1': [1, 3, 5],
'Col 2': [2, 4, 6],
'Col 3': ['A\\n \\nB', None, 'a\\n \\nb'],
'Col 4': ['C\\n \\nD', None, 'c'],
'Col 5': ['E\\n \\nF', None, 'e\\n \\nf'],
'Col 6': ['G\\n \\nH', None, 'g\\n \\nh'],
'Col 7': ['I\\n \\nJ', None, 'i\\n \\nj'],
'Col 8': ['K\\n \\nL', None, 'k\\n \\nl'],
})
You could split the strings with a custom function, then explode with de-duplication as described here:
def explode_dedup(s):
s = s.explode()
return s.set_axis(
pd.MultiIndex.from_arrays([s.index, s.groupby(level=0).cumcount()])
)
def split(x):
if isinstance(x, str):
return l if len(l:=x.split('\\n \\n'))>1 else l[0]
return x
m = df['Col 3'].notna()
out = (pd.concat({c: explode_dedup(df.loc[m, c].map(split)) for c in df}, axis=1)
.sort_index()
.groupby(level=0).ffill()
.droplevel(1).convert_dtypes()
)
Output:
Col 1 Col 2 Col 3 Col 4 Col 5 Col 6 Col 7 Col 8
0 1 2 A C E G I K
0 1 2 B D F H J L
2 5 6 a c e g i k
2 5 6 b c f h j l