pythonpandas

Skipping empty row and fixing length mismatch


I am working with an Excel data set that looks like this

Col 1 Col 2 Col 3 Col 4 Col 5 Col 6 Col 7 Col 8
1 2 A\n \nB C\n \nD E\n \nF G\n \nH I\n \nJ K\n \nL
3 4
5 6 a\n \nb c e\n \nf g\n \nh i\n \nj k\n \nl

I want to use Col 3 to determine how many new rows need to be created, and then split Col 3 through Col 8 based on "\n \n". My 2 issues are when Col 3 is empty and when there is a length mismatch between Col 3 and Col 4.

My goal is to have a data set that looks like this

Col 1 Col 2 Col 3 Col 4 Col 5 Col 6 Col 7 Col 8
1 2 A C E G I K
1 2 B D F H J L
5 6 a c e g i k
5 6 b c f h j l

You can see that in the goal data set Col 1 and Col 2 are replicated for the new row created, the row that contained an empty Col 3 was deleted, and the row that had a length mismatch between Col 3 and Col 4 just copied the value for the new row since there was nothing to split.

The following is the script that I have been working with. You can see the commented out section of if/elif/else that I have tried to use to solve my two issues. There are no issues when Col 3 is populated and the index lengths are the same across Col 3 - Col 8.

def splitappendix(moduleName):
    tableHeaders = ["A", "B", "C", "D", "E", "F", "G", "H"]
    splitfunctionHeaders = ["Col 1", "Col 2", "Col 3", "Col 4", "Col 5", "Col 6", "Col 7", "Col 8"]


    df = pd.read_excel(f'Output\\{moduleName}')
    df.columns = splitfunctionHeaders

    PrintConsole(df) 

    for i, row in df.iterrows():
        df.dropna(axis = 0, subset = ['Col 3'], inplace = True)

    
    PrintConsole(df) 
    
    df_rep = pd.DataFrame(columns =[f'Col {i}' for i in range(1,9)])


    for i, row in enumerate(df.index):
        '''
        if df.isnull().iloc[i,2]:
            df.dropna(axis = 0, subset = ['Col 3'], inplace = True)

        if df['Col 3'][i] == "NaN":
            df.dropna(axis = 0, subset = ['Col 3'], inplace = True) 

        elif len(df['Col 3'][i]) != len(df['Col 4'][i]):
            single_rep = pd.DataFrame(index = range(len(df['Col 3'][i].split('\n \n'))), columns =[f'Col {i}' for i in range(1,9)])
            single_rep['Col 1'] = df.iat[i,0]
            single_rep['Col 2'] = df.iat[i,1]
            single_rep['Col 4'] = df.iat[i,3]
            PrintConsole(single_rep)
            for j in range(3,3) and range(5,9):
                single_rep['Col {}'.format(j)] = df['Col {}'.format(j)][i].split('\n \n')
            df_rep = pd.concat([df_rep, single_rep])
            PrintConsole(df_rep)
        '''
        #else:
        single_rep = pd.DataFrame(index = range(len(df['Col 3'][i].split('\n \n'))), columns =[f'Col {i}' for i in range(1,9)])
        single_rep['Col 1'] = df.iat[i,0]
        single_rep['Col 2'] = df.iat[i,1]
        PrintConsole(single_rep)
        for j in range(3,9):
            single_rep['Col {}'.format(j)] = df['Col {}'.format(j)][i].split('\n \n')
        df_rep = pd.concat([df_rep, single_rep])
        PrintConsole(df_rep)
    df_rep.reset_index(drop=True, inplace=True)

    df_rep.columns = tableHeaders

    PrintConsole(df_rep)

    #df_rep.to_excel(f'Output\\{moduleName}')



if __name__ == "__main__":

    moduleName = 'Appendix.xlsx'
    splitappendix(moduleName)
    PrintConsole("Split Complete")
    ExitCode()

Solution

  • Assuming this input:

    df = pd.DataFrame({'Col 1': [1, 3, 5],
                       'Col 2': [2, 4, 6],
                       'Col 3': ['A\\n \\nB', None, 'a\\n \\nb'],
                       'Col 4': ['C\\n \\nD', None, 'c'],
                       'Col 5': ['E\\n \\nF', None, 'e\\n \\nf'],
                       'Col 6': ['G\\n \\nH', None, 'g\\n \\nh'],
                       'Col 7': ['I\\n \\nJ', None, 'i\\n \\nj'],
                       'Col 8': ['K\\n \\nL', None, 'k\\n \\nl'],
                      })
    

    You could split the strings with a custom function, then explode with de-duplication as described here:

    def explode_dedup(s):
        s = s.explode()
        return s.set_axis(
            pd.MultiIndex.from_arrays([s.index, s.groupby(level=0).cumcount()])
        )
    
    def split(x):
        if isinstance(x, str):
            return l if len(l:=x.split('\\n \\n'))>1 else l[0]
        return x
    
    m = df['Col 3'].notna()
    
    out = (pd.concat({c: explode_dedup(df.loc[m, c].map(split)) for c in df}, axis=1)
             .sort_index()
             .groupby(level=0).ffill()
             .droplevel(1).convert_dtypes()
          )
    

    Output:

       Col 1  Col 2 Col 3 Col 4 Col 5 Col 6 Col 7 Col 8
    0      1      2     A     C     E     G     I     K
    0      1      2     B     D     F     H     J     L
    2      5      6     a     c     e     g     i     k
    2      5      6     b     c     f     h     j     l