pythonpandasdataframe

Create new dataframe rows when column has comma delimited values


Example dataframe:

name      col1    col2     col3
bob       bird     78       1000
alice     cat      55       500,600,700
rob       dog      333      20,30

Desired Dataframe that adds rows when col3 has comma delimited string values:

name     col1      col2     col3
bob      bird       78      1000
alice    cat        55      500
alice    cat        55      600
alice    cat        55      700
rob      dog        333      20
rob      dog        333      30

Any suggestion is appreciated! thanks!


Solution

  • import pandas as pd
    import numpy as np
    
    
    class DataFrameExpander:
        """
        Class for extending DataFrame by adding new rows when the values ​​in a column contain comma-separated strings.
        """
    
        def __init__(self, dataframe):
            """
            Initializing a class with a DataFrame.
            :param dataframe: Исходный DataFrame
            """
            self.dataframe = dataframe
    
        def expand_column(self, column_name):
            """
            Method to extend a DataFrame by adding new rows for comma-separated values ​​in the specified column.
            :param column_name: The name of the column to split comma-separated values
            :return: The new extended DataFrame
            """
            # Use the explode method to efficiently split string values
            df = self.dataframe.copy()
            df[column_name] = df[column_name].apply(lambda x: str(x).split(','))
            expanded_df = df.explode(column_name, ignore_index=True)
    
            # Convert the values ​​in the column back to a numeric type if possible
            expanded_df[column_name] = pd.to_numeric(
                expanded_df[column_name], errors='ignore')
    
            return expanded_df
    
    
    if __name__ == "__main__":
    
        data = {
            'name': ['bob', 'alice', 'rob'],
            'col1': ['bird', 'cat', 'dog'],
            'col2': [78, 55, 333],
            'col3': ['1000', '500,600,700', '20,30']
        }
    
        df = pd.DataFrame(data)
        expander = DataFrameExpander(df)
        expanded_df = expander.expand_column('col3')
        print(expanded_df)
    
    
    
        name  col1  col2  col3
    0    bob  bird    78  1000
    1  alice   cat    55   500
    2  alice   cat    55   600
    3  alice   cat    55   700
    4    rob   dog   333    20
    5    rob   dog   333    30