pythonpandas

Split columns containing lists from CSV into separate CSV files with pandas


I have CSV files with multiple columns of data retrieved from APIs, where each cell may contain either a single value or a list/array. The size of these lists is consistent across each column (e.g., a column named ALPHANUMS having a row containing a list like "['A', 'B', '4']" has the same list size of a column named COLOR having a row containing a list "['red', 'blue', 'green']", but the list sizes can vary per CSV file depending on the API response. I would like to use pandas to create separate CSV files for each element in a list column, while retaining the rest of the data in each file.

Here's an example of what the data might look like from this mockup function:

import random
import csv

# Predefined lists for NAME, CARS, and PHONE OS
NAMES = ["John Doe", "Jane Smith", "Alice Johnson", "Bob Brown", "Charlie Davis", "Eve White", "David Wilson", "Emma Taylor", "Frank Harris", "Grace Clark"]
CAR_BRANDS = ["Toyota", "Ford", "BMW", "Tesla", "Honda", "Chevrolet", "Nissan", "Audi"]
PHONE_OS = ["Android", "iOS"]

def create_csv(file_name, num_records):
    cur_random_list_size = random.randint(1, min(len(NAMES), len(CAR_BRANDS)))
    with open(file_name, mode='w', newline='') as file:
        writer = csv.writer(file)
        
        writer.writerow(["ID", "NAME", "MONTH", "CARS", "PHONE OS"])

        for i in range(num_records):

            record = {
                "id" : i + 1,
                "name": [NAMES[n] for n in range(cur_random_list_size)],
                "month": random.randint(1,12),
                "cars": [random.choice(CAR_BRANDS) for _ in range(cur_random_list_size)],
                "phone": random.choice(PHONE_OS)
            }
            writer.writerow(record.values())
    
    print(f"CSV file '{file_name}' created with {num_records} records.")

create_csv("people_data.csv", 5)
ID NAME MONTH CARS PHONE OS
1 "['John Doe', 'Jane Smith', 'Alice Johnson', 'Bob Brown', 'Charlie Davis', 'Eve White']" 2 "['Toyota', 'Nissan', 'Nissan', 'Nissan', 'Audi', 'Honda']" iOS
2 "['John Doe', 'Jane Smith', 'Alice Johnson', 'Bob Brown', 'Charlie Davis', 'Eve White']" 4 "['Nissan', 'Ford', 'Honda', 'Toyota', 'Ford', 'Honda']" iOS
3 "['John Doe', 'Jane Smith', 'Alice Johnson', 'Bob Brown', 'Charlie Davis', 'Eve White']" 8 "['BMW', 'Honda', 'Tesla', 'Tesla', 'Tesla', 'Nissan']" Android
4 "['John Doe', 'Jane Smith', 'Alice Johnson', 'Bob Brown', 'Charlie Davis', 'Eve White']" 3 "['Tesla', 'Audi', 'Chevrolet', 'Audi', 'Chevrolet', 'BMW']" iOS
5 "['John Doe', 'Jane Smith', 'Alice Johnson', 'Bob Brown', 'Charlie Davis', 'Eve White']" 8 "['Ford', 'Tesla', 'BMW', 'Toyota', 'Nissan', 'Ford']" Android

And ideally, I'd like to separate this into five individual csv files, as an example for john_doe_people_data.csv:

ID NAME MONTH CARS PHONE OS
1 John Doe 2 Toyota iOS
2 John Doe 4 Nissan iOS
3 John Doe 8 BMW Android
4 John Doe 3 Tesla iOS
5 John Doe 8 Ford Android

All in all, how can I use pandas to create separate CSV files for each element in a list column, while keeping the rest of the data in each file?


Solution

  • I ended up using a combination of explode,map, and ast.literal_eval to break out the columns with string lists into different CSV files.

    Instead of hard-coding column names like NAME or CARS, the program now dynamically checks which columns contain string representations of lists. This is done by iterating over all columns and using the map_check_if_list_literal function to identify list-like columns and later convert to literals with map_convert_list applied element-wise :

    import random
    import csv
    import pandas as pd
    import ast
    
    # Predefined lists for NAME, CARS, and PHONE OS
    NAMES = ["John Doe", "Jane Smith", "Alice Johnson", "Bob Brown", "Charlie Davis", "Eve White", "David Wilson", "Emma Taylor", "Frank Harris", "Grace Clark"]
    CAR_BRANDS = ["Toyota", "Ford", "BMW", "Tesla", "Honda", "Chevrolet", "Nissan", "Audi"]
    PHONE_OS = ["Android", "iOS"]
    
    def create_csv(file_name, num_records):
        cur_random_list_size = random.randint(1, min(len(NAMES), len(CAR_BRANDS)))
        with open(file_name, mode='w', newline='') as file:
            writer = csv.writer(file)
            
            writer.writerow(["ID", "NAME", "MONTH", "CARS", "PHONE OS"])
    
            for i in range(num_records):
    
                record = {
                    "id" : i + 1,
                    "name": [NAMES[n] for n in range(cur_random_list_size)],
                    "month": random.randint(1,12),
                    "cars": [random.choice(CAR_BRANDS) for _ in range(cur_random_list_size)],
                    "phone": random.choice(PHONE_OS)
                }
                writer.writerow(record.values())
        
        print(f"CSV file '{file_name}' created with {num_records} records.")
    
    def map_check_if_list_literal(element):
        if isinstance(element,str):
            try:
                data = ast.literal_eval(element)
                if isinstance(data, list):
                    return True
                else:
                    return False
            except Exception as e:
                return False
        else:
            return False
    
    def map_convert_list_literal(element):
        if isinstance(element,str):
            try:
                data = ast.literal_eval(element)
                if isinstance(data, list):
                    return data
                else:
                    return element
            except Exception as e:
                return element
        else:
            return element
        
    if __name__ == "__main__":
        create_csv("people_data.csv", 5)
    
        file_name = "people_data.csv"
        df = pd.read_csv(file_name)
        temp_df = df.map(map_check_if_list_literal)
        columns_w_list = []
        for c in temp_df.columns:
            if temp_df[c].any():
                columns_w_list.append(c)
        
        new_df = df.map(map_convert_list_literal)
        new_df = new_df.explode(columns_w_list)
        
        #this is column of interest
        reference_column = ast.literal_eval(df["NAME"].mode()[0])
    
        for name in reference_column:
            mask = new_df["NAME"] == name
            unique_df = new_df[mask]
            unique_df.to_csv(f"{name}_{file_name}.csv", index=False)