I have CSV files with multiple columns of data retrieved from APIs, where each cell may contain either a single value or a list/array. The size of these lists is consistent across each column (e.g., a column named ALPHANUMS
having a row containing a list like "['A', 'B', '4']"
has the same list size of a column named COLOR
having a row containing a list "['red', 'blue', 'green']"
, but the list sizes can vary per CSV file depending on the API response. I would like to use pandas
to create separate CSV files for each element in a list column, while retaining the rest of the data in each file.
Here's an example of what the data might look like from this mockup function:
import random
import csv
# Predefined lists for NAME, CARS, and PHONE OS
NAMES = ["John Doe", "Jane Smith", "Alice Johnson", "Bob Brown", "Charlie Davis", "Eve White", "David Wilson", "Emma Taylor", "Frank Harris", "Grace Clark"]
CAR_BRANDS = ["Toyota", "Ford", "BMW", "Tesla", "Honda", "Chevrolet", "Nissan", "Audi"]
PHONE_OS = ["Android", "iOS"]
def create_csv(file_name, num_records):
cur_random_list_size = random.randint(1, min(len(NAMES), len(CAR_BRANDS)))
with open(file_name, mode='w', newline='') as file:
writer = csv.writer(file)
writer.writerow(["ID", "NAME", "MONTH", "CARS", "PHONE OS"])
for i in range(num_records):
record = {
"id" : i + 1,
"name": [NAMES[n] for n in range(cur_random_list_size)],
"month": random.randint(1,12),
"cars": [random.choice(CAR_BRANDS) for _ in range(cur_random_list_size)],
"phone": random.choice(PHONE_OS)
}
writer.writerow(record.values())
print(f"CSV file '{file_name}' created with {num_records} records.")
create_csv("people_data.csv", 5)
ID | NAME | MONTH | CARS | PHONE OS |
---|---|---|---|---|
1 | "['John Doe', 'Jane Smith', 'Alice Johnson', 'Bob Brown', 'Charlie Davis', 'Eve White']" | 2 | "['Toyota', 'Nissan', 'Nissan', 'Nissan', 'Audi', 'Honda']" | iOS |
2 | "['John Doe', 'Jane Smith', 'Alice Johnson', 'Bob Brown', 'Charlie Davis', 'Eve White']" | 4 | "['Nissan', 'Ford', 'Honda', 'Toyota', 'Ford', 'Honda']" | iOS |
3 | "['John Doe', 'Jane Smith', 'Alice Johnson', 'Bob Brown', 'Charlie Davis', 'Eve White']" | 8 | "['BMW', 'Honda', 'Tesla', 'Tesla', 'Tesla', 'Nissan']" | Android |
4 | "['John Doe', 'Jane Smith', 'Alice Johnson', 'Bob Brown', 'Charlie Davis', 'Eve White']" | 3 | "['Tesla', 'Audi', 'Chevrolet', 'Audi', 'Chevrolet', 'BMW']" | iOS |
5 | "['John Doe', 'Jane Smith', 'Alice Johnson', 'Bob Brown', 'Charlie Davis', 'Eve White']" | 8 | "['Ford', 'Tesla', 'BMW', 'Toyota', 'Nissan', 'Ford']" | Android |
And ideally, I'd like to separate this into five individual csv files, as an example for john_doe_people_data.csv
:
ID | NAME | MONTH | CARS | PHONE OS |
---|---|---|---|---|
1 | John Doe | 2 | Toyota | iOS |
2 | John Doe | 4 | Nissan | iOS |
3 | John Doe | 8 | BMW | Android |
4 | John Doe | 3 | Tesla | iOS |
5 | John Doe | 8 | Ford | Android |
All in all, how can I use pandas to create separate CSV files for each element in a list column, while keeping the rest of the data in each file?
I ended up using a combination of explode
,map
, and ast.literal_eval
to break out the columns with string lists into different CSV files.
Instead of hard-coding column names like NAME
or CARS
, the program now dynamically checks which columns contain string representations of lists. This is done by iterating over all columns and using the map_check_if_list_literal
function to identify list-like columns and later convert to literals with map_convert_list
applied element-wise :
import random
import csv
import pandas as pd
import ast
# Predefined lists for NAME, CARS, and PHONE OS
NAMES = ["John Doe", "Jane Smith", "Alice Johnson", "Bob Brown", "Charlie Davis", "Eve White", "David Wilson", "Emma Taylor", "Frank Harris", "Grace Clark"]
CAR_BRANDS = ["Toyota", "Ford", "BMW", "Tesla", "Honda", "Chevrolet", "Nissan", "Audi"]
PHONE_OS = ["Android", "iOS"]
def create_csv(file_name, num_records):
cur_random_list_size = random.randint(1, min(len(NAMES), len(CAR_BRANDS)))
with open(file_name, mode='w', newline='') as file:
writer = csv.writer(file)
writer.writerow(["ID", "NAME", "MONTH", "CARS", "PHONE OS"])
for i in range(num_records):
record = {
"id" : i + 1,
"name": [NAMES[n] for n in range(cur_random_list_size)],
"month": random.randint(1,12),
"cars": [random.choice(CAR_BRANDS) for _ in range(cur_random_list_size)],
"phone": random.choice(PHONE_OS)
}
writer.writerow(record.values())
print(f"CSV file '{file_name}' created with {num_records} records.")
def map_check_if_list_literal(element):
if isinstance(element,str):
try:
data = ast.literal_eval(element)
if isinstance(data, list):
return True
else:
return False
except Exception as e:
return False
else:
return False
def map_convert_list_literal(element):
if isinstance(element,str):
try:
data = ast.literal_eval(element)
if isinstance(data, list):
return data
else:
return element
except Exception as e:
return element
else:
return element
if __name__ == "__main__":
create_csv("people_data.csv", 5)
file_name = "people_data.csv"
df = pd.read_csv(file_name)
temp_df = df.map(map_check_if_list_literal)
columns_w_list = []
for c in temp_df.columns:
if temp_df[c].any():
columns_w_list.append(c)
new_df = df.map(map_convert_list_literal)
new_df = new_df.explode(columns_w_list)
#this is column of interest
reference_column = ast.literal_eval(df["NAME"].mode()[0])
for name in reference_column:
mask = new_df["NAME"] == name
unique_df = new_df[mask]
unique_df.to_csv(f"{name}_{file_name}.csv", index=False)