I have a csv file containing 720,000 rows with and 10 columns, the columns that are relevant to the problem are
['timestamp_utc', 'looted_by__name', 'item_id', 'quantity']
This file is logs of items people loot of the ground in a game, the problem is that sometimes the loot logger of the ground bugs and types in the person looting the same item twice in two different rows (those two rows could be separated by up to 5 rows) with a slight difference in the timestamp_utc
column otherwise ['looted_by__name', 'item_id', 'quantity']
are the same, and example of this would be:
2024-06-23T11:40:43.2187312Z,Georgeeto,T4_SOUL,2
2024-06-23T11:40:43.4588316Z,Georgeeto,T4_SOUL,2
where in this example here the 2024-06-23T11:40:43.2187312Z
would be the timestamp_utc
, 'Georgeeto'
would be the looted_by__name
, T4_SOUL
would be the item_id
, and 2
would be the quantity
.
What am trying to do here is see if ['looted_by__name', 'item_id', 'quantity']
are equal in both rows and if they are subtract both rows time stamps from one another , and if it is less that 0.5 secs I copy both corrupted lines into a Corrupted.csv
file and only put one of the lines in a Clean.csv
file
The way I went about doing this is the following
import pandas as pd
import time
from datetime import datetime
start_time = time.time()
combined_df_3 = pd.read_csv("Processing/combined_file_refined.csv", delimiter= ',', usecols=['timestamp_utc', 'looted_by__name', 'item_id', 'quantity'])
combined_df_4 = pd.read_csv("Processing/combined_file_refined.csv", delimiter= ',', usecols=['timestamp_utc', 'looted_by__name', 'item_id', 'quantity'])
bugged_item_df = pd.DataFrame()
clean_item_df = pd.DataFrame()
bugged_item_list = []
clean_item_list = []
date_format = '%Y-%m-%dT%H:%M:%S.%f'
for index1,row1 in combined_df_3.iterrows():
n = 0
time_stamp_1 = datetime.strptime(row1['timestamp_utc'][:26], date_format)
name_1 = row1['looted_by__name']
item_id_1 = row1['item_id']
quantity_1 = row1['quantity']
for index2, row2 in combined_df_4.iterrows():
print(str(n))
n += 1
if n > 5:
break
time_stamp_2 = datetime.strptime(row2['timestamp_utc'][:26], date_format)
name_2 = row2['looted_by__name']
item_id_2 = row2['item_id']
quantity_2 = row2['quantity']
if time_stamp_1 == time_stamp_2 and name_1 == name_2 and item_id_1 == item_id_2 and quantity_2 == quantity_2:
break # get out of for loop here
elif name_1 == name_2 and item_id_1 == item_id_2 and quantity_1 == quantity_2:
if time_stamp_1 > time_stamp_2:
date_diff = abs(time_stamp_1 - time_stamp_2)
date_diff_sec = date_diff.total_seconds()
elif time_stamp_1 < time_stamp_2:
date_diff = abs(time_stamp_2 - time_stamp_1)
date_diff_sec = date_diff.total_seconds()
if date_diff_sec < 0.5:
bugged_item_df = bugged_item_df._append(row1 ,ignore_index=True)
bugged_item_df = bugged_item_df._append(row2 ,ignore_index=True) #add both lines into a csv file and not write 1 of them into the final csv file
elif date_diff_sec > 0.5:
pass # type line into a csv file normally
else:
pass # type line into a csv file normally
bugged_item_df.to_csv("test.csv", index=False)
clean_item_df.to_csv('test2.csv', index=False)
end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")
The way am Doing it 'Technically' works , but it takes about 6-13hrs to go threw the entire file I came to ask if there is a way to optimize it to run faster
note: code is not finished yet but you can get the idea from it
update:Thanks to the advice of AKZ (i love you man) i was able to reduce the time from 13.4hrs to 32mins, and i realised that the code i posted was done wrong in the for loop as well so i went with the following answer
import time
import pandas as pd
from datetime import datetime
#orgnizing the rows
df = pd.read_csv("processing/combined_file_refined.csv", delimiter= ',', usecols=['timestamp_utc', 'looted_by__name', 'item_id', 'quantity'])
df = df.groupby(['looted_by__name', 'timestamp_utc']).sum().reset_index()
df.to_csv("test.csv", index=False)
bugged_item_df = pd.DataFrame()
clean_item_df = pd.DataFrame()
df1 =pd.read_csv("test.csv", delimiter= ',', usecols=['timestamp_utc', 'looted_by__name', 'item_id', 'quantity'])
date_format = '%Y-%m-%dT%H:%M:%S.%f'
n = 0
num_of_runs = 0
start_time = time.time()
for index1,row1 in df.iterrows():
num_of_runs += 1
n += 1
try:
row2 = df1.iloc[n]
except IndexError:
clean_item_df = clean_item_df._append(row1 ,ignore_index=True)
break
time_stamp_1 = datetime.strptime(row1['timestamp_utc'][:26], date_format)
name_1 = row1['looted_by__name']
item_id_1 = row1['item_id']
quantity_1 = row1['quantity']
time_stamp_2 = datetime.strptime(row2['timestamp_utc'][:26], date_format)
name_2 = row2['looted_by__name']
item_id_2 = row2['item_id']
quantity_2 = row2['quantity']
if name_1 != name_2 or item_id_1 != item_id_2 or quantity_1 != quantity_2:
#add row 1 to df
continue
elif time_stamp_1 > time_stamp_2:
date_diff_1 = abs(time_stamp_1 - time_stamp_2)
date_diff_sec_1 = date_diff_1.total_seconds()
if date_diff_sec_1 < 0.5:
#donot add row 1 to df and add row 1 and row 2 to bugged item list
bugged_item_df = bugged_item_df._append(row1 ,ignore_index=True)
bugged_item_df = bugged_item_df._append(row2 ,ignore_index=True)
pass
elif date_diff_sec_1 > 0.5:
clean_item_df = clean_item_df._append(row1 ,ignore_index=True)
#add row 1 to df
continue
elif time_stamp_1 < time_stamp_2:
date_diff_2 = abs(time_stamp_2 - time_stamp_1)
date_diff_sec_2 = date_diff_2.total_seconds()
if date_diff_sec_2 < 0.5:
bugged_item_df = bugged_item_df._append(row1 ,ignore_index=True)
bugged_item_df = bugged_item_df._append(row2 ,ignore_index=True)
#donot add row 1 to df and add row 1 and row 2 to bugged item list
pass
elif date_diff_sec_2 > 0.5:
clean_item_df = clean_item_df._append(row1 ,ignore_index=True)
#add row 1 to df
continue
bugged_item_df.to_csv("bugged.csv", index=False)
clean_item_df.to_csv("clean.csv", index=False)
end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")
if someone has a better answer than the one i did please post it i will greatly appreciate it
update 2: i edited the code again and realised i could just remove the bugged lines faster now it does it in 60secs
import time
import pandas as pd
from datetime import datetime
#orgnizing the rows
combined_df_3 = pd.read_csv("processing/combined_file_refined.csv", delimiter= ',', usecols=['timestamp_utc', 'looted_by__name', 'item_id', 'quantity'])
combined_df_3 = combined_df_3.groupby(['looted_by__name', 'timestamp_utc']).sum().reset_index()
combined_df_3.to_csv("processing/combined_file_orgnized.csv", index=False)
bugged_item_df = pd.DataFrame()
bugged_item_2df = pd.DataFrame()
combined_df_4 =pd.read_csv("processing/combined_file_orgnized.csv", delimiter= ',', usecols=['timestamp_utc', 'looted_by__name', 'item_id', 'quantity'])
date_format = '%Y-%m-%dT%H:%M:%S.%f'
num_of_runs = 0
for index1,row1 in combined_df_3.iterrows():
num_of_runs += 1
try:
row2 = combined_df_4.iloc[num_of_runs]
except IndexError:
break
time_stamp_1 = datetime.strptime(row1['timestamp_utc'][:26], date_format)
name_1 = row1['looted_by__name']
item_id_1 = row1['item_id']
quantity_1 = row1['quantity']
time_stamp_2 = datetime.strptime(row2['timestamp_utc'][:26], date_format)
name_2 = row2['looted_by__name']
item_id_2 = row2['item_id']
quantity_2 = row2['quantity']
if name_1 != name_2 or item_id_1 != item_id_2 or quantity_1 != quantity_2:
continue
elif time_stamp_1 > time_stamp_2:
date_diff_1 = abs(time_stamp_1 - time_stamp_2)
date_diff_sec_1 = date_diff_1.total_seconds()
if date_diff_sec_1 < 0.5:
#donot add row 1 to df and add row 1 and row 2 to bugged item list
bugged_item_df = bugged_item_df._append(row1 ,ignore_index=True)
bugged_item_df = bugged_item_df._append(row2 ,ignore_index=True)
bugged_item_2df = bugged_item_2df._append(row1,ignore_index=True)
elif time_stamp_1 < time_stamp_2:
date_diff_2 = abs(time_stamp_2 - time_stamp_1)
date_diff_sec_2 = date_diff_2.total_seconds()
if date_diff_sec_2 < 0.5:
bugged_item_df = bugged_item_df._append(row1 ,ignore_index=True)
bugged_item_df = bugged_item_df._append(row2 ,ignore_index=True)
bugged_item_2df = bugged_item_2df._append(row1,ignore_index=True)
#donot add row 1 to df and add row 1 and row 2 to bugged item list
bugged_item_df.to_csv("bugged.csv", index=False)
print('here')
clean_item_df = combined_df_3.merge(bugged_item_2df, on=['timestamp_utc', 'looted_by__name', 'item_id', 'quantity'], how='left', indicator=True).query('_merge == "left_only"').drop('_merge', axis=1)
clean_item_df.to_csv("clean.csv", index=False)
If someone knows how to improve it beyond 30 secs feel free to add another way
Pandas was not designed to iterate rows: Don't iterate rows in Pandas. That answer really goes down the rabbit hole in terms of performance and alternatives, but I think a good takeaway for you would be, that you need a better tool for the job.
Enter Python's csv module and its humble but very fast reader: nothing beats the reader in terms of row-reading performance (and probably will remain that way as the Python contributors have optimized this in C over the years).
reader = csv.reader(open("input.csv", newline=""))
clean_w = csv.writer(open("output-clean.csv", "w", newline=""))
corrupt_w = csv.writer(open("output-corrupt.csv", "w", newline=""))
Granted, a row is just a list of strings, but this actually works to your advantage becaue for this problem you only need to parse one field, the timestamp; the other three fields work fine just as strings because you use them for their identity, not their value—"2"
or 2
? doesn't matter, this problem doesn't require doing math with the quantity, you only care about the quantity "two". I bring up this idea of identity-vs-value because your Pandas code spends some time parsing "2"
→ 2
, when "2"
works just fine.
I expanded on your sample rows:
2024-06-23T11:40:43.2187312Z,Georgeeto,T4_SOUL,2
2024-06-23T11:40:43.3962342Z,Alicechen,XXYY,3
2024-06-23T11:40:43.4588316Z,Georgeeto,T4_SOUL,2
2024-06-23T11:40:44.5634358Z,Bobbiejjj,AABB,1
With that, I get a clean CSV like:
2024-06-23T11:40:43.2187312Z,Georgeeto,T4_SOUL,2
2024-06-23T11:40:43.3962342Z,Alicechen,XXYY,3
2024-06-23T11:40:44.5634358Z,Bobbiejjj,AABB,1
I ran your final (to date) Pandas code against that input and got a clean CSV like:
Alicechen,2024-06-23T11:40:43.3962342Z,XXYY,3
Bobbiejjj,2024-06-23T11:40:44.5634358Z,AABB,1
Georgeeto,2024-06-23T11:40:43.4588316Z,T4_SOUL,2
Similar. In yours, Georgeeto's row is out of chronological sort.
My corrupt CSV looks like yours:
2024-06-23T11:40:43.2187312Z,Georgeeto,T4_SOUL,2
2024-06-23T11:40:43.4588316Z,Georgeeto,T4_SOUL,2
My program uses a last_seen
dict to keep track of a row and its timestamp, keyed to a tuple of username, item, count (or, in your own terms, looted_by__name, item_id, quantity). I made a lightweight dataclass to hold the timestamp and the complete row, and created a type alias for the key:
@dataclass
class Entry:
ts: datetime
row: list[str]
Key = tuple[str, str, str] # combo of username,item,quantity
then, the dict looks like:
last_seen: dict[Key, Entry] = {}
As the reader loops through the rows, it records every row by its key. If the reader has the current row, and that row's key already exists in the dict, there could be a possible duplication, which will be determined by subtracting the two timestamps. If the two rows represent a duplicate (corrupt) entry, the current row gets marked as unclean, and then a final check of the clean flag determines whether to write to the clean or corrupt CSV.
This allows the program to only have to loop over the input once:
max_delta = timedelta(milliseconds=500)
last_seen: dict[Key, Entry] = {}
for row in reader:
this_ts = datetime.strptime(row[0], "%Y-%m-%dT%H:%M:%S.%f")
name = row[1]
item = row[2]
count = row[3]
key = (name, item, count)
clean = True
last = Entry(datetime(1, 1, 1), []) # get around "possibly unbound" error
if key in last_seen:
last = last_seen[key]
delta = this_ts - last.ts
if delta < max_delta:
clean = False
if clean:
clean_w.writerow(row)
else:
corrupt_w.writerow(last.row)
corrupt_w.writerow(row)
last_seen[key] = Entry(this_ts, row)
I created a 720K-row test file and ran both of our programs against it. Yours ran in 28s and used about 239MB of memory; mine ran in under 4s and used about 14MB of memory. If I swap datetime.strptime(row[0], "%Y-%m-%dT%H:%M:%S.%f")
for datetime.fromisoformat(row[0])
, that shaves another 2s off mine... down to under 2s (probably because it doesn't have to interpet the format string 720K times).
My complete program:
import csv
from dataclasses import dataclass
from datetime import datetime, timedelta
reader = csv.reader(open("big.csv", newline=""))
clean_w = csv.writer(open("output-clean.csv", "w", newline=""))
corrupt_w = csv.writer(open("output-corrupt.csv", "w", newline=""))
# Copy header from reader to output writers
header = next(reader)
clean_w.writerow(header)
corrupt_w.writerow(header)
# Create small class, and a separate type; for cleaner code w/
# type safety.
@dataclass
class Entry:
ts: datetime
row: list[str]
Key = tuple[str, str, str] # username,item,quantity
# Precompute delta (saves about .2s over 720K iterations)
max_delta = timedelta(milliseconds=500)
# Initialize dict
last_seen: dict[Key, Entry] = {}
# Iterate reader, parse row, check for previous key in
# last_seen and determine clean status, write accordingly,
# save current row to key in last_seen.
for row in reader:
this_ts = datetime.fromisoformat(row[0])
name = row[1]
item = row[2]
count = row[3]
key = (name, item, count)
clean = True
last = Entry(datetime(1, 1, 1), []) # get around "possibly unbound" error
if key in last_seen:
last = last_seen[key]
delta = this_ts - last.ts
if delta < max_delta:
clean = False
if clean:
clean_w.writerow(row)
else:
corrupt_w.writerow(last.row)
corrupt_w.writerow(row)
last_seen[key] = Entry(this_ts, row)