I have a dataframe column which is comprised of strings. I also have a list of substrings. For every substring, I want to test it against each string in the dataframe column, returning True if the substring is in the string. The following works but is very slow.
import pandas as pd
import time
t0 = time.time()
df = pd.DataFrame({
'FullName': ['C:/historical Dog analysis/Digger.doc', 'C:/historical Dog analysis/Roscoe.doc', 'C:/2024/Budgie requests/pipsqueak.csv', 'C:/text4.doc', 'C:/text5.doc'],
})
new_columns = {"_Outreach/Website design": (df['FullName'].str.contains("/historical Dog analysis/|"\
"/Budgie requests/|"\
"Dog analysis/best practices",case=False))
}
new_df = pd.DataFrame(new_columns)
df = pd.concat([df, new_df], axis=1).reindex(df.index)
t1 = time.time()
print(t1-t0)
print(df)
In an effort to find a faster approach, I tried isin.
But it only appears to work when matching string to string, not string to substring.
t0 = time.time()
df = pd.DataFrame({
'FullName': ['C:/historical Dog analysis/Digger.doc', 'C:/historical Dog analysis/Roscoe.doc', 'C:/2024/Budgie requests/pipsqueak.csv', 'C:/text4.doc', 'C:/text5.doc'],
})
#works, but not useful because requires full string match
new_columns = df["FullName"].isin(["C:/historical Dog analysis/Digger.doc","C:/2024/Budgie requests/pipsqueak.csv"])
#doesn't work (Returns a list of FALSE in next column)
# new_columns = df["FullName"].isin([".*/historical Dog analysis/.*"])
new_df = pd.DataFrame(new_columns)
df = pd.concat([df, new_df], axis=1).reindex(df.index)
t1 = time.time()
print(t1-t0)
print(df)
I also tried filter
, but it seems that it can only take one substring input at a time.
col_one_list = df['FullName'].tolist()
#doesn't work:TypeError: 'in <string>' requires string as left operand, not list
# b = ["/historical Dog analysis/","/Budgie requests/"]
#doesn't work: TypeError: unsupported operand type(s) for |: 'str' and 'str'
# b = ("/historical Dog analysis/"|"/Budgie requests/")
#works, but can only search one substring at a time
b = "/historical Dog analysis/"
new_columns = list(filter(lambda x: b in x, col_one_list))
print(new_columns)
new_df = pd.DataFrame(new_columns)
df = pd.concat([df, new_df], axis=1).reindex(df.index)
t1 = time.time()
print(t1-t0)
print(df)
Does anyone know a fast way to match a list of substrings to strings?
You can use the vectorized substring matching using numpy
package. It uses list comprehension with any()
to check for substring matches.
matches = np.array(
[
any(
substring.lower() in name.lower()
for substring in substrings
) for name in fullNames
]
)
I have tested this solution against yours for 15,000 iterations and here is the output:
FullName Matches
0 C:/historical Dog analysis/Digger.doc True
1 C:/historical Dog analysis/Roscoe.doc True
2 C:/2024/Budgie requests/pipsqueak.csv True
3 C:/text4.doc False
4 C:/text5.doc False
Average time taken for substring matching: 0.000446 seconds.
FullName _Outreach/Website design
0 C:/historical Dog analysis/Digger.doc True
1 C:/historical Dog analysis/Roscoe.doc True
2 C:/2024/Budgie requests/pipsqueak.csv True
3 C:/text4.doc False
4 C:/text5.doc False
Average time taken for substring matching: 0.000811 seconds.
FullName FullName
0 C:/historical Dog analysis/Digger.doc True
1 C:/historical Dog analysis/Roscoe.doc False
2 C:/2024/Budgie requests/pipsqueak.csv True
3 C:/text4.doc False
4 C:/text5.doc False
Average time taken for substring matching: 0.000710 seconds.
Below is the complete code for comparing the functions, where Func1
represents the suggested approach, and the other two functions are yours.
import time
import pandas as pd
import numpy as np
def Func1(t):
avgTime = 0.0
for i in range(t):
t0 = time.time()
df = pd.DataFrame(
{
'FullName': [
'C:/historical Dog analysis/Digger.doc',
'C:/historical Dog analysis/Roscoe.doc',
'C:/2024/Budgie requests/pipsqueak.csv',
'C:/text4.doc',
'C:/text5.doc'
]
}
)
substrings = [
"/historical Dog analysis/",
"/Budgie requests/",
"Dog analysis/best practices"
]
fullNames = df["FullName"].values.astype(str)
# Vectorized substring matching using NumPy.
matches = np.array(
[
any(
substring.lower() in name.lower()
for substring in substrings
) for name in fullNames
]
)
df["Matches"] = matches
t1 = time.time()
avgTime += (t1 - t0)
avgTime /= t
print(df)
print(f"Average time taken for substring matching: {avgTime:.6f} seconds.")
def Func2(t):
avgTime = 0.0
for i in range(t):
t0 = time.time()
df = pd.DataFrame(
{
'FullName': ['C:/historical Dog analysis/Digger.doc', 'C:/historical Dog analysis/Roscoe.doc',
'C:/2024/Budgie requests/pipsqueak.csv', 'C:/text4.doc', 'C:/text5.doc'],
}
)
new_columns = {
"_Outreach/Website design": (df['FullName'].str.contains(
"/historical Dog analysis/|" \
"/Budgie requests/|" \
"Dog analysis/best practices", case=False
))
}
new_df = pd.DataFrame(new_columns)
df = pd.concat([df, new_df], axis=1).reindex(df.index)
t1 = time.time()
avgTime += (t1 - t0)
avgTime /= t
print(df)
print(f"Average time taken for substring matching: {avgTime:.6f} seconds.")
def Func3(t):
avgTime = 0.0
for i in range(t):
t0 = time.time()
df = pd.DataFrame(
{
'FullName': [
'C:/historical Dog analysis/Digger.doc',
'C:/historical Dog analysis/Roscoe.doc',
'C:/2024/Budgie requests/pipsqueak.csv',
'C:/text4.doc', 'C:/text5.doc'
],
}
)
new_columns = df["FullName"].isin(
[
"C:/historical Dog analysis/Digger.doc",
"C:/2024/Budgie requests/pipsqueak.csv"
]
)
new_df = pd.DataFrame(new_columns)
df = pd.concat([df, new_df], axis=1).reindex(df.index)
t1 = time.time()
avgTime += (t1 - t0)
avgTime /= t
print(df)
print(f"Average time taken for substring matching: {avgTime:.6f} seconds.")
if __name__ == "__main__":
Func1(15000)
Func2(15000)
Func3(15000)