pythonstringfiltersubstringcontains

Is there a fast way to match a column of strings to each substring in a list?


I have a dataframe column which is comprised of strings. I also have a list of substrings. For every substring, I want to test it against each string in the dataframe column, returning True if the substring is in the string. The following works but is very slow.

import pandas as pd
import time

t0 = time.time()

df = pd.DataFrame({
    'FullName': ['C:/historical Dog analysis/Digger.doc', 'C:/historical Dog analysis/Roscoe.doc', 'C:/2024/Budgie requests/pipsqueak.csv', 'C:/text4.doc', 'C:/text5.doc'],
})

new_columns = {"_Outreach/Website design": (df['FullName'].str.contains("/historical Dog analysis/|"\
        "/Budgie requests/|"\
        "Dog analysis/best practices",case=False))
     }

new_df = pd.DataFrame(new_columns)

df = pd.concat([df, new_df], axis=1).reindex(df.index)

t1 = time.time()
print(t1-t0)
print(df)

In an effort to find a faster approach, I tried isin. But it only appears to work when matching string to string, not string to substring.

t0 = time.time()

df = pd.DataFrame({
    'FullName': ['C:/historical Dog analysis/Digger.doc', 'C:/historical Dog analysis/Roscoe.doc', 'C:/2024/Budgie requests/pipsqueak.csv', 'C:/text4.doc', 'C:/text5.doc'],
})

#works, but not useful because requires full string match
new_columns = df["FullName"].isin(["C:/historical Dog analysis/Digger.doc","C:/2024/Budgie requests/pipsqueak.csv"])
#doesn't work (Returns a list of FALSE in next column)
# new_columns = df["FullName"].isin([".*/historical Dog analysis/.*"])

new_df = pd.DataFrame(new_columns)

df = pd.concat([df, new_df], axis=1).reindex(df.index)
t1 = time.time()
print(t1-t0)
print(df)

I also tried filter, but it seems that it can only take one substring input at a time.

col_one_list = df['FullName'].tolist()
#doesn't work:TypeError: 'in <string>' requires string as left operand, not list
# b = ["/historical Dog analysis/","/Budgie requests/"]
#doesn't work: TypeError: unsupported operand type(s) for |: 'str' and 'str'
# b = ("/historical Dog analysis/"|"/Budgie requests/")
#works, but can only search one substring at a time
b = "/historical Dog analysis/"
new_columns = list(filter(lambda x: b in x, col_one_list))
print(new_columns)

new_df = pd.DataFrame(new_columns)

df = pd.concat([df, new_df], axis=1).reindex(df.index)
t1 = time.time()
print(t1-t0)
print(df)

Does anyone know a fast way to match a list of substrings to strings?


Solution

  • You can use the vectorized substring matching using numpy package. It uses list comprehension with any() to check for substring matches.

    matches = np.array(
      [
        any(
          substring.lower() in name.lower()
          for substring in substrings
        ) for name in fullNames
      ]
    )
    

    I have tested this solution against yours for 15,000 iterations and here is the output:

                                    FullName  Matches
    0  C:/historical Dog analysis/Digger.doc     True
    1  C:/historical Dog analysis/Roscoe.doc     True
    2  C:/2024/Budgie requests/pipsqueak.csv     True
    3                           C:/text4.doc    False
    4                           C:/text5.doc    False
    Average time taken for substring matching: 0.000446 seconds.
                                    FullName  _Outreach/Website design
    0  C:/historical Dog analysis/Digger.doc                      True
    1  C:/historical Dog analysis/Roscoe.doc                      True
    2  C:/2024/Budgie requests/pipsqueak.csv                      True
    3                           C:/text4.doc                     False
    4                           C:/text5.doc                     False
    Average time taken for substring matching: 0.000811 seconds.
                                    FullName  FullName
    0  C:/historical Dog analysis/Digger.doc      True
    1  C:/historical Dog analysis/Roscoe.doc     False
    2  C:/2024/Budgie requests/pipsqueak.csv      True
    3                           C:/text4.doc     False
    4                           C:/text5.doc     False
    Average time taken for substring matching: 0.000710 seconds.
    

    Below is the complete code for comparing the functions, where Func1 represents the suggested approach, and the other two functions are yours.

    import time
    import pandas as pd
    import numpy as np
    
    
    def Func1(t):
      avgTime = 0.0
      for i in range(t):
        t0 = time.time()
        df = pd.DataFrame(
          {
            'FullName': [
              'C:/historical Dog analysis/Digger.doc',
              'C:/historical Dog analysis/Roscoe.doc',
              'C:/2024/Budgie requests/pipsqueak.csv',
              'C:/text4.doc',
              'C:/text5.doc'
            ]
          }
        )
        substrings = [
          "/historical Dog analysis/",
          "/Budgie requests/",
          "Dog analysis/best practices"
        ]
        fullNames = df["FullName"].values.astype(str)
    
        # Vectorized substring matching using NumPy.
        matches = np.array(
          [
            any(
              substring.lower() in name.lower()
              for substring in substrings
            ) for name in fullNames
          ]
        )
        df["Matches"] = matches
        t1 = time.time()
        avgTime += (t1 - t0)
    
      avgTime /= t
      print(df)
      print(f"Average time taken for substring matching: {avgTime:.6f} seconds.")
    
    
    def Func2(t):
      avgTime = 0.0
      for i in range(t):
        t0 = time.time()
        df = pd.DataFrame(
          {
            'FullName': ['C:/historical Dog analysis/Digger.doc', 'C:/historical Dog analysis/Roscoe.doc',
                         'C:/2024/Budgie requests/pipsqueak.csv', 'C:/text4.doc', 'C:/text5.doc'],
          }
        )
        new_columns = {
          "_Outreach/Website design": (df['FullName'].str.contains(
            "/historical Dog analysis/|" \
            "/Budgie requests/|" \
            "Dog analysis/best practices", case=False
          ))
        }
        new_df = pd.DataFrame(new_columns)
        df = pd.concat([df, new_df], axis=1).reindex(df.index)
    
        t1 = time.time()
        avgTime += (t1 - t0)
    
      avgTime /= t
    
      print(df)
    
      print(f"Average time taken for substring matching: {avgTime:.6f} seconds.")
    
    
    def Func3(t):
      avgTime = 0.0
      for i in range(t):
        t0 = time.time()
    
        df = pd.DataFrame(
          {
            'FullName': [
              'C:/historical Dog analysis/Digger.doc',
              'C:/historical Dog analysis/Roscoe.doc',
              'C:/2024/Budgie requests/pipsqueak.csv',
              'C:/text4.doc', 'C:/text5.doc'
            ],
          }
        )
    
        new_columns = df["FullName"].isin(
          [
            "C:/historical Dog analysis/Digger.doc",
            "C:/2024/Budgie requests/pipsqueak.csv"
          ]
        )
    
        new_df = pd.DataFrame(new_columns)
    
        df = pd.concat([df, new_df], axis=1).reindex(df.index)
        t1 = time.time()
        avgTime += (t1 - t0)
    
      avgTime /= t
    
      print(df)
    
      print(f"Average time taken for substring matching: {avgTime:.6f} seconds.")
    
    
    if __name__ == "__main__":
      Func1(15000)
      Func2(15000)
      Func3(15000)