I have large datasets (ranging from 100k - 4 million rows) where I am looking for different relevant codes across multiple columns. For example, if I wanted to identify each row which has some start to a string '302' I would do:
import polars as pl
df = pl.DataFrame({
'Codes_1': ['302E513', '301E513', '302E512'],
'Codes_2': ['303E513', '306E510', '302E512']}).lazy()
conditions = ['302E513', '306E510']
column_names = ['Codes_1', 'Codes_2']
#create new column
df = df.with_columns(
pl.when(pl.any_horizontal(
pl.col(column_names).str.starts_with(conditions[0]),
pl.col(column_names).str.starts_with(conditions[1])))
.then(1.0)
.otherwise(0.0)
.alias('Column_name')
)
It is really annoying when I am looking for say 4 codes instead of 2 to have to type in each of the codes to form my new column:
import polars as pl
df = pl.DataFrame({
'Codes_1': ['302E513', '301E513', '302E512'],
'Codes_2': ['303E513', '306E510', '302E512']}).lazy()
conditions = ['302E513', '306E510', '5164E23', '302E514']
column_names = ['Codes_1', 'Codes_2']
#create new column
df = df.with_columns(
pl.when(pl.any_horizontal(
#Tedious part
pl.col(column_names).str.starts_with(conditions[0]),
pl.col(column_names).str.starts_with(conditions[1]),
pl.col(column_names).str.starts_with(conditions[2]),
pl.col(column_names).str.starts_with(conditions[3])
))
.then(1.0)
.otherwise(0.0)
.alias('Column_name')
)
I know that this can be done with pandas by updating a mask with a for loop
import pandas as pd
df = pd.DataFrame({
'Codes_1': ['302E513', '301E513', '302E512'],
'Codes_2': ['303E513', '306E510', '302E512']})
conditions = ['302E513', '306E510']
column_names = ['Codes_1', 'Codes_2']
#loop to create new column
mask = False
for code in conditions:
mask |= df[column_names].eq(code).any(axis=1)
df['Column_name'] = 0.0
df.loc[mask, 'Column_name'] = 1.0
print(df['Column_name'])
And I could change the number of conditions to any number and this code would execute. However, I would much rather use polars as it is faster and does not overflow the RAM on my machine for larger datasets. Any help is appreciated.
You could replace the multiple str.starts_with
with a single regex and str.contains
:
df.with_columns(
pl.when(pl.any_horizontal(
pl.col(column_names).str.contains(f"^({'|'.join(conditions)})"),
))
.then(1.0)
.otherwise(0.0)
.alias('Column_name')
)
Or use a loop:
df.with_columns(
pl.when(pl.any_horizontal(
pl.col(column_names).str.starts_with(c)
for c in conditions
))
.then(1.0)
.otherwise(0.0)
.alias('Column_name')
)
Intermediate:
# f"^({'|'.join(conditions)})"
'^(302E513|306E510|5164E23|302E514)'
Output (non-lazy):
┌─────────┬─────────┬─────────────┐
│ Codes_1 ┆ Codes_2 ┆ Column_name │
│ --- ┆ --- ┆ --- │
│ str ┆ str ┆ f64 │
╞═════════╪═════════╪═════════════╡
│ 302E513 ┆ 303E513 ┆ 1.0 │
│ 301E513 ┆ 306E510 ┆ 1.0 │
│ 302E512 ┆ 302E512 ┆ 0.0 │
└─────────┴─────────┴─────────────┘