How should one join two pl.LazyFrame
using two columns from each pl.LazyFrame
based on content in the columns of the left pl.LazyFrame
?
import polars as pl
lf1 = pl.LazyFrame(
data={
"col_1": ["a", "b", "c"],
"col_2": ["d", None, None],
"col_3": [None, "e", None],
},
)
lf2 = pl.LazyFrame(
data={
"col_a": ["d", "xyz"],
"col_b": ["xyz", "e"],
"col_c": ["relevant_info_1", "relevant_info_2"],
},
)
Pseudo-code of desired join :
lf1.join(lf2,
when(col("col_2").is_not_null().then(left_on="col_2", right_on="col_a")
when(col("col_3").is_not_null().then(left_on="col_3", right_on="col_b")
otherwise(do_nothing)
)
Expected result :
shape: (3, 4)
┌───────┬───────┬───────┬─────────────────┐
│ col_1 ┆ col_2 ┆ col_3 ┆ col_c │
│ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str ┆ str │
╞═══════╪═══════╪═══════╪═════════════════╡
│ a ┆ d ┆ null ┆ relevant_info_1 │
│ b ┆ null ┆ e ┆ relevant_info_2 │
│ c ┆ null ┆ null ┆ null │
└───────┴───────┴───────┴─────────────────┘
One way to do it is to first make the two joins inner separately.
Then we can filter LF1
to select only rows with col_2
and col_3
being both None
. Finally, we can concatenate the three pl.LazyFrame
to obtain final result.
That solution uses only 2 joins instead of 3 for the solution provided by @robertdj. Thus might explain the faster execution of following foo1()
.
import polars as pl
LF1 = pl.LazyFrame(
data={
"col_1": ["a", "b", "c"],
"col_2": ["d", None, None],
"col_3": [None, "e", None],
},
)
LF2 = pl.LazyFrame(
data={
"col_a": ["d", "xyz"],
"col_b": ["xyz", "e"],
"col_c": ["relevant_info_1", "relevant_info_2"],
},
)
def foo1() -> pl.DataFrame:
lf_joined_on_col_2 = (
LF1.join(other=LF2, left_on=["col_2"], right_on=["col_a"])
.with_columns(pl.lit(None).cast(pl.String).alias("col_a"))
.select("col_1", "col_2", "col_3", "col_c")
)
lf_joined_on_col_3 = (
LF1.join(other=LF2, left_on=["col_3"], right_on=["col_b"])
.with_columns(pl.lit(None).cast(pl.String).alias("col_b"))
.select("col_1", "col_2", "col_3", "col_c")
)
lf_rows_with_null_on_col_2_and_col_3 = LF1.filter(
pl.col("col_2").is_null() & pl.col("col_3").is_null()
).with_columns(pl.lit(None).cast(pl.String).alias("col_c"))
return pl.concat(
[lf_joined_on_col_2, lf_joined_on_col_3, lf_rows_with_null_on_col_2_and_col_3]
).collect()
def foo2() -> pl.DataFrame:
lf_joined_on_col_2 = LF1.join(
other=LF2, left_on=["col_2"], right_on=["col_a"], how="left"
)
lf_joined_on_col_3 = LF1.join(
other=LF2, left_on=["col_3"], right_on=["col_b"], how="left"
)
return (
lf_joined_on_col_2.join(
lf_joined_on_col_3.select("col_1", "col_c"), on=["col_1"]
)
.with_columns(pl.coalesce("col_c", "col_c_right").alias("col_c"))
.select("col_1", "col_2", "col_3", "col_c")
).collect()
Solution foo2()
provided by @robertdj is slightly slower than my solution foo1()
:
> python -m timeit -n 1000 -s "import test1" "test1.foo1()"
1000 loops, best of 5: 4.76 msec per loop
> python -m timeit -n 1000 -s "import test1" "test1.foo2()"
1000 loops, best of 5: 6.57 msec per loop