I find myself doing this:
import polars as pl
import sys
red_data = pl.DataFrame(
[
pl.Series("id", [0, 1, 2], dtype=pl.UInt8()),
pl.Series("red_data", [1, 0, 1], dtype=pl.UInt8()),
]
)
blue_data = pl.DataFrame(
[
pl.Series("id", [0, 2, 3], dtype=pl.UInt8()),
pl.Series("blue_data", [0, 1, 1], dtype=pl.UInt8()),
]
)
# in both red and blue
red_and_blue = red_data.join(blue_data, on=["id"])
# in red, but not blue
red_not_blue = red_data.join(blue_data, on=["id"], how="anti").with_columns(
blue_data=pl.lit(None, dtype=pl.UInt8())
)
# in blue, but not red
blue_not_red = blue_data.join(red_data, on=["id"], how="anti").with_columns(
red_data=pl.lit(None, dtype=pl.UInt8())
)
columns = ["id", "red_data", "blue_data"]
sys.displayhook(
pl.concat(
[
red_and_blue.select(columns),
red_not_blue.select(columns),
blue_not_red.select(columns),
]
)
)
shape: (4, 3)
┌─────┬──────────┬───────────┐
│ id ┆ red_data ┆ blue_data │
│ --- ┆ --- ┆ --- │
│ u8 ┆ u8 ┆ u8 │
╞═════╪══════════╪═══════════╡
│ 0 ┆ 1 ┆ 0 │
│ 2 ┆ 1 ┆ 1 │
│ 1 ┆ 0 ┆ null │
│ 3 ┆ null ┆ 1 │
└─────┴──────────┴───────────┘
It seems like you are looking for a simple pl.DataFrame.join
with how="full"
and coalesce=True
.
red_data.join(blue_data, on="id", how="full", coalesce=True)
shape: (4, 3)
┌─────┬──────────┬───────────┐
│ id ┆ red_data ┆ blue_data │
│ --- ┆ --- ┆ --- │
│ u8 ┆ u8 ┆ u8 │
╞═════╪══════════╪═══════════╡
│ 0 ┆ 1 ┆ 0 │
│ 1 ┆ 0 ┆ null │
│ 2 ┆ 1 ┆ 1 │
│ 3 ┆ null ┆ 1 │
└─────┴──────────┴───────────┘