I want to efficiently find the distance from the current row to the previous occurrence. I know polars doesn't have indexes, but the formula would roughly be:
if prior_occurrence {
(current_row_index - prior_occurrence_index - 1)
} else {
-1
}
This is the input dataframe:
let df_a = df![
"a" => [1, 2, 2, 1, 4, 1],
"b" => ["c","a", "b", "c", "c","a"]
].unwrap();
println!("{}", df_a);
┌─────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ i32 ┆ str │
╞═════╪═════╡
│ 1 ┆ c │
│ 2 ┆ a │
│ 2 ┆ b │
│ 1 ┆ c │
│ 4 ┆ c │
│ 1 ┆ a │
└─────┴─────┘
Wanted output:
┌─────┬─────┬────────┐
│ a ┆ b ┆ b_dist │
│ --- ┆ --- ┆ --- │
│ i32 ┆ str ┆ i32 │
╞═════╪═════╪════════╡
│ 1 ┆ c ┆ -1 │
│ 2 ┆ a ┆ -1 │
│ 2 ┆ b ┆ -1 │
│ 1 ┆ c ┆ 2 │
│ 4 ┆ c ┆ 0 │
│ 1 ┆ a ┆ 3 │
└─────┴─────┴────────┘
What's the most efficient way to go about this?
(df
.with_row_index()
.with_columns(
((pl.col("index") - pl.col("index").shift()).cast(pl.Int32).fill_null(0) - 1)
.over("a").alias("a_distance_to_a")
)
)
fn func1() -> PolarsResult<()> {
let df_a = df![
"a" => [1, 2, 2, 1, 4, 1],
"b" => ["c","a", "b", "c", "c","a"]
]?;
let out = df_a
.lazy()
.with_row_count("idx", None)
.with_columns([((col("idx") - col("idx").shift(1))
.cast(DataType::Int32)
.fill_null(0)
- lit(1))
.over("a")
.alias("a_distance_to_a")])
.collect()?;
Ok(())
shape: (6, 4)
┌─────┬─────┬─────┬─────────────────┐
│ idx ┆ a ┆ b ┆ a_distance_to_a │
│ --- ┆ --- ┆ --- ┆ --- │
│ u32 ┆ i64 ┆ str ┆ i32 │
╞═════╪═════╪═════╪═════════════════╡
│ 0 ┆ 1 ┆ c ┆ -1 │
│ 1 ┆ 2 ┆ a ┆ -1 │
│ 2 ┆ 2 ┆ b ┆ 0 │
│ 3 ┆ 1 ┆ c ┆ 2 │
│ 4 ┆ 4 ┆ c ┆ -1 │
│ 5 ┆ 1 ┆ a ┆ 1 │
└─────┴─────┴─────┴─────────────────┘