rustpeekrust-polars

Peek at the next value in a rust-polars LazyFrame column while still working on the current one


I guess this is a conceptual oxymoron "peeking ahead in a LazyFrame-column" ... maybe one of you can enlighten me how to best do it.

I want to put the result of this for each date into a new column:

Ok( (next_weekday_number - current_weekday_number) == 1 )

Here is the sample code to help me find an answer:

// PLEASE be aware to add the needed feature flags in your toml file

use polars::export::arrow::temporal_conversions::date32_to_date;
use polars::prelude::*;

fn main() -> Result<()> {
    let days = df!(
        "date_string" => &["1900-01-01", "1900-01-02", "1900-01-03", "1900-01-04", "1900-01-05",
        "1900-01-06", "1900-01-07", "1900-01-09", "1900-01-10"])?;

    let options = StrpTimeOptions {
        date_dtype: DataType::Date,   // the result column-datatype
        fmt: Some("%Y-%m-%d".into()), // the source format of the date-string
        strict: false,
        exact: true,
    };

    // convert date_string into dtype(date) and put into new column "date_type"
    // we convert the days DataFrame to a LazyFrame ...
    // because in my real-world example I am getting a LazyFrame
    let mut new_days = days.lazy().with_column(
        col("date_string")
            .alias("date_type")
            .str()
            .strptime(options),
    );

    // This is what I wanted to do ... but I get a string result .. need u32
    // let o = GetOutput::from_type(DataType::Date);
    // new_days = new_days.with_column(
    //     col("date_type")
    //         .alias("weekday_number")
    //         .map(|x| Ok(x.strftime("%w").unwrap()), o.clone()),
    // );

    // This is the convoluted workaround
    let o = GetOutput::from_type(DataType::Date);
    new_days = new_days.with_column(col("date_type").alias("weekday_number").map(
        |x| {
            Ok(x.date()
                .unwrap()
                .clone()
                .into_iter()
                .map(|opt_name: Option<i32>| {
                    opt_name.map(|datum: i32| {
                        // println!("{:?}", datum);
                        date32_to_date(datum)
                            .format("%w")
                            .to_string()
                            .parse::<u32>()
                            .unwrap()
                    })
                })
                .collect::<UInt32Chunked>()
                .into_series())
        },
        o,
    ));

    // Here is where my challenge is ..
    // I need to get the weekday_number of the following day to determine a condition
    // my pseudo code:
    // new_days = new_days.with_column(
    //     col("weekday_number")
    //         .alias("cold_day")
    //         .map(|x| Ok( (next_weekday_number - current_weekday_number) == 1 ), o.clone()),
    // );

    println!("{:?}", new_days.clone().collect());

    Ok(())
}

Solution

  • Ok, I could not find a way to do everything with a LazyFrame, thus I converted the LazyFrame to an eager DataFrame and was able to process two columns at the same time.

    So its working for now. Maybe someone can help me realize a solution just with a LazyFrame.

    Here is the working code:

    use polars::export::arrow::temporal_conversions::date32_to_date;
    
    use polars::prelude::*;
    
    fn main() -> Result<()> {
        let days = df!(
            "date_string" => &["1900-01-01", "1900-01-02", "1900-01-03", "1900-01-04", "1900-01-05",
            "1900-01-06", "1900-01-07", "1900-01-09", "1900-01-10"])?;
    
        let options = StrpTimeOptions {
            date_dtype: DataType::Date,   // the result column-datatype
            fmt: Some("%Y-%m-%d".into()), // the source format of the date-string
            strict: false,
            exact: true,
        };
    
        // convert date_string into dtype(date) and put into new column "date_type"
        // we convert the days DataFrame to a LazyFrame ...
        // because in my real-world example I am getting a LazyFrame
        let mut new_days_lf = days.lazy().with_column(
            col("date_string")
                .alias("date_type")
                .str()
                .strptime(options),
        );
    
        // Getting the weekday as a number:
        // This is what I wanted to do ... but I get a string result .. need u32
        // let o = GetOutput::from_type(DataType::Date);
        // new_days_lf = new_days_lf.with_column(
        //     col("date_type")
        //         .alias("weekday_number")
        //         .map(|x| Ok(x.strftime("%w").unwrap()), o.clone()),
        // );
    
        // This is the convoluted workaround for getting the weekday as a number
        let o = GetOutput::from_type(DataType::Date);
        new_days_lf = new_days_lf.with_column(col("date_type").alias("weekday_number").map(
            |x| {
                Ok(x.date()
                    .unwrap()
                    .clone()
                    .into_iter()
                    .map(|opt_name: Option<i32>| {
                        opt_name.map(|datum: i32| {
                            // println!("{:?}", datum);
                            date32_to_date(datum)
                                .format("%w")
                                .to_string()
                                .parse::<u32>()
                                .unwrap()
                        })
                    })
                    .collect::<UInt32Chunked>()
                    .into_series())
            },
            o,
        ));
    
        // The "peek" ==> add a shifted column
        new_days_lf = new_days_lf.with_column(
            col("weekday_number")
                .shift_and_fill(-1, 9999)
                .alias("next_weekday_number"),
        );
    
        // now we convert the LazyFrame into a normal DataFrame for further processing:
        let mut new_days_df = new_days_lf.collect()?;
    
        // convert the column to a series
        // to get a column by name we need to collect the LazyFrame into a normal DataFrame
        let col1 = new_days_df.column("weekday_number")?;
    
        // convert the column to a series
        let col2 = new_days_df.column("next_weekday_number")?;
    
        // now I can use series-arithmetics
        let diff = col2 - col1;
    
        // create a bool column based on "element == 2"
        // add bool column to DataFrame
        new_days_df.replace_or_add("weekday diff eq(2)", diff.equal(2)?.into_series())?;
    
        println!("{:?}", new_days_df);
    
        Ok(())
    }