rustdtype

rust, polars, polars Decimal type in scheme converts to f64


I have csv file with format

2008-03-19 10:15:00,69.00,56.37,59.50,59.50,34803018,34793218,
2008-03-19 10:16:00,64.00,58.05,59.50,59.50,38839940,4024922,

I want to read this file with dtypes Datetime, Decimal, Decimal, Decimal, Decimal, UInt64, UInt64

This code works:

    let mut scheme = pl::Schema::new();
    scheme
        .insert_at_index(
            0,
            "timestamp".into(),
            pl::DataType::Datetime(pl::TimeUnit::Milliseconds, None),
        )
        .unwrap();
    scheme
        .insert_at_index(1, "open".into(), pl::DataType::Decimal(Some(12), Some(4)))
        .unwrap();
    scheme
        .insert_at_index(2, "high".into(), pl::DataType::Decimal(Some(12), Some(4)))
        .unwrap();
    scheme
        .insert_at_index(3, "low".into(), pl::DataType::Decimal(Some(12), Some(4)))
        .unwrap();
    scheme
        .insert_at_index(4, "close".into(), pl::DataType::Decimal(Some(12), Some(4)))
        .unwrap();
    scheme
        .insert_at_index(5, "volume".into(), pl::DataType::UInt32)
        .unwrap();
    scheme
        .insert_at_index(6, "open_interest".into(), pl::DataType::UInt32)
        .unwrap();

    let visa = pl::CsvReader::from_path("src/data/V.csv")
        .expect("expected path to the csv file")
        .with_separator(b',')
        .truncate_ragged_lines(true)
        .with_n_threads(Some(2))
        .with_dtypes(Some(Arc::new(scheme)))
        .with_skip_rows(1_600_000)
        .finish();

And I have result dtypes:

│ datetime[ms] ┆ decimal[12,4] ┆ decimal[12,4] ┆ decimal[12,4] ┆ decimal[12,4] ┆ u64 ┆ u64│

But I thought this is unconvinient and tried different approach:

    let f1 = pl::Field::new(
        "timestamp",
        pl::DataType::Datetime(pl::TimeUnit::Milliseconds, None),
    );
    let f2 = pl::Field::new("open", pl::DataType::Decimal(Some(12), Some(4)));
    println!("{}", f2.data_type()); // ===> decimal[12,4]
    let f3 = pl::Field::new("high", pl::DataType::Decimal(Some(12), Some(4)));
    let f4 = pl::Field::new("low", pl::DataType::Decimal(Some(12), Some(4)));
    let f5 = pl::Field::new("close", pl::DataType::Decimal(Some(12), Some(4)));
    let f6 = pl::Field::new("volume", pl::DataType::UInt64);
    let f7 = pl::Field::new("open_interest", pl::DataType::UInt64);
    let sc = pl::Schema::from_iter(vec![f1, f2, f3, f4, f5, f6, f7]);
    println!("{}", sc.get("open").unwrap()); // ===> f64????????? Why ????
    let visa = pl::CsvReader::from_path("src/data/V.csv")
        .expect("expected path to the csv file")
        .with_separator(b',')
        .truncate_ragged_lines(true)
        .with_dtypes(Some(Arc::new(sc)))
        .with_skip_rows(1_600_000)
        .with_n_threads(Some(2))
        .finish();

    println!("{:?}", visa.unwrap());

That's seems much better, but gives me incorrect dtypes for some reason...

│ datetime[ms] ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ u64 ┆ u64 │

Is it bug or I didn't get something?


Solution

  • It's because polars converts Decimal to Float64 in Schema::from_iter if $POLARS_ACTIVATE_DECIMAL is not set to "1"

               #[cfg(feature = "dtype-decimal")]
               let fld = match fld.dtype {
                   DataType::Decimal(_, _) => {
                       if crate::config::decimal_is_active() {
                           fld
                       } else {
                           let mut fld = fld.clone();
                           fld.coerce(DataType::Float64);
                           fld
                       }
                   },
                   _ => fld,
               };
    

    You can either run with an apropriate environment, for example with something like POLARS_ACTIVATE_DECIMAL=1 your_executable or set that environment variable from your code1 to preserve the datatype:

    use polars::prelude::*;
    fn main() {
        std::env::set_var("POLARS_ACTIVATE_DECIMAL", "1");
    
        let open = Field::new("open", DataType::Decimal(Some(1), Some(1)));
        let sc = Schema::from_iter(vec![open]);
        dbg!(sc.get_field("open"));
    }
    

    outputs

    sc.get_field("open") = Some(
        Field {
            name: "open",
            dtype: Decimal(
                Some(
                    1,
                ),
                Some(
                    1,
                ),
            ),
        },
    )
    

    as expected.


    1: It's likely surprising for the user and thus discouraged to ignore the environment that way though