rustrust-polars

Transform JSON Key into a Polars DataFrame


I was wondering how to read in a JSON file into a polars DataFrame in Rust on the "data" key. However, I believe the structure of the JSON file that I have would be hard to achieve.

Here is the first Structure of the JSON File where it contains dataTypes.

{
  "data": [
    {
      "dataItemName": "TICKER",
      "result": [
        "AAPL",
        "MSFT",
        "TSLA"
      ],
      "dataType": "STRING",
      "error": 0
    },
    {
      "dataItemName": "SALES",
      "result": [ 
        259968,
        143015,
        24578
      ],
      "dataType": "DOUBLE",
      "error": 0
    },
    {
      "dataItemName": "CNAME",
      "result": [
        "Apple Inc.",
        "Microsoft Corporation",
        "Tesla Inc"
      ],
      "dataType": "STRING",
      "error": 0
    },
    {
      "dataItemName": "PRICE",
      "result": [
        115.98,
        214.22,
        430.83
      ],
      "dataType": "DOUBLE",
      "error": 0
    },
    {
      "dataItemName": "ASSETS",
      "result": [
        338516,
        301311,
        34309
      ],
      "dataType": "DOUBLE",
      "error": 0
    }
  ]
}

Here is what I have tried in Rust.

use polars::prelude::*;


fn main() {
    let json_file = std::fs::File::open("data/test_merged.json").unwrap();
    let df = JsonReader::new(json_file).finish().unwrap();
    println!("{:?}", df);
}

Here is the Rust output example which single column/row DataFrame

shape: (1, 1)
┌───────────────────────────────────┐
│ data                              │
│ ---                               │
│ list[struct[63]]                  │
╞═══════════════════════════════════╡
│ [{0.0,0.530558,3.38631,"2023-06-… │
└───────────────────────────────────┘

There are only 3 Data Types Which are Stings Floats and Integers.

Here is a similar question for a Python Version. transform json to polars dataframe


Solution

  • If performance is important, then @BallpointBen's version is not the fastest you can get; here's a more performant version:

    pub fn convert(json: &str) -> Result<DataFrame, Box<dyn Error>> {
        use serde::Deserialize;
    
        #[derive(Debug, Deserialize)]
        #[serde(untagged)]
        enum Values {
            String(Vec<String>),
            Double(Vec<f64>),
        }
    
        #[derive(Debug, Deserialize)]
        #[serde(rename_all = "UPPERCASE")]
        enum DataType {
            String,
            Double,
        }
    
        #[derive(Debug, Deserialize)]
        #[serde(rename_all = "camelCase")]
        struct Column {
            data_item_name: String,
            result: Values,
            data_type: DataType,
        }
    
        #[derive(Debug, Deserialize)]
        struct Data {
            data: Vec<Column>,
        }
    
        let data = serde_json::from_str::<Data>(json)?;
        let df = data
            .data
            .into_iter()
            .map(|column| match column.data_type {
                DataType::String => {
                    let Values::String(values) = column.result else {
                        return Err("column type mismatch");
                    };
                    Ok(Series::new(&column.data_item_name, values))
                }
                DataType::Double => {
                    let Values::Double(values) = column.result else {
                        return Err("column type mismatch");
                    };
                    Ok(Series::from_vec(&column.data_item_name, values))
                }
            })
            .collect::<Result<DataFrame, _>>()?;
    
        Ok(df)
    }
    

    Benchmark with 1,000 random entries:

    BallpointBen            time:   [338.41 µs 340.05 µs 341.85 µs]
    Found 2 outliers among 100 measurements (2.00%)
      2 (2.00%) high mild
    
    Mine                    time:   [195.82 µs 196.79 µs 197.95 µs]
    Found 11 outliers among 100 measurements (11.00%)
      8 (8.00%) high mild
      3 (3.00%) high severe