python-polars

Storing Duration in spreadsheet


What is the ideal way to store duration datatype of polars in a spreadsheet, as it is getting converted on its own, and I do not get the data like “1d 2m” when I export to spreadsheet?

Convert

1d 2m 2h 1m

To either a string or a time format


Solution

  • Update: .dt.to_string() support for Duration types was added in Polars 1.14.0

    df.with_columns(
        pl.col("duration").dt.to_string(format="polars")
          .name.suffix("_str")
    )
    
    shape: (1, 2)
    ┌───────────────────┬───────────────────┐
    │ duration          ┆ duration_str      │
    │ ---               ┆ ---               │
    │ duration[μs]      ┆ str               │
    ╞═══════════════════╪═══════════════════╡
    │ 1d 2m 3s 987654µs ┆ 1d 2m 3s 987654µs │
    └───────────────────┴───────────────────┘
    

    Original answer

    There is an open feature request for keeping the Duration format when casted to String:

    df = pl.select(duration = pl.duration(days=1, seconds=123, microseconds=987654))
    
    df.with_columns(duration_string = pl.col("duration").cast(str))
    
    shape: (1, 2)
    ┌───────────────────┬─────────────────┐
    │ duration          ┆ duration_string │
    │ ---               ┆ ---             │
    │ duration[μs]      ┆ str             │
    ╞═══════════════════╪═════════════════╡
    │ 1d 2m 3s 987654µs ┆ 86523987654     │
    └───────────────────┴─────────────────┘
    

    The code for the frame repr formatting is here:

    It is possible to attempt to port it to Polars expressions.

    Probably not something one would use, but may serve as an interesting example of what is possible with expressions.

    def duration_to_string(col, time_unit="us"):
        names = ["d", "h", "m", "s"]
        
        sizes = {
            "us": [864e8,  36e8,  6e7,  1e6],
            "ns": [864e11, 36e11, 6e10, 1e9],
            "ms": [864e5,  36e5,  6e4,  1e3]
        }[time_unit]
        
        remainder = {
            "us": 
                (pl.when(col % 1e3 != 0).then(pl.format("{}µs", (col % 1e6).cast(int)))
                   .when(col % 1e6 != 0).then(pl.format("{}ms", ((col % 1e6) / 1e3).cast(int)))),
            "ns": 
                (pl.when(col % 1e3 != 0).then(pl.format("{}ns", (col % 1e9).cast(int)))
                   .when(col % 1e6 != 0).then(pl.format("{}µs", ((col % 1e9) / 1e3).cast(int)))
                   .when(col % 1e9 != 0).then(pl.format("{}ms", ((col % 1e9) / 1e6).cast(int)))),
            "ms":
                (pl.when(col % 1e3 != 0).then(pl.format("{}ms", (col % 1e3).cast(int))))
        }[time_unit]
    
        blank, space = pl.lit(""), pl.lit(" ")
        
        def build_fmt(name, whole_num, size):
            valid = (whole_num <= -1) | (whole_num >= 1)
            pad = pl.when(col % size != 0).then(space).otherwise(blank)
            fmt = pl.format("{}{}{}", whole_num.cast(int), pl.lit(name), space)
            return pl.when(valid).then(fmt).otherwise(blank)
            
        size = sizes[0]
        whole_num = col / sizes[0]
        expr = build_fmt(names[0], whole_num, size)
        
        for i in range(1, 4):
            size = sizes[i]
            whole_num = (col % sizes[i - 1]) / size
            expr += build_fmt(names[i], whole_num, size)
        
        expr += remainder.otherwise(blank)
                     
        return (
             pl.when(col == 0)
               .then(pl.lit("0" + time_unit.replace("u", "µ")))
               .otherwise(expr)
        )
    

    And call it with Expr.pipe()

    df.with_columns(
       pl.col("duration").pipe(duration_to_string, time_unit=df.schema["duration"].time_unit)
         .name.suffix("_string")
    )
    
    shape: (1, 2)
    ┌───────────────────┬───────────────────┐
    │ duration          ┆ duration_string   │
    │ ---               ┆ ---               │
    │ duration[μs]      ┆ str               │
    ╞═══════════════════╪═══════════════════╡
    │ 1d 2m 3s 987654µs ┆ 1d 2m 3s 987654µs │
    └───────────────────┴───────────────────┘