pythonpython-polars

python-polars: string columns to lists to integer columns


I am trying to convert a TSV file in BED12 format to a polars data frame. I got two columns encoded as strings containing coma separated integers. My solution (simplified) involves going through structs:

df = pl.DataFrame(
    {
        "chrom": ["1", "1", "2", "X"],
        "blockSizes": ["10,29,", "20,22,", "30,25,", "40,23,"],
        "blockStarts": ["0,50,", "0,45,", "0,60,", "0,70,"]
    })

df.with_columns(pl.col("blockSizes").str.split(by=",")
.list.slice(0, 2)
.list.to_struct()
).unnest("blockSizes").with_columns(
    pl.col("field_0").cast(pl.Int32).alias("blockSizes_0"),
    pl.col("field_1").cast(pl.Int32).alias("blockSizes_1"),
).drop("field_0", "field_1")

Obviously I can just continue with doing the same with blockStarts column but I hope there is some simpler way to do it.


Solution

  • The current approach could be modified:

    cols = "blockSizes", "blockStarts"
    
    (df.with_columns(
        pl.col(col)
          .str.strip_chars(",")
          .str.split(",")
          .cast(pl.List(pl.Int32))
          .list.to_struct(
             n_field_strategy = "max_width", 
             fields = lambda idx, col=col: f"{col}_{idx}"
          )
        for col in cols
     )
     .unnest(cols)
    )
    
    shape: (4, 5)
    ┌───────┬──────────────┬──────────────┬───────────────┬───────────────┐
    │ chrom ┆ blockSizes_0 ┆ blockSizes_1 ┆ blockStarts_0 ┆ blockStarts_1 │
    │ ---   ┆ ---          ┆ ---          ┆ ---           ┆ ---           │
    │ str   ┆ i32          ┆ i32          ┆ i32           ┆ i32           │
    ╞═══════╪══════════════╪══════════════╪═══════════════╪═══════════════╡
    │ 1     ┆ 10           ┆ 29           ┆ 0             ┆ 50            │
    │ 1     ┆ 20           ┆ 22           ┆ 0             ┆ 45            │
    │ 2     ┆ 30           ┆ 25           ┆ 0             ┆ 60            │
    │ X     ┆ 40           ┆ 23           ┆ 0             ┆ 70            │
    └───────┴──────────────┴──────────────┴───────────────┴───────────────┘
    

    Perhaps creating a function would neaten things up.

    def csv_to_struct(col):
        expr = pl.col(col).str.strip_chars(",").str.split(",")
        expr = expr.cast(pl.List(pl.Int32))
        return expr.list.to_struct(
            n_field_strategy = "max_width", 
            fields = lambda idx: f"{col}_{idx}"
        )
    
    cols = "blockSizes", "blockStarts"
    
    df.with_columns(map(csv_to_struct, cols)).unnest(cols)
    

    unpivot + pivot

    Another approach could be to .unpivot() - generate the column names, and .pivot() back.

    cols = "blockSizes", "blockStarts"
    
    (df.with_row_index()
       .with_columns(
          pl.col(cols).str.strip_chars(",").str.split(",").cast(pl.List(int))
       )
       .explode(cols)
       .unpivot(index=["index", "chrom"], variable_name="name")
       .with_columns(
          pl.format("{}_{}", 
             "name", 
             pl.col("index").cum_count().over("index", "name") - 1
          )
       )
       .pivot(on="name", index=["index", "chrom"])
    )
    
    shape: (4, 6)
    ┌───────┬───────┬──────────────┬──────────────┬───────────────┬───────────────┐
    │ index ┆ chrom ┆ blockSizes_0 ┆ blockSizes_1 ┆ blockStarts_0 ┆ blockStarts_1 │
    │ ---   ┆ ---   ┆ ---          ┆ ---          ┆ ---           ┆ ---           │
    │ u32   ┆ str   ┆ i64          ┆ i64          ┆ i64           ┆ i64           │
    ╞═══════╪═══════╪══════════════╪══════════════╪═══════════════╪═══════════════╡
    │ 0     ┆ 1     ┆ 10           ┆ 29           ┆ 0             ┆ 50            │
    │ 1     ┆ 1     ┆ 20           ┆ 22           ┆ 0             ┆ 45            │
    │ 2     ┆ 2     ┆ 30           ┆ 25           ┆ 0             ┆ 60            │
    │ 3     ┆ X     ┆ 40           ┆ 23           ┆ 0             ┆ 70            │
    └───────┴───────┴──────────────┴──────────────┴───────────────┴───────────────┘