I am trying to convert a TSV file in BED12 format to a polars data frame. I got two columns encoded as strings containing coma separated integers. My solution (simplified) involves going through structs:
df = pl.DataFrame(
{
"chrom": ["1", "1", "2", "X"],
"blockSizes": ["10,29,", "20,22,", "30,25,", "40,23,"],
"blockStarts": ["0,50,", "0,45,", "0,60,", "0,70,"]
})
df.with_columns(pl.col("blockSizes").str.split(by=",")
.list.slice(0, 2)
.list.to_struct()
).unnest("blockSizes").with_columns(
pl.col("field_0").cast(pl.Int32).alias("blockSizes_0"),
pl.col("field_1").cast(pl.Int32).alias("blockSizes_1"),
).drop("field_0", "field_1")
Obviously I can just continue with doing the same with blockStarts
column but I hope there is some simpler way to do it.
The current approach could be modified:
.str.strip_chars()
to remove the trailing comma.cast(pl.List(pl.Int32))
to cast everything at once.list.to_struct(fields=)
to rename the fields "dynamically"cols = "blockSizes", "blockStarts"
(df.with_columns(
pl.col(col)
.str.strip_chars(",")
.str.split(",")
.cast(pl.List(pl.Int32))
.list.to_struct(
n_field_strategy = "max_width",
fields = lambda idx, col=col: f"{col}_{idx}"
)
for col in cols
)
.unnest(cols)
)
shape: (4, 5)
┌───────┬──────────────┬──────────────┬───────────────┬───────────────┐
│ chrom ┆ blockSizes_0 ┆ blockSizes_1 ┆ blockStarts_0 ┆ blockStarts_1 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i32 ┆ i32 ┆ i32 ┆ i32 │
╞═══════╪══════════════╪══════════════╪═══════════════╪═══════════════╡
│ 1 ┆ 10 ┆ 29 ┆ 0 ┆ 50 │
│ 1 ┆ 20 ┆ 22 ┆ 0 ┆ 45 │
│ 2 ┆ 30 ┆ 25 ┆ 0 ┆ 60 │
│ X ┆ 40 ┆ 23 ┆ 0 ┆ 70 │
└───────┴──────────────┴──────────────┴───────────────┴───────────────┘
Perhaps creating a function would neaten things up.
def csv_to_struct(col):
expr = pl.col(col).str.strip_chars(",").str.split(",")
expr = expr.cast(pl.List(pl.Int32))
return expr.list.to_struct(
n_field_strategy = "max_width",
fields = lambda idx: f"{col}_{idx}"
)
cols = "blockSizes", "blockStarts"
df.with_columns(map(csv_to_struct, cols)).unnest(cols)
Another approach could be to .unpivot()
- generate the column names, and .pivot()
back.
cols = "blockSizes", "blockStarts"
(df.with_row_index()
.with_columns(
pl.col(cols).str.strip_chars(",").str.split(",").cast(pl.List(int))
)
.explode(cols)
.unpivot(index=["index", "chrom"], variable_name="name")
.with_columns(
pl.format("{}_{}",
"name",
pl.col("index").cum_count().over("index", "name") - 1
)
)
.pivot(on="name", index=["index", "chrom"])
)
shape: (4, 6)
┌───────┬───────┬──────────────┬──────────────┬───────────────┬───────────────┐
│ index ┆ chrom ┆ blockSizes_0 ┆ blockSizes_1 ┆ blockStarts_0 ┆ blockStarts_1 │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ u32 ┆ str ┆ i64 ┆ i64 ┆ i64 ┆ i64 │
╞═══════╪═══════╪══════════════╪══════════════╪═══════════════╪═══════════════╡
│ 0 ┆ 1 ┆ 10 ┆ 29 ┆ 0 ┆ 50 │
│ 1 ┆ 1 ┆ 20 ┆ 22 ┆ 0 ┆ 45 │
│ 2 ┆ 2 ┆ 30 ┆ 25 ┆ 0 ┆ 60 │
│ 3 ┆ X ┆ 40 ┆ 23 ┆ 0 ┆ 70 │
└───────┴───────┴──────────────┴──────────────┴───────────────┴───────────────┘