pythonpython-polarsdelta-lakedelta

DeltaTable map type


Using Spark, I can create a delta table with a map column type: MAP<STRING, TIMESTAMP>
How do I create a delta table with a map type without Spark?
I have tried multiple approaches and none of them are working.

import pyarrow as pa
from deltalake import write_deltalake

# Create a sample Arrow Table with a map type
data = {
    "id": pa.array([1, 2, 3]),
    "name": pa.array(["Alice", "Bob", "Charlie"]),
    "attributes": pa.array([
        pa.array([("age", 30)], type=pa.map_(pa.string(), pa.int32())),
        pa.array([("age", 25)], type=pa.map_(pa.string(), pa.int32())),
        pa.array([("age", 35)], type=pa.map_(pa.string(), pa.int32())),
    ])
}

# Create an Arrow Table
table = pa.Table.from_pydict(data)

# Define the path where the Delta table will be stored
delta_table_path = "./tmp/delta_map"

# Write the Arrow Table to a Delta table
write_deltalake(delta_table_path, data=table, mode="overwrite")

pyarrow throws: pyarrow.lib.ArrowTypeError: Could not convert 'a' with type str: was expecting tuple of (key, value) pair

from deltalake import Schema, Field, DeltaTable, WriterProperties, write_deltalake
from deltalake.schema import PrimitiveType, MapType

# Define the schema for the Delta table
schema = Schema([
    Field("id",PrimitiveType("string")),
    Field("data", MapType("integer", "string", value_contains_null=False))
])

# Create a list of data to write to the Delta table
data = [
    {"id": "1", "data": {"key1": "value1", "key2": "value2"}},
    {"id": "2", "data": {"key3": "value3", "key4": "value4"}}
]

# Create a Delta table
delta_table = write_deltalake(table_or_uri="./tmp/delta_map", data=data,
    schema=schema,mode="append",
    writer_properties=WriterProperties(compression="ZSTD")
)

# Write the data to the Delta table
delta_table.write_data(data)

deltalake throws: NotImplementedError: ArrowSchemaConversionMode.passthrough is not implemented to work with DeltaSchema, skip passing a schema or pass an arrow schema. Thx


Solution

  • To create Delta with map:

    import pyarrow as pa
    from deltalake import write_deltalake
    table_path = "./tmp/my_table"
    payload = [{"id": 1, "account_id": {17: "100.01.001 Cash"}}]
    schema = pa.schema([
            pa.field("id", pa.int32()),
            pa.field("account_id", pa.map_(pa.int32(), pa.string())),
        ])
    table = pa.Table.from_pylist(payload, schema)
    
    write_deltalake(table_path,table,mode="overwrite",predicate="id = '1'",engine="rust",)
    

    To read Delta with map:

    import duckdb
    table_path = "./tmp/my_table"
    con = duckdb.connect()
    df = con.sql(f"""SELECT * FROM delta_scan('{table_path}');""")
    print(df)
    con.close()