pythondataframegeopandaspython-polarspyarrow

GeoDataFrame conversion to polars with from_pandas fails with ArrowTypeError: Did not pass numpy.dtype object


I try to convert a GeoDataFrame to a polars DataFrame with from_pandas. I receive an ArrowTypeError: Did not pass numpy.dtype object exception. So I can continue working against the polars API.

Expected outcome would be a polars DataFrame with the geometry column being typed as pl.Object.

I'm aware of https://github.com/geopolars/geopolars (alpha) and https://github.com/pola-rs/polars/issues/1830 and would be OK with the shapely objects just being represented as pl.Object for now.

Here is a minimal example to demonstrate the problem:

## Minimal example displaying the issue
import geopandas as gpd
print("geopandas version: ", gpd.__version__)
import geodatasets
print("geodatasets version: ", geodatasets.__version__)
import polars as pl
print("polars version: ", pl.__version__)

gdf = gpd.GeoDataFrame.from_file(geodatasets.get_path("nybb"))
print("\nOriginal GeoDataFrame")
print(gdf.dtypes)
print(gdf.head())

print("\nGeoDataFrame to Polars without geometry")
print(pl.from_pandas(gdf.drop("geometry", axis=1)).head())

try:
    print("\nGeoDataFrame to Polars naiive") 
    print(pl.from_pandas(gdf).head())
except Exception as e:
    print(e)

try:
    print("\nGeoDataFrame to Polars with schema override") 
    print(pl.from_pandas(gdf, schema_overrides={"geometry": pl.Object}).head())
except Exception as e:
    print(e)

# again to print stack trace
pl.from_pandas(gdf).head()

Output

geopandas version:  0.14.4
geodatasets version:  2023.12.0
polars version:  0.20.23

Original GeoDataFrame
BoroCode         int64
BoroName        object
Shape_Leng     float64
Shape_Area     float64
geometry      geometry
dtype: object
   BoroCode       BoroName     Shape_Leng    Shape_Area  \
0         5  Staten Island  330470.010332  1.623820e+09   
1         4         Queens  896344.047763  3.045213e+09   
2         3       Brooklyn  741080.523166  1.937479e+09   
3         1      Manhattan  359299.096471  6.364715e+08   
4         2          Bronx  464392.991824  1.186925e+09   

                                            geometry  
0  MULTIPOLYGON (((970217.022 145643.332, 970227....  
1  MULTIPOLYGON (((1029606.077 156073.814, 102957...  
2  MULTIPOLYGON (((1021176.479 151374.797, 102100...  
3  MULTIPOLYGON (((981219.056 188655.316, 980940....  
4  MULTIPOLYGON (((1012821.806 229228.265, 101278...  

GeoDataFrame to Polars without geometry
shape: (5, 4)
┌──────────┬───────────────┬───────────────┬────────────┐
│ BoroCode ┆ BoroName      ┆ Shape_Leng    ┆ Shape_Area │
│ ---      ┆ ---           ┆ ---           ┆ ---        │
│ i64      ┆ str           ┆ f64           ┆ f64        │
╞══════════╪═══════════════╪═══════════════╪════════════╡
│ 5        ┆ Staten Island ┆ 330470.010332 ┆ 1.6238e9   │
│ 4        ┆ Queens        ┆ 896344.047763 ┆ 3.0452e9   │
│ 3        ┆ Brooklyn      ┆ 741080.523166 ┆ 1.9375e9   │
│ 1        ┆ Manhattan     ┆ 359299.096471 ┆ 6.3647e8   │
│ 2        ┆ Bronx         ┆ 464392.991824 ┆ 1.1869e9   │
└──────────┴───────────────┴───────────────┴────────────┘

GeoDataFrame to Polars naiive
Did not pass numpy.dtype object

GeoDataFrame to Polars with schema override
Did not pass numpy.dtype object

Stack trace (is the same with and without schema_overrides)

---------------------------------------------------------------------------
ArrowTypeError                            Traceback (most recent call last)
Cell In[59], line 27
     24     print(e)
     26 # again to print stack trace
---> 27 pl.from_pandas(gdf).head()

File c:\Users\...\polars\convert.py:571, in from_pandas(data, schema_overrides, rechunk, nan_to_null, include_index)
    568     return wrap_s(pandas_to_pyseries("", data, nan_to_null=nan_to_null))
    569 elif isinstance(data, pd.DataFrame):
    570     return wrap_df(
--> 571         pandas_to_pydf(
    572             data,
    573             schema_overrides=schema_overrides,
    574             rechunk=rechunk,
    575             nan_to_null=nan_to_null,
    576             include_index=include_index,
    577         )
    578     )
    579 else:
    580     msg = f"expected pandas DataFrame or Series, got {type(data).__name__!r}"

File c:\Users\...\polars\_utils\construction\dataframe.py:1032, in pandas_to_pydf(data, schema, schema_overrides, strict, rechunk, nan_to_null, include_index)
   1025         arrow_dict[str(idxcol)] = plc.pandas_series_to_arrow(
   1026             data.index.get_level_values(idxcol),
   1027             nan_to_null=nan_to_null,
   1028             length=length,
   1029         )
   1031 for col in data.columns:
-> 1032     arrow_dict[str(col)] = plc.pandas_series_to_arrow(
   1033         data[col], nan_to_null=nan_to_null, length=length
   1034     )
   1036 arrow_table = pa.table(arrow_dict)
   1037 return arrow_to_pydf(
   1038     arrow_table,
   1039     schema=schema,
   (...)
   1042     rechunk=rechunk,
   1043 )

File c:\Users\...\polars\_utils\construction\other.py:97, in pandas_series_to_arrow(values, length, nan_to_null)
     95     return pa.array(values, from_pandas=nan_to_null)
     96 elif dtype:
---> 97     return pa.array(values, from_pandas=nan_to_null)
     98 else:
     99     # Pandas Series is actually a Pandas DataFrame when the original DataFrame
    100     # contains duplicated columns and a duplicated column is requested with df["a"].
    101     msg = "duplicate column names found: "

File c:\Users\...\pyarrow\array.pxi:323, in pyarrow.lib.array()

File c:\Users\...\pyarrow\array.pxi:79, in pyarrow.lib._ndarray_to_array()

File c:\Users\...\pyarrow\array.pxi:67, in pyarrow.lib._ndarray_to_type()

File c:\Users\...\pyarrow\error.pxi:123, in pyarrow.lib.check_status()

ArrowTypeError: Did not pass numpy.dtype object

Solution

  • You could drop the geometry before making the polars dataframe from_pandas, then assign it later as a new column :

    out = (
        pl.from_pandas(gdf.drop(columns=["geometry"]))
            .with_columns(pl.Series("geometry", gdf["geometry"].tolist()))
    )
    

    Output :

    ┌──────────┬───────────────┬───────────────┬────────────┬───────────────────────────────────┐
    │ BoroCode ┆ BoroName      ┆ Shape_Leng    ┆ Shape_Area ┆ geometry                          │
    │ ---      ┆ ---           ┆ ---           ┆ ---        ┆ ---                               │
    │ i64      ┆ str           ┆ f64           ┆ f64        ┆ object                            │
    ╞══════════╪═══════════════╪═══════════════╪════════════╪═══════════════════════════════════╡
    │ 5        ┆ Staten Island ┆ 330470.010332 ┆ 1.6238e9   ┆ MULTIPOLYGON (((970217.022399902… │
    │ 4        ┆ Queens        ┆ 896344.047763 ┆ 3.0452e9   ┆ MULTIPOLYGON (((1029606.07659912… │
    │ 3        ┆ Brooklyn      ┆ 741080.523166 ┆ 1.9375e9   ┆ MULTIPOLYGON (((1021176.47900390… │
    │ 1        ┆ Manhattan     ┆ 359299.096471 ┆ 6.3647e8   ┆ MULTIPOLYGON (((981219.055786132… │
    │ 2        ┆ Bronx         ┆ 464392.991824 ┆ 1.1869e9   ┆ MULTIPOLYGON (((1012821.80578613… │
    └──────────┴───────────────┴───────────────┴────────────┴───────────────────────────────────┘
    

    The geomtries are preserved. For instance, out[0, 4] shows this :

    enter image description here