pythonpandasgeopandas

Is there any way to optimize my function for district matching using Python's GeoPandas and Pandas?


I have a table with polygons and district names; I also have data on purchases with exact longitude and latitude. I wrote a function that checks for every coordinate pair a match in a polygon; then it assigns district name for a purchase. The problem is that it works very-very slow due to lack of vectorization and nested for-loops (thanks, pandas). How can I optimize so it will digest 10+ million rows in less time?

def get_district_name(geo_df: pd.DataFrame, ship_df: pd.DataFrame, col_name: str, frac: int=0.65) -> pd.DataFrame:
    
    sample_ship = ship_df.sample(frac=frac, replace=False, random_state=42).reset_index(drop=True)

    sample_ship['municipal_district_name'] = ''

    for i in tqdm(range(len(sample_ship))):
        point = shapely.geometry.Point(sample_ship['address_longitude'][i], sample_ship['address_latitude'][i])
        for j in range(len(geo_df)):
            if point.within(geo_df.geometry[j]):        
                sample_ship['municipal_district_name'][i] = geo_df[col_name][j]
                continue

    return sample_ship

Solution

  • You could use a within/sjoin but note that doing a spatial join on 10+M ain't gonna be fast.

    def get_district_name(
        geo_df: gpd.GeoDataFrame,
        ship_df: pd.DataFrame,
        col_name: str,
        frac: float = 0.65,
    ) -> pd.DataFrame:
    
        ship_sample_df = ship_df.sample(
            frac=frac, replace=False, random_state=42,
        )
        sample_ship_gdf = gpd.GeoDataFrame(
            ship_sample_df,
            geometry=gpd.points_from_xy(
                ship_sample_df["address_longitude"],
                ship_sample_df["address_latitude"],
            ),
        )
        return gpd.sjoin(
            sample_ship_gdf,
            geo_df,
            predicate="within",
            how="left",
        )[list(ship_df) + [col_name]]
    

    Output :

    >>> get_district_name(gdf_districts, df_purchases, "municipal_district_name", 1)
    
       address_longitude  address_latitude municipal_district_name
    1               1.00              2.00              district_2
    4               1.50              1.60              district_2
    2               0.70              3.00                     NaN
    0               3.00              2.50                     NaN
    3               0.20              0.30              district_1
    
    [5 rows x 3 columns]
    

    enter image description here

    Used input