pythonimagemasktiffrasterio

How to make binary mask of large .tif file fast using .shp file with polygons (GPS coordinates)?


I have a large .tif file (~24 Gb, more, than RAM could store)
(Similar question: https://rasterio.readthedocs.io/en/latest/topics/masking-by-shapefile.html)
(Method from library: https://rasterio.readthedocs.io/en/latest/topics/masking-by-shapefile.html)

But I'm searching another faster solution. Maybe I should create numpy array with zeros and add polygon areas into that array (it shouldn't be better, than with rasterio)

Here is my code to process polygons with longitude and latitude.

import rasterio
import geopandas as gpd
from tqdm import tqdm
import os

def generate_mask(raster_path, shape_path, output_path):
    gdf = gpd.read_file(shape_path)
    with rasterio.open(raster_path) as src:
        for num in tqdm(sorted(gdf['class'].unique())):
            if os.path.isfile(f'{path}/masks/{num}.tif'):
                continue
            shapes = gdf.loc[(gdf['class'] == num)]['geometry']
            file_name = f'{num}.tif'

            for shp in shapes:
                bound = rasterio.features.bounds(shp)
                bbox = rasterio.windows.from_bounds(*bound, src.transform)
                window_transform = rasterio.windows.transform(window=bbox, transform=src.transform)

                img_crop = src.read([1,2,3], window=bbox)
                print(f'Image read')
                mask = rasterio.features.geometry_mask([shp],out_shape=(img_crop.shape[1], img_crop.shape[2]), 
                                       transform=window_transform, invert=True)

                out_image, out_transform = rasterio.mask.mask(src, shp, crop=True, filled=True)
                out_meta = src.meta
                print(f'Mask {fn} is done')

                with rasterio.open(f'{output_path}{file_name}', "w") as dest:
                    dest.write(out_image)
                print(f'Mask is written')`

The problem here, that it could process one polygon at a time, moreover if I process all shapes at a time it time more than 3 hours to make mask. (It's too long, server usually kill process.)


Solution

  • I solved problem using osgeo.gdal.Rasterize()
    (see documentation: https://gdal.org/api/python/osgeo.gdal.html)
    It handles large files way better than analogs (~3-5 min per one mask)

    Here is my code:

    from osgeo import gdal
    import os
    
    def gdal_generate_mask(raster_path, shape_path, output_path):
    # open the raster layer and get its relevant properties
    raster_ds = gdal.Open(raster_path, gdal.GA_ReadOnly)
    xSize = raster_ds.RasterXSize
    ySize = raster_ds.RasterYSize
    geotransform = raster_ds.GetGeoTransform()
    projection = raster_ds.GetProjection()
    
    # create the target layer (1 band binary)
    gdf = gpd.read_file(shape_path)
    for num in tqdm(sorted(gdf['class'].unique())):
        driver = gdal.GetDriverByName('GTiff')
        target_ds = driver.Create(f'{output_path}{num}.tif', xSize, ySize, bands=1, eType = gdal.GDT_Byte, options = ["COMPRESS=DEFLATE"])
        target_ds.SetGeoTransform(geotransform)
        target_ds.SetProjection(projection)
    
        # create temp file for vector layer gdal Rasterize
        vector_layer = f'/home/username/GeoProj/tmp_shp/{num}.shp'
        if not os.path.isfile(vector_layer):
            gdf[gdf['class'] == num].to_file(vector_layer)
    
        # rasterize the vector layer into the target one
        ds = gdal.Rasterize(target_ds, vector_layer, burnValues=[1])
    

    P.S. I should seek an answer here: https://gis.stackexchange.com/