pythonstac

Error writing stac catalog with pystac: datetime, tzinfo error


The notebook exploring this issue can be found here.

https://github.com/worldbank/GOSTrocks/blob/fathom/notebooks/FATHOM/PROCESSING_NOTEBOOKS/generate_fathom_vrts_and_STAC_catalog.ipynb

I am trying to create a STAC catalog for a gridded geotiff dataset (flood model predictions at various times, types, climate scenarios, etc.). However, whenever I attempt to save the STAC catalog, I get an erro about the item datetime object. I am posting the error below, and then the code where I create the STAC catalog below that; any suggestions would be appreciated!

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[7], line 6
      3         print(f"Found item: {item.id} in catalog {root.id}: {item.datetime}")
      5 catalog.validate()
----> 6 catalog.save(catalog_type=pystac.CatalogType.SELF_CONTAINED)

File c:\WBG\Anaconda3\envs\s2s_ingest\lib\site-packages\pystac\catalog.py:986, in Catalog.save(self, catalog_type, dest_href, stac_io)
    981             child.save(
    982                 dest_href=os.path.dirname(child_dest_href),
    983                 stac_io=stac_io,
    984             )
    985         else:
--> 986             child.save(stac_io=stac_io)
    988 for item_link in self.get_item_links():
    989     if item_link.is_resolved():

File c:\WBG\Anaconda3\envs\s2s_ingest\lib\site-packages\pystac\catalog.py:1022, in Catalog.save(self, catalog_type, dest_href, stac_io)
   1018     rel_href = make_relative_href(self.self_href, self.self_href)
   1019     catalog_dest_href = make_absolute_href(
   1020         rel_href, dest_href, start_is_dir=True
   1021     )
-> 1022 self.save_object(
   1023     include_self_link=include_self_link,
   1024     dest_href=catalog_dest_href,
...
--> 412     if dt.tzinfo is None:
    413         dt = dt.replace(tzinfo=timezone.utc)
    415     timestamp = dt.isoformat(timespec=timespec)

AttributeError: 'str' object has no attribute 'tzinfo'

The errors seems pretty straightforward: the time objects in the stac Items need to be datetime objects ... but they are; you can see that I set them as datetime objects and then validate them as datetime objects (by printing out the year).

catalog = pystac.Catalog(
    id="fathom-v31-catalog",
    description="STAC Catalog for FATHOM 3.1 global flood hazard data",    
)

cur_model = 'FLOOD_MAP-1ARCSEC-NW_OFFSET-1in100-FLUVIAL-UNDEFENDED-DEPTH-2020-PERCENTILE50-v3.1' #new_fathom_models[0]
model_path = f"FATHOM/v31/{cur_model}/"
all_tiles = get_tile_list(model_path) # this gets a list of all tile in our S3 bucket for the current model

deets = cur_model.split("-")
return_period = deets[3]
type = deets[4]
defended = deets[5]
date = deets[7]
scenario = deets[8] if deets[8] != "PERCENTILE50" else "Baseline"

spatial_extent = pystac.SpatialExtent(bboxes=[[-180.0, -90.0, 180.0, 90.0]])
temporal_extent = pystac.TemporalExtent(intervals=[[f"{date}-01-01T00:00:00Z", f"{date}-12-31T00:00:00Z"]])

c_collection = pystac.Collection(
    id=cur_model,
    description=f"FATHOM 3.1 Flood Hazard Model: {type} flood, {defended}, {return_period} return period, {scenario} scenario, {date} data",
    title=f"FATHOM 3.1 - {cur_model}",
    extent=pystac.Extent(spatial=spatial_extent, temporal=temporal_extent),
    extra_fields={
        "model_type": type,
        "defended_status": defended,
        "return_period": return_period,
        "scenario": scenario,
        "data_year": date,
    }
)

for c_tile in tqdm(all_tiles[:10]): #Limiting to first 10 for testing
    raster_s3_uri = f"s3://{s3_bucket}/{model_path}{c_tile}"
    bbox, footprint = get_bbox_and_footprint_dumb(c_tile) # tiles are named by their lat/long so we can infer bbox and footprint from another function
    item = pystac.Item(
        id=c_tile.replace('.tif', ''),
        geometry=footprint,
        bbox=bbox,
        datetime=pystac.utils.str_to_datetime(f"{date}-01-01T00:00:00Z"),
        properties={},
    )

    asset = pystac.Asset(
        href=raster_s3_uri,
        media_type=pystac.MediaType.COG,
        roles=["data"],
        title=c_tile,
    )
    item.add_asset("raster", asset)

    c_collection.add_item(item)

catalog.add_child(c_collection)
catalog.normalize_hrefs("")

for root, catalogs, items in catalog.walk():
    for item in items:
        print(f"Found item: {item.id} in catalog {root.id}: {item.datetime.year}")

catalog.validate()
catalog.save(catalog_type=pystac.CatalogType.SELF_CONTAINED)

Solution

  • From the call stack:

    self.extent.to_dict()
    -> self.temporal.to_dict()
    -> datetime_to_str(i[0])
    

    The problem comes from extent. This variable comes from the constructor of Collection.

    def __init__(
            self,
            id: str,
            description: str,
            extent: Extent,     # HERE
            title: str | None = None,
            stac_extensions: list[str] | None = None,
            href: str | None = None,
            extra_fields: dict[str, Any] | None = None,
            catalog_type: CatalogType | None = None,
            license: str = "other",
            keywords: list[str] | None = None,
            providers: list[Provider] | None = None,
            summaries: Summaries | None = None,
            assets: dict[str, Asset] | None = None,
            strategy: HrefLayoutStrategy | None = None,
        )
    

    Your code pass it from here:

    c_collection = pystac.Collection(
        id=cur_model,
        description=f"FATHOM 3.1 Flood Hazard Model: {type} flood, {defended}, {return_period} return period, {scenario} scenario, {date} data",
        title=f"FATHOM 3.1 - {cur_model}",
        extent=pystac.Extent(spatial=spatial_extent, temporal=temporal_extent),  # HERE
        extra_fields={
            "model_type": type,
            "defended_status": defended,
            "return_period": return_period,
            "scenario": scenario,
            "data_year": date,
        }
    )
    

    So, it looks like the error comes from:

    temporal_extent = pystac.TemporalExtent(intervals=[[f"{date}-01-01T00:00:00Z", f"{date}-12-31T00:00:00Z"]])
    

    Are you sure the variable date only represents the year?

    There may be a formatting error here.