From 50b7f2b3039c815c3dba3efb602fbca96510df06 Mon Sep 17 00:00:00 2001 From: Karthick Date: Thu, 11 Dec 2025 10:25:48 +0000 Subject: [PATCH 1/3] Make XML metadata optional, extract from HDF if XML not found --- src/stactools/modis/builder.py | 8 +++++++- src/stactools/modis/metadata.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/src/stactools/modis/builder.py b/src/stactools/modis/builder.py index 8304f9b0..961f63f1 100644 --- a/src/stactools/modis/builder.py +++ b/src/stactools/modis/builder.py @@ -110,7 +110,13 @@ def add_hdf_or_xml_href( xml_href = f"{href}.xml" else: raise ValueError(f"Invalid HDF or XML href: {href}") - self.add_xml_asset(xml_href) + + # Add XML asset if it exists, otherwise extract metadata from HDF + if os.path.exists(xml_href): + self.add_xml_asset(xml_href) + else: + self.metadata = Metadata.from_hdf_href(hdf_href, self.read_href_modifier) + self.add_hdf_asset( hdf_href, cog_directory=cog_directory, create_cogs=create_cogs ) diff --git a/src/stactools/modis/metadata.py b/src/stactools/modis/metadata.py index 34032996..2761cc34 100644 --- a/src/stactools/modis/metadata.py +++ b/src/stactools/modis/metadata.py @@ -1,13 +1,16 @@ import datetime import os.path +import warnings from dataclasses import dataclass from typing import Any, Callable, Dict, List, Optional, Tuple import fsspec import numpy as np +import rasterio from lxml import etree from rasterio import Affine from rasterio.crs import CRS +from rasterio.errors import NotGeoreferencedWarning from shapely.geometry import shape from stactools.core.io import ReadHrefModifier from stactools.core.io.xml import XmlElement @@ -252,6 +255,32 @@ def from_cog_tags(cls, cog_tags: Dict[str, str]) -> "Metadata": collection=collection, ) + @classmethod + def from_hdf_href( + cls, href: str, read_href_modifier: Optional[ReadHrefModifier] = None + ) -> "Metadata": + """Reads metadata from an HDF file when XML is not available. + + Args: + href (str): The href of the HDF file + read_href_modifier (Optional[Callable[[str], str]]): Optional + function to modify the read href + + Returns: + Metadata: Information that will map to Item attributes. + """ + if read_href_modifier: + read_href = read_href_modifier(href) + else: + read_href = href + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", category=NotGeoreferencedWarning) + with rasterio.open(read_href) as dataset: + hdf_tags = dataset.tags() + + return cls.from_cog_tags(hdf_tags) + @property def datetime(self) -> Optional[datetime.datetime]: """Returns a single nominal datetime for this metadata file. From 3dc78127f002c2dfe2ec8fd89ffa851b21fc7fdc Mon Sep 17 00:00:00 2001 From: Karthick Date: Thu, 11 Dec 2025 21:52:03 +0000 Subject: [PATCH 2/3] Add test function --- CHANGELOG.md | 1 + src/stactools/modis/metadata.py | 6 ++++-- tests/test_stac.py | 29 +++++++++++++++++++++++++++++ 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4c317488..f8866611 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), ### Changed - Item IDs no longer contain the production datetime ([#88](https://github.com/stactools-packages/modis/pull/88)) +- Make XML metadata optional - extract metadata from HDF file if XML is not available ([#XX](https://github.com/stactools-packages/modis/pull/XX)) ### Fixed diff --git a/src/stactools/modis/metadata.py b/src/stactools/modis/metadata.py index e958bd1c..c11ab386 100644 --- a/src/stactools/modis/metadata.py +++ b/src/stactools/modis/metadata.py @@ -235,6 +235,8 @@ def from_cog_tags(cls, cog_tags: Dict[str, str]) -> "Metadata": geometry, bbox = cls._geometry_and_bbox( collection, horizontal_tile, vertical_tile ) + qa_percent = cog_tags.get("QAPERCENTNOTPRODUCEDCLOUD") + qa_percent_not_produced_cloud = int(qa_percent) if qa_percent else None return Metadata( id=os.path.splitext(cog_tags["LOCALGRANULEID"])[0], product=product, @@ -245,11 +247,11 @@ def from_cog_tags(cls, cog_tags: Dict[str, str]) -> "Metadata": end_datetime=end_datetime, created=None, updated=None, - qa_percent_not_produced_cloud=int(cog_tags["QAPERCENTNOTPRODUCEDCLOUD"]), + qa_percent_not_produced_cloud=qa_percent_not_produced_cloud, qa_percent_cloud_cover=None, horizontal_tile=horizontal_tile, vertical_tile=vertical_tile, - tile_id=cog_tags["TileID"], + tile_id=cog_tags.get("TileID", ""), platforms=sorted(list(platforms)), instruments=sorted(list(instruments)), collection=collection, diff --git a/tests/test_stac.py b/tests/test_stac.py index da6e9658..462ebe7e 100644 --- a/tests/test_stac.py +++ b/tests/test_stac.py @@ -207,6 +207,35 @@ def test_raster_footprint_geometry() -> None: item.validate() +def test_create_item_from_hdf_without_xml() -> None: + """Test that an item can be created from an HDF file when XML is not available. + + This tests the fallback to extracting metadata directly from the HDF file + when the accompanying XML metadata file is not present. + """ + hdf_file = "MOD10A2.A2022033.h09v05.061.2022042050729.hdf" + source_hdf_path = test_data.get_path(f"data-files/{hdf_file}") + + with TemporaryDirectory() as temporary_directory: + # Copy only the HDF file (not the XML) to ensure XML is not available + temp_hdf_path = os.path.join(temporary_directory, hdf_file) + shutil.copyfile(source_hdf_path, temp_hdf_path) + + # Verify XML does not exist in temp directory + temp_xml_path = f"{temp_hdf_path}.xml" + assert not os.path.exists(temp_xml_path), "XML file should not exist" + + # Create item from HDF only - should extract metadata from HDF + item = stactools.modis.stac.create_item(temp_hdf_path) + + # Verify item was created with correct metadata + assert item is not None + assert item.id.startswith("MOD10A2.A2022033.h09v05") + assert "hdf" in item.assets + assert "metadata" not in item.assets # XML asset should not be present + item.validate() + + @pytest.mark.parametrize("file_name", PROJECTION_EDGE_FILES) def test_raster_footprint_at_projection_edge(file_name: str) -> None: path = test_data.get_path(file_name) From e7346858e8fabc2be2aa358367da90e92bbe64c7 Mon Sep 17 00:00:00 2001 From: Karthick Date: Mon, 15 Dec 2025 15:25:56 +0000 Subject: [PATCH 3/3] Address PR review feedback - Revert TileID to required (present in all products) - Remove NotGeoreferencedWarning suppression - Use pytest tmp_path fixture in test - Clean up test comments --- src/stactools/modis/metadata.py | 10 +++------ tests/test_stac.py | 38 ++++++++++++--------------------- 2 files changed, 17 insertions(+), 31 deletions(-) diff --git a/src/stactools/modis/metadata.py b/src/stactools/modis/metadata.py index c11ab386..e1f6b052 100644 --- a/src/stactools/modis/metadata.py +++ b/src/stactools/modis/metadata.py @@ -1,6 +1,5 @@ import datetime import os.path -import warnings from dataclasses import dataclass from typing import Any, Callable, Dict, List, Optional, Tuple @@ -10,7 +9,6 @@ from lxml import etree from rasterio import Affine from rasterio.crs import CRS -from rasterio.errors import NotGeoreferencedWarning from shapely.geometry import shape from stactools.core.io import ReadHrefModifier from stactools.core.io.xml import XmlElement @@ -251,7 +249,7 @@ def from_cog_tags(cls, cog_tags: Dict[str, str]) -> "Metadata": qa_percent_cloud_cover=None, horizontal_tile=horizontal_tile, vertical_tile=vertical_tile, - tile_id=cog_tags.get("TileID", ""), + tile_id=cog_tags["TileID"], platforms=sorted(list(platforms)), instruments=sorted(list(instruments)), collection=collection, @@ -276,10 +274,8 @@ def from_hdf_href( else: read_href = href - with warnings.catch_warnings(): - warnings.simplefilter("ignore", category=NotGeoreferencedWarning) - with rasterio.open(read_href) as dataset: - hdf_tags = dataset.tags() + with rasterio.open(read_href) as dataset: + hdf_tags = dataset.tags() return cls.from_cog_tags(hdf_tags) diff --git a/tests/test_stac.py b/tests/test_stac.py index 462ebe7e..77e90c1b 100644 --- a/tests/test_stac.py +++ b/tests/test_stac.py @@ -207,33 +207,23 @@ def test_raster_footprint_geometry() -> None: item.validate() -def test_create_item_from_hdf_without_xml() -> None: - """Test that an item can be created from an HDF file when XML is not available. - - This tests the fallback to extracting metadata directly from the HDF file - when the accompanying XML metadata file is not present. - """ +def test_create_item_from_hdf_without_xml(tmp_path: Path) -> None: hdf_file = "MOD10A2.A2022033.h09v05.061.2022042050729.hdf" source_hdf_path = test_data.get_path(f"data-files/{hdf_file}") - with TemporaryDirectory() as temporary_directory: - # Copy only the HDF file (not the XML) to ensure XML is not available - temp_hdf_path = os.path.join(temporary_directory, hdf_file) - shutil.copyfile(source_hdf_path, temp_hdf_path) - - # Verify XML does not exist in temp directory - temp_xml_path = f"{temp_hdf_path}.xml" - assert not os.path.exists(temp_xml_path), "XML file should not exist" - - # Create item from HDF only - should extract metadata from HDF - item = stactools.modis.stac.create_item(temp_hdf_path) - - # Verify item was created with correct metadata - assert item is not None - assert item.id.startswith("MOD10A2.A2022033.h09v05") - assert "hdf" in item.assets - assert "metadata" not in item.assets # XML asset should not be present - item.validate() + temp_hdf_path = tmp_path / hdf_file + shutil.copyfile(source_hdf_path, temp_hdf_path) + + temp_xml_path = tmp_path / f"{hdf_file}.xml" + assert not temp_xml_path.exists() + + item = stactools.modis.stac.create_item(str(temp_hdf_path)) + + assert item is not None + assert item.id.startswith("MOD10A2.A2022033.h09v05") + assert "hdf" in item.assets + assert "metadata" not in item.assets + item.validate() @pytest.mark.parametrize("file_name", PROJECTION_EDGE_FILES)