From 9f3a554f93e4f24d3bfd32e54a0989341a1774d0 Mon Sep 17 00:00:00 2001 From: Jathavaan Shankarr Date: Thu, 22 Jan 2026 15:07:24 +0100 Subject: [PATCH 1/4] #47 Reduced runtime by 50% by reducing number of downloads when normalizing HREFs --- main.py | 2 ++ .../contracts/stac_io_service_interface.py | 8 +++++++ .../contracts/stac_service_interface.py | 3 ++- .../services/stac_io_service.py | 24 +++++++++++++++---- .../infrastructure/services/stac_service.py | 17 +++++++++---- .../entrypoints/release_pipeline.py | 15 +++++++++--- 6 files changed, 56 insertions(+), 13 deletions(-) diff --git a/main.py b/main.py index 9b8b5e0..a4ef5d3 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,6 @@ from src.presentation.configuration import initialize_dependencies from src.presentation.entrypoints import run_pipeline +import cProfile def main() -> None: @@ -12,3 +13,4 @@ def main() -> None: if __name__ == "__main__": main() + # cProfile.run('main()') diff --git a/src/application/contracts/stac_io_service_interface.py b/src/application/contracts/stac_io_service_interface.py index 80ac5d1..1d78c2c 100644 --- a/src/application/contracts/stac_io_service_interface.py +++ b/src/application/contracts/stac_io_service_interface.py @@ -4,6 +4,14 @@ class IStacIOService(DefaultStacIO, ABC): + @property + def skip_file_download(self) -> bool: + raise NotImplementedError + + @skip_file_download.setter + def skip_file_download(self, value: bool) -> None: + raise NotImplementedError + def strip_path_stem(self, path: str) -> str: """ Removes the leading part of the path up to and including 'stac/'. Only intended for use with STAC HREFs with 'stac/' in them. diff --git a/src/application/contracts/stac_service_interface.py b/src/application/contracts/stac_service_interface.py index 4432759..71683c9 100644 --- a/src/application/contracts/stac_service_interface.py +++ b/src/application/contracts/stac_service_interface.py @@ -124,10 +124,11 @@ def create_release_catalog(self, root_catalog: Catalog, release: str) -> Catalog raise NotImplementedError @abstractmethod - def save_catalog(self, catalog: Catalog) -> None: + def save_catalog(self, catalog: Catalog, release: str) -> None: """ Saves catalog to its self href location :param catalog: + :param release: :return: """ raise NotImplementedError diff --git a/src/infra/infrastructure/services/stac_io_service.py b/src/infra/infrastructure/services/stac_io_service.py index 953c3b4..0db8a05 100644 --- a/src/infra/infrastructure/services/stac_io_service.py +++ b/src/infra/infrastructure/services/stac_io_service.py @@ -8,19 +8,33 @@ class StacIOService(IStacIOService): __blob_storage_service: IBlobStorageService + __skip_file_download: bool = False def __init__(self, blob_storage_service: IBlobStorageService): super().__init__() self.__blob_storage_service = blob_storage_service + @property + def skip_file_download(self) -> bool: + return self.__skip_file_download + + @skip_file_download.setter + def skip_file_download(self, value: bool) -> None: + self.__skip_file_download = value + def write_text(self, dest: HREF, txt: str, *args: Any, **kwargs: Any) -> None: data = txt.encode(encoding="utf-8") path = self.strip_path_stem(dest) - if path != "catalog.json" and self.__blob_storage_service.is_blob_in_storage_container( - container_name=StorageContainer.STAC, - blob_name=path - ): + is_file_uploaded = ( + path != "catalog.json" and + self.__blob_storage_service.is_blob_in_storage_container( + container_name=StorageContainer.STAC, + blob_name=path + ) + ) + + if is_file_uploaded: return self.__blob_storage_service.upload_file(container_name=StorageContainer.STAC, blob_name=path, data=data) @@ -28,11 +42,13 @@ def write_text(self, dest: HREF, txt: str, *args: Any, **kwargs: Any) -> None: def read_text(self, source: HREF, *args: Any, **kwargs: Any) -> str: path = self.strip_path_stem(source) path = path.lstrip(".") + data = self.__blob_storage_service.download_file(container_name=StorageContainer.STAC, blob_name=path) if data is None: raise FileNotFoundError(f"File not found in blob storage: {path}") + self.skip_file_download = False return data.decode(encoding="utf-8") def strip_path_stem(self, path: str) -> str: diff --git a/src/infra/infrastructure/services/stac_service.py b/src/infra/infrastructure/services/stac_service.py index 3ec2ed5..e807eac 100644 --- a/src/infra/infrastructure/services/stac_service.py +++ b/src/infra/infrastructure/services/stac_service.py @@ -126,8 +126,15 @@ def create_release_catalog(self, root_catalog: Catalog, release: str) -> Catalog return release_catalog - def save_catalog(self, catalog: Catalog) -> None: - catalog.normalize_and_save( - root_href=Config.STAC_STORAGE_CONTAINER, - catalog_type=CatalogType.ABSOLUTE_PUBLISHED - ) + def save_catalog(self, catalog: Catalog, release: str) -> None: + latest_release = catalog.get_child(id=f"release/{release}", recursive=True) + + catalog_copy = catalog.clone() + catalog_copy.clear_children() + catalog_copy.add_child(latest_release) + catalog_copy.normalize_hrefs(root_href=Config.STAC_STORAGE_CONTAINER) + + normalized_latest_release = catalog_copy.get_child(id=f"release/{release}", recursive=True) + catalog.remove_child(f"release/{release}") + catalog.add_child(normalized_latest_release) + catalog.save(catalog_type=CatalogType.ABSOLUTE_PUBLISHED) diff --git a/src/presentation/entrypoints/release_pipeline.py b/src/presentation/entrypoints/release_pipeline.py index 1ce60a4..b4a693b 100644 --- a/src/presentation/entrypoints/release_pipeline.py +++ b/src/presentation/entrypoints/release_pipeline.py @@ -4,6 +4,7 @@ from dependency_injector.wiring import inject, Provide from pystac import Catalog, Collection, Item +from application.contracts import IStacIOService from src.application.common import logger from src.application.contracts import ( IReleaseService, IStacService, IOpenStreetMapFileService, ICountyService, IOpenStreetMapService, IFKBService, @@ -93,13 +94,14 @@ def run_pipeline() -> None: add_assets_to_item(conflated_region_item, conflated_blob_paths) - save_catalog(catalog=root_catalog) + save_catalog(catalog=root_catalog, release=latest_release) @inject def create_release( release_service: IReleaseService = Provide[Containers.release_service], stac_service: IStacService = Provide[Containers.stac_service], + stac_io_service: IStacIOService = Provide[Containers.stac_io_service] ) -> tuple[str, Catalog, Catalog]: root_catalog = stac_service.get_catalog_root() current_release = release_service.create_release() @@ -246,5 +248,12 @@ def add_assets_to_item( @inject -def save_catalog(catalog: Catalog, stac_service: IStacService = Provide[Containers.stac_service]) -> None: - stac_service.save_catalog(catalog) +def save_catalog( + catalog: Catalog, + release: str, + stac_service: IStacService = Provide[Containers.stac_service], + stac_io_service: IStacIOService = Provide[Containers.stac_io_service] +) -> None: + stac_io_service.skip_file_download = True + stac_service.save_catalog(catalog, release) + stac_io_service.skip_file_download = False From 91971df9580f8ed84fbe7e202cf291d22bf224e8 Mon Sep 17 00:00:00 2001 From: Jathavaan Shankarr Date: Fri, 23 Jan 2026 09:27:19 +0100 Subject: [PATCH 2/4] #47 Removed unused methods --- src/application/contracts/stac_io_service_interface.py | 8 -------- src/infra/infrastructure/services/stac_io_service.py | 8 -------- 2 files changed, 16 deletions(-) diff --git a/src/application/contracts/stac_io_service_interface.py b/src/application/contracts/stac_io_service_interface.py index 1d78c2c..80ac5d1 100644 --- a/src/application/contracts/stac_io_service_interface.py +++ b/src/application/contracts/stac_io_service_interface.py @@ -4,14 +4,6 @@ class IStacIOService(DefaultStacIO, ABC): - @property - def skip_file_download(self) -> bool: - raise NotImplementedError - - @skip_file_download.setter - def skip_file_download(self, value: bool) -> None: - raise NotImplementedError - def strip_path_stem(self, path: str) -> str: """ Removes the leading part of the path up to and including 'stac/'. Only intended for use with STAC HREFs with 'stac/' in them. diff --git a/src/infra/infrastructure/services/stac_io_service.py b/src/infra/infrastructure/services/stac_io_service.py index 0db8a05..b632333 100644 --- a/src/infra/infrastructure/services/stac_io_service.py +++ b/src/infra/infrastructure/services/stac_io_service.py @@ -8,19 +8,11 @@ class StacIOService(IStacIOService): __blob_storage_service: IBlobStorageService - __skip_file_download: bool = False def __init__(self, blob_storage_service: IBlobStorageService): super().__init__() self.__blob_storage_service = blob_storage_service - @property - def skip_file_download(self) -> bool: - return self.__skip_file_download - - @skip_file_download.setter - def skip_file_download(self, value: bool) -> None: - self.__skip_file_download = value def write_text(self, dest: HREF, txt: str, *args: Any, **kwargs: Any) -> None: data = txt.encode(encoding="utf-8") From a725f2e8a71201bf24592abf369571c3c2a09cab Mon Sep 17 00:00:00 2001 From: Jathavaan Shankarr Date: Fri, 23 Jan 2026 09:28:09 +0100 Subject: [PATCH 3/4] #47 Cleaned up main.py --- main.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/main.py b/main.py index a4ef5d3..f31d627 100644 --- a/main.py +++ b/main.py @@ -1,16 +1,11 @@ from src.presentation.configuration import initialize_dependencies from src.presentation.entrypoints import run_pipeline -import cProfile def main() -> None: - # SETUP initialize_dependencies() - - # OPEN STREET MAP run_pipeline() if __name__ == "__main__": main() - # cProfile.run('main()') From 42871e7d19c0b27f6faac616ee9aae67ad60032e Mon Sep 17 00:00:00 2001 From: Jathavaan Shankarr Date: Fri, 23 Jan 2026 09:30:49 +0100 Subject: [PATCH 4/4] #47 Code cleanup --- src/presentation/entrypoints/release_pipeline.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/presentation/entrypoints/release_pipeline.py b/src/presentation/entrypoints/release_pipeline.py index b4a693b..2adc3bc 100644 --- a/src/presentation/entrypoints/release_pipeline.py +++ b/src/presentation/entrypoints/release_pipeline.py @@ -4,11 +4,10 @@ from dependency_injector.wiring import inject, Provide from pystac import Catalog, Collection, Item -from application.contracts import IStacIOService from src.application.common import logger from src.application.contracts import ( IReleaseService, IStacService, IOpenStreetMapFileService, ICountyService, IOpenStreetMapService, IFKBService, - IVectorService, IBlobStorageService, IFilePathService, IConflationService + IVectorService, IBlobStorageService, IFilePathService, IConflationService, IStacIOService ) from src.domain.enums import EPSGCode, Theme, DataSource, StorageContainer from src.infra.infrastructure import Containers @@ -100,8 +99,7 @@ def run_pipeline() -> None: @inject def create_release( release_service: IReleaseService = Provide[Containers.release_service], - stac_service: IStacService = Provide[Containers.stac_service], - stac_io_service: IStacIOService = Provide[Containers.stac_io_service] + stac_service: IStacService = Provide[Containers.stac_service] ) -> tuple[str, Catalog, Catalog]: root_catalog = stac_service.get_catalog_root() current_release = release_service.create_release() @@ -254,6 +252,4 @@ def save_catalog( stac_service: IStacService = Provide[Containers.stac_service], stac_io_service: IStacIOService = Provide[Containers.stac_io_service] ) -> None: - stac_io_service.skip_file_download = True stac_service.save_catalog(catalog, release) - stac_io_service.skip_file_download = False