From 29c6d34daf15eb63417cc62763cc4b4247a7bff4 Mon Sep 17 00:00:00 2001 From: Jathavaan Shankarr Date: Mon, 17 Nov 2025 19:36:15 +0100 Subject: [PATCH 1/4] #18 Created config variable for counties blob --- src/config.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/config.py b/src/config.py index bb0e4d5..1f06e6d 100644 --- a/src/config.py +++ b/src/config.py @@ -50,7 +50,8 @@ class Config: GEONORGE_BASE_URL: str = "https://api.kartverket.no/kommuneinfo/v1/" # METADATA - RELEASE_FILE_NAME = "releases.parquet" + RELEASE_FILE_NAME: str = "releases.parquet" + COUNTY_FILE_NAME: str = "counties.parquet" # STAC STAC_LICENSE = "CC-BY-4.0" From 007a77af2755baec5a81716b428e5858d3fd3aeb Mon Sep 17 00:00:00 2001 From: Jathavaan Shankarr Date: Mon, 17 Nov 2025 19:37:12 +0100 Subject: [PATCH 2/4] #18 Created services for writing counties parquet file to blob storage --- .../contracts/county_service_interface.py | 2 +- src/infra/infrastructure/containers.py | 12 +-- .../infrastructure/services/county_service.py | 84 +++++++++++++++++-- 3 files changed, 84 insertions(+), 14 deletions(-) diff --git a/src/application/contracts/county_service_interface.py b/src/application/contracts/county_service_interface.py index 79020db..0841bd5 100644 --- a/src/application/contracts/county_service_interface.py +++ b/src/application/contracts/county_service_interface.py @@ -10,5 +10,5 @@ def get_county_ids(self) -> list[str]: raise NotImplementedError @abstractmethod - def get_county_wkb_by_id(self, county_id: str, epsg_code: EPSGCode) -> tuple[bytes, dict[str, Any]]: + def get_county_polygons_by_id(self, county_id: str, epsg_code: EPSGCode) -> tuple[bytes, dict[str, Any]]: raise NotImplementedError diff --git a/src/infra/infrastructure/containers.py b/src/infra/infrastructure/containers.py index 73645be..27bbe2e 100644 --- a/src/infra/infrastructure/containers.py +++ b/src/infra/infrastructure/containers.py @@ -13,11 +13,6 @@ class Containers(containers.DeclarativeContainer): db_context = providers.Singleton(create_duckdb_context) blob_storage_context = providers.Singleton(create_blob_storage_context) - county_service = providers.Singleton( - CountyService, - db_context=db_context - ) - file_path_service = providers.Singleton( FilePathService ) @@ -37,6 +32,13 @@ class Containers(containers.DeclarativeContainer): file_path_service=file_path_service ) + county_service = providers.Singleton( + CountyService, + db_context=db_context, + blob_storage_service=blob_storage_service, + bytes_service=bytes_service + ) + osm_file_service = providers.Singleton( OpenStreetMapFileService ) diff --git a/src/infra/infrastructure/services/county_service.py b/src/infra/infrastructure/services/county_service.py index f5cec3f..5f54e2e 100644 --- a/src/infra/infrastructure/services/county_service.py +++ b/src/infra/infrastructure/services/county_service.py @@ -1,20 +1,32 @@ -from typing import Any +import json +from io import BytesIO +from typing import Any, Dict +import pandas as pd import requests import shapely from duckdb import DuckDBPyConnection from shapely.geometry import shape from src import Config -from src.application.contracts import ICountyService -from src.domain.enums import EPSGCode +from src.application.contracts import ICountyService, IBlobStorageService, IBytesService +from src.domain.enums import EPSGCode, StorageContainer class CountyService(ICountyService): __db_context: DuckDBPyConnection + __blob_storage_service: IBlobStorageService + __bytes_service: IBytesService - def __init__(self, db_context: DuckDBPyConnection): + def __init__( + self, + db_context: DuckDBPyConnection, + blob_storage_service: IBlobStorageService, + bytes_service: IBytesService + ): self.__db_context = db_context + self.__blob_storage_service = blob_storage_service + self.__bytes_service = bytes_service def get_county_ids(self) -> list[str]: response = requests.get(f"{Config.GEONORGE_BASE_URL}/fylker?sorter=fylkesnummer") @@ -22,20 +34,76 @@ def get_county_ids(self) -> list[str]: data = response.json() return [item["fylkesnummer"] for item in data] - def get_county_wkb_by_id(self, county_id: str, epsg_code: EPSGCode) -> tuple[bytes, dict[str, Any]]: + def get_county_polygons_by_id(self, county_id: str, epsg_code: EPSGCode) -> tuple[bytes, dict[str, Any]]: + geometries = self.__get_county_polygons_from_blob_storage(region=county_id) + + if geometries is not None: + return geometries + + wkb, geo_json = self.__fetch_county_polygons_from_api(region=county_id, epsg_code=epsg_code) + self.__write_county_to_blob_storage(region=county_id, wkb=wkb, geo_json=geo_json) + + return wkb, geo_json + + @staticmethod + def __fetch_county_polygons_from_api(region: str, epsg_code: EPSGCode) -> tuple[bytes, dict[str, Any]]: response = requests.get( - f"{Config.GEONORGE_BASE_URL}/fylker/{county_id}/omrade?utkoordsys={epsg_code.value}" + f"{Config.GEONORGE_BASE_URL}/fylker/{region}/omrade?utkoordsys={epsg_code.value}" ) response.raise_for_status() data = response.json() geom_data = data["omrade"] geom = shape(geom_data) - wkb_data = shapely.to_wkb(geom) + wkb = shapely.to_wkb(geom) geo_json = { "type": geom_data["type"], "coordinates": geom_data["coordinates"] } - return wkb_data, geo_json + return wkb, geo_json + + def __get_county_polygons_from_blob_storage(self, region: str) -> tuple[bytes, dict[str, Any]] | None: + county_bytes = self.__blob_storage_service.download_file( + container_name=StorageContainer.METADATA, + blob_name=Config.COUNTY_FILE_NAME + ) + + if county_bytes is None: + return None + + county_df = self.__bytes_service.convert_parquet_bytes_to_df(county_bytes) + if not region in county_df["region"].values: + return None + + row = county_df.loc[county_df["region"] == region].iloc[0] + return row["wkb"], json.loads(row["json"]) + + def __write_county_to_blob_storage(self, region: str, wkb: bytes, geo_json: Dict[str, Any]) -> None: + county_bytes = self.__blob_storage_service.download_file( + container_name=StorageContainer.METADATA, + blob_name=Config.COUNTY_FILE_NAME + ) + + county_df = self.__bytes_service.convert_parquet_bytes_to_df(county_bytes) \ + if (county_bytes is not None and len(county_bytes) > 0) \ + else pd.DataFrame(columns=["region", "wkb", "json"]) + + json_string = json.dumps(geo_json) + if region in county_df["region"].values: + mask = county_df["region"] == region + county_df.loc[mask, "wkb"] = wkb + county_df.loc[mask, "json"] = json_string + else: + new_row = pd.DataFrame({"region": [region], "wkb": [wkb], "json": [json_string]}) + county_df = pd.concat([county_df, new_row], ignore_index=True) + + buffer = BytesIO() + county_df.to_parquet(buffer, index=False) + buffer.seek(0) + self.__blob_storage_service.upload_file( + container_name=StorageContainer.METADATA, + blob_name=Config.COUNTY_FILE_NAME, + data=buffer.read() + ) From e8f8fdcb4579c8b67d3b32fae7e6bc7a48ab5fa5 Mon Sep 17 00:00:00 2001 From: Jathavaan Shankarr Date: Mon, 17 Nov 2025 19:38:45 +0100 Subject: [PATCH 3/4] #18 Renamed method for fetching county polygons --- src/presentation/entrypoints/release_pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/presentation/entrypoints/release_pipeline.py b/src/presentation/entrypoints/release_pipeline.py index cb79fdf..39ba535 100644 --- a/src/presentation/entrypoints/release_pipeline.py +++ b/src/presentation/entrypoints/release_pipeline.py @@ -157,7 +157,7 @@ def clip_and_partition_dataset_to_region( county_service: ICountyService = Provide[Containers.county_service], vector_service: IVectorService = Provide[Containers.vector_service], ) -> tuple[list[gpd.GeoDataFrame], list[gpd.GeoDataFrame], dict[str, Any]]: - polygon_wkb, polygon_geojson = county_service.get_county_wkb_by_id(county_id=region, epsg_code=EPSGCode.WGS84) + polygon_wkb, polygon_geojson = county_service.get_county_polygons_by_id(county_id=region, epsg_code=EPSGCode.WGS84) osm_county_dataset = vector_service.clip_dataframes_to_wkb(osm_batches, polygon_wkb, epsg_code=EPSGCode.WGS84) fkb_county_dataset = vector_service.clip_dataframes_to_wkb(fkb_batches, polygon_wkb, epsg_code=EPSGCode.WGS84) From 6fa5ac5347d38a875a999c2d6ff8c0cc60c776bc Mon Sep 17 00:00:00 2001 From: Jathavaan Shankarr <82966828+jathavaan@users.noreply.github.com> Date: Mon, 17 Nov 2025 19:55:12 +0100 Subject: [PATCH 4/4] Update src/infra/infrastructure/services/county_service.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- src/infra/infrastructure/services/county_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/infra/infrastructure/services/county_service.py b/src/infra/infrastructure/services/county_service.py index 5f54e2e..572e7d2 100644 --- a/src/infra/infrastructure/services/county_service.py +++ b/src/infra/infrastructure/services/county_service.py @@ -74,7 +74,7 @@ def __get_county_polygons_from_blob_storage(self, region: str) -> tuple[bytes, d return None county_df = self.__bytes_service.convert_parquet_bytes_to_df(county_bytes) - if not region in county_df["region"].values: + if region not in county_df["region"].values: return None row = county_df.loc[county_df["region"] == region].iloc[0]