Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/application/contracts/county_service_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,5 @@ def get_county_ids(self) -> list[str]:
raise NotImplementedError

@abstractmethod
def get_county_wkb_by_id(self, county_id: str, epsg_code: EPSGCode) -> tuple[bytes, dict[str, Any]]:
def get_county_polygons_by_id(self, county_id: str, epsg_code: EPSGCode) -> tuple[bytes, dict[str, Any]]:
raise NotImplementedError
3 changes: 2 additions & 1 deletion src/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ class Config:
GEONORGE_BASE_URL: str = "https://api.kartverket.no/kommuneinfo/v1/"

# METADATA
RELEASE_FILE_NAME = "releases.parquet"
RELEASE_FILE_NAME: str = "releases.parquet"
COUNTY_FILE_NAME: str = "counties.parquet"

# STAC
STAC_LICENSE = "CC-BY-4.0"
Expand Down
12 changes: 7 additions & 5 deletions src/infra/infrastructure/containers.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,6 @@ class Containers(containers.DeclarativeContainer):
db_context = providers.Singleton(create_duckdb_context)
blob_storage_context = providers.Singleton(create_blob_storage_context)

county_service = providers.Singleton(
CountyService,
db_context=db_context
)

file_path_service = providers.Singleton(
FilePathService
)
Expand All @@ -37,6 +32,13 @@ class Containers(containers.DeclarativeContainer):
file_path_service=file_path_service
)

county_service = providers.Singleton(
CountyService,
db_context=db_context,
blob_storage_service=blob_storage_service,
bytes_service=bytes_service
)

osm_file_service = providers.Singleton(
OpenStreetMapFileService
)
Expand Down
84 changes: 76 additions & 8 deletions src/infra/infrastructure/services/county_service.py
Original file line number Diff line number Diff line change
@@ -1,41 +1,109 @@
from typing import Any
import json
from io import BytesIO
from typing import Any, Dict

import pandas as pd
import requests
import shapely
from duckdb import DuckDBPyConnection
from shapely.geometry import shape

from src import Config
from src.application.contracts import ICountyService
from src.domain.enums import EPSGCode
from src.application.contracts import ICountyService, IBlobStorageService, IBytesService
from src.domain.enums import EPSGCode, StorageContainer


class CountyService(ICountyService):
__db_context: DuckDBPyConnection
__blob_storage_service: IBlobStorageService
__bytes_service: IBytesService

def __init__(self, db_context: DuckDBPyConnection):
def __init__(
self,
db_context: DuckDBPyConnection,
blob_storage_service: IBlobStorageService,
bytes_service: IBytesService
):
self.__db_context = db_context
self.__blob_storage_service = blob_storage_service
self.__bytes_service = bytes_service

def get_county_ids(self) -> list[str]:
response = requests.get(f"{Config.GEONORGE_BASE_URL}/fylker?sorter=fylkesnummer")
response.raise_for_status()
data = response.json()
return [item["fylkesnummer"] for item in data]

def get_county_wkb_by_id(self, county_id: str, epsg_code: EPSGCode) -> tuple[bytes, dict[str, Any]]:
def get_county_polygons_by_id(self, county_id: str, epsg_code: EPSGCode) -> tuple[bytes, dict[str, Any]]:
geometries = self.__get_county_polygons_from_blob_storage(region=county_id)

if geometries is not None:
return geometries

wkb, geo_json = self.__fetch_county_polygons_from_api(region=county_id, epsg_code=epsg_code)
self.__write_county_to_blob_storage(region=county_id, wkb=wkb, geo_json=geo_json)

return wkb, geo_json
Comment on lines +37 to +46
Copy link

Copilot AI Nov 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The caching implementation doesn't account for the epsg_code parameter. When cached data is returned from blob storage (line 40-41), it may be in a different coordinate system than requested. The cache should either:

  1. Store the EPSG code along with the polygon data and check it matches before returning cached data, or
  2. Document that only WGS84 is supported and validate the parameter

This could lead to incorrect spatial operations if the method is ever called with a different EPSG code.

Copilot uses AI. Check for mistakes.

@staticmethod
def __fetch_county_polygons_from_api(region: str, epsg_code: EPSGCode) -> tuple[bytes, dict[str, Any]]:
response = requests.get(
f"{Config.GEONORGE_BASE_URL}/fylker/{county_id}/omrade?utkoordsys={epsg_code.value}"
f"{Config.GEONORGE_BASE_URL}/fylker/{region}/omrade?utkoordsys={epsg_code.value}"
)
response.raise_for_status()

data = response.json()
geom_data = data["omrade"]
geom = shape(geom_data)
wkb_data = shapely.to_wkb(geom)
wkb = shapely.to_wkb(geom)

geo_json = {
"type": geom_data["type"],
"coordinates": geom_data["coordinates"]
}

return wkb_data, geo_json
return wkb, geo_json

def __get_county_polygons_from_blob_storage(self, region: str) -> tuple[bytes, dict[str, Any]] | None:
county_bytes = self.__blob_storage_service.download_file(
container_name=StorageContainer.METADATA,
blob_name=Config.COUNTY_FILE_NAME
)

if county_bytes is None:
return None

county_df = self.__bytes_service.convert_parquet_bytes_to_df(county_bytes)
if region not in county_df["region"].values:
return None

row = county_df.loc[county_df["region"] == region].iloc[0]
return row["wkb"], json.loads(row["json"])

Comment on lines +76 to +82
Copy link

Copilot AI Nov 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This method lacks error handling for potential failures when parsing the parquet file (line 76) or deserializing JSON (line 81). If the cached file is corrupted or in an unexpected format, the method will raise an unhandled exception instead of falling back to the API. Consider wrapping these operations in a try-except block and returning None on error to allow fallback to the API fetch.

Suggested change
county_df = self.__bytes_service.convert_parquet_bytes_to_df(county_bytes)
if region not in county_df["region"].values:
return None
row = county_df.loc[county_df["region"] == region].iloc[0]
return row["wkb"], json.loads(row["json"])
try:
county_df = self.__bytes_service.convert_parquet_bytes_to_df(county_bytes)
if region not in county_df["region"].values:
return None
row = county_df.loc[county_df["region"] == region].iloc[0]
return row["wkb"], json.loads(row["json"])
except Exception:
return None

Copilot uses AI. Check for mistakes.
def __write_county_to_blob_storage(self, region: str, wkb: bytes, geo_json: Dict[str, Any]) -> None:
county_bytes = self.__blob_storage_service.download_file(
container_name=StorageContainer.METADATA,
blob_name=Config.COUNTY_FILE_NAME
)

county_df = self.__bytes_service.convert_parquet_bytes_to_df(county_bytes) \
if (county_bytes is not None and len(county_bytes) > 0) \
else pd.DataFrame(columns=["region", "wkb", "json"])

json_string = json.dumps(geo_json)
if region in county_df["region"].values:
mask = county_df["region"] == region
county_df.loc[mask, "wkb"] = wkb
county_df.loc[mask, "json"] = json_string
else:
new_row = pd.DataFrame({"region": [region], "wkb": [wkb], "json": [json_string]})
Comment on lines +91 to +99
Copy link

Copilot AI Nov 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Assigning bytes directly to DataFrame cells using .loc may not work correctly with all pandas versions. The bytes object might be stored as a reference rather than properly serialized. Consider using .at for scalar assignment or explicitly setting the dtype to object when creating the DataFrame to ensure bytes are stored correctly.

Suggested change
else pd.DataFrame(columns=["region", "wkb", "json"])
json_string = json.dumps(geo_json)
if region in county_df["region"].values:
mask = county_df["region"] == region
county_df.loc[mask, "wkb"] = wkb
county_df.loc[mask, "json"] = json_string
else:
new_row = pd.DataFrame({"region": [region], "wkb": [wkb], "json": [json_string]})
else pd.DataFrame(columns=["region", "wkb", "json"]).astype({"wkb": object})
# Ensure 'wkb' column is dtype object for bytes storage
if "wkb" in county_df.columns:
county_df["wkb"] = county_df["wkb"].astype(object)
json_string = json.dumps(geo_json)
if region in county_df["region"].values:
mask = county_df["region"] == region
county_df.loc[mask, "wkb"] = wkb
county_df.loc[mask, "json"] = json_string
else:
new_row = pd.DataFrame({"region": [region], "wkb": [wkb], "json": [json_string]}).astype({"wkb": object})

Copilot uses AI. Check for mistakes.
county_df = pd.concat([county_df, new_row], ignore_index=True)

buffer = BytesIO()
county_df.to_parquet(buffer, index=False)
buffer.seek(0)
self.__blob_storage_service.upload_file(
container_name=StorageContainer.METADATA,
blob_name=Config.COUNTY_FILE_NAME,
data=buffer.read()
)
2 changes: 1 addition & 1 deletion src/presentation/entrypoints/release_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ def clip_and_partition_dataset_to_region(
county_service: ICountyService = Provide[Containers.county_service],
vector_service: IVectorService = Provide[Containers.vector_service],
) -> tuple[list[gpd.GeoDataFrame], list[gpd.GeoDataFrame], dict[str, Any]]:
polygon_wkb, polygon_geojson = county_service.get_county_wkb_by_id(county_id=region, epsg_code=EPSGCode.WGS84)
polygon_wkb, polygon_geojson = county_service.get_county_polygons_by_id(county_id=region, epsg_code=EPSGCode.WGS84)

osm_county_dataset = vector_service.clip_dataframes_to_wkb(osm_batches, polygon_wkb, epsg_code=EPSGCode.WGS84)
fkb_county_dataset = vector_service.clip_dataframes_to_wkb(fkb_batches, polygon_wkb, epsg_code=EPSGCode.WGS84)
Expand Down
Loading