diff --git a/.gitignore b/.gitignore index 80dc3a63..756f9f7e 100644 --- a/.gitignore +++ b/.gitignore @@ -5,7 +5,12 @@ dist/ build/ tests/catboost_info/ tests/out/ -tests/data/ +tests/data/* +!tests/data/settings.template.json +!tests/data/nc-guilford/ +!tests/data/nc-guilford/** +!tests/data/fr-02-saint_quentin/ +!tests/data/fr-02-saint_quentin/** notebooks/pipeline/data/ notebooks/pipeline/out/ notebooks/pipeline/cache/ diff --git a/scripts/prepare_saint_quentin.py b/scripts/prepare_saint_quentin.py new file mode 100644 index 00000000..8a148fc5 --- /dev/null +++ b/scripts/prepare_saint_quentin.py @@ -0,0 +1,653 @@ +#!/usr/bin/env python3 +"""Prepare a local Saint-Quentin OpenAVMKit smoke-test dataset. + +The generated data lives under notebooks/pipeline/data/fr-02-saint_quentin, +which is ignored by git. The script writes normalized parquet inputs and a +minimal settings.json, then can run the OpenAVMKit load/process path. +""" + +from __future__ import annotations + +import argparse +import hashlib +import json +import re +import sys +import types +import zipfile +from pathlib import Path + +import geopandas as gpd +import pandas as pd +import requests + + +ROOT = Path(__file__).resolve().parents[1] +if str(ROOT) not in sys.path: + sys.path.insert(0, str(ROOT)) +LOCALITY = "fr-02-saint_quentin" +DATA_DIR = ROOT / "notebooks" / "pipeline" / "data" / LOCALITY +RAW_DIR = DATA_DIR / "raw" +IN_DIR = DATA_DIR / "in" + +CADASTRE_URL = ( + "https://services1.arcgis.com/5nIW6mZeb2YNJ7np/ArcGIS/rest/services/" + "SIG_CADASTRE/FeatureServer" +) +DVF_API_URL = "https://www.data.gouv.fr/api/1/datasets/demandes-de-valeurs-foncieres/" +SQM_TO_SQFT = 10.763910416709722 +ARCGIS_PAGE_SIZE = 1000 + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser() + parser.add_argument( + "--years", + nargs="+", + type=int, + default=[2024], + help="DVF years to download and filter. Default: 2024.", + ) + parser.add_argument("--force", action="store_true", help="Re-download/rebuild raw files.") + parser.add_argument("--skip-smoke", action="store_true", help="Only prepare files.") + return parser.parse_args() + + +def get_json(url: str, params: dict | None = None) -> dict: + response = requests.get(url, params=params, timeout=120) + response.raise_for_status() + return response.json() + + +def arcgis_count(layer_id: int) -> int: + payload = get_json( + f"{CADASTRE_URL}/{layer_id}/query", + {"where": "1=1", "returnCountOnly": "true", "f": "json"}, + ) + return int(payload["count"]) + + +def fetch_arcgis_layer(layer_id: int, name: str, force: bool) -> gpd.GeoDataFrame: + out_path = RAW_DIR / f"{name}.geojson" + if out_path.exists() and not force: + return gpd.read_file(out_path) + + count = arcgis_count(layer_id) + features: list[dict] = [] + for offset in range(0, count, ARCGIS_PAGE_SIZE): + payload = get_json( + f"{CADASTRE_URL}/{layer_id}/query", + { + "where": "1=1", + "outFields": "*", + "returnGeometry": "true", + "f": "geojson", + "outSR": "4326", + "resultOffset": offset, + "resultRecordCount": ARCGIS_PAGE_SIZE, + "orderByFields": "objectid", + }, + ) + batch = payload.get("features", []) + features.extend(batch) + print(f"{name}: fetched {len(features):,}/{count:,}") + + collection = {"type": "FeatureCollection", "features": features} + out_path.write_text(json.dumps(collection), encoding="utf-8") + return gpd.GeoDataFrame.from_features(collection, crs="EPSG:4326") + + +def clean_cell(value: object) -> str: + if pd.isna(value): + return "" + return str(value).strip() + + +def norm_key(value: object) -> str: + return re.sub(r"\s+", "", clean_cell(value)).upper() + + +def parcel_key_parts(codeident: object) -> tuple[str, str, str]: + raw = clean_cell(codeident) + if len(raw) < 6: + return "", "", "" + dep = raw[:2] + commune = raw[3:6] + return dep, commune, dep + commune + + +def normalize_parcels(parcels: gpd.GeoDataFrame, buildings: gpd.GeoDataFrame) -> gpd.GeoDataFrame: + parcels = parcels.copy() + parcels["key"] = parcels["codeident"].map(norm_key) + parts = parcels["codeident"].map(parcel_key_parts) + parcels["dvf_departement"] = parts.map(lambda x: x[0]) + parcels["dvf_commune"] = parts.map(lambda x: x[1]) + parcels["commune_code"] = parts.map(lambda x: x[2]) + parcels["codcomm_arcgis"] = parcels["codeident"].astype(str).str[:6] + parcels["cadastral_section"] = parcels["sect_cad"].astype(str).str.strip() + parcels["parcel_number"] = pd.to_numeric(parcels["parcelle"], errors="coerce").astype("Int64") + parcels["neighborhood"] = parcels["cadastral_section"].replace("", pd.NA).fillna("unknown") + + parcels_m = parcels.to_crs("EPSG:3949") + parcels_m["land_area_sqm"] = parcels_m.geometry.area + parcels_m["land_area_sqft"] = parcels_m["land_area_sqm"] * SQM_TO_SQFT + + buildings = buildings.copy() + buildings["building_area_footprint_sqm"] = pd.to_numeric( + buildings.get("surface"), errors="coerce" + ) + buildings_m = buildings.to_crs("EPSG:3949") + missing_area = buildings_m["building_area_footprint_sqm"].isna() + buildings_m.loc[missing_area, "building_area_footprint_sqm"] = buildings_m.loc[ + missing_area, "geometry" + ].area + buildings_m["building_area_shon_sqm"] = pd.to_numeric(buildings_m.get("shon"), errors="coerce") + buildings_pts = buildings_m.copy() + buildings_pts["geometry"] = buildings_pts.geometry.representative_point() + + joined = gpd.sjoin( + buildings_pts[["type", "building_area_footprint_sqm", "building_area_shon_sqm", "geometry"]], + parcels_m[["key", "geometry"]], + how="left", + predicate="within", + ).dropna(subset=["key"]) + + def first_mode(values: pd.Series) -> object: + modes = values.dropna().mode() + return modes.iloc[0] if len(modes) else pd.NA + + bldg_agg = joined.groupby("key").agg( + bldg_count=("key", "size"), + bldg_area_footprint_sqm=("building_area_footprint_sqm", "sum"), + bldg_area_shon_sqm=("building_area_shon_sqm", "sum"), + bldg_type=("type", first_mode), + ) + parcels_m = parcels_m.merge(bldg_agg, on="key", how="left") + parcels_m["bldg_count"] = parcels_m["bldg_count"].fillna(0).astype("Int64") + for col in ["bldg_area_footprint_sqm", "bldg_area_shon_sqm"]: + parcels_m[col] = parcels_m[col].fillna(0.0) + parcels_m["bldg_area_finished_sqm"] = parcels_m["bldg_area_shon_sqm"] + use_footprint = parcels_m["bldg_area_finished_sqm"].le(0) + parcels_m.loc[use_footprint, "bldg_area_finished_sqm"] = parcels_m.loc[ + use_footprint, "bldg_area_footprint_sqm" + ] + parcels_m["bldg_area_finished_sqft"] = parcels_m["bldg_area_finished_sqm"] * SQM_TO_SQFT + parcels_m["bldg_area_footprint_sqft"] = parcels_m["bldg_area_footprint_sqm"] * SQM_TO_SQFT + parcels_m["is_vacant"] = parcels_m["bldg_area_finished_sqm"].le(0) + + cols = [ + "key", + "codeident", + "codcomm_arcgis", + "dvf_departement", + "dvf_commune", + "commune_code", + "cadastral_section", + "parcel_number", + "neighborhood", + "land_area_sqm", + "land_area_sqft", + "bldg_count", + "bldg_area_footprint_sqm", + "bldg_area_footprint_sqft", + "bldg_area_finished_sqm", + "bldg_area_finished_sqft", + "bldg_type", + "is_vacant", + "geometry", + ] + return parcels_m[cols].to_crs("EPSG:4326") + + +def dvf_resources() -> dict[int, str]: + dataset = get_json(DVF_API_URL) + resources = {} + for resource in dataset.get("resources", []): + title = resource.get("title", "") + match = re.search(r"(\d{4})", title) + if resource.get("type") == "main" and match: + resources[int(match.group(1))] = resource["url"] + return resources + + +def download(url: str, path: Path, force: bool) -> None: + if path.exists() and path.stat().st_size > 0 and not force: + return + with requests.get(url, stream=True, timeout=120) as response: + response.raise_for_status() + with path.open("wb") as handle: + for chunk in response.iter_content(chunk_size=1024 * 1024): + if chunk: + handle.write(chunk) + + +def dvf_codeident(row: pd.Series) -> str: + dep = clean_cell(row.get("Code departement", "")).zfill(2) + commune = clean_cell(row.get("Code commune", "")).zfill(3) + prefix = clean_cell(row.get("Prefixe de section", "")) + prefix = " " if prefix in {"", "0", "00", "000", "nan", ""} else prefix.zfill(3) + section = clean_cell(row.get("Section", "")).upper().rjust(2) + plan_raw = clean_cell(row.get("No plan", "")) + try: + plan = str(int(float(plan_raw))).zfill(4) + except ValueError: + plan = plan_raw.zfill(4) + return norm_key(f"{dep}0{commune}{prefix}{section}{plan}") + + +def parse_fr_number(series: pd.Series) -> pd.Series: + cleaned = ( + series.astype("string") + .str.replace("\u00a0", "", regex=False) + .str.replace(" ", "", regex=False) + .str.replace(",", ".", regex=False) + ) + return pd.to_numeric(cleaned, errors="coerce") + + +def stable_sale_suffix(row: pd.Series) -> str: + payload = "|".join(str(row.get(col, "")) for col in row.index) + return hashlib.sha1(payload.encode("utf-8")).hexdigest()[:10] + + +def prepare_sales(parcels: gpd.GeoDataFrame, years: list[int], force: bool) -> pd.DataFrame: + out_path = IN_DIR / "sales.parquet" + required_cols = { + "valid_for_ratio_study", + "valid_for_land_ratio_study", + } + if out_path.exists() and not force: + existing = pd.read_parquet(out_path) + existing_years = set(pd.to_datetime(existing["sale_date"]).dt.year.dropna().astype(int)) + if existing_years == set(years) and required_cols.issubset(existing.columns): + return existing + + resources: dict[int, str] = {} + parcel_keys = set(parcels["key"]) + departments = set(parcels["dvf_departement"]) + communes = set(parcels["dvf_commune"]) + chunks: list[pd.DataFrame] = [] + + for year in years: + zip_path = RAW_DIR / f"valeursfoncieres-{year}.txt.zip" + print(f"DVF {year}: downloading/filtering") + if not zip_path.exists() or force: + if not resources: + resources = dvf_resources() + if year not in resources: + raise ValueError(f"No DVF resource found for {year}") + download(resources[year], zip_path, force) + with zipfile.ZipFile(zip_path) as archive: + names = [name for name in archive.namelist() if name.lower().endswith(".txt")] + if not names: + raise ValueError(f"No .txt file found in {zip_path}") + with archive.open(names[0]) as handle: + reader = pd.read_csv(handle, sep="|", dtype="string", chunksize=200_000) + for chunk in reader: + chunk["Code departement"] = chunk["Code departement"].str.strip().str.zfill(2) + chunk["Code commune"] = chunk["Code commune"].str.strip().str.zfill(3) + chunk = chunk[ + chunk["Code departement"].isin(departments) + & chunk["Code commune"].isin(communes) + ].copy() + if chunk.empty: + continue + chunk["key"] = chunk.apply(dvf_codeident, axis=1) + chunk = chunk[chunk["key"].isin(parcel_keys)].copy() + if chunk.empty: + continue + chunks.append(chunk) + + if not chunks: + raise ValueError("No DVF sales matched the Saint-Quentin parcel layer") + + raw = pd.concat(chunks, ignore_index=True) + raw["sale_date"] = pd.to_datetime(raw["Date mutation"], format="%d/%m/%Y", errors="coerce") + raw["sale_price"] = parse_fr_number(raw["Valeur fonciere"]) + raw["surface_reelle_bati_sqm"] = parse_fr_number(raw["Surface reelle bati"]) + raw["sale_land_area_sqm"] = parse_fr_number(raw["Surface terrain"]) + raw["rooms"] = parse_fr_number(raw["Nombre pieces principales"]) + raw["sale_nature"] = raw["Nature mutation"].astype("string") + raw["property_type"] = raw["Type local"].astype("string") + raw = raw[raw["sale_date"].notna() & raw["sale_price"].gt(0)].copy() + raw = raw[raw["sale_nature"].str.contains("Vente", case=False, na=False)].copy() + + grouped = raw.groupby( + ["key", "sale_date", "sale_price", "sale_nature"], + dropna=False, + as_index=False, + ).agg( + property_type=("property_type", lambda s: s.dropna().iloc[0] if s.dropna().size else pd.NA), + bldg_area_finished_sqm=("surface_reelle_bati_sqm", "max"), + sale_land_area_sqm=("sale_land_area_sqm", "max"), + rooms=("rooms", "max"), + ) + grouped["sale_date"] = grouped["sale_date"].dt.strftime("%Y-%m-%d") + grouped["bldg_area_finished_sqm"] = grouped["bldg_area_finished_sqm"].fillna(0.0) + grouped["bldg_area_finished_sqft"] = grouped["bldg_area_finished_sqm"] * SQM_TO_SQFT + grouped["valid_sale"] = True + grouped = grouped.merge(parcels[["key", "is_vacant"]], on="key", how="left") + current_is_vacant = grouped["is_vacant"].fillna(False) + sale_looks_vacant = grouped["bldg_area_finished_sqm"].le(0) + grouped["vacant_sale"] = sale_looks_vacant + grouped["valid_for_ratio_study"] = grouped["valid_sale"] & sale_looks_vacant.eq( + current_is_vacant + ) + grouped["valid_for_land_ratio_study"] = grouped["valid_sale"] & grouped["vacant_sale"] + grouped = grouped.drop(columns=["is_vacant"]) + grouped["sale_hash"] = grouped.apply(stable_sale_suffix, axis=1) + grouped["key_sale"] = ( + grouped["key"] + + "---" + + grouped["sale_date"] + + "---" + + grouped["sale_hash"] + ) + + cols = [ + "key_sale", + "key", + "sale_date", + "sale_price", + "sale_nature", + "property_type", + "bldg_area_finished_sqm", + "bldg_area_finished_sqft", + "sale_land_area_sqm", + "rooms", + "valid_sale", + "vacant_sale", + "valid_for_ratio_study", + "valid_for_land_ratio_study", + ] + sales = grouped[cols].sort_values(["sale_date", "key_sale"]).reset_index(drop=True) + sales.to_parquet(out_path, index=False) + return sales + + +def write_settings(years: list[int]) -> None: + main_model_features = [ + "land_area_sqm", + "bldg_area_finished_sqm", + "bldg_area_footprint_sqm", + "bldg_count", + "neighborhood", + "commune_code", + "cadastral_section", + ] + vacant_model_features = [ + "land_area_sqm", + "neighborhood", + "commune_code", + "cadastral_section", + ] + naive_area_features = [ + "land_area_sqm", + "bldg_area_finished_sqm", + ] + main_area_models = { + "default": { + "ind_vars": main_model_features, + }, + "naive_area": { + "model": "naive_area", + "ind_vars": naive_area_features, + }, + "local_area": { + "model": "local_area", + "ind_vars": main_model_features, + "locations": ["neighborhood", "commune_code", "cadastral_section"], + }, + "lightgbm": { + "engine": "lightgbm", + "model": "lightgbm", + "ind_vars": main_model_features, + "n_trials": 10, + }, + } + vacant_area_models = { + "default": { + "ind_vars": vacant_model_features, + }, + "naive_area": { + "model": "naive_area", + "ind_vars": naive_area_features, + }, + "local_area": { + "model": "local_area", + "ind_vars": vacant_model_features, + "locations": ["neighborhood", "commune_code", "cadastral_section"], + }, + "lightgbm": { + "engine": "lightgbm", + "model": "lightgbm", + "ind_vars": vacant_model_features, + "n_trials": 10, + }, + } + settings = { + "locality": { + "name": "Saint-Quentin", + "country": "FR", + "state": "02", + "slug": LOCALITY, + "units": "metric", + }, + "data": { + "load": { + "geo_parcels": { + "key": "geo_parcels", + "filename": "parcels.parquet", + "dupes": { + "subset": ["key"], + "sort_by": ["key", "asc"], + "drop": True, + }, + "load": { + "key": ["key", "string"], + "codeident": ["codeident", "string"], + "commune_code": ["commune_code", "string"], + "cadastral_section": ["cadastral_section", "string"], + "neighborhood": ["neighborhood", "string"], + "land_area_sqm": ["land_area_sqm", "float"], + "land_area_sqft": ["land_area_sqft", "float"], + "bldg_count": ["bldg_count", "float"], + "bldg_area_footprint_sqm": ["bldg_area_footprint_sqm", "float"], + "bldg_area_footprint_sqft": ["bldg_area_footprint_sqft", "float"], + "bldg_area_finished_sqm": ["bldg_area_finished_sqm", "float"], + "bldg_area_finished_sqft": ["bldg_area_finished_sqft", "float"], + "bldg_type": ["bldg_type", "string"], + "is_vacant": ["is_vacant", "boolean", "na_false"], + }, + }, + "sales": { + "key": "sales", + "filename": "sales.parquet", + "geometry": False, + "dupes": { + "subset": ["key_sale"], + "sort_by": ["key_sale", "asc"], + "drop": True, + }, + "load": { + "key_sale": ["key_sale", "string"], + "key": ["key", "string"], + "sale_date": ["sale_date", "datetime", "%Y-%m-%d"], + "sale_price": ["sale_price", "float"], + "sale_nature": ["sale_nature", "string"], + "property_type": ["property_type", "string"], + "bldg_area_finished_sqm": ["bldg_area_finished_sqm", "float"], + "bldg_area_finished_sqft": ["bldg_area_finished_sqft", "float"], + "sale_land_area_sqm": ["sale_land_area_sqm", "float"], + "rooms": ["rooms", "float"], + "valid_sale": ["valid_sale", "boolean", "na_false"], + "vacant_sale": ["vacant_sale", "boolean", "na_false"], + "valid_for_ratio_study": [ + "valid_for_ratio_study", + "boolean", + "na_false", + ], + "valid_for_land_ratio_study": [ + "valid_for_land_ratio_study", + "boolean", + "na_false", + ], + }, + }, + }, + "process": { + "merge": { + "universe": ["geo_parcels"], + "sales": ["sales"], + }, + "enrich": {}, + }, + }, + "modeling": { + "metadata": { + "valuation_date": f"{max(years) + 1}-01-01", + "use_sales_from": min(years), + "test_sales_from": max(years), + "modeler": "Saint-Quentin metric test", + }, + "model_groups": { + "all": { + "name": "All parcels", + "filter": [">=", "land_area_sqm", 0], + }, + }, + "instructions": { + "dep_var": "sale_price_time_adj", + "dep_var_test": "sale_price_time_adj", + "time_adjustment": { + "period": "M", + }, + "main": { + "run": ["naive_area", "local_area", "lightgbm"], + }, + "vacant": { + "run": ["naive_area", "local_area", "lightgbm"], + }, + "hedonic": { + "skip": {"all": ["all"]}, + }, + }, + "models": { + "main": main_area_models, + "vacant": vacant_area_models, + "default": { + "ind_vars": main_model_features, + } + }, + }, + "field_classification": { + "important": { + "fields": { + "loc_neighborhood": "neighborhood", + "land_category": "cadastral_section", + "impr_category": "property_type", + }, + "locations": ["neighborhood", "commune_code", "cadastral_section"], + "report_locations": ["neighborhood", "commune_code"], + }, + "land": { + "+numeric": ["land_area_sqm", "land_area_sqft"], + "+categorical": ["commune_code", "cadastral_section", "neighborhood"], + }, + "impr": { + "+numeric": [ + "bldg_count", + "bldg_area_footprint_sqm", + "bldg_area_footprint_sqft", + "bldg_area_finished_sqm", + "bldg_area_finished_sqft", + "rooms", + ], + "+categorical": ["bldg_type", "property_type"], + }, + "other": { + "+categorical": ["codeident", "sale_nature"], + "+numeric": ["sale_land_area_sqm"], + }, + }, + } + settings_path = IN_DIR / "settings.json" + settings_path.write_text(json.dumps(settings, indent=2, ensure_ascii=False) + "\n", encoding="utf-8") + + +def install_optional_import_stubs() -> None: + if "census" not in sys.modules: + census_mod = types.ModuleType("census") + + class Census: # pragma: no cover - smoke-test import shim + pass + + census_mod.Census = Census + sys.modules["census"] = census_mod + for name in ["lightgbm", "xgboost"]: + sys.modules.setdefault(name, types.ModuleType(name)) + + +def smoke_test() -> None: + install_optional_import_stubs() + from openavmkit.data import load_dataframe, process_data + from openavmkit.utilities.settings import ( + get_fields_boolean, + get_fields_categorical, + get_fields_numeric, + load_settings, + ) + + old_cwd = Path.cwd() + try: + import os + + os.chdir(DATA_DIR) + settings = load_settings("in/settings.json") + fields_cat = get_fields_categorical(settings, include_boolean=False) + fields_bool = get_fields_boolean(settings) + fields_num = get_fields_numeric(settings, include_boolean=False) + dataframes = { + key: load_dataframe( + entry, + settings, + verbose=True, + fields_cat=fields_cat, + fields_bool=fields_bool, + fields_num=fields_num, + ) + for key, entry in settings["data"]["load"].items() + } + sup = process_data(dataframes, settings, verbose=True) + print( + "SMOKE OK: " + f"universe={len(sup.universe):,}, " + f"sales={len(sup.sales):,}, " + f"vacant_universe={int(sup.universe['is_vacant'].sum()):,}, " + f"vacant_sales={int(sup.sales['vacant_sale'].sum()):,}" + ) + finally: + import os + + os.chdir(old_cwd) + + +def main() -> None: + args = parse_args() + RAW_DIR.mkdir(parents=True, exist_ok=True) + IN_DIR.mkdir(parents=True, exist_ok=True) + + parcels_raw = fetch_arcgis_layer(1, "cadastre_parcels", args.force) + buildings_raw = fetch_arcgis_layer(0, "cadastre_buildings", args.force) + parcels = normalize_parcels(parcels_raw, buildings_raw) + parcels.to_parquet(IN_DIR / "parcels.parquet", index=False) + sales = prepare_sales(parcels, args.years, args.force) + write_settings(args.years) + + print(f"Prepared {len(parcels):,} parcels and {len(sales):,} sales for {LOCALITY}") + if not args.skip_smoke: + smoke_test() + + +if __name__ == "__main__": + main() diff --git a/tests/data/fr-02-saint_quentin/README.md b/tests/data/fr-02-saint_quentin/README.md new file mode 100644 index 00000000..2ff076e7 --- /dev/null +++ b/tests/data/fr-02-saint_quentin/README.md @@ -0,0 +1,27 @@ +# Saint-Quentin fixture + +This directory contains the lightweight OpenAVMKit settings fixture for the +Saint-Quentin, Aisne, France metric jurisdiction. + +The raw and generated data are intentionally not committed. Regeneration writes +to the ignored working directory: + +```text +notebooks/pipeline/data/fr-02-saint_quentin/ +``` + +Public data sources: + +- Cadastre parcels and buildings: `https://services1.arcgis.com/5nIW6mZeb2YNJ7np/ArcGIS/rest/services/SIG_CADASTRE/FeatureServer` +- DVF sales data API: `https://www.data.gouv.fr/api/1/datasets/demandes-de-valeurs-foncieres/` + +Local full preparation command: + +```bash +python scripts/prepare_saint_quentin.py --years 2021 2022 2023 2024 2025 +``` + +The fixture uses metric units, monthly time adjustment, a single `all` model +group, and main/vacant `naive_area`, `local_area`, and `lightgbm` runs. It +exercises the European metric data path without requiring large raw files in +git. diff --git a/tests/data/fr-02-saint_quentin/settings.json b/tests/data/fr-02-saint_quentin/settings.json new file mode 100644 index 00000000..88bb2a1b --- /dev/null +++ b/tests/data/fr-02-saint_quentin/settings.json @@ -0,0 +1,379 @@ +{ + "locality": { + "name": "Saint-Quentin", + "country": "FR", + "state": "02", + "slug": "fr-02-saint_quentin", + "units": "metric" + }, + "data": { + "load": { + "geo_parcels": { + "key": "geo_parcels", + "filename": "parcels.parquet", + "dupes": { + "subset": [ + "key" + ], + "sort_by": [ + "key", + "asc" + ], + "drop": true + }, + "load": { + "key": [ + "key", + "string" + ], + "codeident": [ + "codeident", + "string" + ], + "commune_code": [ + "commune_code", + "string" + ], + "cadastral_section": [ + "cadastral_section", + "string" + ], + "neighborhood": [ + "neighborhood", + "string" + ], + "land_area_sqm": [ + "land_area_sqm", + "float" + ], + "land_area_sqft": [ + "land_area_sqft", + "float" + ], + "bldg_count": [ + "bldg_count", + "float" + ], + "bldg_area_footprint_sqm": [ + "bldg_area_footprint_sqm", + "float" + ], + "bldg_area_footprint_sqft": [ + "bldg_area_footprint_sqft", + "float" + ], + "bldg_area_finished_sqm": [ + "bldg_area_finished_sqm", + "float" + ], + "bldg_area_finished_sqft": [ + "bldg_area_finished_sqft", + "float" + ], + "bldg_type": [ + "bldg_type", + "string" + ], + "is_vacant": [ + "is_vacant", + "boolean", + "na_false" + ] + } + }, + "sales": { + "key": "sales", + "filename": "sales.parquet", + "geometry": false, + "dupes": { + "subset": [ + "key_sale" + ], + "sort_by": [ + "key_sale", + "asc" + ], + "drop": true + }, + "load": { + "key_sale": [ + "key_sale", + "string" + ], + "key": [ + "key", + "string" + ], + "sale_date": [ + "sale_date", + "datetime", + "%Y-%m-%d" + ], + "sale_price": [ + "sale_price", + "float" + ], + "sale_nature": [ + "sale_nature", + "string" + ], + "property_type": [ + "property_type", + "string" + ], + "bldg_area_finished_sqm": [ + "bldg_area_finished_sqm", + "float" + ], + "bldg_area_finished_sqft": [ + "bldg_area_finished_sqft", + "float" + ], + "sale_land_area_sqm": [ + "sale_land_area_sqm", + "float" + ], + "rooms": [ + "rooms", + "float" + ], + "valid_sale": [ + "valid_sale", + "boolean", + "na_false" + ], + "vacant_sale": [ + "vacant_sale", + "boolean", + "na_false" + ], + "valid_for_ratio_study": [ + "valid_for_ratio_study", + "boolean", + "na_false" + ], + "valid_for_land_ratio_study": [ + "valid_for_land_ratio_study", + "boolean", + "na_false" + ] + } + } + }, + "process": { + "merge": { + "universe": [ + "geo_parcels" + ], + "sales": [ + "sales" + ] + }, + "enrich": {} + } + }, + "modeling": { + "metadata": { + "valuation_date": "2026-01-01", + "use_sales_from": 2021, + "test_sales_from": 2025, + "modeler": "Saint-Quentin metric test" + }, + "model_groups": { + "all": { + "name": "All parcels", + "filter": [ + ">=", + "land_area_sqm", + 0 + ] + } + }, + "instructions": { + "dep_var": "sale_price_time_adj", + "dep_var_test": "sale_price_time_adj", + "time_adjustment": { + "period": "M" + }, + "main": { + "run": [ + "naive_area", + "local_area", + "lightgbm" + ] + }, + "vacant": { + "run": [ + "naive_area", + "local_area", + "lightgbm" + ] + }, + "hedonic": { + "skip": { + "all": [ + "all" + ] + } + } + }, + "models": { + "main": { + "default": { + "ind_vars": [ + "land_area_sqm", + "bldg_area_finished_sqm", + "bldg_area_footprint_sqm", + "bldg_count", + "neighborhood", + "commune_code", + "cadastral_section" + ] + }, + "naive_area": { + "model": "naive_area", + "ind_vars": [ + "land_area_sqm", + "bldg_area_finished_sqm" + ] + }, + "local_area": { + "model": "local_area", + "ind_vars": [ + "land_area_sqm", + "bldg_area_finished_sqm", + "bldg_area_footprint_sqm", + "bldg_count", + "neighborhood", + "commune_code", + "cadastral_section" + ], + "locations": [ + "neighborhood", + "commune_code", + "cadastral_section" + ] + }, + "lightgbm": { + "engine": "lightgbm", + "model": "lightgbm", + "ind_vars": [ + "land_area_sqm", + "bldg_area_finished_sqm", + "bldg_area_footprint_sqm", + "bldg_count", + "neighborhood", + "commune_code", + "cadastral_section" + ], + "n_trials": 10 + } + }, + "vacant": { + "default": { + "ind_vars": [ + "land_area_sqm", + "neighborhood", + "commune_code", + "cadastral_section" + ] + }, + "naive_area": { + "model": "naive_area", + "ind_vars": [ + "land_area_sqm", + "bldg_area_finished_sqm" + ] + }, + "local_area": { + "model": "local_area", + "ind_vars": [ + "land_area_sqm", + "neighborhood", + "commune_code", + "cadastral_section" + ], + "locations": [ + "neighborhood", + "commune_code", + "cadastral_section" + ] + }, + "lightgbm": { + "engine": "lightgbm", + "model": "lightgbm", + "ind_vars": [ + "land_area_sqm", + "neighborhood", + "commune_code", + "cadastral_section" + ], + "n_trials": 10 + } + }, + "default": { + "ind_vars": [ + "land_area_sqm", + "bldg_area_finished_sqm", + "bldg_area_footprint_sqm", + "bldg_count", + "neighborhood", + "commune_code", + "cadastral_section" + ] + } + } + }, + "field_classification": { + "important": { + "fields": { + "loc_neighborhood": "neighborhood", + "land_category": "cadastral_section", + "impr_category": "property_type" + }, + "locations": [ + "neighborhood", + "commune_code", + "cadastral_section" + ], + "report_locations": [ + "neighborhood", + "commune_code" + ] + }, + "land": { + "+numeric": [ + "land_area_sqm", + "land_area_sqft" + ], + "+categorical": [ + "commune_code", + "cadastral_section", + "neighborhood" + ] + }, + "impr": { + "+numeric": [ + "bldg_count", + "bldg_area_footprint_sqm", + "bldg_area_footprint_sqft", + "bldg_area_finished_sqm", + "bldg_area_finished_sqft", + "rooms" + ], + "+categorical": [ + "bldg_type", + "property_type" + ] + }, + "other": { + "+categorical": [ + "codeident", + "sale_nature" + ], + "+numeric": [ + "sale_land_area_sqm" + ] + } + } +} diff --git a/tests/test_saint_quentin_fixture.py b/tests/test_saint_quentin_fixture.py new file mode 100644 index 00000000..c3fe80a8 --- /dev/null +++ b/tests/test_saint_quentin_fixture.py @@ -0,0 +1,51 @@ +from pathlib import Path + +from openavmkit.utilities.settings import load_settings + + +def test_saint_quentin_fixture_loads_with_metric_model_config(): + settings_path = ( + Path(__file__).parent / "data" / "fr-02-saint_quentin" / "settings.json" + ) + + settings = load_settings(settings_file=str(settings_path)) + + assert settings["locality"]["slug"] == "fr-02-saint_quentin" + assert settings["locality"]["units"] == "metric" + + model_groups = settings["modeling"]["model_groups"] + assert model_groups["all"]["filter"] == [">=", "land_area_sqm", 0] + + instructions = settings["modeling"]["instructions"] + assert instructions["dep_var"] == "sale_price_time_adj" + assert instructions["dep_var_test"] == "sale_price_time_adj" + assert instructions["time_adjustment"]["period"] == "M" + assert instructions["main"]["run"] == ["naive_area", "local_area", "lightgbm"] + assert instructions["vacant"]["run"] == ["naive_area", "local_area", "lightgbm"] + + naive_area = settings["modeling"]["models"]["main"]["naive_area"] + assert naive_area["model"] == "naive_area" + assert "land_area_sqm" in naive_area["ind_vars"] + assert "bldg_area_finished_sqm" in naive_area["ind_vars"] + + lightgbm = settings["modeling"]["models"]["main"]["lightgbm"] + assert lightgbm["model"] == "lightgbm" + assert lightgbm["engine"] == "lightgbm" + assert lightgbm["n_trials"] == 10 + assert "neighborhood" in lightgbm["ind_vars"] + + vacant_lightgbm = settings["modeling"]["models"]["vacant"]["lightgbm"] + assert "land_area_sqm" in vacant_lightgbm["ind_vars"] + assert "bldg_area_finished_sqm" not in vacant_lightgbm["ind_vars"] + + sales_load = settings["data"]["load"]["sales"]["load"] + assert sales_load["valid_for_ratio_study"] == [ + "valid_for_ratio_study", + "boolean", + "na_false", + ] + assert sales_load["valid_for_land_ratio_study"] == [ + "valid_for_land_ratio_study", + "boolean", + "na_false", + ]