diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29..76d27ba 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: patch + changes: + fixed: + - Minor fixes diff --git a/src/policyengine/core/simulation.py b/src/policyengine/core/simulation.py index f7c214e..0a5c106 100644 --- a/src/policyengine/core/simulation.py +++ b/src/policyengine/core/simulation.py @@ -24,6 +24,13 @@ class Simulation(BaseModel): def run(self): self.tax_benefit_model_version.run(self) + def ensure(self): + try: + self.tax_benefit_model_version.load(self) + except Exception: + self.run() + self.save() + def save(self): """Save the simulation's output dataset.""" self.tax_benefit_model_version.save(self) diff --git a/src/policyengine/tax_benefit_models/uk.py b/src/policyengine/tax_benefit_models/uk.py index a9fb102..d6c1ad3 100644 --- a/src/policyengine/tax_benefit_models/uk.py +++ b/src/policyengine/tax_benefit_models/uk.py @@ -1,33 +1,38 @@ """PolicyEngine UK tax-benefit model - imports from uk/ module.""" -from .uk import ( - PolicyEngineUK, - PolicyEngineUKDataset, - PolicyEngineUKLatest, - ProgrammeStatistics, - UKYearData, - create_datasets, - general_policy_reform_analysis, - uk_latest, - uk_model, -) +from importlib.util import find_spec -__all__ = [ - "UKYearData", - "PolicyEngineUKDataset", - "create_datasets", - "PolicyEngineUK", - "PolicyEngineUKLatest", - "uk_model", - "uk_latest", - "general_policy_reform_analysis", - "ProgrammeStatistics", -] +if find_spec("policyengine_uk") is not None: + from .uk import ( + PolicyEngineUK, + PolicyEngineUKDataset, + PolicyEngineUKLatest, + ProgrammeStatistics, + UKYearData, + create_datasets, + ensure_datasets, + general_policy_reform_analysis, + load_datasets, + uk_latest, + uk_model, + ) -# Rebuild models to resolve forward references -from policyengine.core import Dataset + __all__ = [ + "UKYearData", + "PolicyEngineUKDataset", + "create_datasets", + "load_datasets", + "ensure_datasets", + "PolicyEngineUK", + "PolicyEngineUKLatest", + "uk_model", + "uk_latest", + "general_policy_reform_analysis", + "ProgrammeStatistics", + ] -Dataset.model_rebuild() -UKYearData.model_rebuild() -PolicyEngineUKDataset.model_rebuild() -PolicyEngineUKLatest.model_rebuild() + # Rebuild models to resolve forward references + PolicyEngineUKDataset.model_rebuild() + PolicyEngineUKLatest.model_rebuild() +else: + __all__ = [] diff --git a/src/policyengine/tax_benefit_models/uk/__init__.py b/src/policyengine/tax_benefit_models/uk/__init__.py index ade6e53..d933589 100644 --- a/src/policyengine/tax_benefit_models/uk/__init__.py +++ b/src/policyengine/tax_benefit_models/uk/__init__.py @@ -1,26 +1,44 @@ """PolicyEngine UK tax-benefit model.""" -from .analysis import general_policy_reform_analysis -from .datasets import PolicyEngineUKDataset, UKYearData, create_datasets -from .model import PolicyEngineUK, PolicyEngineUKLatest, uk_latest, uk_model -from .outputs import ProgrammeStatistics +from importlib.util import find_spec -__all__ = [ - "UKYearData", - "PolicyEngineUKDataset", - "create_datasets", - "PolicyEngineUK", - "PolicyEngineUKLatest", - "uk_model", - "uk_latest", - "general_policy_reform_analysis", - "ProgrammeStatistics", -] +if find_spec("policyengine_uk") is not None: + from policyengine.core import Dataset -# Rebuild models to resolve forward references -from policyengine.core import Dataset + from .analysis import general_policy_reform_analysis + from .datasets import ( + PolicyEngineUKDataset, + UKYearData, + create_datasets, + ensure_datasets, + load_datasets, + ) + from .model import ( + PolicyEngineUK, + PolicyEngineUKLatest, + uk_latest, + uk_model, + ) + from .outputs import ProgrammeStatistics -Dataset.model_rebuild() -UKYearData.model_rebuild() -PolicyEngineUKDataset.model_rebuild() -PolicyEngineUKLatest.model_rebuild() + # Rebuild Pydantic models to resolve forward references + Dataset.model_rebuild() + UKYearData.model_rebuild() + PolicyEngineUKDataset.model_rebuild() + PolicyEngineUKLatest.model_rebuild() + + __all__ = [ + "UKYearData", + "PolicyEngineUKDataset", + "create_datasets", + "load_datasets", + "ensure_datasets", + "PolicyEngineUK", + "PolicyEngineUKLatest", + "uk_model", + "uk_latest", + "general_policy_reform_analysis", + "ProgrammeStatistics", + ] +else: + __all__ = [] diff --git a/src/policyengine/tax_benefit_models/uk/datasets.py b/src/policyengine/tax_benefit_models/uk/datasets.py index bdf89d9..138ee78 100644 --- a/src/policyengine/tax_benefit_models/uk/datasets.py +++ b/src/policyengine/tax_benefit_models/uk/datasets.py @@ -37,11 +37,7 @@ def model_post_init(self, __context): if self.data is not None: self.save() elif self.filepath and not self.data: - try: - self.load() - except FileNotFoundError: - # File doesn't exist yet, that's OK - pass + self.load() def save(self) -> None: """Save dataset to HDF5 file.""" @@ -85,7 +81,9 @@ def create_datasets( "hf://policyengine/policyengine-uk-data/enhanced_frs_2023_24.h5", ], years: list[int] = [2026, 2027, 2028, 2029, 2030], -) -> None: + data_folder: str = "./data", +) -> dict[str, PolicyEngineUKDataset]: + result = {} for dataset in datasets: from policyengine_uk import Microsimulation @@ -139,9 +137,10 @@ def create_datasets( ) uk_dataset = PolicyEngineUKDataset( + id=f"{Path(dataset).stem}_year_{year}", name=f"{dataset}-year-{year}", description=f"UK Dataset for year {year} based on {dataset}", - filepath=f"./data/{Path(dataset).stem}_year_{year}.h5", + filepath=f"{data_folder}/{Path(dataset).stem}_year_{year}.h5", year=year, data=UKYearData( person=MicroDataFrame(person_df, weights="person_weight"), @@ -154,3 +153,75 @@ def create_datasets( ), ) uk_dataset.save() + + dataset_key = f"{Path(dataset).stem}_{year}" + result[dataset_key] = uk_dataset + + return result + + +def load_datasets( + datasets: list[str] = [ + "hf://policyengine/policyengine-uk-data/frs_2023_24.h5", + "hf://policyengine/policyengine-uk-data/enhanced_frs_2023_24.h5", + ], + years: list[int] = [2026, 2027, 2028, 2029, 2030], + data_folder: str = "./data", +) -> dict[str, PolicyEngineUKDataset]: + result = {} + for dataset in datasets: + for year in years: + filepath = f"{data_folder}/{Path(dataset).stem}_year_{year}.h5" + uk_dataset = PolicyEngineUKDataset( + name=f"{dataset}-year-{year}", + description=f"UK Dataset for year {year} based on {dataset}", + filepath=filepath, + year=year, + ) + uk_dataset.load() + + dataset_key = f"{Path(dataset).stem}_{year}" + result[dataset_key] = uk_dataset + + return result + + +def ensure_datasets( + datasets: list[str] = [ + "hf://policyengine/policyengine-uk-data/frs_2023_24.h5", + "hf://policyengine/policyengine-uk-data/enhanced_frs_2023_24.h5", + ], + years: list[int] = [2026, 2027, 2028, 2029, 2030], + data_folder: str = "./data", +) -> dict[str, PolicyEngineUKDataset]: + """Ensure datasets exist, loading if available or creating if not. + + Args: + datasets: List of HuggingFace dataset paths + years: List of years to load/create data for + data_folder: Directory containing or to save the dataset files + + Returns: + Dictionary mapping dataset keys to PolicyEngineUKDataset objects + """ + # Check if all dataset files exist + all_exist = True + for dataset in datasets: + for year in years: + filepath = Path( + f"{data_folder}/{Path(dataset).stem}_year_{year}.h5" + ) + if not filepath.exists(): + all_exist = False + break + if not all_exist: + break + + if all_exist: + return load_datasets( + datasets=datasets, years=years, data_folder=data_folder + ) + else: + return create_datasets( + datasets=datasets, years=years, data_folder=data_folder + ) diff --git a/src/policyengine/tax_benefit_models/uk/model.py b/src/policyengine/tax_benefit_models/uk/model.py index 571ceb1..abeee8f 100644 --- a/src/policyengine/tax_benefit_models/uk/model.py +++ b/src/policyengine/tax_benefit_models/uk/model.py @@ -265,17 +265,29 @@ def save(self, simulation: "Simulation"): def load(self, simulation: "Simulation"): """Load the simulation's output dataset.""" + import os + + filepath = str( + Path(simulation.dataset.filepath).parent / (simulation.id + ".h5") + ) + simulation.output_dataset = PolicyEngineUKDataset( id=simulation.id, name=simulation.dataset.name, description=simulation.dataset.description, - filepath=str( - Path(simulation.dataset.filepath).parent - / (simulation.id + ".h5") - ), + filepath=filepath, year=simulation.dataset.year, is_output_dataset=True, ) + # Load timestamps from file system metadata + if os.path.exists(filepath): + simulation.created_at = datetime.datetime.fromtimestamp( + os.path.getctime(filepath) + ) + simulation.updated_at = datetime.datetime.fromtimestamp( + os.path.getmtime(filepath) + ) + uk_latest = PolicyEngineUKLatest() diff --git a/src/policyengine/tax_benefit_models/us.py b/src/policyengine/tax_benefit_models/us.py index c915a3b..3cf6264 100644 --- a/src/policyengine/tax_benefit_models/us.py +++ b/src/policyengine/tax_benefit_models/us.py @@ -9,7 +9,10 @@ PolicyEngineUSLatest, ProgramStatistics, USYearData, + create_datasets, + ensure_datasets, general_policy_reform_analysis, + load_datasets, us_latest, us_model, ) @@ -17,6 +20,9 @@ __all__ = [ "USYearData", "PolicyEngineUSDataset", + "create_datasets", + "load_datasets", + "ensure_datasets", "PolicyEngineUS", "PolicyEngineUSLatest", "us_model", diff --git a/src/policyengine/tax_benefit_models/us/__init__.py b/src/policyengine/tax_benefit_models/us/__init__.py index 6336178..26d9da9 100644 --- a/src/policyengine/tax_benefit_models/us/__init__.py +++ b/src/policyengine/tax_benefit_models/us/__init__.py @@ -6,7 +6,13 @@ from policyengine.core import Dataset from .analysis import general_policy_reform_analysis - from .datasets import PolicyEngineUSDataset, USYearData, create_datasets + from .datasets import ( + PolicyEngineUSDataset, + USYearData, + create_datasets, + ensure_datasets, + load_datasets, + ) from .model import ( PolicyEngineUS, PolicyEngineUSLatest, @@ -25,6 +31,8 @@ "USYearData", "PolicyEngineUSDataset", "create_datasets", + "load_datasets", + "ensure_datasets", "PolicyEngineUS", "PolicyEngineUSLatest", "us_model", diff --git a/src/policyengine/tax_benefit_models/us/datasets.py b/src/policyengine/tax_benefit_models/us/datasets.py index 53643cc..f6f64db 100644 --- a/src/policyengine/tax_benefit_models/us/datasets.py +++ b/src/policyengine/tax_benefit_models/us/datasets.py @@ -44,11 +44,7 @@ def model_post_init(self, __context) -> None: if self.data is not None: self.save() elif self.filepath and not self.data: - try: - self.load() - except FileNotFoundError: - # File doesn't exist yet, that's OK - pass + self.load() def save(self) -> None: """Save dataset to HDF5 file.""" @@ -112,15 +108,21 @@ def create_datasets( "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5", ], years: list[int] = [2024, 2025, 2026, 2027, 2028], -) -> None: + data_folder: str = "./data", +) -> dict[str, PolicyEngineUSDataset]: """Create PolicyEngineUSDataset instances from HuggingFace dataset paths. Args: datasets: List of HuggingFace dataset paths (e.g., "hf://policyengine/policyengine-us-data/cps_2024.h5") years: List of years to extract data for + data_folder: Directory to save the dataset files + + Returns: + Dictionary mapping dataset keys (e.g., "enhanced_cps_2024") to PolicyEngineUSDataset objects """ from policyengine_us import Microsimulation + result = {} for dataset in datasets: sim = Microsimulation(dataset=dataset) @@ -263,9 +265,10 @@ def create_datasets( tax_unit_df = entity_df us_dataset = PolicyEngineUSDataset( + id=f"{Path(dataset).stem}_year_{year}", name=f"{dataset}-year-{year}", description=f"US Dataset for year {year} based on {dataset}", - filepath=f"./data/{Path(dataset).stem}_year_{year}.h5", + filepath=f"{data_folder}/{Path(dataset).stem}_year_{year}.h5", year=year, data=USYearData( person=MicroDataFrame(person_df, weights="person_weight"), @@ -285,3 +288,83 @@ def create_datasets( ), ) us_dataset.save() + + dataset_key = f"{Path(dataset).stem}_{year}" + result[dataset_key] = us_dataset + + return result + + +def load_datasets( + datasets: list[str] = [ + "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5", + ], + years: list[int] = [2024, 2025, 2026, 2027, 2028], + data_folder: str = "./data", +) -> dict[str, PolicyEngineUSDataset]: + """Load PolicyEngineUSDataset instances from saved HDF5 files. + + Args: + datasets: List of HuggingFace dataset paths (used to derive file names) + years: List of years to load data for + data_folder: Directory containing the dataset files + + Returns: + Dictionary mapping dataset keys (e.g., "enhanced_cps_2024") to PolicyEngineUSDataset objects + """ + result = {} + for dataset in datasets: + for year in years: + filepath = f"{data_folder}/{Path(dataset).stem}_year_{year}.h5" + us_dataset = PolicyEngineUSDataset( + name=f"{dataset}-year-{year}", + description=f"US Dataset for year {year} based on {dataset}", + filepath=filepath, + year=year, + ) + us_dataset.load() + + dataset_key = f"{Path(dataset).stem}_{year}" + result[dataset_key] = us_dataset + + return result + + +def ensure_datasets( + datasets: list[str] = [ + "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5", + ], + years: list[int] = [2024, 2025, 2026, 2027, 2028], + data_folder: str = "./data", +) -> dict[str, PolicyEngineUSDataset]: + """Ensure datasets exist, loading if available or creating if not. + + Args: + datasets: List of HuggingFace dataset paths + years: List of years to load/create data for + data_folder: Directory containing or to save the dataset files + + Returns: + Dictionary mapping dataset keys to PolicyEngineUSDataset objects + """ + # Check if all dataset files exist + all_exist = True + for dataset in datasets: + for year in years: + filepath = Path( + f"{data_folder}/{Path(dataset).stem}_year_{year}.h5" + ) + if not filepath.exists(): + all_exist = False + break + if not all_exist: + break + + if all_exist: + return load_datasets( + datasets=datasets, years=years, data_folder=data_folder + ) + else: + return create_datasets( + datasets=datasets, years=years, data_folder=data_folder + ) diff --git a/src/policyengine/tax_benefit_models/us/model.py b/src/policyengine/tax_benefit_models/us/model.py index 9fd5f05..d65feac 100644 --- a/src/policyengine/tax_benefit_models/us/model.py +++ b/src/policyengine/tax_benefit_models/us/model.py @@ -311,18 +311,30 @@ def save(self, simulation: "Simulation"): def load(self, simulation: "Simulation"): """Load the simulation's output dataset.""" + import os + + filepath = str( + Path(simulation.dataset.filepath).parent / (simulation.id + ".h5") + ) + simulation.output_dataset = PolicyEngineUSDataset( id=simulation.id, name=simulation.dataset.name, description=simulation.dataset.description, - filepath=str( - Path(simulation.dataset.filepath).parent - / (simulation.id + ".h5") - ), + filepath=filepath, year=simulation.dataset.year, is_output_dataset=True, ) + # Load timestamps from file system metadata + if os.path.exists(filepath): + simulation.created_at = datetime.datetime.fromtimestamp( + os.path.getctime(filepath) + ) + simulation.updated_at = datetime.datetime.fromtimestamp( + os.path.getmtime(filepath) + ) + def _build_simulation_from_dataset(self, microsim, dataset, system): """Build a PolicyEngine Core simulation from dataset entity IDs.