@@ -37,11 +37,7 @@ def model_post_init(self, __context):
3737 if self .data is not None :
3838 self .save ()
3939 elif self .filepath and not self .data :
40- try :
41- self .load ()
42- except FileNotFoundError :
43- # File doesn't exist yet, that's OK
44- pass
40+ self .load ()
4541
4642 def save (self ) -> None :
4743 """Save dataset to HDF5 file."""
@@ -85,7 +81,9 @@ def create_datasets(
8581 "hf://policyengine/policyengine-uk-data/enhanced_frs_2023_24.h5" ,
8682 ],
8783 years : list [int ] = [2026 , 2027 , 2028 , 2029 , 2030 ],
88- ) -> None :
84+ data_folder : str = "./data" ,
85+ ) -> dict [str , PolicyEngineUKDataset ]:
86+ result = {}
8987 for dataset in datasets :
9088 from policyengine_uk import Microsimulation
9189
@@ -139,9 +137,10 @@ def create_datasets(
139137 )
140138
141139 uk_dataset = PolicyEngineUKDataset (
140+ id = f"{ Path (dataset ).stem } _year_{ year } " ,
142141 name = f"{ dataset } -year-{ year } " ,
143142 description = f"UK Dataset for year { year } based on { dataset } " ,
144- filepath = f"./data /{ Path (dataset ).stem } _year_{ year } .h5" ,
143+ filepath = f"{ data_folder } /{ Path (dataset ).stem } _year_{ year } .h5" ,
145144 year = year ,
146145 data = UKYearData (
147146 person = MicroDataFrame (person_df , weights = "person_weight" ),
@@ -154,3 +153,75 @@ def create_datasets(
154153 ),
155154 )
156155 uk_dataset .save ()
156+
157+ dataset_key = f"{ Path (dataset ).stem } _{ year } "
158+ result [dataset_key ] = uk_dataset
159+
160+ return result
161+
162+
163+ def load_datasets (
164+ datasets : list [str ] = [
165+ "hf://policyengine/policyengine-uk-data/frs_2023_24.h5" ,
166+ "hf://policyengine/policyengine-uk-data/enhanced_frs_2023_24.h5" ,
167+ ],
168+ years : list [int ] = [2026 , 2027 , 2028 , 2029 , 2030 ],
169+ data_folder : str = "./data" ,
170+ ) -> dict [str , PolicyEngineUKDataset ]:
171+ result = {}
172+ for dataset in datasets :
173+ for year in years :
174+ filepath = f"{ data_folder } /{ Path (dataset ).stem } _year_{ year } .h5"
175+ uk_dataset = PolicyEngineUKDataset (
176+ name = f"{ dataset } -year-{ year } " ,
177+ description = f"UK Dataset for year { year } based on { dataset } " ,
178+ filepath = filepath ,
179+ year = year ,
180+ )
181+ uk_dataset .load ()
182+
183+ dataset_key = f"{ Path (dataset ).stem } _{ year } "
184+ result [dataset_key ] = uk_dataset
185+
186+ return result
187+
188+
189+ def ensure_datasets (
190+ datasets : list [str ] = [
191+ "hf://policyengine/policyengine-uk-data/frs_2023_24.h5" ,
192+ "hf://policyengine/policyengine-uk-data/enhanced_frs_2023_24.h5" ,
193+ ],
194+ years : list [int ] = [2026 , 2027 , 2028 , 2029 , 2030 ],
195+ data_folder : str = "./data" ,
196+ ) -> dict [str , PolicyEngineUKDataset ]:
197+ """Ensure datasets exist, loading if available or creating if not.
198+
199+ Args:
200+ datasets: List of HuggingFace dataset paths
201+ years: List of years to load/create data for
202+ data_folder: Directory containing or to save the dataset files
203+
204+ Returns:
205+ Dictionary mapping dataset keys to PolicyEngineUKDataset objects
206+ """
207+ # Check if all dataset files exist
208+ all_exist = True
209+ for dataset in datasets :
210+ for year in years :
211+ filepath = Path (
212+ f"{ data_folder } /{ Path (dataset ).stem } _year_{ year } .h5"
213+ )
214+ if not filepath .exists ():
215+ all_exist = False
216+ break
217+ if not all_exist :
218+ break
219+
220+ if all_exist :
221+ return load_datasets (
222+ datasets = datasets , years = years , data_folder = data_folder
223+ )
224+ else :
225+ return create_datasets (
226+ datasets = datasets , years = years , data_folder = data_folder
227+ )
0 commit comments