From 7d55c8d3ce1ba6fe80db753bb71903420d52b38b Mon Sep 17 00:00:00 2001 From: kobebryant432 Date: Thu, 12 Feb 2026 12:12:58 +0100 Subject: [PATCH 1/2] Avoid Path like object in esmcatalog to accomadate pyarrow issue --- pyproject.toml | 1 - src/valenspy/input/esm_catalog_builder.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 28d10270..50f102b2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,7 +57,6 @@ xesmf = "^0.8.8" xclim = ">0.55.0" #Previous versions not compatible with xarray >2025.03.0 as they rely on xarray.core.merge which was moved docstring-parser = "^0.16" intake-esm = ">v2023.10.27" -pyarrow = "19.0.1" #pyarrow newer versions incompatible with intake-esm as they use polars (which can't handle Path objects) [tool.poetry.group.dev] # Development group diff --git a/src/valenspy/input/esm_catalog_builder.py b/src/valenspy/input/esm_catalog_builder.py index a68fd823..ced37fca 100644 --- a/src/valenspy/input/esm_catalog_builder.py +++ b/src/valenspy/input/esm_catalog_builder.py @@ -198,7 +198,7 @@ def _process_dataset_for_catalog(self, dataset_name, dataset_info): continue # Add the file path to the metadata - file_metadata["path"] = Path(file_path) + file_metadata["path"] = Path(file_path).as_posix() # Add dataset level metadata file_metadata = {**dataset_meta_data, **file_metadata} @@ -245,4 +245,4 @@ def _process_dataset_for_catalog(self, dataset_name, dataset_info): if not files_with_metadata: warnings.warn(f"No valid files found for dataset {dataset_name}; \n Please check the dataset root {dataset_root} and pattern {dataset_info.get('pattern', None)}") - return files_with_metadata \ No newline at end of file + return files_with_metadata From 3e40bb373bb8639eb9fcce459b7042735692cd45 Mon Sep 17 00:00:00 2001 From: kobebryant432 Date: Thu, 12 Feb 2026 13:43:05 +0100 Subject: [PATCH 2/2] update manager bugfix load_regrid script --- src/valenspy/input/manager.py | 50 ++++++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 12 deletions(-) diff --git a/src/valenspy/input/manager.py b/src/valenspy/input/manager.py index 4f699c3a..8c8bbbfb 100644 --- a/src/valenspy/input/manager.py +++ b/src/valenspy/input/manager.py @@ -239,21 +239,47 @@ def update_catalog_from_dataset_info(self, dataset_name, dataset_root_dir, datas self._update_catalog(dataset_name, dataset_info) self.catalog_builder._validate_dataset_info() self.esm_datastore.esmcat._df = self.catalog_builder.df - + @property def preprocess(self): """ - A preprocessor function to convert the input dataset to ValEnsPy compliant data. - - This function applys the input convertor to the dataset if an input convertor exists (i.e. source_id is in this managers input convertors). + Preprocessor to convert datasets to ValEnsPy-compliant format. + + Applies the appropriate input converter based on the dataset's source_id. """ + + # --- Build lookup once (critical for performance) --- + df = self.esm_datastore.df + + # Ensure paths are POSIX strings + path_to_source = dict(zip(df["path"], df["source_id"])) + + IC_dict = self.input_convertors - def process_IC(ds, IC_dict, df): - file_name = ds.encoding["source"] - source_id = df[df["path"] == Path(file_name)]["source_id"].values[0] - if source_id in IC_dict: - return IC_dict[source_id](ds) - else: + def process_IC(ds, path_to_source, IC_dict): + # Intake provides the file path here + file_name = ds.encoding.get("source") + if file_name is None: return ds - - return partial(process_IC, IC_dict=self.input_convertors, df=self.esm_datastore.df) \ No newline at end of file + + # Normalize exactly like catalog + file_name = Path(file_name).as_posix() + + source_id = path_to_source.get(file_name) + if source_id is None: + raise ValueError( + f"Dataset path not found in catalog: {file_name}" + ) + + convertor = IC_dict.get(source_id) + if convertor is None: + return ds + + return convertor(ds) + + return partial( + process_IC, + path_to_source=path_to_source, + IC_dict=IC_dict, + ) +