Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ xesmf = "^0.8.8"
xclim = ">0.55.0" #Previous versions not compatible with xarray >2025.03.0 as they rely on xarray.core.merge which was moved
docstring-parser = "^0.16"
intake-esm = ">v2023.10.27"
pyarrow = "19.0.1" #pyarrow newer versions incompatible with intake-esm as they use polars (which can't handle Path objects)

[tool.poetry.group.dev]
# Development group
Expand Down
4 changes: 2 additions & 2 deletions src/valenspy/input/esm_catalog_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def _process_dataset_for_catalog(self, dataset_name, dataset_info):
continue

# Add the file path to the metadata
file_metadata["path"] = Path(file_path)
file_metadata["path"] = Path(file_path).as_posix()

# Add dataset level metadata
file_metadata = {**dataset_meta_data, **file_metadata}
Expand Down Expand Up @@ -245,4 +245,4 @@ def _process_dataset_for_catalog(self, dataset_name, dataset_info):
if not files_with_metadata:
warnings.warn(f"No valid files found for dataset {dataset_name}; \n Please check the dataset root {dataset_root} and pattern {dataset_info.get('pattern', None)}")

return files_with_metadata
return files_with_metadata
50 changes: 38 additions & 12 deletions src/valenspy/input/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,21 +239,47 @@ def update_catalog_from_dataset_info(self, dataset_name, dataset_root_dir, datas
self._update_catalog(dataset_name, dataset_info)
self.catalog_builder._validate_dataset_info()
self.esm_datastore.esmcat._df = self.catalog_builder.df

@property
def preprocess(self):
"""
A preprocessor function to convert the input dataset to ValEnsPy compliant data.

This function applys the input convertor to the dataset if an input convertor exists (i.e. source_id is in this managers input convertors).
Preprocessor to convert datasets to ValEnsPy-compliant format.
Applies the appropriate input converter based on the dataset's source_id.
"""

# --- Build lookup once (critical for performance) ---
df = self.esm_datastore.df

# Ensure paths are POSIX strings
path_to_source = dict(zip(df["path"], df["source_id"]))

IC_dict = self.input_convertors

def process_IC(ds, IC_dict, df):
file_name = ds.encoding["source"]
source_id = df[df["path"] == Path(file_name)]["source_id"].values[0]
if source_id in IC_dict:
return IC_dict[source_id](ds)
else:
def process_IC(ds, path_to_source, IC_dict):
# Intake provides the file path here
file_name = ds.encoding.get("source")
if file_name is None:
return ds

return partial(process_IC, IC_dict=self.input_convertors, df=self.esm_datastore.df)

# Normalize exactly like catalog
file_name = Path(file_name).as_posix()

source_id = path_to_source.get(file_name)
if source_id is None:
raise ValueError(
f"Dataset path not found in catalog: {file_name}"
)

convertor = IC_dict.get(source_id)
if convertor is None:
return ds

return convertor(ds)

return partial(
process_IC,
path_to_source=path_to_source,
IC_dict=IC_dict,
)

Loading