CORDEX-be2 · kobebryant432 · Feb 12, 2026 · Feb 12, 2026 · Feb 12, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -57,7 +57,6 @@ xesmf = "^0.8.8"
 xclim = ">0.55.0" #Previous versions not compatible with xarray >2025.03.0 as they rely on xarray.core.merge which was moved
 docstring-parser = "^0.16"
 intake-esm = ">v2023.10.27"
-pyarrow = "19.0.1" #pyarrow newer versions incompatible with intake-esm as they use polars (which can't handle Path objects)
 
 [tool.poetry.group.dev]
 # Development group

diff --git a/src/valenspy/input/esm_catalog_builder.py b/src/valenspy/input/esm_catalog_builder.py
@@ -198,7 +198,7 @@ def _process_dataset_for_catalog(self, dataset_name, dataset_info):
                         continue
 
                     # Add the file path to the metadata
-                    file_metadata["path"] = Path(file_path)
+                    file_metadata["path"] = Path(file_path).as_posix()
 
                     # Add dataset level metadata
                     file_metadata = {**dataset_meta_data, **file_metadata}
@@ -245,4 +245,4 @@ def _process_dataset_for_catalog(self, dataset_name, dataset_info):
         if not files_with_metadata:
             warnings.warn(f"No valid files found for dataset {dataset_name}; \n Please check the dataset root {dataset_root} and pattern {dataset_info.get('pattern', None)}")
 
-        return files_with_metadata
+        return files_with_metadata
diff --git a/src/valenspy/input/manager.py b/src/valenspy/input/manager.py
@@ -239,21 +239,47 @@ def update_catalog_from_dataset_info(self, dataset_name, dataset_root_dir, datas
         self._update_catalog(dataset_name, dataset_info)
         self.catalog_builder._validate_dataset_info()
         self.esm_datastore.esmcat._df = self.catalog_builder.df
-
+    
     @property
     def preprocess(self):
         """
-        A preprocessor function to convert the input dataset to ValEnsPy compliant data.
-
-        This function applys the input convertor to the dataset if an input convertor exists (i.e. source_id is in this managers input convertors).
+        Preprocessor to convert datasets to ValEnsPy-compliant format.
+    
+        Applies the appropriate input converter based on the dataset's source_id.
         """
+
+        # --- Build lookup once (critical for performance) ---
+        df = self.esm_datastore.df
+
+        # Ensure paths are POSIX strings
+        path_to_source = dict(zip(df["path"], df["source_id"]))
+
+        IC_dict = self.input_convertors
 
-        def process_IC(ds, IC_dict, df):
-            file_name = ds.encoding["source"]
-            source_id = df[df["path"] == Path(file_name)]["source_id"].values[0]
-            if source_id in IC_dict:
-                return IC_dict[source_id](ds)
-            else:
+        def process_IC(ds, path_to_source, IC_dict):
+            # Intake provides the file path here
+            file_name = ds.encoding.get("source")
+            if file_name is None:
                 return ds
-
-        return partial(process_IC, IC_dict=self.input_convertors, df=self.esm_datastore.df)
+
+            # Normalize exactly like catalog
+            file_name = Path(file_name).as_posix()
+
+            source_id = path_to_source.get(file_name)
+            if source_id is None:
+                raise ValueError(
+                    f"Dataset path not found in catalog: {file_name}"
+                )
+
+            convertor = IC_dict.get(source_id)
+            if convertor is None:
+                return ds
+
+            return convertor(ds)
+
+        return partial(
+            process_IC,
+            path_to_source=path_to_source,
+            IC_dict=IC_dict,
+        )
+