From 245e14ffe5ae8e494c9b4535d7b6910bb7d33d48 Mon Sep 17 00:00:00 2001 From: "Kyle R. Wodzicki" Date: Wed, 22 Oct 2025 14:05:36 -0400 Subject: [PATCH 1/2] Performance improvements via transposing In Python/NumPy, iteration is 'fastest' over the last dimension of an array because data in this dimension are stored sequentially in RAM. In previous versions of this package, data were expected to be ordered such that dimension were [time, y, x], or some variant with time being the left-most, and slowest, dimension. This update flips how data are processed and assumes that the right-most dimension is time. This enables faster compute as the time series of data for a single location are now sequential in RAM, leading to speedups in both serial and parallel performance. I have also reduced the number of processes that run in the parallel case by only iterating over the last dimension. In the previous version, iteration occured over two dimension, leading to lots of 'small' chunks of data being passed back and forth to the processing Pool. This communication can be slow and incurs some overhead. To reduce this, we now pass 'large' chunks of data to a single process and use the map_over_location() static method to iterate down to the time dimension. Testing indicated a 50% reduction in processing time, with only a slight increase in read time (10-20s) when transposing the data on read via Xarray. --- ibicus/__meta__.py | 2 +- ibicus/debias/_debiaser.py | 196 +++++++++--------- ibicus/debias/_delta_change.py | 46 ---- ...ng_and_trend_preservation_all_debiasers.py | 14 +- 4 files changed, 109 insertions(+), 149 deletions(-) diff --git a/ibicus/__meta__.py b/ibicus/__meta__.py index 90eac87..c4f7142 100644 --- a/ibicus/__meta__.py +++ b/ibicus/__meta__.py @@ -9,7 +9,7 @@ """Container for ibicus metadata.""" __name__ = "ibicus" -__version__ = "1.2.0" +__version__ = "2.0.0" __author__ = "Fiona Spuler, Jakob Wessel & European Centre for Medium-Range Weather Forecasts (ECMWF)" __author_email__ = "ibicus.py@gmail.com" __license__ = "Apache License Version 2.0" diff --git a/ibicus/debias/_debiaser.py b/ibicus/debias/_debiaser.py index 6d0511d..4aae173 100644 --- a/ibicus/debias/_debiaser.py +++ b/ibicus/debias/_debiaser.py @@ -184,22 +184,56 @@ def _has_correct_shape(df): @staticmethod def _have_same_shape(obs, cm_hist, cm_future): return ( - obs.shape[1:] == cm_hist.shape[1:] and obs.shape[1:] == cm_future.shape[1:] + obs.shape[:-1] == cm_hist.shape[:-1] and obs.shape[:-1] == cm_future.shape[:-1] ) @staticmethod def _contains_inf_nan(x): return np.any(np.logical_or(np.isnan(x), np.isinf(x))) - def _not_if_or_nan_vals_outside_reasonable_physical_range(self, x): - if self.reasonable_physical_range is not None: - return not np.all( - (x >= self.reasonable_physical_range[0]) - & (x <= self.reasonable_physical_range[1]) - | np.isinf(x) - | np.isnan(x) + def _not_if_or_nan_vals_outside_reasonable_physical_range(self, x, vtype): + """ + Combine methods to improver performance + + In previous version, had a _contains_inf_nan static method and a + method to check for data within range with secondary check of out + of range data was nan/inf. + + Functionality of these methods have been combined to improve + performance so don't have to compute nan/inf masks twice + + Arguments: + x: Data values to check + vtype: Label/type of data; e.g., obs, cm_hist, cm_future, etc. + Used for warning labels + + """ + + nan_inf = np.isinf(x) | np.isnan(x) + if nan_inf.any(): + warnings.warn( + f"{vtype} contains inf or nan values. Not all debiasers support missing values and their " + "presence might lead to infs or nans inside of the debiased values. Consider infilling the missing values.", + stacklevel=2, ) - return False + + if self.reasonable_physical_range is None: + return False + + if np.all( + (x >= self.reasonable_physical_range[0]) + & (x <= self.reasonable_physical_range[1]) + | nan_inf + ): + return False + + warnings.warn( + "%s contains values outside the reasonable physical range of %s for the variable: %s. " + "This might be due to different units of to data problems. It is recommended to check the input." + % (vtype, self.reasonable_physical_range, self.variable), + stacklevel=2, + ) + return True @staticmethod def _has_float_dtype(x): @@ -229,8 +263,17 @@ def _fill_masked_array_with_nan(x): return x.filled(np.nan) # ----- Input checks ----- # - def _check_inputs_and_convert_if_possible(self, obs, cm_hist, cm_future): + """ + Check inputs are correct + + Updates: + Uses the updated + _not_if_or_nan_vals_outside_reasonable_physical_range() method for + speedup + + """ + # correct type if not Debiaser._is_correct_type(obs): raise TypeError("Wrong type for obs. Needs to be np.ndarray") @@ -260,54 +303,21 @@ def _check_inputs_and_convert_if_possible(self, obs, cm_hist, cm_future): # correct shape if not Debiaser._has_correct_shape(obs): - raise ValueError("obs needs to have 3 dimensions: time, x, y") + raise ValueError("obs needs to have 3 dimensions: x, y, time") if not Debiaser._has_correct_shape(cm_hist): - raise ValueError("cm_hist needs to have 3 dimensions: time, x, y") + raise ValueError("cm_hist needs to have 3 dimensions: x, y, time") if not Debiaser._has_correct_shape(cm_future): - raise ValueError("cm_future needs to have 3 dimensions: time, x, y") + raise ValueError("cm_future needs to have 3 dimensions: x, y, time") # have shame shape if not Debiaser._have_same_shape(obs, cm_hist, cm_future): raise ValueError( - "obs, cm_hist, cm_future need to have same (number of) spatial dimensions. The arrays of obs, cm_hist and cm_future are assumed to have the following structure: [t, x, y] where t is the time dimension and x, y are spatial ones." - ) - - # contains inf or nan - if Debiaser._contains_inf_nan(obs): - warnings.warn( - "obs contains inf or nan values. Not all debiasers support missing values and their presence might lead to infs or nans inside of the debiased values. Consider infilling the missing values.", - stacklevel=2, - ) - if Debiaser._contains_inf_nan(cm_hist): - warnings.warn( - "cm_hist contains inf or nan values. Not all debiasers support missing values and their presence might lead to infs or nans inside of the debiased values. Consider infilling the missing values.", - stacklevel=2, - ) - if Debiaser._contains_inf_nan(cm_future): - warnings.warn( - "cm_future contains inf or nan values. Not all debiasers support missing values and their presence might lead to infs or nans inside of the debiased values. Consider infilling the missing values.", - stacklevel=2, + "obs, cm_hist, cm_future need to have same (number of) spatial dimensions. The arrays of obs, cm_hist and cm_future are assumed to have the following structure: [x, y, t] where t is the time dimension and x, y are spatial ones." ) - # in reasonable physical range: - if self._not_if_or_nan_vals_outside_reasonable_physical_range(obs): - warnings.warn( - "obs contains values outside the reasonable physical range of %s for the variable: %s. This might be due to different units of to data problems. It is recommended to check the input." - % (self.reasonable_physical_range, self.variable), - stacklevel=2, - ) - if self._not_if_or_nan_vals_outside_reasonable_physical_range(cm_hist): - warnings.warn( - "cm_hist contains values outside the reasonable physical range of %s for the variable: %s. This might be due to different units of to data problems. It is recommended to check the input." - % (self.reasonable_physical_range, self.variable), - stacklevel=2, - ) - if self._not_if_or_nan_vals_outside_reasonable_physical_range(cm_future): - warnings.warn( - "cm_future contains values outside the reasonable physical range of %s for the variable: %s. This might be due to different units of to data problems. It is recommended to check the input." - % (self.reasonable_physical_range, self.variable), - stacklevel=2, - ) + self._not_if_or_nan_vals_outside_reasonable_physical_range(obs, 'obs') + self._not_if_or_nan_vals_outside_reasonable_physical_range(cm_hist, 'cm_hist') + self._not_if_or_nan_vals_outside_reasonable_physical_range(cm_future, 'cm_future') # masked arrays if Debiaser._is_masked_array(obs): @@ -345,24 +355,12 @@ def _check_inputs_and_convert_if_possible(self, obs, cm_hist, cm_future): "cm_future is a masked array, but contains no invalid data. It is converted to a normal numpy array.", stacklevel=2, ) - cm_future = Debiaser._fill_masked_array_with_nan(cm_future) return obs, cm_hist, cm_future def _check_output(self, output): - if Debiaser._contains_inf_nan(output): - warnings.warn( - "The debiaser output contains inf or nan values. This might be due to inf or nan values inside the input, or to a problem of the debiaser for the given dataset at hand. It is recommended to check the output carefully", - stacklevel=2, - ) - # in reasonable physical range: - if self._not_if_or_nan_vals_outside_reasonable_physical_range(output): - warnings.warn( - "The debiaser output contains values outside the reasonable physical range of %s for the variable: %s. This might be due to values outside the range in the input, or to a problem of the debiaser for the given dataset at hand. It is recommended to check the output carefully." - % (self.reasonable_physical_range, self.variable), - stacklevel=2, - ) + self._not_if_or_nan_vals_outside_reasonable_physical_range(output, "The debiaser output") # ----- Helpers ----- # @@ -402,7 +400,6 @@ def _run_func_on_location_and_catch_error( @staticmethod def map_over_locations( func, - output_size, obs, cm_hist, cm_future, @@ -410,17 +407,20 @@ def map_over_locations( failsafe=False, **kwargs, ): - output = np.empty(output_size, dtype=cm_future.dtype) + output = np.empty_like(cm_future) - indices = np.ndindex(obs.shape[1:]) + # Generator object of indices of all but the time (last) dimension + indices = np.ndindex(obs.shape[:-1]) if progressbar: - indices = tqdm(indices, total=np.prod(obs.shape[1:])) - - for i, j in indices: - output[:, i, j] = Debiaser._run_func_on_location_and_catch_error( - obs[:, i, j], - cm_hist[:, i, j], - cm_future[:, i, j], + indices = tqdm(indices, total=np.prod(obs.shape[:-1])) + + # Iterate over all indices; applies function to timeseries at each + # location sequentially + for idx in indices: + output[*idx, :] = Debiaser._run_func_on_location_and_catch_error( + obs[*idx, :], + cm_hist[*idx, :], + cm_future[*idx, :], func, failsafe=failsafe, **kwargs, @@ -430,7 +430,6 @@ def map_over_locations( @staticmethod def parallel_map_over_locations( func, - output_size, obs, cm_hist, cm_future, @@ -438,29 +437,36 @@ def parallel_map_over_locations( failsafe=False, **kwargs, ): + # compute results - indices = [(i, j) for i in range(obs.shape[1]) for j in range(obs.shape[2])] - with Pool(processes=nr_processes) as pool: - result = pool.starmap( - partial( - Debiaser._run_func_on_location_and_catch_error, - func=func, - failsafe=failsafe, - **kwargs, - ), - [ - (obs[:, i, j], cm_hist[:, i, j], cm_future[:, i, j]) - for (i, j) in indices - ], + func = partial( + Debiaser._run_func_on_location_and_catch_error, + func=func, + failsafe=failsafe, + progressbar=False, # Disable progress bar + **kwargs, + ) + + # Open pool an pass chunks to it for processing. By passing chunks + # we reduce communcation (slow) with each process and allow them to + # work for a while + pool = Pool(processes=nr_processes) + objs = [ + pool.apply_async( + func, + (obs[i, ...], cm_hist[i, ...], cm_future[i, ...]), ) + for i in range(obs.shape[0]) + ] + pool.close() + pool.join() # fill output - output = np.empty(output_size, dtype=cm_future.dtype) - for k, index in enumerate(indices): - output[:, index[0], index[1]] = result[k] - - # result = np.array(result) - # result = np.moveaxis(result.reshape(obs.shape[1], obs.shape[2], obs.shape[0]), -1, 0) + i = -1 + output = np.empty_like(cm_future) + while len(objs) > 0: + output[i, ...] = objs.pop().get() + i -= 1 return output @@ -538,8 +544,7 @@ def apply( warnings.warn("progressbar argument is ignored when parallel = True.") output = Debiaser.parallel_map_over_locations( - self.apply_location, - output_size=cm_future.shape, + partial(self.map_over_locations, self.apply_location), obs=obs, cm_hist=cm_hist, cm_future=cm_future, @@ -550,7 +555,6 @@ def apply( else: output = Debiaser.map_over_locations( self.apply_location, - output_size=cm_future.shape, obs=obs, cm_hist=cm_hist, cm_future=cm_future, diff --git a/ibicus/debias/_delta_change.py b/ibicus/debias/_delta_change.py index ae6c642..5b1f7c3 100644 --- a/ibicus/debias/_delta_change.py +++ b/ibicus/debias/_delta_change.py @@ -16,7 +16,6 @@ RunningWindowOverDaysOfYear, check_time_information_and_raise_error, day_of_year, - get_library_logger, infer_and_create_time_arrays_if_not_given, ) from ..variables import ( @@ -221,48 +220,3 @@ def apply_location( return debiased_cm_future else: return self._apply_on_within_year_window(obs, cm_hist, cm_future) - - def apply( - self, - obs, - cm_hist, - cm_future, - progressbar=True, - parallel=False, - nr_processes=4, - failsafe=False, - **kwargs - ): - logger = get_library_logger() - logger.info("----- Running debiasing for variable: %s -----" % self.variable) - - obs, cm_hist, cm_future = self._check_inputs_and_convert_if_possible( - obs, cm_hist, cm_future - ) - - if parallel: - output = Debiaser.parallel_map_over_locations( - self.apply_location, - output_size=obs.shape, - obs=obs, - cm_hist=cm_hist, - cm_future=cm_future, - nr_processes=nr_processes, - failsafe=failsafe, - **kwargs, - ) - else: - output = Debiaser.map_over_locations( - self.apply_location, - output_size=obs.shape, - obs=obs, - cm_hist=cm_hist, - cm_future=cm_future, - progressbar=progressbar, - failsafe=failsafe, - **kwargs, - ) - - self._check_output(output) - - return output diff --git a/tests/test_handling_and_trend_preservation_all_debiasers.py b/tests/test_handling_and_trend_preservation_all_debiasers.py index 09f69e6..a63f481 100644 --- a/tests/test_handling_and_trend_preservation_all_debiasers.py +++ b/tests/test_handling_and_trend_preservation_all_debiasers.py @@ -51,9 +51,11 @@ def test_from_variable_and_apply(self): debiaser = debiaser_class.from_variable("tas") assert debiaser.variable == "Daily mean near-surface air temperature" - obs = np.random.normal(size=(10000, 2, 2)) + 270 - cm_hist = np.random.normal(size=(10000, 2, 2)) + 270 - cm_future = np.random.normal(size=(10000, 2, 2)) + 270 + 2 + shape = (2, 2, 10000) + # shape = (10000, 2, 2) + obs = np.random.normal(size=shape) + 270 + cm_hist = np.random.normal(size=shape) + 270 + cm_future = np.random.normal(size=shape) + 270 + 2 debiased_cm_future = debiaser.apply(obs, cm_hist, cm_future) assert debiased_cm_future.shape == cm_future.shape @@ -64,9 +66,9 @@ def test_from_variable_and_apply(self): debiaser = debiaser_class.from_variable("pr") assert debiaser.variable == "Daily mean precipitation flux" - obs = np.exp(np.random.normal(size=(10000, 2, 2))) - cm_hist = np.exp(np.random.normal(size=(10000, 2, 2))) - cm_future = np.exp(np.random.normal(size=(10000, 2, 2))) * 2 + obs = np.exp(np.random.normal(size=shape)) + cm_hist = np.exp(np.random.normal(size=shape)) + cm_future = np.exp(np.random.normal(size=shape)) * 2 debiased_cm_future = debiaser.apply(obs, cm_hist, cm_future) assert debiased_cm_future.shape == cm_future.shape From c54f42b2f071bf9b0f485b940ce1f8dc4ddfe8fe Mon Sep 17 00:00:00 2001 From: "Kyle R. Wodzicki" Date: Thu, 23 Oct 2025 10:17:18 -0400 Subject: [PATCH 2/2] Updates to handle 1D/Single location Modified the dimensionality requirment to check that data are at least 1-D (i.e., array.ndims > 0). Updates to code for speed improvements made things more flexible with dimensionality, so do not have to worry as long as time is last/right most dimension and all other dimensions match across the arrays, everything should work as expected. Closes https://github.com/ecmwf-projects/ibicus/issues/22 --- ibicus/__meta__.py | 2 +- ibicus/debias/_debiaser.py | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/ibicus/__meta__.py b/ibicus/__meta__.py index c4f7142..d55b000 100644 --- a/ibicus/__meta__.py +++ b/ibicus/__meta__.py @@ -9,7 +9,7 @@ """Container for ibicus metadata.""" __name__ = "ibicus" -__version__ = "2.0.0" +__version__ = "2.0.1" __author__ = "Fiona Spuler, Jakob Wessel & European Centre for Medium-Range Weather Forecasts (ECMWF)" __author_email__ = "ibicus.py@gmail.com" __license__ = "Apache License Version 2.0" diff --git a/ibicus/debias/_debiaser.py b/ibicus/debias/_debiaser.py index 4aae173..7b6c991 100644 --- a/ibicus/debias/_debiaser.py +++ b/ibicus/debias/_debiaser.py @@ -179,7 +179,7 @@ def _is_correct_type(df): @staticmethod def _has_correct_shape(df): - return df.ndim == 3 + return df.ndim > 0 @staticmethod def _have_same_shape(obs, cm_hist, cm_future): @@ -539,6 +539,10 @@ def apply( obs, cm_hist, cm_future ) + if obs.shape[:-1] == () and parallel: + logger.info("Running on single location, disabling parallel") + parallel = False + if parallel: if progressbar: warnings.warn("progressbar argument is ignored when parallel = True.")