diff --git a/.github/workflows/python-test-latest.yml b/.github/workflows/python-test-latest.yml index d119a7a..f2447c5 100644 --- a/.github/workflows/python-test-latest.yml +++ b/.github/workflows/python-test-latest.yml @@ -36,7 +36,7 @@ jobs: fail-fast: false matrix: # Choose the latest stable python version - python-version: ['3.13'] + python-version: ['3.14'] steps: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 diff --git a/.github/workflows/python-test.yml b/.github/workflows/python-test.yml index 4f4458a..edc47dd 100644 --- a/.github/workflows/python-test.yml +++ b/.github/workflows/python-test.yml @@ -59,13 +59,11 @@ jobs: run: pytest . - name: Install optional dependencies - id: install-optional # uncomment the if statement below to allow skipping versions - if: matrix.python-version != '3.14t' + #if: matrix.python-version != '3.14t' run: python -m pip install .[full] - name: Test with optional dependencies - if: steps.install-optional.outcome == 'success' run: pytest . diff --git a/LICENSES_bundled.txt b/LICENSES_bundled.txt index ba0d6b2..7231b95 100644 --- a/LICENSES_bundled.txt +++ b/LICENSES_bundled.txt @@ -4,7 +4,7 @@ licensed sources, which are listed below. Source: ported MATLAB code from https://www.mathworks.com/matlabcentral/fileexchange/27429-background-correction (last accessed March 18, 2021) -Function: pybaselines.polynomial.penalized_poly +File: pybaselines._nd.polynomial.py License: 2-clause BSD Copyright (c) 2012, Vincent Mazet diff --git a/docs/examples/general/plot_masked_data.py b/docs/examples/general/plot_masked_data.py index 046dd4f..aad4f0a 100644 --- a/docs/examples/general/plot_masked_data.py +++ b/docs/examples/general/plot_masked_data.py @@ -181,9 +181,7 @@ def masked_arpls(y, mask=None, lam=1e5, diff_order=2, tol=1e-3, max_iter=50, wei weights[~mask] = 0 whittaker_system = PenalizedSystem(len(y), lam=lam, diff_order=diff_order) for _ in range(max_iter): - baseline = whittaker_system.solve( - whittaker_system.add_diagonal(weights), weights * y_fit, - ) + baseline = whittaker_system.solve(y_fit, weights) # need to ignore the problem regions in y since they would otherwise affect # the arpls weighting; could alternatively do: # _arpls(np.interp(x, x[mask], y[mask]), baseline) to approximate diff --git a/pybaselines/_algorithm_setup.py b/pybaselines/_algorithm_setup.py index e2c4390..5e06429 100644 --- a/pybaselines/_algorithm_setup.py +++ b/pybaselines/_algorithm_setup.py @@ -29,6 +29,7 @@ ParameterWarning, SortingWarning, _determine_sorts, _inverted_sort, _sort_array, estimate_window, pad_edges ) +from .results import PSplineResult, WhittakerResult class _Algorithm: @@ -269,10 +270,10 @@ def _return_results(self, baseline, params, dtype, sort_keys=(), skip_sorting=Fa return baseline, params @classmethod - def _register(cls, func=None, *, sort_keys=(), ensure_1d=True, skip_sorting=False, - require_unique_x=False): + def _handle_io(cls, func=None, *, sort_keys=(), ensure_dims=True, skip_sorting=False, + require_unique=False, reshape_keys=None): """ - Wraps a baseline function to validate inputs and correct outputs. + Wraps a baseline method to validate inputs and correct outputs. The input data is converted to a numpy array, validated to ensure the length is consistent, and ordered to match the input x ordering. The outputs are corrected @@ -281,19 +282,22 @@ def _register(cls, func=None, *, sort_keys=(), ensure_1d=True, skip_sorting=Fals Parameters ---------- func : Callable, optional - The function that is being decorated. Default is None, which returns a partial function. + The method that is being decorated. Default is None, which returns a partial function. sort_keys : tuple, optional The keys within the output parameter dictionary that will need sorting to match the - sort order of :attr:`.x`. Default is (). - ensure_1d : bool, optional + sort order of ``self.x``. Default is (). + ensure_dims : bool, optional If True (default), will raise an error if the shape of `array` is not a one dimensional array with shape (N,) or a two dimensional array with shape (N, 1) or (1, N). skip_sorting : bool, optional If True, will skip sorting the inputs and outputs, which is useful for algorithms that use other algorithms so that sorting is already internally done. Default is False. - require_unique_x : bool, optional - If True, will check ``self.x`` to ensure all values are unique and will raise an error + require_unique : bool, optional + If True, will check `self.x` to ensure all values are unique and will raise an error if non-unique values are present. Default is False, which skips the check. + reshape_keys : None, optional + Not used within this method, simply added to have the same call signature + as `_Algorithm2D._handle_io`. Returns ------- @@ -305,8 +309,8 @@ def _register(cls, func=None, *, sort_keys=(), ensure_1d=True, skip_sorting=Fals """ if func is None: return partial( - cls._register, sort_keys=sort_keys, ensure_1d=ensure_1d, skip_sorting=skip_sorting, - require_unique_x=require_unique_x + cls._handle_io, sort_keys=sort_keys, ensure_dims=ensure_dims, + skip_sorting=skip_sorting, require_unique=require_unique ) @wraps(func) @@ -316,11 +320,11 @@ def inner(self, data=None, *args, **kwargs): raise TypeError('"data" and "x_data" cannot both be None') input_y = True y, self.x = _yx_arrays( - data, check_finite=self._check_finite, ensure_1d=ensure_1d + data, check_finite=self._check_finite, ensure_1d=ensure_dims ) self._size = y.shape[-1] else: - if require_unique_x and not self._validated_x: + if require_unique and not self._validated_x: if np.any(self.x[1:] == self.x[:-1]): raise ValueError('x-values must be unique for the selected method') else: @@ -328,7 +332,7 @@ def inner(self, data=None, *args, **kwargs): if data is not None: input_y = True y = _check_sized_array( - data, self._size, check_finite=self._check_finite, ensure_1d=ensure_1d, + data, self._size, check_finite=self._check_finite, ensure_1d=ensure_dims, name='data' ) else: @@ -391,7 +395,7 @@ def _setup_whittaker(self, y, lam=1, diff_order=2, weights=None, copy_weights=Fa ---------- y : numpy.ndarray, shape (N,) The y-values of the measured data, already converted to a numpy - array by :meth:`~._Algorithm._register`. + array by :meth:`~._Algorithm._handle_io`. lam : float, optional The smoothing parameter, lambda. Typical values are between 10 and 1e8, but it strongly depends on the penalized least square method @@ -445,8 +449,8 @@ def _setup_whittaker(self, y, lam=1, diff_order=2, weights=None, copy_weights=Fa self._size, weights, copy_input=copy_weights, check_finite=self._check_finite, dtype=float ) - if self._sort_order is not None and weights is not None: - weight_array = weight_array[self._sort_order] + if weights is not None: + weight_array = _sort_array(weight_array, self._sort_order) allow_lower = allow_lower and self.banded_solver < 4 allow_penta = self.banded_solver < 3 @@ -459,7 +463,7 @@ def _setup_whittaker(self, y, lam=1, diff_order=2, weights=None, copy_weights=Fa return y, weight_array, whittaker_system def _setup_polynomial(self, y, weights=None, poly_order=2, calc_vander=True, - calc_pinv=False, copy_weights=False): + calc_pinv=False, copy_weights=False, max_cross=None): """ Sets the starting parameters for doing polynomial fitting. @@ -467,7 +471,7 @@ def _setup_polynomial(self, y, weights=None, poly_order=2, calc_vander=True, ---------- y : numpy.ndarray, shape (N,) The y-values of the measured data, already converted to a numpy - array by :meth:`~._Algorithm._register`. + array by :meth:`~._Algorithm._handle_io`. weights : array-like, shape (N,), optional The weighting array. If None (default), then will be an array with size equal to N and all values set to 1. @@ -481,6 +485,9 @@ def _setup_polynomial(self, y, weights=None, poly_order=2, calc_vander=True, copy_weights : boolean, optional If True, will copy the array of input weights. Only needed if the algorithm changes the weights in-place. Default is False. + max_cross : None, optional + Not used within this method, simply added to have the same call signature + as `_Algorithm2D._setup_polynomial`. Returns ------- @@ -509,8 +516,8 @@ def _setup_polynomial(self, y, weights=None, poly_order=2, calc_vander=True, self._size, weights, copy_input=copy_weights, check_finite=self._check_finite, dtype=float ) - if self._sort_order is not None and weights is not None: - weight_array = weight_array[self._sort_order] + if weights is not None: + weight_array = _sort_array(weight_array, self._sort_order) if calc_vander: if self._polynomial is None: @@ -542,7 +549,7 @@ def _setup_spline(self, y, weights=None, spline_degree=3, num_knots=10, ---------- y : numpy.ndarray, shape (N,) The y-values of the measured data, already converted to a numpy - array by :meth:`~._Algorithm._register`. + array by :meth:`~._Algorithm._handle_io`. weights : array-like, shape (N,), optional The weighting array. If None (default), then will be an array with size equal to N and all values set to 1. @@ -597,8 +604,8 @@ def _setup_spline(self, y, weights=None, spline_degree=3, num_knots=10, self._size, weights, dtype=float, order='C', copy_input=copy_weights, check_finite=self._check_finite ) - if self._sort_order is not None and weights is not None: - weight_array = weight_array[self._sort_order] + if weights is not None: + weight_array = _sort_array(weight_array, self._sort_order) if not make_basis: return y, weight_array @@ -623,6 +630,79 @@ def _setup_spline(self, y, weights=None, spline_degree=3, num_knots=10, return y, weight_array, pspline + def _setup_pls(self, y, weights=None, spline_degree=None, num_knots=10, + diff_order=2, lam=1, allow_lower=True, reverse_diags=False, + copy_weights=False, num_eigens=None): + """ + Sets the starting parameters for methods using penalized least squares. + + Depending on the input of `spline_degree`, will dispatch to either + `_setup_whittaker` or `_setup_spline`. + + Parameters + ---------- + y : numpy.ndarray, shape (N,) + The y-values of the measured data, already converted to a numpy + array by :meth:`~._Algorithm._handle_io`. + weights : array-like, shape (N,), optional + The weighting array. If None (default), then will be an array with + size equal to N and all values set to 1. + spline_degree : int or None, optional + If None (default), denotes that the system is using Whittaker smoothing. + Otherwise, the system is a penalized spline with a spline degree of `spline_degree`. + num_knots : int, optional + The number of interior knots for the splines. Only used if `spline_degree` is + not None. Default is 10. + diff_order : int, optional + The integer differential order for the penalty; must be greater than 0. + Default is 2. + lam : float, optional + The smoothing parameter, lambda. Typical values are between 10 and + 1e8, but it strongly depends on `diff_order` and the data size. + Default is 1. + allow_lower : boolean, optional + If True (default), will include only the lower non-zero diagonals of + the squared difference matrix. If False, will include all non-zero diagonals. + reverse_diags : boolean, optional + If True, will reverse the order of the diagonals of the penalty matrix. + Default is False. + copy_weights : boolean, optional + If True, will copy the array of input weights. Only needed if the + algorithm changes the weights in-place. Default is False. + num_eigens : None, optional + Not used within this method, simply added to have the same call signature + as `_Algorithm2D._setup_pls`. + + Returns + ------- + y : numpy.ndarray, shape (N,) + The y-values of the measured data, converted to a numpy array. + weight_array : numpy.ndarray, shape (N,) + The weight array for fitting the spline to the data. + penalized_system : PenalizedSystem or PSpline + The object for solving the penalized least squared system. If `spline_degree` + is None, returns a PenalizedSystem object;, otherwise, returns a PSpline. + result_class : WhittakerResult or PSplineResult + The result class for defining the solution. If `spline_degree` + is None, returns WhittakerResult; otherwise, returns PSplineResult. + + """ + if spline_degree is None: + y, weight_array, penalized_system = self._setup_whittaker( + y, lam=lam, diff_order=diff_order, weights=weights, copy_weights=copy_weights, + allow_lower=allow_lower, reverse_diags=reverse_diags + ) + result_class = WhittakerResult + else: + y, weight_array, penalized_system = self._setup_spline( + y, lam=lam, diff_order=diff_order, weights=weights, copy_weights=copy_weights, + allow_lower=allow_lower, reverse_diags=reverse_diags, + spline_degree=spline_degree, num_knots=num_knots, penalized=True, make_basis=True + ) + result_class = PSplineResult + + return y, weight_array, penalized_system, result_class + def _setup_morphology(self, y, half_window=None, window_kwargs=None, **kwargs): """ Sets the starting parameters for morphology-based methods. @@ -631,7 +711,7 @@ def _setup_morphology(self, y, half_window=None, window_kwargs=None, **kwargs): ---------- y : numpy.ndarray, shape (N,) The y-values of the measured data, already converted to a numpy - array by :meth:`~._Algorithm._register`. + array by :meth:`~._Algorithm._handle_io`. half_window : int, optional The half-window used for the morphology functions. If a value is input, then that value will be used. Default is None, which will optimize the @@ -686,7 +766,7 @@ def _setup_smooth(self, y, half_window=None, pad_type='half', window_multiplier= ---------- y : numpy.ndarray, shape (N,) The y-values of the measured data, already converted to a numpy - array by :meth:`~._Algorithm._register`. + array by :meth:`~._Algorithm._handle_io`. half_window : int, optional The half-window used for the smoothing functions. Used to pad the left and right edges of the data to reduce edge @@ -753,7 +833,7 @@ def _setup_classification(self, y, weights=None, **kwargs): ---------- y : numpy.ndarray, shape (N,) The y-values of the measured data, already converted to a numpy - array by :meth:`~._Algorithm._register`. + array by :meth:`~._Algorithm._handle_io`. weights : array-like, shape (N,), optional The weighting array. If None (default), then will be an array with size equal to N and all values set to 1. @@ -772,8 +852,8 @@ def _setup_classification(self, y, weights=None, **kwargs): weight_array = _check_optional_array( self._size, weights, dtype=bool, check_finite=self._check_finite ) - if self._sort_order is not None and weights is not None: - weight_array = weight_array[self._sort_order] + if weights is not None: + weight_array = _sort_array(weight_array, self._sort_order) return y, weight_array @@ -853,7 +933,7 @@ def _setup_optimizer(self, y, method, modules, method_kwargs=None, copy_kwargs=T ---------- y : numpy.ndarray, shape (N,) The y-values of the measured data, already converted to a numpy - array by :meth:`~._Algorithm._register`. + array by :meth:`~._Algorithm._handle_io`. method : str The string name of the desired function, like 'asls'. Case does not matter. modules : Sequence[module, ...] @@ -911,7 +991,7 @@ def _setup_misc(self, y): ---------- y : numpy.ndarray, shape (N,) The y-values of the measured data, already converted to a numpy - array by :meth:`~._Algorithm._register`. + array by :meth:`~._Algorithm._handle_io`. Returns ------- diff --git a/pybaselines/_banded_utils.py b/pybaselines/_banded_utils.py index e9d8946..1ec06a9 100644 --- a/pybaselines/_banded_utils.py +++ b/pybaselines/_banded_utils.py @@ -849,6 +849,32 @@ def __init__(self, data_size, lam=1, diff_order=2, allow_lower=True, lam, diff_order, allow_lower, reverse_diags, allow_penta, padding=padding ) + @property + def tot_bases(self): + """ + The total number of basis functions for the system. + + Returns + ------- + int + The total number of basis functions for the system. + + """ + return self._num_bases + + @property + def shape(self): + """ + The shape of the data being fit by the penalized system. + + Returns + ------- + tuple[int] + The shape of the data that the system corresponds to. + + """ + return (self._num_bases,) + def add_penalty(self, penalty): """ Updates `self.penalty` with an additional penalty and updates the bands. @@ -962,8 +988,62 @@ def reset_diagonals(self, lam=1, diff_order=2, allow_lower=True, reverse_diags=F self.penalty = self.lam * _pad_diagonals(self.original_diagonals, padding, self.lower) self._update_bands() - def solve(self, lhs, rhs, overwrite_ab=False, overwrite_b=False, - check_finite=False, l_and_u=None): + def solve(self, y, weights, penalty=None, rhs_extra=None): + """ + Solves the penalized linear system. + + Solves the equation ``(W + P) x = W @ y + rhs_extra`` for `x`, given the weights + (diagonal of `W`), the penalty `P`, `y`, and an additional array `rhs_extra`. + + Parameters + ---------- + y : numpy.ndarray, shape (N,) + The y-values for fitting. + weights : numpy.ndarray, shape (N,) + The weights for each y-value. + penalty : numpy.ndarray, shape (M, N), optional + The finite difference penalty matrix, in LAPACK's lower banded format (see + :func:`scipy.linalg.solveh_banded`) if `self.lower` is True or the full banded + format (see :func:`scipy.linalg.solve_banded`) if `self.lower` is False. Default + is None, which uses the object's penalty. + rhs_extra : float or numpy.ndarray, shape (N,), optional + If supplied, `rhs_extra` will be added to the right hand side (``weights * y``) + of the equation before solving. Default is None, which adds nothing. + + Returns + ------- + numpy.ndarray, shape (N,) + The solution to the linear system, `x`. + + Raises + ------ + ValueError + Raised if the input `penalty` does not match the system's penalty shape, such + that it's diagonal index is unknown. + + """ + if penalty is None: + lhs = self.add_diagonal(weights) + else: + if penalty.shape != self.penalty.shape: + raise ValueError(( + f"shape mismatch between given penalty, {penalty.shape}, and system's " + f"penalty, {self.penalty.shape}, so cannot add diagonal; use " + "`direct_solve` instead" + )) + penalty[self.main_diagonal_index] += weights + lhs = penalty + + rhs = weights * y + if rhs_extra is not None: + rhs = rhs + rhs_extra + + return self.direct_solve( + lhs, rhs, overwrite_b=True, l_and_u=(self.num_bands, self.num_bands) + ) + + def direct_solve(self, lhs, rhs, overwrite_ab=False, overwrite_b=False, + check_finite=False, l_and_u=None): """ Solves the equation ``A @ x = rhs``, given `A` in banded format as `lhs`. @@ -973,7 +1053,7 @@ def solve(self, lhs, rhs, overwrite_ab=False, overwrite_b=False, The left-hand side of the equation, in banded format. `lhs` is assumed to be some slight modification of `self.penalty` in the same format (reversed, lower, number of bands, etc. are all the same). - rhs : array-like, shape (N,) + rhs : array-like, shape (N,) or (M, N) The right-hand side of the equation. overwrite_ab : bool, optional Whether to overwrite `lhs` when using any of the solvers. Default is False. @@ -983,14 +1063,14 @@ def solve(self, lhs, rhs, overwrite_ab=False, overwrite_b=False, Whether to check if the inputs are finite when using :func:`scipy.linalg.solveh_banded` or :func:`scipy.linalg.solve_banded`. Default is False. - l_and_u : Container[int, int], optional + l_and_u : tuple[int, int], optional The number of lower and upper bands in `lhs` when using :func:`scipy.linalg.solve_banded`. Default is None, which uses (``len(lhs) // 2``, ``len(lhs) // 2``). Returns ------- - output : numpy.ndarray, shape (N,) + output : numpy.ndarray, shape (N,) or (M, N) The solution to the linear system, `x`. """ @@ -1088,7 +1168,7 @@ def factorized_solve(self, factorization, rhs, overwrite_b=False, check_finite=F Parameters ---------- factorization : numpy.ndarray or Callable - The factorization of ``A``, output by :meth:`PenalizedSystem.factorize`. + The factorization of ``A``, output by :meth:`~.PenalizedSystem.factorize`. rhs : array-like, shape (N,) or (N, M) The right-hand side of the equation. overwrite_b : bool, optional diff --git a/pybaselines/_nd/__init__.py b/pybaselines/_nd/__init__.py new file mode 100644 index 0000000..b9815a5 --- /dev/null +++ b/pybaselines/_nd/__init__.py @@ -0,0 +1,16 @@ +# -*- coding: utf-8 -*- +""" +=========================================== +Baseline Correction for N-Dimensional Data. +=========================================== + +The `_nd` module provides implementations that are written to be mostly +dimension agnostic such that they can be used for both the 1D `Baseline` +and 2D `Baseline2D` objects. + +@author: Donald Erb +Created on March 11, 2026 + +""" + +from . import morphological, pls, polynomial diff --git a/pybaselines/_nd/_algorithm_setup.py b/pybaselines/_nd/_algorithm_setup.py new file mode 100644 index 0000000..973ff05 --- /dev/null +++ b/pybaselines/_nd/_algorithm_setup.py @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- +"""Setup code for the ND Mixin classes. + +Created on March 23, 2026 +@author: Donald Erb + +""" + +from functools import partial, wraps + + +def _handle_io(func=None, *, sort_keys=(), ensure_dims=True, reshape_keys=(), + skip_sorting=False, require_unique=False): + """ + Wraps a baseline method to validate inputs and correct outputs. + + Allows passing through keywords to the underlying fitting object's `_handle_io` method. + + Parameters + ---------- + func : callable, optional + The method that is being decorated. Default is None, which returns a partial function. + sort_keys : tuple, optional + The keys within the output parameter dictionary that will need sorting to match the + sort order of the object's `x` and potentially `z` attributes. Default is (). + ensure_dims : bool, optional + If True (default), will raise an error if the shape of `array` is not a one + dimensional array with shape (N,) or a two dimensional array with shape (N, 1) or + (1, N) if `self` is an `_Algorithm`, or if the shape of `array` + is not a two dimensional array with shape (M, N) or a three dimensional array with + shape (M, N, 1), (M, 1, N), or (1, M, N) if `self` is an `_Algorithm2D`. + reshape_keys : tuple, optional + If `self` is `_Algorithm2D`, the keys within the output parameter dictionary that + will need reshaped to match the shape of the data. Ignored for `_Algorithm` `self`. + Default is (). + skip_sorting : bool, optional + If True, will skip sorting the output baseline. The keys in `sort_keys` will + still be sorted. Default is False. + require_unique : bool, optional + If True, will check ``self.x`` and potentially ``self.z`` to ensure all values are + unique and will raise an error if non-unique values are present. Default is False, + which skips the check. + + Returns + ------- + callable + The wrapped method. + + Notes + ----- + Within the inner function, `self` can be either `pybaselines._algorithm_setup._Algorithm` + or `pybaselines.two_d._algorithm_setup._Algorithm2D`. + + """ + if func is None: + return partial( + _handle_io, sort_keys=sort_keys, ensure_dims=ensure_dims, reshape_keys=reshape_keys, + skip_sorting=skip_sorting, require_unique=require_unique + ) + + @wraps(func) + def inner(self, *args, **kwargs): + return self._handle_io( + func, sort_keys=sort_keys, ensure_dims=ensure_dims, reshape_keys=reshape_keys, + skip_sorting=skip_sorting, require_unique=require_unique + )(self, *args, **kwargs) + return inner diff --git a/pybaselines/_nd/morphological.py b/pybaselines/_nd/morphological.py new file mode 100644 index 0000000..5591d7d --- /dev/null +++ b/pybaselines/_nd/morphological.py @@ -0,0 +1,176 @@ +# -*- coding: utf-8 -*- +"""Morphological techniques for fitting baselines to experimental data. + +Created on March 25, 2026 +@author: Donald Erb + +""" + +import numpy as np +from scipy.ndimage import grey_opening + +from ..utils import _avg_opening, _make_window, relative_difference + + +from ._algorithm_setup import _handle_io + + +class _MorphologicalNDMixin: + """A mixin class for providing morphological methods for 1D and 2D.""" + + @_handle_io + def mor(self, data, half_window=None, window_kwargs=None, **kwargs): + """ + A Morphological based (Mor) baseline algorithm. + + Parameters + ---------- + data : array-like, shape (M, N) + The y-values of the measured data. + half_window : int or Sequence[int, int], optional + The half-window used for the rows and columns, respectively, for the morphology + functions. If a single value is given, rows and columns will use the same value. + Default is None, which will optimize the half-window size using + :func:`.estimate_window` and `window_kwargs`. + window_kwargs : dict, optional + A dictionary of keyword arguments to pass to :func:`.estimate_window` for + estimating the half window if `half_window` is None. Default is None. + **kwargs + + .. deprecated:: 1.2.0 + Passing additional keyword arguments is deprecated and will be removed in version + 1.4.0. Pass keyword arguments using `window_kwargs`. + + Returns + ------- + baseline : numpy.ndarray, shape (M, N) + The calculated baseline. + dict + A dictionary with the following items: + + * 'half_window': np.ndarray[int, int] + The half windows used for the morphological calculations. + + References + ---------- + Perez-Pueyo, R., et al. Morphology-Based Automated Baseline Removal for + Raman Spectra of Artistic Pigments. Applied Spectroscopy, 2010, 64, 595-600. + + """ + y, half_wind = self._setup_morphology(data, half_window, window_kwargs, **kwargs) + opening = grey_opening(y, _make_window(y, half_wind)) + baseline = np.minimum(opening, _avg_opening(y, half_wind, opening)) + + return baseline, {'half_window': half_wind} + + @_handle_io + def imor(self, data, half_window=None, tol=1e-3, max_iter=200, window_kwargs=None, **kwargs): + """ + An Improved Morphological based (IMor) baseline algorithm. + + Parameters + ---------- + data : array-like, shape (M, N) + The y-values of the measured data. + half_window : int or Sequence[int, int], optional + The half-window used for the rows and columns, respectively, for the morphology + functions. If a single value is given, rows and columns will use the same value. + Default is None, which will optimize the half-window size using + :func:`.estimate_window` and `window_kwargs`. + tol : float, optional + The exit criteria. Default is 1e-3. + max_iter : int, optional + The maximum number of iterations. Default is 200. + window_kwargs : dict, optional + A dictionary of keyword arguments to pass to :func:`.estimate_window` for + estimating the half window if `half_window` is None. Default is None. + **kwargs + + .. deprecated:: 1.2.0 + Passing additional keyword arguments is deprecated and will be removed in version + 1.4.0. Pass keyword arguments using `window_kwargs`. + + Returns + ------- + baseline : numpy.ndarray, shape (M, N) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'half_window': np.ndarray[int, int] + The half windows used for the morphological calculations. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + + References + ---------- + Dai, L., et al. An Automated Baseline Correction Method Based on Iterative + Morphological Operations. Applied Spectroscopy, 2018, 72(5), 731-739. + + """ + y, half_wind = self._setup_morphology(data, half_window, window_kwargs, **kwargs) + baseline = y + tol_history = np.empty(max_iter + 1) + for i in range(max_iter + 1): + baseline_new = np.minimum(y, _avg_opening(baseline, half_wind)) + calc_difference = relative_difference(baseline, baseline_new) + tol_history[i] = calc_difference + if calc_difference < tol: + break + baseline = baseline_new + + params = {'half_window': half_wind, 'tol_history': tol_history[:i + 1]} + return baseline, params + + @_handle_io + def tophat(self, data, half_window=None, window_kwargs=None, **kwargs): + """ + Estimates the baseline using a top-hat transformation (morphological opening). + + Parameters + ---------- + data : array-like, shape (M, N) + The y-values of the measured data. + half_window : int or Sequence[int, int], optional + The half-window used for the rows and columns, respectively, for the morphology + functions. If a single value is given, rows and columns will use the same value. + Default is None, which will optimize the half-window size using + :func:`.estimate_window` and `window_kwargs`. + window_kwargs : dict, optional + A dictionary of keyword arguments to pass to :func:`.estimate_window` for + estimating the half window if `half_window` is None. Default is None. + **kwargs + + .. deprecated:: 1.2.0 + Passing additional keyword arguments is deprecated and will be removed in version + 1.4.0. Pass keyword arguments using `window_kwargs`. + + Returns + ------- + baseline : numpy.ndarray, shape (M, N) + The calculated baseline. + dict + A dictionary with the following items: + + * 'half_window': np.ndarray[int, int] + The half windows used for the morphological calculations. + + Notes + ----- + The actual top-hat transformation is defined as `data - opening(data)`, where + `opening` is the morphological opening operation. This function, however, returns + `opening(data)`, since that is technically the baseline defined by the operation. + + References + ---------- + Perez-Pueyo, R., et al. Morphology-Based Automated Baseline Removal for + Raman Spectra of Artistic Pigments. Applied Spectroscopy, 2010, 64, 595-600. + + """ + y, half_wind = self._setup_morphology(data, half_window, window_kwargs, **kwargs) + baseline = grey_opening(y, _make_window(y, half_wind)) + + return baseline, {'half_window': half_wind} diff --git a/pybaselines/_nd/pls.py b/pybaselines/_nd/pls.py new file mode 100644 index 0000000..2f0a306 --- /dev/null +++ b/pybaselines/_nd/pls.py @@ -0,0 +1,1279 @@ +# -*- coding: utf-8 -*- +"""Penalized Least Squares (PLS) methods for solving baselines. + +Generalized methods that cover both Whittaker smoothing and penalized spline (P-Spline) algorithms. + +Created on March 27, 2026 +@author: Donald Erb + +""" + +import warnings + +import numpy as np + +from .. import _weighting +from ..utils import ( + ParameterWarning, _mollifier_kernel, gaussian, pad_edges, padded_convolve, relative_difference +) +from .._validation import _check_scalar_variable +from ._algorithm_setup import _handle_io + + +class _PLSNDMixin: + """A mixin class for providing penalized least squares methods for 1D and 2D.""" + + @_handle_io(sort_keys=('weights',), reshape_keys=('weights',)) + def _asls(self, data, lam=1e6, p=1e-2, diff_order=2, max_iter=50, tol=1e-3, weights=None, + spline_degree=None, num_knots=25, num_eigens=(10, 10), return_dof=False): + """ + Fits the baseline using the asymmetric least squares (AsLS) algorithm. + + Parameters + ---------- + data : array-like, shape (N,) or (M, N) + The y-values of the measured data. Must not contain missing data (NaN) or Inf. + lam : float or Sequence[float, float], optional + The smoothing parameter. Can be a single value or a sequence of floats with length + equal to the dimensions of `data`. Larger values will create smoother baselines. + Default is 1e6. + p : float, optional + The penalizing weighting factor. Must be between 0 and 1. Values greater + than the baseline will be given `p` weight, and values less than the baseline + will be given `1 - p` weight. Default is 1e-2. + diff_order : int or Sequence[int, int], optional + The order of the difference matrix. Can be a single value or a sequence of ints with + length equal to the dimensions of `data`. Must be greater than 0. + Default is 2 (second order difference matrix). + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (N,) or (M, N), optional + The weighting array. If None (default), then the initial weights + will be an array with the same shape as `data` with all values set to 1. + spline_degree : None or int or Sequence[int, int], optional + The degree of the splines. Can be a single value or a sequence of ints with + length equal to the dimensions of `data`. Default is None, which will use Whittaker + smoothing. + num_knots : int or Sequence[int, int], optional + The number of knots for the splines. Can be a single value or a sequence of ints + with length equal to the dimensions of `data`. Default is 25. Only used if + `spline_degree` is not None. + num_eigens : int or Sequence[int, int] or None, optional + The number of eigenvalues for eigendecomposition of the penalty matrices. Can be a + single value or a sequence of ints with length equal to the dimensions of `data`. + Typical values are between 5 and 30, with higher values + needed for baselines with more curvature. If None, will solve the linear system + using the full analytical solution, which is typically much slower. Must be greater + than `diff_order`. Default is (10, 10). Only used if `data` is two dimensional + and `spline_degree` is not None. + return_dof : bool, optional + If True and `num_eigens` is not None, then the effective degrees of freedom for + each eigenvector will be calculated and returned in the parameter dictionary. + Default is False since the calculation takes time. Only used if `data` is + two dimensional. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) or (M, N) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) or (M, N) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + * 'result': WhittakerResult or WhittakerResult2D or PSplineResult or PSplineResult2D + An object that can use the results of the fit to perform additional + calculations. The type depends on the dimensions of `data` and if + `spline_degree` was None. + * 'dof' : numpy.ndarray, shape (`num_eigens[0]`, `num_eigens[1]`) + Only if `return_dof` is True. The effective degrees of freedom associated + with each eigenvector. Lower values signify that the eigenvector was + less important for the fit. + + Raises + ------ + ValueError + Raised if `p` is not between 0 and 1. + + References + ---------- + Eilers, P. A Perfect Smoother. Analytical Chemistry, 2003, 75(14), 3631-3636. + + Eilers, P., et al. Baseline correction with asymmetric least squares smoothing. + Leiden University Medical Centre Report, 2005, 1(1). + + """ + if not 0 < p < 1: + raise ValueError('p must be between 0 and 1') + y, weight_array, penalized_system, result_class = self._setup_pls( + data, lam=lam, diff_order=diff_order, weights=weights, spline_degree=spline_degree, + num_knots=num_knots, num_eigens=num_eigens + ) + tol_history = np.empty(max_iter + 1) + for i in range(max_iter + 1): + baseline = penalized_system.solve(y, weight_array) + new_weights = _weighting._asls(y, baseline, p) + calc_difference = relative_difference(weight_array, new_weights) + tol_history[i] = calc_difference + if calc_difference < tol: + break + weight_array = new_weights + + params = { + 'weights': weight_array, 'tol_history': tol_history[:i + 1], + 'result': result_class(penalized_system, weight_array) + } + if return_dof: + params['dof'] = params['result'].relative_dof() + + return baseline, params + + @_handle_io(sort_keys=('weights',), reshape_keys=('weights',)) + def _airpls(self, data, lam=1e6, diff_order=2, max_iter=50, tol=1e-3, weights=None, + spline_degree=None, num_knots=25, num_eigens=(10, 10), return_dof=False, + normalize_weights=False): + """ + Adaptive iteratively reweighted penalized least squares (airPLS) baseline. + + Parameters + ---------- + data : array-like, shape (N,) or (M, N) + The y-values of the measured data. Must not contain missing data (NaN) or Inf. + lam : float or Sequence[float, float], optional + The smoothing parameter. Can be a single value or a sequence of floats with length + equal to the dimensions of `data`. Larger values will create smoother baselines. + Default is 1e6. + diff_order : int or Sequence[int, int], optional + The order of the difference matrix. Can be a single value or a sequence of ints with + length equal to the dimensions of `data`. Must be greater than 0. + Default is 2 (second order difference matrix). + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (N,) or (M, N), optional + The weighting array. If None (default), then the initial weights + will be an array with the same shape as `data` with all values set to 1. + spline_degree : None or int or Sequence[int, int], optional + The degree of the splines. Can be a single value or a sequence of ints with + length equal to the dimensions of `data`. Default is None, which will use Whittaker + smoothing. + num_knots : int or Sequence[int, int], optional + The number of knots for the splines. Can be a single value or a sequence of ints + with length equal to the dimensions of `data`. Default is 25. Only used if + `spline_degree` is not None. + num_eigens : int or Sequence[int, int] or None, optional + The number of eigenvalues for eigendecomposition of the penalty matrices. Can be a + single value or a sequence of ints with length equal to the dimensions of `data`. + Typical values are between 5 and 30, with higher values + needed for baselines with more curvature. If None, will solve the linear system + using the full analytical solution, which is typically much slower. Must be greater + than `diff_order`. Default is (10, 10). Only used if `data` is two dimensional + and `spline_degree` is not None. + return_dof : bool, optional + If True and `num_eigens` is not None, then the effective degrees of freedom for + each eigenvector will be calculated and returned in the parameter dictionary. + Default is False since the calculation takes time. Only used if `data` is + two dimensional. + normalize_weights : bool, optional + If True, will normalize the computed weights between 0 and 1 to potentially + improve the numerical stability. Set to False (default) to use the original + implementation, which sets weights for all negative residuals to be greater than 1. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) or (M, N) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) or (M, N) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + * 'result': WhittakerResult or WhittakerResult2D or PSplineResult or PSplineResult2D + An object that can use the results of the fit to perform additional + calculations. The type depends on the dimensions of `data` and if + `spline_degree` was None. + * 'dof' : numpy.ndarray, shape (`num_eigens[0]`, `num_eigens[1]`) + Only if `return_dof` is True. The effective degrees of freedom associated + with each eigenvector. Lower values signify that the eigenvector was + less important for the fit. + + References + ---------- + Zhang, Z.M., et al. Baseline correction using adaptive iteratively + reweighted penalized least squares. Analyst, 2010, 135(5), 1138-1146. + + """ + y, weight_array, penalized_system, result_class = self._setup_pls( + data, lam=lam, diff_order=diff_order, weights=weights, spline_degree=spline_degree, + num_knots=num_knots, num_eigens=num_eigens + ) + y_l1_norm = np.abs(y).sum() + tol_history = np.empty(max_iter + 1) + for i in range(1, max_iter + 2): + baseline = penalized_system.solve(y, weight_array) + new_weights, residual_l1_norm, exit_early = _weighting._airpls( + y, baseline, i, normalize_weights + ) + if exit_early: + i -= 1 # reduce i so that output tol_history indexing is correct + break + calc_difference = residual_l1_norm / y_l1_norm + tol_history[i - 1] = calc_difference + if calc_difference < tol: + break + weight_array = new_weights + + params = { + 'weights': weight_array, 'tol_history': tol_history[:i], + 'result': result_class(penalized_system, weight_array) + } + if return_dof: + params['dof'] = params['result'].relative_dof() + + return baseline, params + + @_handle_io(sort_keys=('weights',), reshape_keys=('weights',)) + def _arpls(self, data, lam=1e6, diff_order=2, max_iter=50, tol=1e-3, weights=None, + spline_degree=None, num_knots=25, num_eigens=(10, 10), return_dof=False): + """ + Asymmetrically reweighted penalized least squares smoothing (arPLS). + + Parameters + ---------- + data : array-like, shape (N,) or (M, N) + The y-values of the measured data. Must not contain missing data (NaN) or Inf. + lam : float or Sequence[float, float], optional + The smoothing parameter. Can be a single value or a sequence of floats with length + equal to the dimensions of `data`. Larger values will create smoother baselines. + Default is 1e6. + diff_order : int or Sequence[int, int], optional + The order of the difference matrix. Can be a single value or a sequence of ints with + length equal to the dimensions of `data`. Must be greater than 0. + Default is 2 (second order difference matrix). + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (N,) or (M, N), optional + The weighting array. If None (default), then the initial weights + will be an array with the same shape as `data` with all values set to 1. + spline_degree : None or int or Sequence[int, int], optional + The degree of the splines. Can be a single value or a sequence of ints with + length equal to the dimensions of `data`. Default is None, which will use Whittaker + smoothing. + num_knots : int or Sequence[int, int], optional + The number of knots for the splines. Can be a single value or a sequence of ints + with length equal to the dimensions of `data`. Default is 25. Only used if + `spline_degree` is not None. + num_eigens : int or Sequence[int, int] or None, optional + The number of eigenvalues for eigendecomposition of the penalty matrices. Can be a + single value or a sequence of ints with length equal to the dimensions of `data`. + Typical values are between 5 and 30, with higher values + needed for baselines with more curvature. If None, will solve the linear system + using the full analytical solution, which is typically much slower. Must be greater + than `diff_order`. Default is (10, 10). Only used if `data` is two dimensional + and `spline_degree` is not None. + return_dof : bool, optional + If True and `num_eigens` is not None, then the effective degrees of freedom for + each eigenvector will be calculated and returned in the parameter dictionary. + Default is False since the calculation takes time. Only used if `data` is + two dimensional. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) or (M, N) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) or (M, N) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + * 'result': WhittakerResult or WhittakerResult2D or PSplineResult or PSplineResult2D + An object that can use the results of the fit to perform additional + calculations. The type depends on the dimensions of `data` and if + `spline_degree` was None. + * 'dof' : numpy.ndarray, shape (`num_eigens[0]`, `num_eigens[1]`) + Only if `return_dof` is True. The effective degrees of freedom associated + with each eigenvector. Lower values signify that the eigenvector was + less important for the fit. + + References + ---------- + Baek, S.J., et al. Baseline correction using asymmetrically reweighted + penalized least squares smoothing. Analyst, 2015, 140, 250-257. + + """ + y, weight_array, penalized_system, result_class = self._setup_pls( + data, lam=lam, diff_order=diff_order, weights=weights, spline_degree=spline_degree, + num_knots=num_knots, num_eigens=num_eigens + ) + tol_history = np.empty(max_iter + 1) + for i in range(max_iter + 1): + baseline = penalized_system.solve(y, weight_array) + new_weights, exit_early = _weighting._arpls(y, baseline) + if exit_early: + i -= 1 # reduce i so that output tol_history indexing is correct + break + calc_difference = relative_difference(weight_array, new_weights) + tol_history[i] = calc_difference + if calc_difference < tol: + break + weight_array = new_weights + + params = { + 'weights': weight_array, 'tol_history': tol_history[:i + 1], + 'result': result_class(penalized_system, weight_array) + } + if return_dof: + params['dof'] = params['result'].relative_dof() + + return baseline, params + + @_handle_io(sort_keys=('weights',), reshape_keys=('weights',)) + def _iarpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=None, + spline_degree=None, num_knots=25, num_eigens=(10, 10), return_dof=False): + """ + Improved asymmetrically reweighted penalized least squares smoothing (IarPLS). + + Parameters + ---------- + data : array-like, shape (N,) or (M, N) + The y-values of the measured data. Must not contain missing data (NaN) or Inf. + lam : float or Sequence[float, float], optional + The smoothing parameter. Can be a single value or a sequence of floats with length + equal to the dimensions of `data`. Larger values will create smoother baselines. + Default is 1e5. + diff_order : int or Sequence[int, int], optional + The order of the difference matrix. Can be a single value or a sequence of ints with + length equal to the dimensions of `data`. Must be greater than 0. + Default is 2 (second order difference matrix). + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (N,) or (M, N), optional + The weighting array. If None (default), then the initial weights + will be an array with the same shape as `data` with all values set to 1. + spline_degree : None or int or Sequence[int, int], optional + The degree of the splines. Can be a single value or a sequence of ints with + length equal to the dimensions of `data`. Default is None, which will use Whittaker + smoothing. + num_knots : int or Sequence[int, int], optional + The number of knots for the splines. Can be a single value or a sequence of ints + with length equal to the dimensions of `data`. Default is 25. Only used if + `spline_degree` is not None. + num_eigens : int or Sequence[int, int] or None, optional + The number of eigenvalues for eigendecomposition of the penalty matrices. Can be a + single value or a sequence of ints with length equal to the dimensions of `data`. + Typical values are between 5 and 30, with higher values + needed for baselines with more curvature. If None, will solve the linear system + using the full analytical solution, which is typically much slower. Must be greater + than `diff_order`. Default is (10, 10). Only used if `data` is two dimensional + and `spline_degree` is not None. + return_dof : bool, optional + If True and `num_eigens` is not None, then the effective degrees of freedom for + each eigenvector will be calculated and returned in the parameter dictionary. + Default is False since the calculation takes time. Only used if `data` is + two dimensional. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) or (M, N) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) or (M, N) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + * 'result': WhittakerResult or WhittakerResult2D or PSplineResult or PSplineResult2D + An object that can use the results of the fit to perform additional + calculations. The type depends on the dimensions of `data` and if + `spline_degree` was None. + * 'dof' : numpy.ndarray, shape (`num_eigens[0]`, `num_eigens[1]`) + Only if `return_dof` is True. The effective degrees of freedom associated + with each eigenvector. Lower values signify that the eigenvector was + less important for the fit. + + References + ---------- + Ye, J., et al. Baseline correction method based on improved asymmetrically + reweighted penalized least squares for Raman spectrum. Applied Optics, 2020, + 59, 10933-10943. + + """ + y, weight_array, penalized_system, result_class = self._setup_pls( + data, lam=lam, diff_order=diff_order, weights=weights, spline_degree=spline_degree, + num_knots=num_knots, num_eigens=num_eigens + ) + tol_history = np.empty(max_iter + 1) + for i in range(1, max_iter + 2): + baseline = penalized_system.solve(y, weight_array) + new_weights, exit_early = _weighting._iarpls(y, baseline, i) + if exit_early: + i -= 1 # reduce i so that output tol_history indexing is correct + break + calc_difference = relative_difference(weight_array, new_weights) + tol_history[i - 1] = calc_difference + if calc_difference < tol: + break + weight_array = new_weights + + params = { + 'weights': weight_array, 'tol_history': tol_history[:i], + 'result': result_class(penalized_system, weight_array) + } + if return_dof: + params['dof'] = params['result'].relative_dof() + + return baseline, params + + @_handle_io(sort_keys=('weights',), reshape_keys=('weights',)) + def _psalsa(self, data, lam=1e5, p=0.5, k=None, diff_order=2, max_iter=50, tol=1e-3, + weights=None, spline_degree=None, num_knots=25, num_eigens=(10, 10), + return_dof=False): + """ + Peaked Signal's Asymmetric Least Squares Algorithm (psalsa). + + Similar to the asymmetric least squares (AsLS) algorithm, but applies an + exponential decay weighting to values greater than the baseline to allow + using a higher `p` value to better fit noisy data. + + Parameters + ---------- + data : array-like, shape (N,) or (M, N) + The y-values of the measured data. Must not contain missing data (NaN) or Inf. + lam : float or Sequence[float, float], optional + The smoothing parameter. Can be a single value or a sequence of floats with length + equal to the dimensions of `data`. Larger values will create smoother baselines. + Default is 1e5. + p : float, optional + The penalizing weighting factor. Must be between 0 and 1. Values greater + than the baseline will be given `p` weight, and values less than the baseline + will be given `1 - p` weight. Default is 0.5. + k : float, optional + A factor that controls the exponential decay of the weights for baseline + values greater than the data. Should be approximately the height at which + a value could be considered a peak. Default is None, which sets `k` to + one-tenth of the standard deviation of the input data. A large k value + will produce similar results to :meth:`~.Baseline.asls`. + diff_order : int or Sequence[int, int], optional + The order of the difference matrix. Can be a single value or a sequence of ints with + length equal to the dimensions of `data`. Must be greater than 0. + Default is 2 (second order difference matrix). + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (N,) or (M, N), optional + The weighting array. If None (default), then the initial weights + will be an array with the same shape as `data` with all values set to 1. + spline_degree : None or int or Sequence[int, int], optional + The degree of the splines. Can be a single value or a sequence of ints with + length equal to the dimensions of `data`. Default is None, which will use Whittaker + smoothing. + num_knots : int or Sequence[int, int], optional + The number of knots for the splines. Can be a single value or a sequence of ints + with length equal to the dimensions of `data`. Default is 25. Only used if + `spline_degree` is not None. + num_eigens : int or Sequence[int, int] or None, optional + The number of eigenvalues for eigendecomposition of the penalty matrices. Can be a + single value or a sequence of ints with length equal to the dimensions of `data`. + Typical values are between 5 and 30, with higher values + needed for baselines with more curvature. If None, will solve the linear system + using the full analytical solution, which is typically much slower. Must be greater + than `diff_order`. Default is (10, 10). Only used if `data` is two dimensional + and `spline_degree` is not None. + return_dof : bool, optional + If True and `num_eigens` is not None, then the effective degrees of freedom for + each eigenvector will be calculated and returned in the parameter dictionary. + Default is False since the calculation takes time. Only used if `data` is + two dimensional. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) or (M, N) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) or (M, N) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + * 'result': WhittakerResult or WhittakerResult2D or PSplineResult or PSplineResult2D + An object that can use the results of the fit to perform additional + calculations. The type depends on the dimensions of `data` and if + `spline_degree` was None. + * 'dof' : numpy.ndarray, shape (`num_eigens[0]`, `num_eigens[1]`) + Only if `return_dof` is True. The effective degrees of freedom associated + with each eigenvector. Lower values signify that the eigenvector was + less important for the fit. + + Raises + ------ + ValueError + Raised if `p` is not between 0 and 1. Also raised if `k` is not greater + than 0. + + Notes + ----- + The exit criteria for the original algorithm was to check whether the signs + of the residuals do not change between two iterations, but the comparison of + the l2 norms of the weight arrays between iterations is used instead to be + more comparable to other Whittaker-smoothing-based algorithms. + + References + ---------- + Oller-Moreno, S., et al. Adaptive Asymmetric Least Squares baseline estimation + for analytical instruments. 2014 IEEE 11th International Multi-Conference on + Systems, Signals, and Devices, 2014, 1-5. + + """ + if not 0 < p < 1: + raise ValueError('p must be between 0 and 1') + y, weight_array, penalized_system, result_class = self._setup_pls( + data, lam=lam, diff_order=diff_order, weights=weights, spline_degree=spline_degree, + num_knots=num_knots, num_eigens=num_eigens + ) + if k is None: + k = np.std(y) / 10 + else: + k = _check_scalar_variable(k, variable_name='k') + + tol_history = np.empty(max_iter + 1) + for i in range(max_iter + 1): + baseline = penalized_system.solve(y, weight_array) + new_weights = _weighting._psalsa(y, baseline, p, k) + calc_difference = relative_difference(weight_array, new_weights) + tol_history[i] = calc_difference + if calc_difference < tol: + break + weight_array = new_weights + + params = { + 'weights': weight_array, 'tol_history': tol_history[:i + 1], + 'result': result_class(penalized_system, weight_array) + } + if return_dof: + params['dof'] = params['result'].relative_dof() + + return baseline, params + + @_handle_io(sort_keys=('weights',), reshape_keys=('weights',)) + def _derpsalsa(self, data, lam=1e6, p=1e-2, k=None, diff_order=2, max_iter=50, tol=1e-3, + weights=None, spline_degree=None, num_knots=10, smooth_half_window=None, + num_smooths=16, pad_kwargs=None, num_eigens=(10, 10), **kwargs): + """ + Derivative Peak-Screening Asymmetric Least Squares Algorithm (derpsalsa). + + Parameters + ---------- + data : array-like, shape (N,) or (M, N) + The y-values of the measured data. Must not contain missing data (NaN) or Inf. + lam : float or Sequence[float, float], optional + The smoothing parameter. Can be a single value or a sequence of floats with length + equal to the dimensions of `data`. Larger values will create smoother baselines. + Default is 1e5. + p : float, optional + The penalizing weighting factor. Must be between 0 and 1. Values greater + than the baseline will be given `p` weight, and values less than the baseline + will be given `1 - p` weight. Default is 1e-2. + k : float, optional + A factor that controls the exponential decay of the weights for baseline + values greater than the data. Should be approximately the height at which + a value could be considered a peak. Default is None, which sets `k` to + one-tenth of the standard deviation of the input data. A large k value + will produce similar results to :meth:`~.Baseline.asls`. + diff_order : int or Sequence[int, int], optional + The order of the difference matrix. Can be a single value or a sequence of ints with + length equal to the dimensions of `data`. Must be greater than 0. + Default is 2 (second order difference matrix). + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (N,) or (M, N), optional + The weighting array. If None (default), then the initial weights + will be an array with the same shape as `data` with all values set to 1. + spline_degree : None or int or Sequence[int, int], optional + The degree of the splines. Can be a single value or a sequence of ints with + length equal to the dimensions of `data`. Default is None, which will use Whittaker + smoothing. + num_knots : int or Sequence[int, int], optional + The number of knots for the splines. Can be a single value or a sequence of ints + with length equal to the dimensions of `data`. Default is 25. Only used if + `spline_degree` is not None. + smooth_half_window : int, optional + The half-window to use for smoothing the data before computing the first + and second derivatives. Default is None, which will use ``len(data) / 200``. + num_smooths : int, optional + The number of times to smooth the data before computing the first + and second derivatives. Default is 16. + pad_kwargs : dict, optional + A dictionary of keyword arguments to pass to :func:`.pad_edges` for padding + the edges of the data to prevent edge effects from smoothing. Default is None. + num_eigens : int or Sequence[int, int] or None, optional + The number of eigenvalues for eigendecomposition of the penalty matrices. Can be a + single value or a sequence of ints with length equal to the dimensions of `data`. + Typical values are between 5 and 30, with higher values + needed for baselines with more curvature. If None, will solve the linear system + using the full analytical solution, which is typically much slower. Must be greater + than `diff_order`. Default is (10, 10). Only used if `data` is two dimensional + and `spline_degree` is not None. + **kwargs + + .. deprecated:: 1.2.0 + Passing additional keyword arguments is deprecated and will be removed in version + 1.4.0. Pass keyword arguments using `pad_kwargs`. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) or (M, N) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) or (M, N) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + * 'result': WhittakerResult or WhittakerResult2D or PSplineResult or PSplineResult2D + An object that can use the results of the fit to perform additional + calculations. The type depends on the dimensions of `data` and if + `spline_degree` was None. + + Raises + ------ + ValueError + Raised if `p` is not between 0 and 1. Also raised if `k` is not greater + than 0. + + References + ---------- + Korepanov, V. Asymmetric least-squares baseline algorithm with peak screening for + automatic processing of the Raman spectra. Journal of Raman Spectroscopy. 2020, + 51(10), 2061-2065. + + """ + if not 0 < p < 1: + raise ValueError('p must be between 0 and 1') + # NOTE derpsalsa doesn't currently allow 2D + y, weight_array, penalized_system, result_class = self._setup_pls( + data, lam=lam, diff_order=diff_order, weights=weights, spline_degree=spline_degree, + num_knots=num_knots, num_eigens=num_eigens + ) + if k is None: + k = np.std(y) / 10 + else: + k = _check_scalar_variable(k, variable_name='k') + if smooth_half_window is None: + smooth_half_window = self._size // 200 + # could pad the data every iteration, but it is ~2-3 times slower and only affects + # the edges, so it's not worth it + self._deprecate_pad_kwargs(**kwargs) + pad_kwargs = pad_kwargs if pad_kwargs is not None else {} + y_smooth = pad_edges(y, smooth_half_window, **pad_kwargs, **kwargs) + if smooth_half_window > 0: + smooth_kernel = _mollifier_kernel(smooth_half_window) + for _ in range(num_smooths): + y_smooth = padded_convolve(y_smooth, smooth_kernel) + y_smooth = y_smooth[smooth_half_window:self._size + smooth_half_window] + + diff_y_1 = np.gradient(y_smooth) + diff_y_2 = np.gradient(diff_y_1) + # x @ x is same as (x**2).sum() but faster + rms_diff_1 = np.sqrt((diff_y_1 @ diff_y_1) / self._size) + rms_diff_2 = np.sqrt((diff_y_2 @ diff_y_2) / self._size) + + diff_1_weights = np.exp(-((diff_y_1 / rms_diff_1)**2) / 2) + diff_2_weights = np.exp(-((diff_y_2 / rms_diff_2)**2) / 2) + partial_weights = diff_1_weights * diff_2_weights + + tol_history = np.empty(max_iter + 1) + for i in range(max_iter + 1): + baseline = penalized_system.solve(y, weight_array) + new_weights = _weighting._derpsalsa(y, baseline, p, k, partial_weights) + calc_difference = relative_difference(weight_array, new_weights) + tol_history[i] = calc_difference + if calc_difference < tol: + break + weight_array = new_weights + + params = { + 'weights': weight_array, 'tol_history': tol_history[:i + 1], + 'result': result_class(penalized_system, weight_array) + } + + return baseline, params + + @_handle_io(sort_keys=('weights',), reshape_keys=('weights',)) + def _brpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, max_iter_2=50, + tol_2=1e-3, weights=None, spline_degree=None, num_knots=10, num_eigens=(10, 10), + return_dof=False): + """ + Bayesian Reweighted Penalized Least Squares (BrPLS) baseline. + + Parameters + ---------- + data : array-like, shape (N,) or (M, N) + The y-values of the measured data. Must not contain missing data (NaN) or Inf. + lam : float or Sequence[float, float], optional + The smoothing parameter. Can be a single value or a sequence of floats with length + equal to the dimensions of `data`. Larger values will create smoother baselines. + Default is 1e5. + diff_order : int or Sequence[int, int], optional + The order of the difference matrix. Can be a single value or a sequence of ints with + length equal to the dimensions of `data`. Must be greater than 0. + Default is 2 (second order difference matrix). + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + max_iter_2 : int, optional + The number of iterations for updating the proportion of data occupied by peaks. + Default is 50. + tol_2 : float, optional + The exit criteria for the difference between the calculated proportion of data + occupied by peaks. Default is 1e-3. + weights : array-like, shape (N,) or (M, N), optional + The weighting array. If None (default), then the initial weights + will be an array with the same shape as `data` with all values set to 1. + spline_degree : None or int or Sequence[int, int], optional + The degree of the splines. Can be a single value or a sequence of ints with + length equal to the dimensions of `data`. Default is None, which will use Whittaker + smoothing. + num_knots : int or Sequence[int, int], optional + The number of knots for the splines. Can be a single value or a sequence of ints + with length equal to the dimensions of `data`. Default is 25. Only used if + `spline_degree` is not None. + num_eigens : int or Sequence[int, int] or None, optional + The number of eigenvalues for eigendecomposition of the penalty matrices. Can be a + single value or a sequence of ints with length equal to the dimensions of `data`. + Typical values are between 5 and 30, with higher values + needed for baselines with more curvature. If None, will solve the linear system + using the full analytical solution, which is typically much slower. Must be greater + than `diff_order`. Default is (10, 10). Only used if `data` is two dimensional + and `spline_degree` is not None. + return_dof : bool, optional + If True and `num_eigens` is not None, then the effective degrees of freedom for + each eigenvector will be calculated and returned in the parameter dictionary. + Default is False since the calculation takes time. Only used if `data` is + two dimensional. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) or (M, N) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) or (M, N) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + * 'result': WhittakerResult or WhittakerResult2D or PSplineResult or PSplineResult2D + An object that can use the results of the fit to perform additional + calculations. The type depends on the dimensions of `data` and if + `spline_degree` was None. + * 'dof' : numpy.ndarray, shape (`num_eigens[0]`, `num_eigens[1]`) + Only if `return_dof` is True. The effective degrees of freedom associated + with each eigenvector. Lower values signify that the eigenvector was + less important for the fit. + + References + ---------- + Wang, Q., et al. Spectral baseline estimation using penalized least squares + with weights derived from the Bayesian method. Nuclear Science and Techniques, + 2022, 140, 250-257. + + """ + y, weight_array, penalized_system, result_class = self._setup_pls( + data, lam=lam, diff_order=diff_order, weights=weights, spline_degree=spline_degree, + num_knots=num_knots, num_eigens=num_eigens + ) + beta = 0.5 + j_max = 0 + baseline = y + baseline_weights = weight_array + tol_history = np.zeros((max_iter_2 + 2, max(max_iter, max_iter_2) + 1)) + # implementation note: weight_array must always be updated since otherwise when + # reentering the inner loop, new_baseline and baseline would be the same; instead, + # use baseline_weights to track which weights produced the output baseline + for i in range(max_iter_2 + 1): + for j in range(max_iter + 1): + new_baseline = penalized_system.solve(y, weight_array) + new_weights, exit_early = _weighting._brpls(y, new_baseline, beta) + if exit_early: + j -= 1 # reduce j so that output tol_history indexing is correct + tol_2 = np.inf # ensure it exits outer loop + break + # Paper used norm(old - new) / norm(new) rather than old in the denominator, + # but I use old in the denominator instead to be consistent with all other + # algorithms; does not make a major difference + calc_difference = relative_difference(baseline, new_baseline) + tol_history[i + 1, j] = calc_difference + if calc_difference < tol: + if i == 0 and j == 0: # for cases where tol == inf + baseline = new_baseline + break + baseline_weights = weight_array + weight_array = new_weights + baseline = new_baseline + j_max = max(j, j_max) + + weight_array = new_weights + weight_mean = weight_array.mean() + calc_difference_2 = abs(beta + weight_mean - 1) + tol_history[0, i] = calc_difference_2 + if calc_difference_2 < tol_2: + break + beta = 1 - weight_mean + + params = { + 'weights': baseline_weights, 'tol_history': tol_history[:i + 2, :max(i, j_max) + 1], + 'result': result_class(penalized_system, baseline_weights) + } + if return_dof: + params['dof'] = params['result'].relative_dof() + + return baseline, params + + @_handle_io(sort_keys=('weights',), reshape_keys=('weights',)) + def _lsrpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=None, + spline_degree=None, num_knots=25, num_eigens=(10, 10), return_dof=False, + alternate_weighting=False): + """ + Locally Symmetric Reweighted Penalized Least Squares (LSRPLS). + + Parameters + ---------- + data : array-like, shape (N,) or (M, N) + The y-values of the measured data. Must not contain missing data (NaN) or Inf. + lam : float or Sequence[float, float], optional + The smoothing parameter. Can be a single value or a sequence of floats with length + equal to the dimensions of `data`. Larger values will create smoother baselines. + Default is 1e5. + diff_order : int or Sequence[int, int], optional + The order of the difference matrix. Can be a single value or a sequence of ints with + length equal to the dimensions of `data`. Must be greater than 0. + Default is 2 (second order difference matrix). + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (N,) or (M, N), optional + The weighting array. If None (default), then the initial weights + will be an array with the same shape as `data` with all values set to 1. + spline_degree : None or int or Sequence[int, int], optional + The degree of the splines. Can be a single value or a sequence of ints with + length equal to the dimensions of `data`. Default is None, which will use Whittaker + smoothing. + num_knots : int or Sequence[int, int], optional + The number of knots for the splines. Can be a single value or a sequence of ints + with length equal to the dimensions of `data`. Default is 25. Only used if + `spline_degree` is not None. + num_eigens : int or Sequence[int, int] or None, optional + The number of eigenvalues for eigendecomposition of the penalty matrices. Can be a + single value or a sequence of ints with length equal to the dimensions of `data`. + Typical values are between 5 and 30, with higher values + needed for baselines with more curvature. If None, will solve the linear system + using the full analytical solution, which is typically much slower. Must be greater + than `diff_order`. Default is (10, 10). Only used if `data` is two dimensional + and `spline_degree` is not None. + return_dof : bool, optional + If True and `num_eigens` is not None, then the effective degrees of freedom for + each eigenvector will be calculated and returned in the parameter dictionary. + Default is False since the calculation takes time. Only used if `data` is + two dimensional. + alternate_weighting : bool, optional + If False (default), the weighting uses a prefactor term of ``10^t``, where ``t`` is + the iteration number, which is equation 8 within the LSRPLS paper [1]_. If True, uses + a prefactor term of ``exp(t)``. See the Notes section below for more details. + + .. versionadded:: 1.3.0 + + Returns + ------- + baseline : numpy.ndarray, shape (N,) or (M, N) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) or (M, N) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + * 'result': WhittakerResult or WhittakerResult2D or PSplineResult or PSplineResult2D + An object that can use the results of the fit to perform additional + calculations. The type depends on the dimensions of `data` and if + `spline_degree` was None. + * 'dof' : numpy.ndarray, shape (`num_eigens[0]`, `num_eigens[1]`) + Only if `return_dof` is True. The effective degrees of freedom associated + with each eigenvector. Lower values signify that the eigenvector was + less important for the fit. + + Notes + ----- + In the LSRPLS paper [1]_, the weighting equation is written with a prefactor term + of ``10^t``, where ``t`` is the iteration number, but the plotted weighting curve in + Figure 1 of the paper shows a prefactor term of ``exp(t)`` instead. Since it is ambiguous + which prefactor term is actually used for the algorithm, both are permitted by setting + `alternate_weighting` to True to use ``10^t`` and False to use ``exp(t)``. In practice, + the prefactor determines how quickly the weighting curve converts from a sigmoidal curve + to a step curve, and does not heavily influence the result. + + If ``alternate_weighting`` is False, the weighting is the same as the drPLS algorithm [2]_. + + References + ---------- + .. [1] Heng, Z., et al. Baseline correction for Raman Spectra Based on Locally Symmetric + Reweighted Penalized Least Squares. Chinese Journal of Lasers, 2018, 45(12), 1211001. + .. [2] Xu, D. et al. Baseline correction method based on doubly reweighted + penalized least squares, Applied Optics, 2019, 58, 3913-3920. + + """ + y, weight_array, penalized_system, result_class = self._setup_pls( + data, lam=lam, diff_order=diff_order, weights=weights, spline_degree=spline_degree, + num_knots=num_knots, num_eigens=num_eigens + ) + tol_history = np.empty(max_iter + 1) + for i in range(1, max_iter + 2): + baseline = penalized_system.solve(y, weight_array) + new_weights, exit_early = _weighting._lsrpls(y, baseline, i, alternate_weighting) + if exit_early: + i -= 1 # reduce i so that output tol_history indexing is correct + break + calc_difference = relative_difference(weight_array, new_weights) + tol_history[i - 1] = calc_difference + if calc_difference < tol: + break + weight_array = new_weights + + params = { + 'weights': weight_array, 'tol_history': tol_history[:i], + 'result': result_class(penalized_system, weight_array) + } + if return_dof: + params['dof'] = params['result'].relative_dof() + + return baseline, params + + @_handle_io(sort_keys=('weights',), reshape_keys=('weights',)) + def _mixture_model(self, data, lam=1e5, p=1e-2, num_knots=25, spline_degree=None, + diff_order=3, max_iter=50, tol=1e-3, weights=None, + symmetric=False): + """ + Considers the data as a mixture model composed of noise and peaks. + + Weights are iteratively assigned by calculating the probability each value in + the residual belongs to a normal distribution representing the noise. + + Parameters + ---------- + data : array-like, shape (N,) or (M, N) + The y-values of the measured data. Must not contain missing data (NaN) or Inf. + lam : float or Sequence[float, float], optional + The smoothing parameter. Can be a single value or a sequence of floats with length + equal to the dimensions of `data`. Larger values will create smoother baselines. + Default is 1e5. + p : float, optional + The penalizing weighting factor. Must be between 0 and 1. Values greater + than the baseline will be given `p` weight, and values less than the baseline + will be given `1 - p` weight. Used to set the initial weights before performing + expectation-maximization. Default is 1e-2. + num_knots : int or Sequence[int, int], optional + The number of knots for the splines. Can be a single value or a sequence of ints + with length equal to the dimensions of `data`. Default is 25. Only used if + `spline_degree` is not None. + spline_degree : None or int or Sequence[int, int], optional + The degree of the splines. Can be a single value or a sequence of ints with + length equal to the dimensions of `data`. Default is None, which will use Whittaker + smoothing. + diff_order : int or Sequence[int, int], optional + The order of the difference matrix. Can be a single value or a sequence of ints with + length equal to the dimensions of `data`. Must be greater than 0. + Default is 3 (third order difference matrix). + max_iter : int, optional + The max number of fit iterations. Default is 50. + tol : float, optional + The exit criteria. Default is 1e-3. + weights : array-like, shape (N,) or (M, N), optional + The weighting array. If None (default), then the initial weights + will be an array with the same shape as `data` with all values set to 1. + symmetric : bool, optional + If False (default), the total mixture model will be composed of one normal + distribution for the noise and one uniform distribution for positive non-noise + residuals. If True, an additional uniform distribution will be added to the + mixture model for negative non-noise residuals. Only need to set `symmetric` + to True when peaks are both positive and negative. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) or (M, N) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) or (M, N) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + * 'result': WhittakerResult or WhittakerResult2D or PSplineResult or PSplineResult2D + An object that can use the results of the fit to perform additional + calculations. The type depends on the dimensions of `data` and if + `spline_degree` was None. + * 'dof' : numpy.ndarray, shape (`num_eigens[0]`, `num_eigens[1]`) + Only if `return_dof` is True. The effective degrees of freedom associated + with each eigenvector. Lower values signify that the eigenvector was + less important for the fit. + + Raises + ------ + ValueError + Raised if `p` is not between 0 and 1. + + References + ---------- + de Rooi, J., et al. Mixture models for baseline estimation. Chemometric and + Intelligent Laboratory Systems, 2012, 117, 56-60. + + Ghojogh, B., et al. Fitting A Mixture Distribution to Data: Tutorial. arXiv + preprint arXiv:1901.06708, 2019. + + """ + if not 0 < p < 1: + raise ValueError('p must be between 0 and 1') + + # NOTE mixture_model doesn't currently allow Whittaker smoothing + y, weight_array, penalized_system, result_class = self._setup_pls( + data, lam=lam, diff_order=diff_order, weights=weights, spline_degree=spline_degree, + num_knots=num_knots + ) + # scale y between -1 and 1 so that the residual fit is more numerically stable + # TODO is this still necessary now that expectation-maximization is used? -> still + # helps to prevent overflows when using gaussian + y_domain = np.polynomial.polyutils.getdomain(y.ravel()) + y = np.polynomial.polyutils.mapdomain(y, y_domain, np.array([-1., 1.])) + + if weights is not None: + baseline = penalized_system.solve(y, weight_array) + else: + # perform 2 iterations: first is a least-squares fit and second is initial + # reweighted fit; 2 fits are needed to get weights to have a decent starting + # distribution for the expectation-maximization + if symmetric and not 0.2 < p < 0.8: + # p values far away from 0.5 with symmetric=True give bad initial weights + # for the expectation maximization + warnings.warn( + 'should use a p value closer to 0.5 when "symmetric" is True', + ParameterWarning, stacklevel=2 + ) + for _ in range(2): + baseline = penalized_system.solve(y, weight_array) + weight_array = _weighting._asls(y, baseline, p) + + residual = y - baseline + # the 0.2 * std(residual) is an "okay" starting sigma estimate + sigma = 0.2 * np.std(residual) + fraction_noise = 0.5 + if symmetric: + fraction_positive = 0.25 + else: + fraction_positive = 1 - fraction_noise + tol_history = np.empty(max_iter + 1) + for i in range(max_iter + 1): + # expectation part of expectation-maximization -> calc pdfs and + # posterior probabilities + positive_pdf = np.where( + residual >= 0, fraction_positive / max(abs(residual.max()), 1e-6), 0 + ) + noise_pdf = ( + fraction_noise * gaussian(residual, 1 / (sigma * np.sqrt(2 * np.pi)), 0, sigma) + ) + total_pdf = noise_pdf + positive_pdf + if symmetric: + negative_pdf = np.where( + residual < 0, + (1 - fraction_noise - fraction_positive) / max(abs(residual.min()), 1e-6), + 0 + ) + total_pdf += negative_pdf + posterior_prob_noise = noise_pdf / np.maximum(total_pdf, np.finfo(float).eps) + + calc_difference = relative_difference(weight_array, posterior_prob_noise) + tol_history[i] = calc_difference + if calc_difference < tol: + break + + # maximization part of expectation-maximization -> update sigma and + # fractions of each pdf + noise_sum = posterior_prob_noise.sum() + # TODO can noise_sum ever be 0? Should terminate early if so + sigma = np.sqrt((posterior_prob_noise * residual**2).sum() / noise_sum) + if not symmetric: + fraction_noise = posterior_prob_noise.mean() + fraction_positive = 1 - fraction_noise + else: + posterior_prob_positive = positive_pdf / total_pdf + posterior_prob_negative = negative_pdf / total_pdf + + positive_sum = posterior_prob_positive.sum() + negative_sum = posterior_prob_negative.sum() + total_sum = noise_sum + positive_sum + negative_sum + + fraction_noise = noise_sum / total_sum + fraction_positive = positive_sum / total_sum + + weight_array = posterior_prob_noise + baseline = penalized_system.solve(y, weight_array) + residual = y - baseline + + params = { + 'weights': weight_array, 'tol_history': tol_history[:i + 1], + 'result': result_class(penalized_system, weight_array) + } + + baseline = np.polynomial.polyutils.mapdomain(baseline, np.array([-1., 1.]), y_domain) + + return baseline, params + + @_handle_io(sort_keys=('weights',), reshape_keys=('weights',)) + def _irsqr(self, data, lam=1e3, quantile=0.05, num_knots=25, spline_degree=None, + diff_order=3, max_iter=100, tol=1e-6, weights=None, eps=None): + """ + Iterative Reweighted Spline Quantile Regression (IRSQR). + + Fits the baseline using quantile regression with penalized splines. + + Parameters + ---------- + data : array-like, shape (N,) or (M, N) + The y-values of the measured data. Must not contain missing data (NaN) or Inf. + lam : float or Sequence[float, float], optional + The smoothing parameter. Can be a single value or a sequence of floats with length + equal to the dimensions of `data`. Larger values will create smoother baselines. + Default is 1e3. + quantile : float, optional + The quantile at which to fit the baseline. Default is 0.05. + num_knots : int or Sequence[int, int], optional + The number of knots for the splines. Can be a single value or a sequence of ints + with length equal to the dimensions of `data`. Default is 25. Only used if + `spline_degree` is not None. + spline_degree : None or int or Sequence[int, int], optional + The degree of the splines. Can be a single value or a sequence of ints with + length equal to the dimensions of `data`. Default is None, which will use Whittaker + smoothing. + diff_order : int or Sequence[int, int], optional + The order of the difference matrix. Can be a single value or a sequence of ints with + length equal to the dimensions of `data`. Must be greater than 0. + Default is 3 (third order difference matrix). + max_iter : int, optional + The max number of fit iterations. Default is 100. + tol : float, optional + The exit criteria. Default is 1e-6. + weights : array-like, shape (N,) or (M, N), optional + The weighting array. If None (default), then the initial weights + will be an array with the same shape as `data` with all values set to 1. + eps : float, optional + A small value added to the square of the residual to prevent dividing by 0. + Default is None, which uses the square of the maximum-absolute-value of the + fit each iteration multiplied by 1e-6. + + Returns + ------- + baseline : numpy.ndarray, shape (N,) or (M, N) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (N,) or (M, N) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + * 'result': WhittakerResult or WhittakerResult2D or PSplineResult or PSplineResult2D + An object that can use the results of the fit to perform additional + calculations. The type depends on the dimensions of `data` and if + `spline_degree` was None. + * 'dof' : numpy.ndarray, shape (`num_eigens[0]`, `num_eigens[1]`) + Only if `return_dof` is True. The effective degrees of freedom associated + with each eigenvector. Lower values signify that the eigenvector was + less important for the fit. + + Raises + ------ + ValueError + Raised if `quantile` is not between 0 and 1. + + References + ---------- + Han, Q., et al. Iterative Reweighted Quantile Regression Using Augmented Lagrangian + Optimization for Baseline Correction. 2018 5th International Conference on Information + Science and Control Engineering (ICISCE), 2018, 280-284. + + """ + if not 0 < quantile < 1: + raise ValueError('quantile must be between 0 and 1') + + # NOTE irsqr doesn't currently allow Whittaker smoothing + y, weight_array, penalized_system, result_class = self._setup_pls( + data, lam=lam, diff_order=diff_order, weights=weights, spline_degree=spline_degree, + num_knots=num_knots + ) + old_coef = np.zeros(penalized_system.tot_bases) + tol_history = np.empty(max_iter + 1) + for i in range(max_iter + 1): + baseline = penalized_system.solve(y, weight_array) + calc_difference = relative_difference(old_coef, penalized_system.coef) + tol_history[i] = calc_difference + if calc_difference < tol: + break + old_coef = penalized_system.coef + weight_array = _weighting._quantile(y, baseline, quantile, eps) + + params = { + 'weights': weight_array, 'tol_history': tol_history[:i + 1], + 'result': result_class(penalized_system, weight_array) + } + + return baseline, params diff --git a/pybaselines/_nd/polynomial.py b/pybaselines/_nd/polynomial.py new file mode 100644 index 0000000..4ff7bb3 --- /dev/null +++ b/pybaselines/_nd/polynomial.py @@ -0,0 +1,742 @@ +# -*- coding: utf-8 -*- +"""Polynomial techniques for fitting baselines to experimental data. + +Created on March 11, 2026 +@author: Donald Erb + + +The function penalized_poly and associated loss functions were adapted from MATLAB code from +https://www.mathworks.com/matlabcentral/fileexchange/27429-background-correction +(accessed March 18, 2021), which was licensed under the BSD-2-clause below. + +License: 2-clause BSD + +Copyright (c) 2012, Vincent Mazet +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the distribution + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +""" + +import numpy as np + +from .. import _weighting +from ..utils import _MIN_FLOAT, relative_difference, _convert_coef, _convert_coef2d +from ._algorithm_setup import _handle_io + + +class _PolynomialNDMixin: + """A mixin class for providing polynomial methods for 1D and 2D.""" + + @_handle_io(sort_keys=('weights',), reshape_keys=('weights',)) + def modpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, + use_original=False, mask_initial_peaks=False, return_coef=False, max_cross=None): + """ + The modified polynomial (ModPoly) baseline algorithm. + + Parameters + ---------- + data : array-like, shape (M, N) + The y-values of the measured data. + poly_order : int or Sequence[int, int], optional + The polynomial orders for the rows and columns. If a single value is given, will use + that for both rows and columns. Default is 2. + tol : float, optional + The exit criteria. Default is 1e-3. + max_iter : int, optional + The maximum number of iterations. Default is 250. + weights : array-like, shape (M, N), optional + The weighting array. If None (default), then will be an array with + shape equal to (M, N) and all values set to 1. + use_original : bool, optional + If False (default), will compare the baseline of each iteration with + the y-values of that iteration [1]_ when choosing minimum values. If True, + will compare the baseline with the original y-values given by `data` [2]_. + mask_initial_peaks : bool, optional + If True, will mask any data where the initial baseline fit + the standard + deviation of the residual is less than measured data [3]_. Default is False. + return_coef : bool, optional + If True, will convert the polynomial coefficients for the fit baseline to + a form that fits the `x_data` and `z_data` values and return them in the params + dictionary. Default is False, since the conversion takes time. + max_cross : int, optional + The maximum degree for the cross terms. For example, if `max_cross` is 1, then + ``x * z**2``, ``x**2 * z``, and ``x**2 * z**2`` would all be set to 0. Default is + None, which does not limit the cross terms. + + Returns + ------- + baseline : numpy.ndarray, shape (M, N) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (M, N) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + * 'coef': numpy.ndarray, shape (``poly_order[0] + 1``, ``poly_order[1] + 1``) + Only if `return_coef` is True. The array of polynomial parameters + for the baseline, in increasing order. Can be used to create a + polynomial using :func:`numpy.polynomial.polynomial.polyval2d`. + + Notes + ----- + Algorithm originally developed in [2]_ and then slightly modified in [1]_. + + References + ---------- + .. [1] Gan, F., et al. Baseline correction by improved iterative polynomial + fitting with automatic threshold. Chemometrics and Intelligent + Laboratory Systems, 2006, 82, 59-65. + .. [2] Lieber, C., et al. Automated method for subtraction of fluorescence + from biological raman spectra. Applied Spectroscopy, 2003, 57(11), + 1363-1367. + .. [3] Zhao, J., et al. Automated Autofluorescence Background Subtraction + Algorithm for Biomedical Raman Spectroscopy, Applied Spectroscopy, + 2007, 61(11), 1225-1232. + + """ + y, weight_array, pseudo_inverse = self._setup_polynomial( + data, weights, poly_order, calc_vander=True, calc_pinv=True, copy_weights=True, + max_cross=max_cross + ) + + sqrt_w = np.sqrt(weight_array) + if use_original: + y0 = y + + coef = pseudo_inverse @ (sqrt_w * y) + baseline = self._polynomial.vandermonde @ coef + if mask_initial_peaks: + # use baseline + deviation since without deviation, half of y should be above baseline + weight_array[baseline + np.std(y - baseline) < y] = 0 + sqrt_w = np.sqrt(weight_array) + pseudo_inverse = np.linalg.pinv(sqrt_w[:, None] * self._polynomial.vandermonde) + + tol_history = np.empty(max_iter) + for i in range(max_iter): + baseline_old = baseline + y = np.minimum(y0 if use_original else y, baseline) + coef = pseudo_inverse @ (sqrt_w * y) + baseline = self._polynomial.vandermonde @ coef + calc_difference = relative_difference(baseline_old, baseline) + tol_history[i] = calc_difference + if calc_difference < tol: + break + + params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} + if return_coef: + if hasattr(self, 'z'): + params['coef'] = _convert_coef2d( + coef, *self._polynomial.poly_order, self.x_domain, self.z_domain + ) + else: + params['coef'] = _convert_coef(coef, self.x_domain) + + return baseline, params + + @_handle_io(sort_keys=('weights',), reshape_keys=('weights',)) + def imodpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, + use_original=False, mask_initial_peaks=True, return_coef=False, + num_std=1., max_cross=None): + """ + The improved modified polynomial (IModPoly) baseline algorithm. + + Parameters + ---------- + data : array-like, shape (M, N) + The y-values of the measured data. + poly_order : int or Sequence[int, int], optional + The polynomial orders for the rows and columns. If a single value is given, will use + that for both rows and columns. Default is 2. + tol : float, optional + The exit criteria. Default is 1e-3. + max_iter : int, optional + The maximum number of iterations. Default is 250. + weights : array-like, shape (M, N), optional + The weighting array. If None (default), then will be an array with + shape equal to (M, N) and all values set to 1. + use_original : bool, optional + If False (default), will compare the baseline of each iteration with + the y-values of that iteration [1]_ when choosing minimum values. If True, + will compare the baseline with the original y-values given by `data` [2]_. + mask_initial_peaks : bool, optional + If True (default), will mask any data where the initial baseline fit + + the standard deviation of the residual is less than measured data [3]_. + return_coef : bool, optional + If True, will convert the polynomial coefficients for the fit baseline to + a form that fits the `x_data` and `z_data` values and return them in the params + dictionary. Default is False, since the conversion takes time. + num_std : float, optional + The number of standard deviations to include when thresholding. Default + is 1. Must be greater or equal to 0. + max_cross : int, optional + The maximum degree for the cross terms. For example, if `max_cross` is 1, then + ``x * z**2``, ``x**2 * z``, and ``x**2 * z**2`` would all be set to 0. Default is + None, which does not limit the cross terms. + + Returns + ------- + baseline : numpy.ndarray, shape (M, N) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (M, N) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + * 'coef': numpy.ndarray, shape (``poly_order[0] + 1``, ``poly_order[1] + 1``) + Only if `return_coef` is True. The array of polynomial parameters + for the baseline, in increasing order. Can be used to create a + polynomial using :func:`numpy.polynomial.polynomial.polyval2d`. + + Raises + ------ + ValueError + Raised if `num_std` is less than 0. + + Notes + ----- + Algorithm originally developed in [3]_. + + References + ---------- + .. [1] Gan, F., et al. Baseline correction by improved iterative polynomial + fitting with automatic threshold. Chemometrics and Intelligent + Laboratory Systems, 2006, 82, 59-65. + .. [2] Lieber, C., et al. Automated method for subtraction of fluorescence + from biological raman spectra. Applied Spectroscopy, 2003, 57(11), + 1363-1367. + .. [3] Zhao, J., et al. Automated Autofluorescence Background Subtraction + Algorithm for Biomedical Raman Spectroscopy, Applied Spectroscopy, + 2007, 61(11), 1225-1232. + + """ + if num_std < 0: + raise ValueError('num_std must be greater than or equal to 0') + + y, weight_array, pseudo_inverse = self._setup_polynomial( + data, weights, poly_order, calc_vander=True, calc_pinv=True, + copy_weights=True, max_cross=max_cross + ) + sqrt_w = np.sqrt(weight_array) + if use_original: + y0 = y + + coef = pseudo_inverse @ (sqrt_w * y) + baseline = self._polynomial.vandermonde @ coef + deviation = np.std(sqrt_w * (y - baseline)) + if mask_initial_peaks: + weight_array[baseline + deviation < y] = 0 + sqrt_w = np.sqrt(weight_array) + pseudo_inverse = np.linalg.pinv(sqrt_w[:, None] * self._polynomial.vandermonde) + + tol_history = np.empty(max_iter) + for i in range(max_iter): + y = np.minimum(y0 if use_original else y, baseline + num_std * deviation) + coef = pseudo_inverse @ (sqrt_w * y) + baseline = self._polynomial.vandermonde @ coef + new_deviation = np.std(sqrt_w * (y - baseline)) + # use new_deviation as dividing term in relative difference + calc_difference = relative_difference(new_deviation, deviation) + tol_history[i] = calc_difference + if calc_difference < tol: + break + deviation = new_deviation + + params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} + if return_coef: + if hasattr(self, 'z'): + params['coef'] = _convert_coef2d( + coef, *self._polynomial.poly_order, self.x_domain, self.z_domain + ) + else: + params['coef'] = _convert_coef(coef, self.x_domain) + + return baseline, params + + # adapted from + # https://www.mathworks.com/matlabcentral/fileexchange/27429-background-correction; + # see license above + @_handle_io(sort_keys=('weights',), reshape_keys=('weights',)) + def penalized_poly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, + cost_function='asymmetric_truncated_quadratic', threshold=None, + alpha_factor=0.99, return_coef=False, max_cross=None): + """ + Fits a polynomial baseline using a non-quadratic cost function. + + The non-quadratic cost functions penalize residuals with larger values, + giving a more robust fit compared to normal least-squares. + + Parameters + ---------- + data : array-like, shape (M, N) + The y-values of the measured data. + poly_order : int or Sequence[int, int], optional + The polynomial orders for the rows and columns. If a single value is given, will use + that for both rows and columns. Default is 2. + tol : float, optional + The exit criteria. Default is 1e-3. + max_iter : int, optional + The maximum number of iterations. Default is 250. + weights : array-like, shape (M, N), optional + The weighting array. If None (default), then will be an array with + shape equal to (M, N) and all values set to 1. + cost_function : str, optional + The non-quadratic cost function to minimize. Must indicate symmetry of the + method by prepending 'a' or 'asymmetric' for asymmetric loss, and 's' or + 'symmetric' for symmetric loss. Default is 'asymmetric_truncated_quadratic'. + Available methods, and their associated reference, are: + + * 'asymmetric_truncated_quadratic'[1]_ + * 'symmetric_truncated_quadratic'[1]_ + * 'asymmetric_huber'[1]_ + * 'symmetric_huber'[1]_ + * 'asymmetric_indec'[2]_ + * 'symmetric_indec'[2]_ + + threshold : float, optional + The threshold value for the loss method, where the function goes from + quadratic loss (such as used for least squares) to non-quadratic. For + symmetric loss methods, residual values with absolute value less than + threshold will have quadratic loss. For asymmetric loss methods, residual + values less than the threshold will have quadratic loss. Default is None, + which sets `threshold` to one-tenth of the standard deviation of the input + data. + alpha_factor : float, optional + A value between 0 and 1 that controls the value of the penalty. Default is + 0.99. Typically should not need to change this value. + return_coef : bool, optional + If True, will convert the polynomial coefficients for the fit baseline to + a form that fits the `x_data` and `z_data` values and return them in the params + dictionary. Default is False, since the conversion takes time. + max_cross : int, optional + The maximum degree for the cross terms. For example, if `max_cross` is 1, then + ``x * z**2``, ``x**2 * z``, and ``x**2 * z**2`` would all be set to 0. Default is + None, which does not limit the cross terms. + + Returns + ------- + baseline : numpy.ndarray, shape (M, N) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (M, N) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + * 'coef': numpy.ndarray, shape (``poly_order[0] + 1``, ``poly_order[1] + 1``) + Only if `return_coef` is True. The array of polynomial parameters + for the baseline, in increasing order. Can be used to create a + polynomial using :func:`numpy.polynomial.polynomial.polyval2d`. + + Raises + ------ + ValueError + Raised if `alpha_factor` is not between 0 and 1. + + Notes + ----- + In baseline literature, this procedure is sometimes called "backcor". + + References + ---------- + .. [1] Mazet, V., et al. Background removal from spectra by designing and + minimising a non-quadratic cost function. Chemometrics and Intelligent + Laboratory Systems, 2005, 76(2), 121-133. + .. [2] Liu, J., et al. Goldindec: A Novel Algorithm for Raman Spectrum Baseline + Correction. Applied Spectroscopy, 2015, 69(7), 834-842. + + """ + if not 0 < alpha_factor <= 1: + raise ValueError('alpha_factor must be between 0 and 1') + symmetric_loss, method = _identify_loss_method(cost_function) + loss_function = { + 'huber': _huber_loss, + 'truncated_quadratic': _truncated_quadratic_loss, + 'indec': _indec_loss + }[method] + + y, weight_array, pseudo_inverse = self._setup_polynomial( + data, weights, poly_order, calc_vander=True, calc_pinv=True, max_cross=max_cross + ) + if threshold is None: + threshold = np.std(y) / 10 + loss_kwargs = { + 'threshold': threshold, 'alpha_factor': alpha_factor, 'symmetric': symmetric_loss + } + + sqrt_w = np.sqrt(weight_array) + y = sqrt_w * y + + coef = pseudo_inverse @ y + baseline = self._polynomial.vandermonde @ coef + tol_history = np.empty(max_iter) + for i in range(max_iter): + baseline_old = baseline + coef = pseudo_inverse @ (y + loss_function(y - sqrt_w * baseline, **loss_kwargs)) + baseline = self._polynomial.vandermonde @ coef + calc_difference = relative_difference(baseline_old, baseline) + tol_history[i] = calc_difference + if calc_difference < tol: + break + + params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} + if return_coef: + if hasattr(self, 'z'): + params['coef'] = _convert_coef2d( + coef, *self._polynomial.poly_order, self.x_domain, self.z_domain + ) + else: + params['coef'] = _convert_coef(coef, self.x_domain) + + return baseline, params + + @_handle_io(sort_keys=('weights',), reshape_keys=('weights',)) + def quant_reg(self, data, poly_order=2, quantile=0.05, tol=1e-6, max_iter=250, + weights=None, eps=None, return_coef=False, max_cross=None): + """ + Approximates the baseline of the data using quantile regression. + + Parameters + ---------- + data : array-like, shape (M, N) + The y-values of the measured data. + poly_order : int or Sequence[int, int], optional + The polynomial orders for the rows and columns. If a single value is given, will use + that for both rows and columns. Default is 2. + quantile : float, optional + The quantile at which to fit the baseline. Default is 0.05. + tol : float, optional + The exit criteria. Default is 1e-6. For extreme quantiles (`quantile` < 0.01 + or `quantile` > 0.99), may need to use a lower value to get a good fit. + max_iter : int, optional + The maximum number of iterations. Default is 250. For extreme quantiles + (`quantile` < 0.01 or `quantile` > 0.99), may need to use a higher value to + ensure convergence. + weights : array-like, shape (M, N), optional + The weighting array. If None (default), then will be an array with + shape equal to (M, N) and all values set to 1. + eps : float, optional + A small value added to the square of the residual to prevent dividing by 0. + Default is None, which uses the square of the maximum-absolute-value of the + fit each iteration multiplied by 1e-6. + return_coef : bool, optional + If True, will convert the polynomial coefficients for the fit baseline to + a form that fits the `x_data` and `z_data` values and return them in the params + dictionary. Default is False, since the conversion takes time. + max_cross : int, optional + The maximum degree for the cross terms. For example, if `max_cross` is 1, then + ``x * z**2``, ``x**2 * z``, and ``x**2 * z**2`` would all be set to 0. Default is + None, which does not limit the cross terms. + + Returns + ------- + baseline : numpy.ndarray, shape (M, N) + The calculated baseline. + params : dict + A dictionary with the following items: + + * 'weights': numpy.ndarray, shape (M, N) + The weight array used for fitting the data. + * 'tol_history': numpy.ndarray + An array containing the calculated tolerance values for + each iteration. The length of the array is the number of iterations + completed. If the last value in the array is greater than the input + `tol` value, then the function did not converge. + * 'coef': numpy.ndarray, shape (``poly_order[0] + 1``, ``poly_order[1] + 1``) + Only if `return_coef` is True. The array of polynomial parameters + for the baseline, in increasing order. Can be used to create a + polynomial using :func:`numpy.polynomial.polynomial.polyval2d`. + + Raises + ------ + ValueError + Raised if `quantile` is not between 0 and 1. + + Notes + ----- + Application of quantile regression for baseline fitting as described in [1]_. + + Performs quantile regression using iteratively reweighted least squares (IRLS) + as described in [2]_. + + References + ---------- + .. [1] Komsta, Ł. Comparison of Several Methods of Chromatographic + Baseline Removal with a New Approach Based on Quantile Regression. + Chromatographia, 2011, 73, 721-731. + .. [2] Schnabel, S., et al. Simultaneous estimation of quantile curves using + quantile sheets. AStA Advances in Statistical Analysis, 2013, 97, 77-87. + + """ + # TODO provide a way to estimate best poly_order based on AIC like in Komsta? could be + # useful for all polynomial methods; maybe could be an optimizer function + if not 0 < quantile < 1: + raise ValueError('quantile must be between 0 and 1.') + + y, weight_array = self._setup_polynomial( + data, weights, poly_order, calc_vander=True, max_cross=max_cross + ) + sqrt_w = np.sqrt(weight_array) + baseline_old = y + tol_history = np.empty(max_iter + 1) + for i in range(max_iter + 1): + coef = np.linalg.lstsq( + self._polynomial.vandermonde * sqrt_w[:, None], y * sqrt_w, None + )[0] + baseline = self._polynomial.vandermonde @ coef + # relative_difference(baseline_old, baseline, 1) gives nearly same result and + # the l2 norm is faster to calculate, so use that instead of l1 norm + calc_difference = relative_difference(baseline_old, baseline) + tol_history[i] = calc_difference + if calc_difference < tol: + break + sqrt_w = np.sqrt(_weighting._quantile(y, baseline, quantile, eps)) + baseline_old = baseline + + params = {'weights': sqrt_w**2, 'tol_history': tol_history[:i + 1]} + if return_coef: + if hasattr(self, 'z'): + params['coef'] = _convert_coef2d( + coef, *self._polynomial.poly_order, self.x_domain, self.z_domain + ) + else: + params['coef'] = _convert_coef(coef, self.x_domain) + + return baseline, params + + +# adapted from (https://www.mathworks.com/matlabcentral/fileexchange/27429-background-correction); +# see license above +def _huber_loss(residual, threshold=1.0, alpha_factor=0.99, symmetric=True): + """ + The Huber non-quadratic cost function. + + Parameters + ---------- + residual : numpy.ndarray, shape (N,) + The residual array. + threshold : float, optional + Any residual values below the threshold are given quadratic loss. + Default is 1.0. + alpha_factor : float, optional + The scale between 0 and 1 to multiply the cost function's alpha_max + value (see Notes below). Default is 0.99. + symmetric : bool, optional + If True (default), the cost function is symmetric and applies the same + weighting for positive and negative values. If False, will apply weights + asymmetrically so that only positive weights are given the non-quadratic + weigting and negative weights have normal, quadratic weighting. + + Returns + ------- + weights : numpy.ndarray, shape (N,) + The weight array. + + Notes + ----- + The returned result is:: + + -residual + alpha_factor * alpha_max * phi'(residual) + + where phi'(x) is the derivative of the huber loss function, phi(x). + + References + ---------- + Mazet, V., et al. Background removal from spectra by designing and + minimising a non-quadratic cost function. Chemometrics and Intelligent + Laboratory Systems, 2005, 76(2), 121-133. + + """ + alpha = alpha_factor * 0.5 # alpha_max for huber is 0.5 + if symmetric: + mask = (np.abs(residual) < threshold) + weights = ( + mask * residual * (2 * alpha - 1) + + (~mask) * 2 * alpha * threshold * np.sign(residual) + ) + else: + mask = (residual < threshold) + weights = ( + mask * residual * (2 * alpha - 1) + + (~mask) * (2 * alpha * threshold - residual) + ) + return weights + + +# adapted from (https://www.mathworks.com/matlabcentral/fileexchange/27429-background-correction); +# see license above +def _truncated_quadratic_loss(residual, threshold=1.0, alpha_factor=0.99, symmetric=True): + """ + The Truncated-Quadratic non-quadratic cost function. + + Parameters + ---------- + residual : numpy.ndarray, shape (N,) + The residual array. + threshold : float, optional + Any residual values below the threshold are given quadratic loss. + Default is 1.0. + alpha_factor : float, optional + The scale between 0 and 1 to multiply the cost function's alpha_max + value (see Notes below). Default is 0.99. + symmetric : bool, optional + If True (default), the cost function is symmetric and applies the same + weighting for positive and negative values. If False, will apply weights + asymmetrically so that only positive weights are given the non-quadratic + weigting and negative weights have normal, quadratic weighting. + + Returns + ------- + weights : numpy.ndarray, shape (N,) + The weight array. + + Notes + ----- + The returned result is + + -residual + alpha_factor * alpha_max * phi'(residual) + + where phi'(x) is the derivative of the truncated quadratic function, phi(x). + + References + ---------- + Mazet, V., et al. Background removal from spectra by designing and + minimising a non-quadratic cost function. Chemometrics and Intelligent + Laboratory Systems, 2005, 76(2), 121-133. + + """ + alpha = alpha_factor * 0.5 # alpha_max for truncated quadratic is 0.5 + if symmetric: + mask = (np.abs(residual) < threshold) + else: + mask = (residual < threshold) + return mask * residual * (2 * alpha - 1) - (~mask) * residual + + +def _indec_loss(residual, threshold=1.0, alpha_factor=0.99, symmetric=True): + """ + The Indec non-quadratic cost function. + + Parameters + ---------- + residual : numpy.ndarray, shape (N,) + The residual array. + threshold : float, optional + Any residual values below the threshold are given quadratic loss. + Default is 1.0. + alpha_factor : float, optional + The scale between 0 and 1 to multiply the cost function's alpha_max + value (see Notes below). Default is 0.99. + symmetric : bool, optional + If True (default), the cost function is symmetric and applies the same + weighting for positive and negative values. If False, will apply weights + asymmetrically so that only positive weights are given the non-quadratic + weigting and negative weights have normal, quadratic weighting. + + Returns + ------- + weights : numpy.ndarray, shape (N,) + The weight array. + + Notes + ----- + The returned result is + + -residual + alpha_factor * alpha_max * phi'(residual) + + where phi'(x) is the derivative of the Indec function, phi(x). + + References + ---------- + Liu, J., et al. Goldindec: A Novel Algorithm for Raman Spectrum Baseline + Correction. Applied Spectroscopy, 2015, 69(7), 834-842. + + Mazet, V., et al. Background removal from spectra by designing and + minimising a non-quadratic cost function. Chemometrics and Intelligent + Laboratory Systems, 2005, 76(2), 121-133. + + """ + alpha = alpha_factor * 0.5 # alpha_max for indec is 0.5 + if symmetric: + mask = (np.abs(residual) < threshold) + multiple = np.sign(residual) + else: + mask = (residual < threshold) + # multiple=1 is same as sign(residual) since residual is always > 0 + # for asymmetric case, but this allows not doing the sign calculation + multiple = 1 + weights = ( + mask * residual * (2 * alpha - 1) + - (~mask) * ( + residual + alpha * multiple * threshold**3 / np.maximum(2 * residual**2, _MIN_FLOAT) + ) + ) + return weights + + +def _identify_loss_method(loss_method): + """ + Identifies the symmetry for the given loss method. + + Parameters + ---------- + loss_method : str + The loss method to use. Should have the symmetry identifier as + the prefix. + + Returns + ------- + symmetric : bool + True if `loss_method` had 's_' or 'symmetric_' as the prefix, else False. + str + The input `loss_method` value without the first section that indicated + the symmetry. + + Raises + ------ + ValueError + Raised if the loss method does not have the correct form. + + """ + prefix, *split_method = loss_method.lower().split('_') + if prefix not in ('a', 's', 'asymmetric', 'symmetric') or not split_method: + raise ValueError('must specify loss function symmetry by prepending "a_" or "s_"') + if prefix in ('a', 'asymmetric'): + symmetric = False + else: + symmetric = True + return symmetric, '_'.join(split_method) diff --git a/pybaselines/_spline_utils.py b/pybaselines/_spline_utils.py index e85fa84..2b4a4ae 100644 --- a/pybaselines/_spline_utils.py +++ b/pybaselines/_spline_utils.py @@ -762,6 +762,19 @@ def __init__(self, spline_basis, lam=1, diff_order=2, allow_lower=True, reverse_ else: self._use_numba = False + @property + def shape(self): + """ + The shape of the data being fit by the penalized system. + + Returns + ------- + tuple[int] + The shape of the data that the system corresponds to. + + """ + return (len(self.basis.x),) + @property def tck(self): """ @@ -832,7 +845,7 @@ def reset_penalty_diagonals(self, lam=1, diff_order=2, allow_lower=True, reverse ) # adapted from scipy (scipy/interpolate/_bsplines.py/make_lsq_spline); see license above - def solve_pspline(self, y, weights, penalty=None, rhs_extra=None): + def solve(self, y, weights, penalty=None, rhs_extra=None): """ Solves the coefficients for a weighted penalized spline. @@ -909,7 +922,7 @@ def solve_pspline(self, y, weights, penalty=None, rhs_extra=None): if rhs_extra is not None: rhs = rhs + rhs_extra - self.coef = self.solve( + self.coef = self.direct_solve( lhs, rhs, overwrite_ab=True, overwrite_b=True, check_finite=False ) diff --git a/pybaselines/_validation.py b/pybaselines/_validation.py index 8af0e36..f31da91 100644 --- a/pybaselines/_validation.py +++ b/pybaselines/_validation.py @@ -186,7 +186,7 @@ def _check_array(array, dtype=None, order=None, check_finite=False, ensure_1d=Tr if dimensions == 2 and 1 in output.shape: output = output.ravel() elif dimensions != 1: - raise ValueError('must be a one dimensional array') + raise ValueError('input data must be a one dimensional array') elif two_d: if dimensions < 2 or (dimensions == 2 and 1 in output.shape): raise ValueError( @@ -198,7 +198,7 @@ def _check_array(array, dtype=None, order=None, check_finite=False, ensure_1d=Tr flat_dims = ~np.equal(output_shape, 1) output = output.reshape(output_shape[flat_dims]) elif dimensions != 2: - raise ValueError('must be a two dimensional array') + raise ValueError('input data must be a two dimensional array') elif ensure_2d and not two_d: raise ValueError('two_d must be True if using ensure_2d') @@ -546,3 +546,26 @@ def _get_row_col_values(value, **asarray_kwargs): output = np.array([output[0], output[0], output[1], output[1]]) return output + + +def _check_spline_degree(spline_degree): + """ + Validates that input spline degrees are not None. + + Since some methods share the same code path for penalized spline and Whittaker + smoothing, whose logic is controlled by the input `spline_degree`, need + to ensure that the input for penalized spline methods is not None. + + Parameters + ---------- + spline_degree : int or Sequence[int] + The input spline degrees for 1D or 2D penalized spline methods. + + Raises + ------ + TypeError + Raised if `spline_degree` is None or contains None. + + """ + if None in _check_scalar(spline_degree, desired_length=2, fill_scalar=True)[0]: + raise TypeError('spline_degree cannot be None') diff --git a/pybaselines/_weighting.py b/pybaselines/_weighting.py index 2645ee5..cd9913e 100644 --- a/pybaselines/_weighting.py +++ b/pybaselines/_weighting.py @@ -407,7 +407,7 @@ def _aspls(y, baseline, asymmetric_coef=2., alternate_weighting=True): return weights, residual, exit_early -def _psalsa(y, baseline, p, k, shape_y): +def _psalsa(y, baseline, p, k): """ Weighting for the peaked signal's asymmetric least squares algorithm (psalsa). @@ -425,8 +425,6 @@ def _psalsa(y, baseline, p, k, shape_y): A factor that controls the exponential decay of the weights for baseline values greater than the data. Should be approximately the height at which a value could be considered a peak. - shape_y : int or (int,) or (int, int) - The length of `y`, `N`. Precomputed to avoid repeated calculations. Returns ------- @@ -443,13 +441,13 @@ def _psalsa(y, baseline, p, k, shape_y): residual = y - baseline # only use positive residual in exp to avoid exponential overflow warnings # and accidentally creating a weight of nan (inf * 0 = nan) - weights = np.full(shape_y, 1 - p, dtype=float) + weights = np.full(y.shape, 1 - p, dtype=float) mask = residual > 0 weights[mask] = p * np.exp(-residual[mask] / k) return weights -def _derpsalsa(y, baseline, p, k, shape_y, partial_weights): +def _derpsalsa(y, baseline, p, k, partial_weights): """ Weights for derivative peak-screening asymmetric least squares algorithm (derpsalsa). @@ -467,8 +465,6 @@ def _derpsalsa(y, baseline, p, k, shape_y, partial_weights): A factor that controls the exponential decay of the weights for baseline values greater than the data. Should be approximately the height at which a value could be considered a peak. - shape_y : int or (int,) or (int, int) - The length of `y`, `N`. Precomputed to avoid repeated calculations. partial_weights : numpy.ndarray, shape (N,) The weights associated with the first and second derivatives of the data. @@ -496,7 +492,7 @@ def _derpsalsa(y, baseline, p, k, shape_y, partial_weights): residual = y - baseline # no need for caution since inner exponential is always negative, but still mask # since it's faster than performing the square and exp on the full residual - weights = np.full(shape_y, 1 - p, dtype=float) + weights = np.full(y.shape, 1 - p, dtype=float) mask = residual > 0 weights[mask] = p * np.exp(-0.5 * ((residual[mask] / k)**2)) weights *= partial_weights diff --git a/pybaselines/classification.py b/pybaselines/classification.py index 224ba64..7f0b166 100644 --- a/pybaselines/classification.py +++ b/pybaselines/classification.py @@ -67,7 +67,7 @@ class _Classification(_Algorithm): """A base class for all classification algorithms.""" - @_Algorithm._register(sort_keys=('mask',), require_unique_x=True) + @_Algorithm._handle_io(sort_keys=('mask',), require_unique=True) def golotvin(self, data, half_window=None, num_std=2.0, sections=32, smooth_half_window=None, interp_half_window=5, weights=None, min_length=2, pad_kwargs=None, **kwargs): """ @@ -165,7 +165,7 @@ def golotvin(self, data, half_window=None, num_std=2.0, sections=32, smooth_half return baseline, {'mask': mask} - @_Algorithm._register(sort_keys=('mask',), require_unique_x=True) + @_Algorithm._handle_io(sort_keys=('mask',), require_unique=True) def dietrich(self, data, smooth_half_window=None, num_std=3.0, interp_half_window=5, poly_order=5, max_iter=50, tol=1e-3, weights=None, return_coef=False, min_length=2, pad_kwargs=None, **kwargs): @@ -299,7 +299,7 @@ def dietrich(self, data, smooth_half_window=None, num_std=3.0, interp_half_windo return baseline, params - @_Algorithm._register(sort_keys=('mask',), require_unique_x=True) + @_Algorithm._handle_io(sort_keys=('mask',), require_unique=True) def std_distribution(self, data, half_window=None, interp_half_window=5, fill_half_window=3, num_std=1.1, smooth_half_window=None, weights=None, pad_kwargs=None, **kwargs): @@ -399,7 +399,7 @@ def std_distribution(self, data, half_window=None, interp_half_window=5, return baseline, {'mask': mask} - @_Algorithm._register(sort_keys=('mask',), require_unique_x=True) + @_Algorithm._handle_io(sort_keys=('mask',), require_unique=True) def fastchrom(self, data, half_window=None, threshold=None, min_fwhm=None, interp_half_window=5, smooth_half_window=None, weights=None, max_iter=100, min_length=2, pad_kwargs=None, **kwargs): @@ -545,7 +545,7 @@ def fastchrom(self, data, half_window=None, threshold=None, min_fwhm=None, return baseline, {'mask': mask} - @_Algorithm._register(sort_keys=('mask',)) + @_Algorithm._handle_io(sort_keys=('mask',)) def cwt_br(self, data, poly_order=5, scales=None, num_std=1.0, min_length=2, max_iter=50, tol=1e-3, symmetric=False, weights=None, pad_kwargs=None, **kwargs): """ @@ -730,7 +730,7 @@ def cwt_br(self, data, poly_order=5, scales=None, num_std=1.0, min_length=2, return baseline, params - @_Algorithm._register(sort_keys=('mask', 'weights')) + @_Algorithm._handle_io(sort_keys=('mask', 'weights')) def fabc(self, data, lam=1e6, scale=None, num_std=3.0, diff_order=2, min_length=2, weights=None, weights_as_mask=False, pad_kwargs=None, **kwargs): """ @@ -839,10 +839,7 @@ def fabc(self, data, lam=1e6, scale=None, num_std=3.0, diff_order=2, min_length= whittaker_weights = whittaker_weights[self._inverted_order] whittaker_weights = whittaker_weights.astype(float) - baseline = whittaker_system.solve( - whittaker_system.add_diagonal(whittaker_weights), whittaker_weights * y, - overwrite_b=True, overwrite_ab=True - ) + baseline = whittaker_system.solve(y, whittaker_weights) params = { 'mask': mask, 'weights': whittaker_weights, 'result': WhittakerResult(whittaker_system, whittaker_weights) @@ -850,7 +847,7 @@ def fabc(self, data, lam=1e6, scale=None, num_std=3.0, diff_order=2, min_length= return baseline, params - @_Algorithm._register(sort_keys=('mask',), require_unique_x=True) + @_Algorithm._handle_io(sort_keys=('mask',), require_unique=True) def rubberband(self, data, segments=1, lam=None, diff_order=2, weights=None, smooth_half_window=None, pad_kwargs=None, **kwargs): """ @@ -967,10 +964,7 @@ def rubberband(self, data, segments=1, lam=None, diff_order=2, weights=None, _, whittaker_weights, whittaker_system = self._setup_whittaker( y, lam, diff_order, mask ) - baseline = whittaker_system.solve( - whittaker_system.add_diagonal(whittaker_weights), whittaker_weights * y, - overwrite_b=True, overwrite_ab=True - ) + baseline = whittaker_system.solve(y, whittaker_weights) params.update({ 'weights': whittaker_weights, 'result': WhittakerResult(whittaker_system, whittaker_weights) @@ -980,7 +974,7 @@ def rubberband(self, data, segments=1, lam=None, diff_order=2, weights=None, return baseline, params - @_Algorithm._register(sort_keys=('mask',), require_unique_x=True) + @_Algorithm._handle_io(sort_keys=('mask',), require_unique=True) def corner_cutting(self, data, max_iter=100, weights=None): """ Iteratively removes corner points and creates a Bezier spline from the remaining points. diff --git a/pybaselines/misc.py b/pybaselines/misc.py index 1a2cabc..497999e 100644 --- a/pybaselines/misc.py +++ b/pybaselines/misc.py @@ -82,7 +82,7 @@ class _Misc(_Algorithm): """A base class for all miscellaneous algorithms.""" - @_Algorithm._register + @_Algorithm._handle_io def interp_pts(self, data=None, baseline_points=(), interp_method='linear'): """ Creates a baseline by interpolating through input points. @@ -148,7 +148,7 @@ def interp_pts(self, data=None, baseline_points=(), interp_method='linear'): return baseline, {} - @_Algorithm._register(sort_keys=('signal',)) + @_Algorithm._handle_io(sort_keys=('signal',)) def beads(self, data, freq_cutoff=0.005, lam_0=None, lam_1=None, lam_2=None, asymmetry=6.0, filter_type=1, cost_function=2, max_iter=50, tol=1e-2, eps_0=1e-6, eps_1=1e-6, fit_parabola=True, smooth_half_window=None, alpha=1., parabola_len=3): diff --git a/pybaselines/morphological.py b/pybaselines/morphological.py index 4e723f9..9c76946 100644 --- a/pybaselines/morphological.py +++ b/pybaselines/morphological.py @@ -9,18 +9,21 @@ import warnings import numpy as np -from scipy.ndimage import grey_closing, grey_dilation, grey_erosion, grey_opening, uniform_filter1d +from scipy.ndimage import grey_closing, grey_erosion, grey_opening, uniform_filter1d from ._algorithm_setup import _Algorithm, _class_wrapper +from ._nd.morphological import _MorphologicalNDMixin from ._validation import _check_lam, _check_half_window from .results import PSplineResult, WhittakerResult -from .utils import _mollifier_kernel, _sort_array, pad_edges, padded_convolve, relative_difference +from .utils import ( + _avg_opening, _mollifier_kernel, _sort_array, pad_edges, padded_convolve, relative_difference +) -class _Morphological(_Algorithm): +class _Morphological(_Algorithm, _MorphologicalNDMixin): """A base class for all morphological algorithms.""" - @_Algorithm._register(sort_keys=('weights',)) + @_Algorithm._handle_io(sort_keys=('weights',)) def mpls(self, data, half_window=None, lam=1e6, p=0.0, diff_order=2, tol=None, max_iter=None, weights=None, window_kwargs=None, **kwargs): r""" @@ -178,10 +181,7 @@ def mpls(self, data, half_window=None, lam=1e6, p=0.0, diff_order=2, tol=None, m w = _sort_array(w, self._inverted_order) _, weight_array, whittaker_system = self._setup_whittaker(y, lam, diff_order, w) - baseline = whittaker_system.solve( - whittaker_system.add_diagonal(weight_array), weight_array * y, - overwrite_ab=True, overwrite_b=True - ) + baseline = whittaker_system.solve(y, weight_array) params = { 'weights': weight_array, 'half_window': half_wind, @@ -189,7 +189,6 @@ def mpls(self, data, half_window=None, lam=1e6, p=0.0, diff_order=2, tol=None, m } return baseline, params - @_Algorithm._register def mor(self, data, half_window=None, window_kwargs=None, **kwargs): """ A Morphological based (Mor) baseline algorithm. @@ -213,7 +212,7 @@ def mor(self, data, half_window=None, window_kwargs=None, **kwargs): Returns ------- - baseline : numpy.ndarray, shape (N,) + numpy.ndarray, shape (N,) The calculated baseline. dict A dictionary with the following items: @@ -259,13 +258,8 @@ def mor(self, data, half_window=None, window_kwargs=None, **kwargs): >>> plt.show() """ - y, half_wind = self._setup_morphology(data, half_window, window_kwargs, **kwargs) - opening = grey_opening(y, [2 * half_wind + 1]) - baseline = np.minimum(opening, _avg_opening(y, half_wind, opening)) - - return baseline, {'half_window': half_wind} + return super().mor(data, half_window=half_window, window_kwargs=window_kwargs, **kwargs) - @_Algorithm._register def imor(self, data, half_window=None, tol=1e-3, max_iter=200, window_kwargs=None, **kwargs): """ An Improved Morphological based (IMor) baseline algorithm. @@ -293,9 +287,9 @@ def imor(self, data, half_window=None, tol=1e-3, max_iter=200, window_kwargs=Non Returns ------- - baseline : numpy.ndarray, shape (N,) + numpy.ndarray, shape (N,) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'half_window': int @@ -312,21 +306,12 @@ def imor(self, data, half_window=None, tol=1e-3, max_iter=200, window_kwargs=Non Morphological Operations. Applied Spectroscopy, 2018, 72(5), 731-739. """ - y, half_wind = self._setup_morphology(data, half_window, window_kwargs, **kwargs) - baseline = y - tol_history = np.empty(max_iter + 1) - for i in range(max_iter + 1): - baseline_new = np.minimum(y, _avg_opening(baseline, half_wind)) - calc_difference = relative_difference(baseline, baseline_new) - tol_history[i] = calc_difference - if calc_difference < tol: - break - baseline = baseline_new - - params = {'half_window': half_wind, 'tol_history': tol_history[:i + 1]} - return baseline, params + return super().imor( + data, half_window=half_window, tol=tol, max_iter=max_iter, + window_kwargs=window_kwargs, **kwargs + ) - @_Algorithm._register + @_Algorithm._handle_io def amormol(self, data, half_window=None, tol=1e-3, max_iter=200, pad_kwargs=None, window_kwargs=None, **kwargs): """ @@ -407,7 +392,7 @@ def amormol(self, data, half_window=None, tol=1e-3, max_iter=200, pad_kwargs=Non params = {'half_window': half_wind, 'tol_history': tol_history[:i + 1]} return baseline[data_bounds], params - @_Algorithm._register + @_Algorithm._handle_io def mormol(self, data, half_window=None, tol=1e-3, max_iter=250, smooth_half_window=None, pad_kwargs=None, window_kwargs=None, **kwargs): """ @@ -493,7 +478,7 @@ def mormol(self, data, half_window=None, tol=1e-3, max_iter=250, smooth_half_win params = {'half_window': half_wind, 'tol_history': tol_history[:i + 1]} return baseline[data_bounds], params - @_Algorithm._register + @_Algorithm._handle_io def rolling_ball(self, data, half_window=None, smooth_half_window=None, pad_kwargs=None, window_kwargs=None, **kwargs): """ @@ -559,7 +544,7 @@ def rolling_ball(self, data, half_window=None, smooth_half_window=None, return baseline, {'half_window': half_wind} - @_Algorithm._register + @_Algorithm._handle_io def mwmv(self, data, half_window=None, smooth_half_window=None, pad_kwargs=None, window_kwargs=None, **kwargs): """ @@ -623,7 +608,6 @@ def mwmv(self, data, half_window=None, smooth_half_window=None, return baseline, {'half_window': half_wind} - @_Algorithm._register def tophat(self, data, half_window=None, window_kwargs=None, **kwargs): """ Estimates the baseline using a top-hat transformation (morphological opening). @@ -647,7 +631,7 @@ def tophat(self, data, half_window=None, window_kwargs=None, **kwargs): Returns ------- - baseline : numpy.ndarray, shape (N,) + numpy.ndarray, shape (N,) The calculated baseline. dict A dictionary with the following items: @@ -667,12 +651,9 @@ def tophat(self, data, half_window=None, window_kwargs=None, **kwargs): Raman Spectra of Artistic Pigments. Applied Spectroscopy, 2010, 64, 595-600. """ - y, half_wind = self._setup_morphology(data, half_window, window_kwargs, **kwargs) - baseline = grey_opening(y, [2 * half_wind + 1]) - - return baseline, {'half_window': half_wind} + return super().tophat(data, half_window=half_window, window_kwargs=window_kwargs, **kwargs) - @_Algorithm._register(sort_keys=('weights',)) + @_Algorithm._handle_io(sort_keys=('weights',)) def mpspline(self, data, half_window=None, lam=1e4, lam_smooth=1e-2, p=0.0, num_knots=100, spline_degree=3, diff_order=2, weights=None, pad_kwargs=None, window_kwargs=None, **kwargs): @@ -776,7 +757,7 @@ def mpspline(self, data, half_window=None, lam=1e4, lam_smooth=1e-2, p=0.0, # overestimated baseline; could alternatively just fit a p-spline to # 0.5 * (grey_closing(y, 3) + grey_opening(y, 3)), which averages noisy data better; # could add it as a boolean parameter - spline_fit = pspline.solve_pspline( + spline_fit = pspline.solve( y, weights=(y == grey_closing(y, 3)).astype(float, copy=False) ) if weights is None: @@ -798,7 +779,7 @@ def mpspline(self, data, half_window=None, lam=1e4, lam_smooth=1e-2, p=0.0, weight_array = np.where(spline_fit == optimal_opening, 1 - p, p) pspline.update_lam(lam) - baseline = pspline.solve_pspline(spline_fit, weight_array) + baseline = pspline.solve(spline_fit, weight_array) params = { 'half_window': half_window, 'weights': weight_array, 'result': PSplineResult(pspline, weight_array) @@ -806,7 +787,7 @@ def mpspline(self, data, half_window=None, lam=1e4, lam_smooth=1e-2, p=0.0, return baseline, params - @_Algorithm._register(sort_keys=('signal',)) + @_Algorithm._handle_io(sort_keys=('signal',)) def jbcd(self, data, half_window=None, alpha=0.1, beta=1e1, gamma=1., beta_mult=1.1, gamma_mult=0.909, diff_order=1, max_iter=20, tol=1e-2, tol_2=1e-3, robust_opening=True, window_kwargs=None, **kwargs): @@ -898,20 +879,15 @@ def jbcd(self, data, half_window=None, alpha=0.1, beta=1e1, gamma=1., beta_mult= baseline_old = opening signal_old = y - main_diag_idx = whittaker_system.main_diagonal_index partial_rhs_2 = (2 * alpha) * opening tol_history = np.empty((max_iter + 1, 2)) for i in range(max_iter + 1): - lhs_1 = gamma * whittaker_system.penalty - lhs_1[main_diag_idx] += 1 - lhs_2 = (2 * beta) * whittaker_system.penalty - lhs_2[main_diag_idx] += 1 + 2 * alpha - signal = whittaker_system.solve( - lhs_1, y - baseline_old, overwrite_ab=True, overwrite_b=True + y - baseline_old, weights=1, penalty=gamma * whittaker_system.penalty ) baseline = whittaker_system.solve( - lhs_2, y - signal + partial_rhs_2, overwrite_ab=True, overwrite_b=True + y - signal + partial_rhs_2, weights=1 + 2 * alpha, + penalty=(2 * beta) * whittaker_system.penalty ) calc_tol_1 = relative_difference(signal_old, signal) @@ -935,37 +911,6 @@ def jbcd(self, data, half_window=None, alpha=0.1, beta=1e1, gamma=1., beta_mult= _morphological_wrapper = _class_wrapper(_Morphological) -def _avg_opening(y, half_window, opening=None): - """ - Averages the dilation and erosion of a morphological opening on data. - - Parameters - ---------- - y : numpy.ndarray, shape (N,) - The array of the measured data. - half_window : int, optional - The half window size to use for the operations. - opening : numpy.ndarray, optional - The output of scipy.ndimage.grey_opening(y, window_size). Default is - None, which will compute the value. - - Returns - ------- - numpy.ndarray, shape (N,) - The average of the dilation and erosion of the opening. - - References - ---------- - Perez-Pueyo, R., et al. Morphology-Based Automated Baseline Removal for - Raman Spectra of Artistic Pigments. Applied Spectroscopy, 2010, 64 595-600. - - """ - window_size = 2 * half_window + 1 - if opening is None: - opening = grey_opening(y, [window_size]) - return 0.5 * (grey_dilation(opening, [window_size]) + grey_erosion(opening, [window_size])) - - @_morphological_wrapper def mpls(data, half_window=None, lam=1e6, p=0.0, diff_order=2, tol=None, max_iter=None, weights=None, x_data=None, window_kwargs=None, **kwargs): diff --git a/pybaselines/optimizers.py b/pybaselines/optimizers.py index 0cc82e9..476be71 100644 --- a/pybaselines/optimizers.py +++ b/pybaselines/optimizers.py @@ -26,7 +26,7 @@ class _Optimizers(_Algorithm): """A base class for all optimizer algorithms.""" - @_Algorithm._register(ensure_1d=False, skip_sorting=True) + @_Algorithm._handle_io(ensure_dims=False, skip_sorting=True) def collab_pls(self, data, average_dataset=True, method='asls', method_kwargs=None): """ Collaborative Penalized Least Squares (collab-PLS). @@ -141,7 +141,7 @@ def collab_pls(self, data, average_dataset=True, method='asls', method_kwargs=No return baselines, params - @_Algorithm._register(skip_sorting=True) + @_Algorithm._handle_io(skip_sorting=True) def optimize_extended_range(self, data, method='asls', side='both', width_scale=0.1, height_scale=1., sigma_scale=1 / 12, min_value=2, max_value=9, step=1, pad_kwargs=None, method_kwargs=None): @@ -388,7 +388,7 @@ def optimize_extended_range(self, data, method='asls', side='both', width_scale= return baseline, params - @_Algorithm._register(skip_sorting=True) + @_Algorithm._handle_io(skip_sorting=True) def adaptive_minmax(self, data, poly_order=None, method='modpoly', weights=None, constrained_fraction=0.01, constrained_weight=1e5, estimation_poly_order=2, method_kwargs=None): @@ -517,7 +517,7 @@ def adaptive_minmax(self, data, poly_order=None, method='modpoly', weights=None, return np.maximum.reduce(baselines), params - @_Algorithm._register + @_Algorithm._handle_io def custom_bc(self, data, method='asls', regions=((None, None),), sampling=1, lam=None, diff_order=2, method_kwargs=None): """ @@ -664,14 +664,11 @@ def custom_bc(self, data, method='asls', regions=((None, None),), sampling=1, la params['baseline_fit'] = baseline_fit if lam is not None and lam != 0: _, _, whittaker_system = self._setup_whittaker(y, lam=lam, diff_order=diff_order) - baseline = whittaker_system.solve( - whittaker_system.add_diagonal(1.), baseline, - overwrite_ab=True, overwrite_b=True - ) + baseline = whittaker_system.solve(baseline, weights=1) return baseline, params - @_Algorithm._register(skip_sorting=True) + @_Algorithm._handle_io(skip_sorting=True) def optimize_pls(self, data, method='arpls', opt_method='U-Curve', min_value=4, max_value=7, step=0.5, method_kwargs=None, euclidean=False, rho=None, n_samples=0): """ diff --git a/pybaselines/polynomial.py b/pybaselines/polynomial.py index baafd84..cb8c3dc 100644 --- a/pybaselines/polynomial.py +++ b/pybaselines/polynomial.py @@ -5,38 +5,6 @@ @author: Donald Erb -The function penalized_poly was adapted from MATLAB code from -https://www.mathworks.com/matlabcentral/fileexchange/27429-background-correction -(accessed March 18, 2021), which was licensed under the BSD-2-clause below. - -License: 2-clause BSD - -Copyright (c) 2012, Vincent Mazet -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the distribution - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - - The function loess was adapted from code from https://gist.github.com/agramfort/850437 (accessed March 25, 2021), which was licensed under the BSD-3-clause below. @@ -78,16 +46,16 @@ import numpy as np -from . import _weighting +from ._nd import polynomial as polynomial_nd from ._algorithm_setup import _Algorithm, _class_wrapper from ._compat import _HAS_NUMBA, jit -from .utils import _MIN_FLOAT, ParameterWarning, _convert_coef, _interp_inplace, relative_difference +from .utils import ParameterWarning, _convert_coef, _interp_inplace, relative_difference -class _Polynomial(_Algorithm): +class _Polynomial(_Algorithm, polynomial_nd._PolynomialNDMixin): """A base class for all polynomial algorithms.""" - @_Algorithm._register(sort_keys=('weights',)) + @_Algorithm._handle_io(sort_keys=('weights',)) def poly(self, data, poly_order=2, weights=None, return_coef=False): """ Computes a polynomial fit to the data. @@ -150,7 +118,6 @@ def poly(self, data, poly_order=2, weights=None, return_coef=False): return baseline, params - @_Algorithm._register(sort_keys=('weights',)) def modpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, use_original=False, mask_initial_peaks=False, return_coef=False): """ @@ -183,9 +150,9 @@ def modpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, Returns ------- - baseline : numpy.ndarray, shape (N,) + numpy.ndarray, shape (N,) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (N,) @@ -217,39 +184,12 @@ def modpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, 2007, 61(11), 1225-1232. """ - y, weight_array, pseudo_inverse = self._setup_polynomial( - data, weights, poly_order, calc_vander=True, calc_pinv=True, copy_weights=True + return super().modpoly( + data, poly_order=poly_order, tol=tol, max_iter=max_iter, weights=weights, + use_original=use_original, mask_initial_peaks=mask_initial_peaks, + return_coef=return_coef ) - sqrt_w = np.sqrt(weight_array) - if use_original: - y0 = y - - coef = pseudo_inverse @ (sqrt_w * y) - baseline = self._polynomial.vandermonde @ coef - if mask_initial_peaks: - # use baseline + deviation since without deviation, half of y should be above baseline - weight_array[baseline + np.std(y - baseline) < y] = 0 - sqrt_w = np.sqrt(weight_array) - pseudo_inverse = np.linalg.pinv(sqrt_w[:, None] * self._polynomial.vandermonde) - - tol_history = np.empty(max_iter) - for i in range(max_iter): - baseline_old = baseline - y = np.minimum(y0 if use_original else y, baseline) - coef = pseudo_inverse @ (sqrt_w * y) - baseline = self._polynomial.vandermonde @ coef - calc_difference = relative_difference(baseline_old, baseline) - tol_history[i] = calc_difference - if calc_difference < tol: - break - - params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} - if return_coef: - params['coef'] = _convert_coef(coef, self.x_domain) - - return baseline, params - @_Algorithm._register(sort_keys=('weights',)) def imodpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, use_original=False, mask_initial_peaks=True, return_coef=False, num_std=1.): """ @@ -285,9 +225,9 @@ def imodpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, Returns ------- - baseline : numpy.ndarray, shape (N,) + numpy.ndarray, shape (N,) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (N,) @@ -324,47 +264,12 @@ def imodpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, 2007, 61(11), 1225-1232. """ - if num_std < 0: - raise ValueError('num_std must be greater than or equal to 0') - - y, weight_array, pseudo_inverse = self._setup_polynomial( - data, weights, poly_order, calc_vander=True, calc_pinv=True, copy_weights=True + return super().imodpoly( + data, poly_order=poly_order, tol=tol, max_iter=max_iter, weights=weights, + use_original=use_original, mask_initial_peaks=mask_initial_peaks, + return_coef=return_coef, num_std=num_std ) - sqrt_w = np.sqrt(weight_array) - if use_original: - y0 = y - - coef = pseudo_inverse @ (sqrt_w * y) - baseline = self._polynomial.vandermonde @ coef - deviation = np.std(sqrt_w * (y - baseline)) - if mask_initial_peaks: - weight_array[baseline + deviation < y] = 0 - sqrt_w = np.sqrt(weight_array) - pseudo_inverse = np.linalg.pinv(sqrt_w[:, None] * self._polynomial.vandermonde) - - tol_history = np.empty(max_iter) - for i in range(max_iter): - y = np.minimum(y0 if use_original else y, baseline + num_std * deviation) - coef = pseudo_inverse @ (sqrt_w * y) - baseline = self._polynomial.vandermonde @ coef - new_deviation = np.std(sqrt_w * (y - baseline)) - # use new_deviation as dividing term in relative difference - calc_difference = relative_difference(new_deviation, deviation) - tol_history[i] = calc_difference - if calc_difference < tol: - break - deviation = new_deviation - - params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} - if return_coef: - params['coef'] = _convert_coef(coef, self.x_domain) - - return baseline, params - # adapted from - # https://www.mathworks.com/matlabcentral/fileexchange/27429-background-correction; - # see license above - @_Algorithm._register(sort_keys=('weights',)) def penalized_poly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, cost_function='asymmetric_truncated_quadratic', threshold=None, alpha_factor=0.99, return_coef=False): @@ -418,9 +323,9 @@ def penalized_poly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=Non Returns ------- - baseline : numpy.ndarray, shape (N,) + numpy.ndarray, shape (N,) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (N,) @@ -453,46 +358,13 @@ def penalized_poly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=Non Correction. Applied Spectroscopy, 2015, 69(7), 834-842. """ - if not 0 < alpha_factor <= 1: - raise ValueError('alpha_factor must be between 0 and 1') - symmetric_loss, method = _identify_loss_method(cost_function) - loss_function = { - 'huber': _huber_loss, - 'truncated_quadratic': _truncated_quadratic_loss, - 'indec': _indec_loss - }[method] - - y, weight_array, pseudo_inverse = self._setup_polynomial( - data, weights, poly_order, calc_vander=True, calc_pinv=True + return super().penalized_poly( + data, poly_order=poly_order, tol=tol, max_iter=max_iter, weights=weights, + cost_function=cost_function, threshold=threshold, alpha_factor=alpha_factor, + return_coef=return_coef ) - if threshold is None: - threshold = np.std(y) / 10 - loss_kwargs = { - 'threshold': threshold, 'alpha_factor': alpha_factor, 'symmetric': symmetric_loss - } - - sqrt_w = np.sqrt(weight_array) - y = sqrt_w * y - - coef = pseudo_inverse @ y - baseline = self._polynomial.vandermonde @ coef - tol_history = np.empty(max_iter) - for i in range(max_iter): - baseline_old = baseline - coef = pseudo_inverse @ (y + loss_function(y - sqrt_w * baseline, **loss_kwargs)) - baseline = self._polynomial.vandermonde @ coef - calc_difference = relative_difference(baseline_old, baseline) - tol_history[i] = calc_difference - if calc_difference < tol: - break - - params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} - if return_coef: - params['coef'] = _convert_coef(coef, self.x_domain) - - return baseline, params - @_Algorithm._register(sort_keys=('weights', 'coef'), require_unique_x=True) + @_Algorithm._handle_io(sort_keys=('weights', 'coef'), require_unique=True) def loess(self, data, fraction=0.2, total_points=None, poly_order=1, scale=3.0, tol=1e-3, max_iter=10, symmetric_weights=False, use_threshold=False, num_std=1, use_original=False, weights=None, return_coef=False, @@ -713,7 +585,6 @@ def loess(self, data, fraction=0.2, total_points=None, poly_order=1, scale=3.0, return baseline, params - @_Algorithm._register(sort_keys=('weights',)) def quant_reg(self, data, poly_order=2, quantile=0.05, tol=1e-6, max_iter=250, weights=None, eps=None, return_coef=False): """ @@ -748,9 +619,9 @@ def quant_reg(self, data, poly_order=2, quantile=0.05, tol=1e-6, max_iter=250, Returns ------- - baseline : numpy.ndarray, shape (N,) + numpy.ndarray, shape (N,) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (N,) @@ -786,36 +657,12 @@ def quant_reg(self, data, poly_order=2, quantile=0.05, tol=1e-6, max_iter=250, quantile sheets. AStA Advances in Statistical Analysis, 2013, 97, 77-87. """ - # TODO provide a way to estimate best poly_order based on AIC like in Komsta? could be - # useful for all polynomial methods; maybe could be an optimizer function - if not 0 < quantile < 1: - raise ValueError('quantile must be between 0 and 1.') - - y, weight_array = self._setup_polynomial(data, weights, poly_order, calc_vander=True) - sqrt_w = np.sqrt(weight_array) - baseline_old = y - tol_history = np.empty(max_iter + 1) - for i in range(max_iter + 1): - coef = np.linalg.lstsq( - self._polynomial.vandermonde * sqrt_w[:, None], y * sqrt_w, None - )[0] - baseline = self._polynomial.vandermonde @ coef - # relative_difference(baseline_old, baseline, 1) gives nearly same result and - # the l2 norm is faster to calculate, so use that instead of l1 norm - calc_difference = relative_difference(baseline_old, baseline) - tol_history[i] = calc_difference - if calc_difference < tol: - break - sqrt_w = np.sqrt(_weighting._quantile(y, baseline, quantile, eps)) - baseline_old = baseline - - params = {'weights': sqrt_w**2, 'tol_history': tol_history[:i + 1]} - if return_coef: - params['coef'] = _convert_coef(coef, self.x_domain) - - return baseline, params + return super().quant_reg( + data, poly_order=poly_order, quantile=quantile, tol=tol, max_iter=max_iter, + weights=weights, eps=eps, return_coef=return_coef + ) - @_Algorithm._register(sort_keys=('weights',)) + @_Algorithm._handle_io(sort_keys=('weights',)) def goldindec(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, cost_function='asymmetric_indec', peak_ratio=0.5, alpha_factor=0.99, tol_2=1e-3, tol_3=1e-6, max_iter_2=100, return_coef=False): @@ -915,9 +762,9 @@ def goldindec(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, elif not 0 < peak_ratio < 1: raise ValueError('peak_ratio must be between 0 and 1') try: - symmetric_loss, method = _identify_loss_method(cost_function) + symmetric_loss, method = polynomial_nd._identify_loss_method(cost_function) except ValueError: # do not require a prefix since cost must be asymmetric - symmetric_loss, method = _identify_loss_method('a_' + cost_function) + symmetric_loss, method = polynomial_nd._identify_loss_method('a_' + cost_function) if symmetric_loss: # symmetric cost functions don't work due to how the up-down ratio vs # peak_ratio function was created in the reference; in theory, could simulate @@ -928,9 +775,9 @@ def goldindec(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, raise ValueError('goldindec only works for asymmetric cost functions') loss_function = { - 'huber': _huber_loss, - 'truncated_quadratic': _truncated_quadratic_loss, - 'indec': _indec_loss + 'huber': polynomial_nd._huber_loss, + 'truncated_quadratic': polynomial_nd._truncated_quadratic_loss, + 'indec': polynomial_nd._indec_loss }[method] y, weight_array, pseudo_inverse = self._setup_polynomial( data, weights, poly_order, calc_vander=True, calc_pinv=True @@ -1199,211 +1046,6 @@ def imodpoly(data, x_data=None, poly_order=2, tol=1e-3, max_iter=250, weights=No """ -# adapted from (https://www.mathworks.com/matlabcentral/fileexchange/27429-background-correction); -# see license above -def _huber_loss(residual, threshold=1.0, alpha_factor=0.99, symmetric=True): - """ - The Huber non-quadratic cost function. - - Parameters - ---------- - residual : numpy.ndarray, shape (N,) - The residual array. - threshold : float, optional - Any residual values below the threshold are given quadratic loss. - Default is 1.0. - alpha_factor : float, optional - The scale between 0 and 1 to multiply the cost function's alpha_max - value (see Notes below). Default is 0.99. - symmetric : bool, optional - If True (default), the cost function is symmetric and applies the same - weighting for positive and negative values. If False, will apply weights - asymmetrically so that only positive weights are given the non-quadratic - weigting and negative weights have normal, quadratic weighting. - - Returns - ------- - weights : numpy.ndarray, shape (N,) - The weight array. - - Notes - ----- - The returned result is - - -residual + alpha_factor * alpha_max * phi'(residual) - - where phi'(x) is the derivative of the huber loss function, phi(x). - - References - ---------- - Mazet, V., et al. Background removal from spectra by designing and - minimising a non-quadratic cost function. Chemometrics and Intelligent - Laboratory Systems, 2005, 76(2), 121-133. - - """ - alpha = alpha_factor * 0.5 # alpha_max for huber is 0.5 - if symmetric: - mask = (np.abs(residual) < threshold) - weights = ( - mask * residual * (2 * alpha - 1) - + (~mask) * 2 * alpha * threshold * np.sign(residual) - ) - else: - mask = (residual < threshold) - weights = ( - mask * residual * (2 * alpha - 1) - + (~mask) * (2 * alpha * threshold - residual) - ) - return weights - - -# adapted from (https://www.mathworks.com/matlabcentral/fileexchange/27429-background-correction); -# see license above -def _truncated_quadratic_loss(residual, threshold=1.0, alpha_factor=0.99, symmetric=True): - """ - The Truncated-Quadratic non-quadratic cost function. - - Parameters - ---------- - residual : numpy.ndarray, shape (N,) - The residual array. - threshold : float, optional - Any residual values below the threshold are given quadratic loss. - Default is 1.0. - alpha_factor : float, optional - The scale between 0 and 1 to multiply the cost function's alpha_max - value (see Notes below). Default is 0.99. - symmetric : bool, optional - If True (default), the cost function is symmetric and applies the same - weighting for positive and negative values. If False, will apply weights - asymmetrically so that only positive weights are given the non-quadratic - weigting and negative weights have normal, quadratic weighting. - - Returns - ------- - weights : numpy.ndarray, shape (N,) - The weight array. - - Notes - ----- - The returned result is - - -residual + alpha_factor * alpha_max * phi'(residual) - - where phi'(x) is the derivative of the truncated quadratic function, phi(x). - - References - ---------- - Mazet, V., et al. Background removal from spectra by designing and - minimising a non-quadratic cost function. Chemometrics and Intelligent - Laboratory Systems, 2005, 76(2), 121-133. - - """ - alpha = alpha_factor * 0.5 # alpha_max for truncated quadratic is 0.5 - if symmetric: - mask = (np.abs(residual) < threshold) - else: - mask = (residual < threshold) - return mask * residual * (2 * alpha - 1) - (~mask) * residual - - -def _indec_loss(residual, threshold=1.0, alpha_factor=0.99, symmetric=True): - """ - The Indec non-quadratic cost function. - - Parameters - ---------- - residual : numpy.ndarray, shape (N,) - The residual array. - threshold : float, optional - Any residual values below the threshold are given quadratic loss. - Default is 1.0. - alpha_factor : float, optional - The scale between 0 and 1 to multiply the cost function's alpha_max - value (see Notes below). Default is 0.99. - symmetric : bool, optional - If True (default), the cost function is symmetric and applies the same - weighting for positive and negative values. If False, will apply weights - asymmetrically so that only positive weights are given the non-quadratic - weigting and negative weights have normal, quadratic weighting. - - Returns - ------- - weights : numpy.ndarray, shape (N,) - The weight array. - - Notes - ----- - The returned result is - - -residual + alpha_factor * alpha_max * phi'(residual) - - where phi'(x) is the derivative of the Indec function, phi(x). - - References - ---------- - Liu, J., et al. Goldindec: A Novel Algorithm for Raman Spectrum Baseline - Correction. Applied Spectroscopy, 2015, 69(7), 834-842. - - Mazet, V., et al. Background removal from spectra by designing and - minimising a non-quadratic cost function. Chemometrics and Intelligent - Laboratory Systems, 2005, 76(2), 121-133. - - """ - alpha = alpha_factor * 0.5 # alpha_max for indec is 0.5 - if symmetric: - mask = (np.abs(residual) < threshold) - multiple = np.sign(residual) - else: - mask = (residual < threshold) - # multiple=1 is same as sign(residual) since residual is always > 0 - # for asymmetric case, but this allows not doing the sign calculation - multiple = 1 - weights = ( - mask * residual * (2 * alpha - 1) - - (~mask) * ( - residual + alpha * multiple * threshold**3 / np.maximum(2 * residual**2, _MIN_FLOAT) - ) - ) - return weights - - -def _identify_loss_method(loss_method): - """ - Identifies the symmetry for the given loss method. - - Parameters - ---------- - loss_method : str - The loss method to use. Should have the symmetry identifier as - the prefix. - - Returns - ------- - symmetric : bool - True if `loss_method` had 's_' or 'symmetric_' as the prefix, else False. - str - The input `loss_method` value without the first section that indicated - the symmetry. - - Raises - ------ - ValueError - Raised if the loss method does not have the correct form. - - """ - prefix, *split_method = loss_method.lower().split('_') - if prefix not in ('a', 's', 'asymmetric', 'symmetric') or not split_method: - raise ValueError('must specify loss function symmetry by prepending "a_" or "s_"') - if prefix in ('a', 'asymmetric'): - symmetric = False - else: - symmetric = True - return symmetric, '_'.join(split_method) - - -# adapted from (https://www.mathworks.com/matlabcentral/fileexchange/27429-background-correction); -# see license above @_polynomial_wrapper def penalized_poly(data, x_data=None, poly_order=2, tol=1e-3, max_iter=250, weights=None, cost_function='asymmetric_truncated_quadratic', diff --git a/pybaselines/results.py b/pybaselines/results.py index bed95a1..5952f83 100644 --- a/pybaselines/results.py +++ b/pybaselines/results.py @@ -7,21 +7,51 @@ """ import numpy as np -from scipy.linalg import solve from scipy.sparse import issparse -from scipy.sparse.linalg import factorized from ._banded_utils import _banded_to_sparse, _add_diagonals from ._compat import diags, _sparse_col_index from .utils import _get_rng +def _rademacher(shape, rng): + """ + Generates random samples from a Rademacher distribution, ie. equal chances of -1 or 1. + + Parameters + ---------- + shape : int or tuple[int, ...] + The shape of the random samples to create. + rng : int or numpy.random.Generator or numpy.random.RandomState + The integer for the seed of the random number generator or an existing generating + object to use for drawing samples. + + Returns + ------- + numpy.ndarray, shape `shape` + The generated random samples. + + References + ---------- + https://en.wikipedia.org/wiki/Rademacher_distribution + + Hutchinson, M. A stochastic estimator of the trace of the influence matrix for laplacian + smoothing splines. Communications in Statistics - Simulation and Computation, (1990), + 19(2), 433-450. + + """ + return _get_rng(rng).choice([-1., 1.], size=shape) + + class WhittakerResult: """ Represents the result of Whittaker smoothing. Provides methods for extending the solution obtained from baseline algorithms that use - Whittaker smoothing. This class should not be initialized by external users. + Whittaker smoothing. + + This class should **not** be initialized by external users since its + initialization signature may change without notice as internally required. """ @@ -55,60 +85,9 @@ def __init__(self, penalized_object, weights=None, lhs=None, rhs_extra=None): self._rhs_extra = rhs_extra self._trace = None if weights is None: - weights = np.ones(self._shape) + weights = np.ones(self._penalized_object.shape) self._weights = weights - @property - def _shape(self): - """The shape of the penalized system. - - Returns - ------- - tuple[int, int] - The penalized system's shape. - - """ - # TODO need to add an attribute to join 1D and 2D PenalizedSystem and PSpline objects - # so that this can just access that attribute rather than having to modify for each - # subclass - return self._basis_shape - - @property - def _size(self): - """The total size of the penalized system. - - Returns - ------- - int - The penalized system's size. - - """ - return np.prod(self._shape) - - @property - def _basis_shape(self): - """The shape of the system's basis matrix. - - Returns - ------- - tuple[int, int] - The penalized system's basis shape. - - """ - return self._penalized_object._num_bases - - @property - def _basis_size(self): - """The total size of the system's basis matrix. - - Returns - ------- - int - The system's basis matrix size. - - """ - return np.prod(self._basis_shape) - @property def _lhs(self): """ @@ -168,9 +147,9 @@ def effective_dimension(self, n_samples=0, rng=1234): If 0 (default), will calculate the analytical trace. Otherwise, will use stochastic trace estimation with a matrix of (N, `n_samples`) Rademacher random variables (ie. either -1 or 1). - rng : int or numpy.random.Generator or numpy.random.RandomState + rng : int or numpy.random.Generator or numpy.random.RandomState, optional The integer for the seed of the random number generator or an existing generating - object to use for the stochastic trace estimation. + object to use for the stochastic trace estimation. Default is 1234. Returns ------- @@ -242,14 +221,14 @@ def effective_dimension(self, n_samples=0, rng=1234): if self._rhs_extra is None: # note: about an order of magnitude faster to omit the sparse rhs for the simple # case of lhs @ v = w * y - eye = np.zeros(self._size) - for i in range(self._size): + eye = np.zeros(self._penalized_object.tot_bases) + for i in range(self._penalized_object.tot_bases): eye[i] = self._weights[i] trace += self._penalized_object.factorized_solve(factorization, eye)[i] eye[i] = 0 else: rhs = self._rhs.tocsc() - for i in range(self._basis_size): + for i in range(self._penalized_object.tot_bases): trace += self._penalized_object.factorized_solve( factorization, _sparse_col_index(rhs, i) )[i] @@ -257,13 +236,13 @@ def effective_dimension(self, n_samples=0, rng=1234): # prevent needing to calculate analytical solution again self._trace = trace else: - rng_samples = _get_rng(rng).choice([-1., 1.], size=(self._basis_size, n_samples)) + rng_samples = _rademacher((self._penalized_object.tot_bases, n_samples), rng) if self._rhs_extra is None: rhs_u = self._weights[:, None] * rng_samples else: rhs_u = self._rhs.tocsr() @ rng_samples # H @ u == (W + P)^-1 @ (W @ u) - hat_u = self._penalized_object.solve(self._lhs, rhs_u, overwrite_b=True) + hat_u = self._penalized_object.direct_solve(self._lhs, rhs_u, overwrite_b=True) # stochastic trace is the average of the trace of u.T @ H @ u; # trace(A.T @ B) == (A * B).sum() (see # https://en.wikipedia.org/wiki/Trace_(linear_algebra)#Trace_of_a_product ), @@ -279,7 +258,10 @@ class PSplineResult(WhittakerResult): Represents the result of penalized spline (P-Spline) smoothing. Provides methods for extending the solution obtained from baseline algorithms that use - P-Spline smoothing. This class should not be initialized by external users. + P-Spline smoothing. + + This class should **not** be initialized by external users since its + initialization signature may change without notice as internally required. """ @@ -321,18 +303,6 @@ def __init__(self, penalized_object, weights=None, rhs_extra=None, penalty=None) if penalty is not None: self._penalized_object.penalty = penalty - @property - def _shape(self): - """The shape of the penalized system. - - Returns - ------- - tuple[int, int] - The penalized system's shape. - - """ - return (len(self._penalized_object.basis.x),) - @property def _lhs(self): """ @@ -435,6 +405,9 @@ def effective_dimension(self, n_samples=0, rng=1234): If 0 (default), will calculate the analytical trace. Otherwise, will use stochastic trace estimation with a matrix of (N, `n_samples`) Rademacher random variables (ie. either -1 or 1). + rng : int or numpy.random.Generator or numpy.random.RandomState, optional + The integer for the seed of the random number generator or an existing generating + object to use for the stochastic trace estimation. Default is 1234. Returns ------- @@ -460,9 +433,6 @@ def effective_dimension(self, n_samples=0, rng=1234): Simplicity in Algorithms (SOSA), (2021), 142-155. """ - # TODO could maybe make default n_samples to None and decide to use analytical or - # stochastic trace based on data size; data size > 1000 use stochastic with default - # n_samples = 100? if n_samples == 0: if self._trace is not None: return self._trace @@ -480,16 +450,18 @@ def effective_dimension(self, n_samples=0, rng=1234): # hat matrix does not need to be stored in memory trace = 0 factorization = self._penalized_object.factorize(self._lhs) - for i in range(self._basis_size): + for i in range(self._penalized_object.tot_bases): trace += self._penalized_object.factorized_solve( factorization, _sparse_col_index(rhs, i) )[i] # prevent needing to calculate analytical solution again self._trace = trace else: - rng_samples = _get_rng(rng).choice([-1., 1.], size=(self._basis_size, n_samples)) + rng_samples = _rademacher((self._penalized_object.tot_bases, n_samples), rng) # H @ u == (B.T @ W @ B + P)^-1 @ (B.T @ W @ B) @ u - hat_u = self._penalized_object.solve(self._lhs, rhs @ rng_samples, overwrite_b=True) + hat_u = self._penalized_object.direct_solve( + self._lhs, rhs @ rng_samples, overwrite_b=True + ) # stochastic trace is the average of the trace of u.T @ H @ u; # trace(u.T @ H @ u) == sum(u * (H @ u)) trace = np.einsum('ij,ij->', rng_samples, hat_u) / n_samples @@ -502,7 +474,10 @@ class PSplineResult2D(PSplineResult): Represents the result of 2D penalized spline (P-Spline) smoothing. Provides methods for extending the solution obtained from baseline algorithms that use - P-Spline smoothing. This class should not be initialized by external users. + P-Spline smoothing. + + This class should **not** be initialized by external users since its + initialization signature may change without notice as internally required. """ @@ -541,19 +516,7 @@ def __init__(self, penalized_object, weights=None, rhs_extra=None, penalty=None) """ super().__init__(penalized_object, weights=weights, rhs_extra=rhs_extra, penalty=penalty) if self._weights.ndim == 1: - self._weights = self._weights.reshape(self._shape) - - @property - def _shape(self): - """The shape of the penalized system. - - Returns - ------- - tuple[int, int] - The penalized system's shape. - - """ - return (len(self._penalized_object.basis.x), len(self._penalized_object.basis.z)) + self._weights = self._weights.reshape(self._penalized_object.shape) @property def _lhs(self): @@ -607,11 +570,7 @@ def _btwb(self): The sparse object representing the matrix multiplication of ``B.T @ W @ B``. """ - # TODO can remove once PSpline and PSpline2D unify their btwb method calls; or - # just keep the docstring since the types are different - if self._btwb_ is None: - self._btwb_ = self._penalized_object.basis._make_btwb(self._weights) - return self._btwb_ + return super()._btwb # only overridden to note the return type difference @property def tck(self): @@ -633,83 +592,7 @@ def tck(self): The degree of the spline for the rows and columns. """ - # method only added to document differing output types compared to PSplineResult.tck - return super().tck - - def effective_dimension(self, n_samples=0, rng=1234): - """ - Calculates the effective dimension from the trace of the hat matrix. - - For typical P-spline smoothing, the linear equation would be - ``(B.T @ W @ B + lam * P) c = B.T @ W @ y`` and ``v = B @ c``. Then the hat matrix - would be ``B @ (B.T @ W @ B + lam * P)^-1 @ (B.T @ W)`` or, equivalently - ``(B.T @ W @ B + lam * P)^-1 @ (B.T @ W @ B)``. The latter expression is preferred - since it reduces the dimensionality. The effective dimension for the system - can be estimated as the trace of the hat matrix. - - Parameters - ---------- - n_samples : int, optional - If 0 (default), will calculate the analytical trace. Otherwise, will use stochastic - trace estimation with a matrix of (``M * N``, `n_samples`) Rademacher random variables - (eg. either -1 or 1). - - Returns - ------- - trace : float - The trace of the hat matrix, denoting the effective dimension for - the system. - - Raises - ------ - TypeError - Raised if `n_samples` is not an integer greater than or equal to 0. - - References - ---------- - Eilers, P., et al. Fast and compact smoothing on large multidimensional grids. Computational - Statistics and Data Analysis, 2006, 50(1), 61-76. - - Hutchinson, M. A stochastic estimator of the trace of the influence matrix for laplacian - smoothing splines. Communications in Statistics - Simulation and Computation, (1990), - 19(2), 433-450. - - Meyer, R., et al. Hutch++: Optimal Stochastic Trace Estimation. 2021 Symposium on - Simplicity in Algorithms (SOSA), (2021), 142-155. - - """ - # TODO unify the PSpline and PSpline2D method namings and availability for factorization - # and solving so that this can be directly inherited from the PSplineResult object - if n_samples == 0: - if self._trace is not None: - return self._trace - use_analytic = True - rhs_format = 'csc' - else: - if n_samples < 0 or not isinstance(n_samples, int): - raise TypeError('n_samples must be a non-negative integer') - use_analytic = False - rhs_format = 'csr' - - rhs = self._rhs.asformat(rhs_format) - if use_analytic: - # compute each diagonal of the hat matrix separately so that the full - # hat matrix does not need to be stored in memory - trace = 0 - factorization = factorized(self._lhs) - for i in range(self._basis_size): - trace += factorization(_sparse_col_index(rhs, i))[i] - # prevent needing to calculate analytical solution again - self._trace = trace - else: - rng_samples = _get_rng(rng).choice([-1., 1.], size=(self._basis_size, n_samples)) - # H @ u == (B.T @ W @ B + P)^-1 @ (B.T @ W @ B) @ u - hat_u = self._penalized_object.direct_solve(self._lhs, rhs @ rng_samples) - # stochastic trace is the average of the trace of u.T @ H @ u; - # trace(u.T @ H @ u) == sum(u * (H @ u)) - trace = np.einsum('ij,ij->', rng_samples, hat_u) / n_samples - - return trace + return super().tck # only overridden to note the return type difference class WhittakerResult2D(WhittakerResult): @@ -717,11 +600,14 @@ class WhittakerResult2D(WhittakerResult): Represents the result of 2D Whittaker smoothing. Provides methods for extending the solution obtained from baseline algorithms that use - Whittaker smoothing. This class should not be initialized by external users. + Whittaker smoothing. + + This class should **not** be initialized by external users since its + initialization signature may change without notice as internally required. """ - def __init__(self, penalized_object, weights=None, lhs=None, rhs_extra=None, penalty=None): + def __init__(self, penalized_object, weights=None, lhs=None, rhs_extra=None): """ Initializes the result object. @@ -743,10 +629,6 @@ def __init__(self, penalized_object, weights=None, lhs=None, rhs_extra=None, pen rhs_extra : scipy.sparse.sparray or scipy.sparse.spmatrix, optional Additional terms besides the weights within the right hand side of the hat matrix. Default is None. - penalty : scipy.sparse.sparray or scipy.sparse.spmatrix, optional - The penalty `P` for the system in full, sparse format. If None (default), will use - ``penalized_object.penalty``. If given, will overwrite ``penalized_object.penalty`` - with the given penalty. Raises ------ @@ -756,33 +638,12 @@ def __init__(self, penalized_object, weights=None, lhs=None, rhs_extra=None, pen """ super().__init__(penalized_object, weights=weights, lhs=lhs, rhs_extra=rhs_extra) self._btwb_ = None - if penalty is not None: - if lhs is not None: - raise ValueError('both `lhs` and `penalty` cannot be supplied') - self._penalized_object.penalty = penalty if self._penalized_object._using_svd and self._weights.ndim == 1: - self._weights = self._weights.reshape(self._shape) + self._weights = self._weights.reshape(self._penalized_object.shape) elif not self._penalized_object._using_svd and self._weights.ndim == 2: self._weights = self._weights.ravel() - @property - def _shape(self): - """The shape of the penalized system. - - Returns - ------- - tuple[int, int] - The penalized system's shape. - - """ - # TODO replace/remove once PenalizedSystem2D and WhittakerSystem2D are unified - if hasattr(self._penalized_object, '_num_points'): - shape = self._penalized_object._num_points - else: - shape = self._penalized_object._num_bases - return shape - @property def _btwb(self): """ @@ -856,8 +717,8 @@ def relative_dof(self): dof : numpy.ndarray, shape (P, Q) The relative effective degrees of freedom associated with each eigenvector used for the fit. Each individual effective degree of freedom value is between - 0 and 1, with lower values signifying that the eigenvector was less important - for the fit. + 0 and 1, with lower values signifying that the eigenvector contributed less + to the fit. Raises ------ @@ -870,8 +731,10 @@ def relative_dof(self): raise ValueError( 'Cannot calculate degrees of freedom when not using eigendecomposition' ) - dof = solve(self._lhs, self._btwb, check_finite=False, assume_a='pos') - return dof.diagonal().reshape(self._basis_shape) + dof = self._penalized_object.direct_solve( + self._lhs, self._btwb, check_finite=False, assume_a='pos' + ) + return dof.diagonal().reshape(self._penalized_object._num_bases) def effective_dimension(self, n_samples=0, rng=1234): """ @@ -891,6 +754,9 @@ def effective_dimension(self, n_samples=0, rng=1234): If 0 (default), will calculate the analytical trace. Otherwise, will use stochastic trace estimation with a matrix of (``M * N``, `n_samples`) Rademacher random variables (eg. either -1 or 1). + rng : int or numpy.random.Generator or numpy.random.RandomState, optional + The integer for the seed of the random number generator or an existing generating + object to use for the stochastic trace estimation. Default is 1234. Returns ------- @@ -930,9 +796,10 @@ def effective_dimension(self, n_samples=0, rng=1234): if n_samples < 0 or not isinstance(n_samples, int): raise TypeError('n_samples must be a non-negative integer') use_analytic = False - rng_samples = _get_rng(rng).choice([-1., 1.], size=(self._basis_size, n_samples)) - if self._penalized_object._using_svd: + if not self._penalized_object._using_svd: + trace = super().effective_dimension(n_samples=n_samples, rng=rng) + else: # NOTE the only Whittaker-based algorithms that allow performing SVD for solving # all use the simple (W + P) v = w * y formulation, so no need to implement for # rhs_extra @@ -944,45 +811,14 @@ def effective_dimension(self, n_samples=0, rng=1234): trace = self.relative_dof().sum() self._trace = trace else: + rng_samples = _rademacher((self._penalized_object.tot_bases, n_samples), rng) # H @ u == (B.T @ W @ B + P)^-1 @ (B.T @ W @ B) @ u - hat_u = solve( + hat_u = self._penalized_object.direct_solve( self._lhs, self._rhs @ rng_samples, overwrite_b=True, check_finite=False, assume_a='pos' ) # stochastic trace is the average of the trace of u.T @ H @ u; # trace(u.T @ H @ u) == sum(u * (H @ u)) trace = np.einsum('ij,ij->', rng_samples, hat_u) / n_samples - else: - # TODO unify PenalizedSystem and PenalizedSystem2D methods so that this can be - # directly inherited from WhittakerResult - if use_analytic: - # compute each diagonal of the hat matrix separately so that the full - # hat matrix does not need to be stored in memory - trace = 0 - factorization = factorized(self._lhs) - if self._rhs_extra is None: - # note: about an order of magnitude faster to omit the sparse rhs for the simple - # case of lhs @ v = w * y - eye = np.zeros(self._size) - for i in range(self._size): - eye[i] = self._weights[i] - trace += factorization(eye)[i] - eye[i] = 0 - else: - rhs = self._rhs.tocsc() - for i in range(self._basis_size): - trace += factorization(_sparse_col_index(rhs, i))[i] - self._trace = trace - - else: - if self._rhs_extra is None: - rhs_u = self._weights[:, None] * rng_samples - else: - rhs_u = self._rhs.tocsr() @ rng_samples - # H @ u == (W + P)^-1 @ (W @ u) - hat_u = self._penalized_object.direct_solve(self._lhs, rhs_u) - # stochastic trace is the average of the trace of u.T @ H @ u; - # trace(u.T @ H @ u) == sum(u * (H @ u)) - trace = np.einsum('ij,ij->', rng_samples, hat_u) / n_samples return trace diff --git a/pybaselines/smooth.py b/pybaselines/smooth.py index a45efba..e210992 100644 --- a/pybaselines/smooth.py +++ b/pybaselines/smooth.py @@ -24,7 +24,7 @@ class _Smooth(_Algorithm): """A base class for all smoothing algorithms.""" - @_Algorithm._register + @_Algorithm._handle_io def noise_median(self, data, half_window=None, smooth_half_window=None, sigma=None, pad_kwargs=None, **kwargs): """ @@ -85,7 +85,7 @@ def noise_median(self, data, half_window=None, smooth_half_window=None, sigma=No baseline = padded_convolve(median, gaussian_kernel(smooth_window, sigma)) return baseline[half_window:-half_window], {} - @_Algorithm._register + @_Algorithm._handle_io def snip(self, data, max_half_window=None, decreasing=False, smooth_half_window=None, filter_order=2, pad_kwargs=None, **kwargs): """ @@ -265,7 +265,7 @@ def snip(self, data, max_half_window=None, decreasing=False, smooth_half_window= return baseline[max_of_half_windows:-max_of_half_windows], {} - @_Algorithm._register + @_Algorithm._handle_io def swima(self, data, min_half_window=3, max_half_window=None, smooth_half_window=None, pad_kwargs=None, **kwargs): """ @@ -372,7 +372,7 @@ def swima(self, data, min_half_window=3, max_half_window=None, smooth_half_windo return baseline[data_slice], {'half_window': half_windows, 'converged': converges} - @_Algorithm._register + @_Algorithm._handle_io def ipsa(self, data, half_window=None, max_iter=500, tol=None, roi=None, original_criteria=False, pad_kwargs=None, **kwargs): """ @@ -470,7 +470,7 @@ def ipsa(self, data, half_window=None, max_iter=500, tol=None, roi=None, return baseline[data_slice], {'tol_history': tol_history[:i + 1]} - @_Algorithm._register + @_Algorithm._handle_io def ria(self, data, half_window=None, max_iter=500, tol=1e-2, side='both', width_scale=0.1, height_scale=1., sigma_scale=1 / 12, pad_kwargs=None, **kwargs): """ @@ -614,7 +614,7 @@ def ria(self, data, half_window=None, max_iter=500, tol=1e-2, side='both', return baseline, {'tol_history': tol_history[:i + 1]} - @_Algorithm._register + @_Algorithm._handle_io def peak_filling(self, data, half_window=None, sections=None, max_iter=5, lam_smooth=None): """ The 4S (Smooth, Subsample, Suppress, Stretch) Peak Filling algorithm. @@ -711,7 +711,7 @@ def peak_filling(self, data, half_window=None, sections=None, max_iter=5, lam_sm if lam_smooth is not None and lam_smooth > 0: _, _, whittaker_system = self._setup_whittaker(data, lam_smooth, diff_order=2) - data = whittaker_system.solve(whittaker_system.add_diagonal(1.), data) + data = whittaker_system.solve(data, weights=1) for i, (left_idx, right_idx) in enumerate(zip(indices[:-1], indices[1:])): y_truncated[i] = data[left_idx:right_idx].min() diff --git a/pybaselines/spline.py b/pybaselines/spline.py index 2a15f9e..4450ded 100644 --- a/pybaselines/spline.py +++ b/pybaselines/spline.py @@ -14,19 +14,18 @@ from . import _weighting from ._algorithm_setup import _Algorithm, _class_wrapper from ._banded_utils import _add_diagonals, _shift_rows, _sparse_to_banded, diff_penalty_matrix +from ._nd.pls import _PLSNDMixin from ._spline_utils import _basis_midpoints -from ._validation import _check_lam, _check_optional_array, _check_scalar_variable -from .results import PSplineResult -from .utils import ( - ParameterWarning, _mollifier_kernel, _sort_array, gaussian, pad_edges, padded_convolve, - relative_difference, _MIN_FLOAT +from ._validation import ( + _check_lam, _check_optional_array, _check_scalar_variable, _check_spline_degree ) +from .results import PSplineResult +from .utils import _sort_array, relative_difference -class _Spline(_Algorithm): +class _Spline(_Algorithm, _PLSNDMixin): """A base class for all spline algorithms.""" - @_Algorithm._register(sort_keys=('weights',)) def mixture_model(self, data, lam=1e5, p=1e-2, num_knots=100, spline_degree=3, diff_order=3, max_iter=50, tol=1e-3, weights=None, symmetric=False, num_bins=None): """ @@ -78,9 +77,9 @@ def mixture_model(self, data, lam=1e5, p=1e-2, num_knots=100, spline_degree=3, d Returns ------- - baseline : numpy.ndarray, shape (N,) + numpy.ndarray, shape (N,) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (N,) @@ -97,7 +96,7 @@ def mixture_model(self, data, lam=1e5, p=1e-2, num_knots=100, spline_degree=3, d Raises ------ ValueError - Raised if p is not between 0 and 1. + Raised if `p` is not between 0 and 1. References ---------- @@ -108,105 +107,18 @@ def mixture_model(self, data, lam=1e5, p=1e-2, num_knots=100, spline_degree=3, d preprint arXiv:1901.06708, 2019. """ - if not 0 < p < 1: - raise ValueError('p must be between 0 and 1') if num_bins is not None: warnings.warn( '"num_bins" was deprecated in version 1.1.0 and will be removed in version 1.3.0', DeprecationWarning, stacklevel=2 ) - - y, weight_array, pspline = self._setup_spline( - data, weights, spline_degree, num_knots, True, diff_order, lam + _check_spline_degree(spline_degree) + return super()._mixture_model( + data, lam=lam, p=p, diff_order=diff_order, max_iter=max_iter, tol=tol, + weights=weights, spline_degree=spline_degree, num_knots=num_knots, + symmetric=symmetric ) - # scale y between -1 and 1 so that the residual fit is more numerically stable - # TODO is this still necessary now that expectation-maximization is used? -> still - # helps to prevent overflows when using gaussian - y_domain = np.polynomial.polyutils.getdomain(y) - y = np.polynomial.polyutils.mapdomain(y, y_domain, np.array([-1., 1.])) - if weights is not None: - baseline = pspline.solve_pspline(y, weight_array) - else: - # perform 2 iterations: first is a least-squares fit and second is initial - # reweighted fit; 2 fits are needed to get weights to have a decent starting - # distribution for the expectation-maximization - if symmetric and not 0.2 < p < 0.8: - # p values far away from 0.5 with symmetric=True give bad initial weights - # for the expectation maximization - warnings.warn( - 'should use a p value closer to 0.5 when symmetric is True', - ParameterWarning, stacklevel=2 - ) - for _ in range(2): - baseline = pspline.solve_pspline(y, weight_array) - weight_array = _weighting._asls(y, baseline, p) - - residual = y - baseline - # the 0.2 * std(residual) is an "okay" starting sigma estimate - sigma = 0.2 * np.std(residual) - fraction_noise = 0.5 - if symmetric: - fraction_positive = 0.25 - else: - fraction_positive = 1 - fraction_noise - tol_history = np.empty(max_iter + 1) - for i in range(max_iter + 1): - # expectation part of expectation-maximization -> calc pdfs and - # posterior probabilities - positive_pdf = np.where( - residual >= 0, fraction_positive / max(abs(residual.max()), 1e-6), 0 - ) - noise_pdf = ( - fraction_noise * gaussian(residual, 1 / (sigma * np.sqrt(2 * np.pi)), 0, sigma) - ) - total_pdf = noise_pdf + positive_pdf - if symmetric: - negative_pdf = np.where( - residual < 0, - (1 - fraction_noise - fraction_positive) / max(abs(residual.min()), 1e-6), - 0 - ) - total_pdf += negative_pdf - posterior_prob_noise = noise_pdf / np.maximum(total_pdf, _MIN_FLOAT) - - calc_difference = relative_difference(weight_array, posterior_prob_noise) - tol_history[i] = calc_difference - if calc_difference < tol: - break - - # maximization part of expectation-maximization -> update sigma and - # fractions of each pdf - noise_sum = posterior_prob_noise.sum() - sigma = np.sqrt((posterior_prob_noise * residual**2).sum() / noise_sum) - if not symmetric: - fraction_noise = posterior_prob_noise.mean() - fraction_positive = 1 - fraction_noise - else: - posterior_prob_positive = positive_pdf / total_pdf - posterior_prob_negative = negative_pdf / total_pdf - - positive_sum = posterior_prob_positive.sum() - negative_sum = posterior_prob_negative.sum() - total_sum = noise_sum + positive_sum + negative_sum - - fraction_noise = noise_sum / total_sum - fraction_positive = positive_sum / total_sum - - weight_array = posterior_prob_noise - baseline = pspline.solve_pspline(y, weight_array) - residual = y - baseline - - params = { - 'weights': weight_array, 'tol_history': tol_history[:i + 1], - 'result': PSplineResult(pspline, weight_array) - } - - baseline = np.polynomial.polyutils.mapdomain(baseline, np.array([-1., 1.]), y_domain) - - return baseline, params - - @_Algorithm._register(sort_keys=('weights',)) def irsqr(self, data, lam=100, quantile=0.05, num_knots=100, spline_degree=3, diff_order=3, max_iter=100, tol=1e-6, weights=None, eps=None): """ @@ -245,9 +157,9 @@ def irsqr(self, data, lam=100, quantile=0.05, num_knots=100, spline_degree=3, Returns ------- - baseline : numpy.ndarray, shape (N,) + numpy.ndarray, shape (N,) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (N,) @@ -264,7 +176,7 @@ def irsqr(self, data, lam=100, quantile=0.05, num_knots=100, spline_degree=3, Raises ------ ValueError - Raised if quantile is not between 0 and 1. + Raised if `quantile` is not between 0 and 1. References ---------- @@ -273,31 +185,12 @@ def irsqr(self, data, lam=100, quantile=0.05, num_knots=100, spline_degree=3, Science and Control Engineering (ICISCE), 2018, 280-284. """ - if not 0 < quantile < 1: - raise ValueError('quantile must be between 0 and 1') - - y, weight_array, pspline = self._setup_spline( - data, weights, spline_degree, num_knots, True, diff_order, lam + _check_spline_degree(spline_degree) + return super()._irsqr( + data, lam=lam, quantile=quantile, num_knots=num_knots, spline_degree=spline_degree, + diff_order=diff_order, max_iter=max_iter, tol=tol, weights=weights, eps=eps ) - old_coef = np.zeros(self._spline_basis._num_bases) - tol_history = np.empty(max_iter + 1) - for i in range(max_iter + 1): - baseline = pspline.solve_pspline(y, weight_array) - calc_difference = relative_difference(old_coef, pspline.coef) - tol_history[i] = calc_difference - if calc_difference < tol: - break - old_coef = pspline.coef - weight_array = _weighting._quantile(y, baseline, quantile, eps) - params = { - 'weights': weight_array, 'tol_history': tol_history[:i + 1], - 'result': PSplineResult(pspline, weight_array) - } - - return baseline, params - - @_Algorithm._register(sort_keys=('weights',)) def pspline_asls(self, data, lam=1e3, p=1e-2, num_knots=100, spline_degree=3, diff_order=2, max_iter=50, tol=1e-3, weights=None): """ @@ -332,9 +225,9 @@ def pspline_asls(self, data, lam=1e3, p=1e-2, num_knots=100, spline_degree=3, di Returns ------- - baseline : numpy.ndarray, shape (N,) + numpy.ndarray, shape (N,) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (N,) @@ -370,30 +263,13 @@ def pspline_asls(self, data, lam=1e3, p=1e-2, num_knots=100, spline_degree=3, di Reviews: Computational Statistics, 2010, 2(6), 637-653. """ - if not 0 < p < 1: - raise ValueError('p must be between 0 and 1') - - y, weight_array, pspline = self._setup_spline( - data, weights, spline_degree, num_knots, True, diff_order, lam + _check_spline_degree(spline_degree) + return super()._asls( + data, lam=lam, p=p, diff_order=diff_order, max_iter=max_iter, tol=tol, + weights=weights, spline_degree=spline_degree, num_knots=num_knots ) - tol_history = np.empty(max_iter + 1) - for i in range(max_iter + 1): - baseline = pspline.solve_pspline(y, weight_array) - new_weights = _weighting._asls(y, baseline, p) - calc_difference = relative_difference(weight_array, new_weights) - tol_history[i] = calc_difference - if calc_difference < tol: - break - weight_array = new_weights - params = { - 'weights': weight_array, 'tol_history': tol_history[:i + 1], - 'result': PSplineResult(pspline, weight_array) - } - - return baseline, params - - @_Algorithm._register(sort_keys=('weights',)) + @_Algorithm._handle_io(sort_keys=('weights',)) def pspline_iasls(self, data, lam=1e1, p=1e-2, lam_1=1e-4, num_knots=100, spline_degree=3, max_iter=50, tol=1e-3, weights=None, diff_order=2): """ @@ -506,7 +382,7 @@ def pspline_iasls(self, data, lam=1e1, p=1e-2, lam_1=1e-4, num_knots=100, tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): - baseline = pspline.solve_pspline(y, weight_array, rhs_extra=partial_rhs) + baseline = pspline.solve(y, weight_array, rhs_extra=partial_rhs) new_weights = _weighting._iasls(y, baseline, p) calc_difference = relative_difference(weight_array, new_weights) tol_history[i] = calc_difference @@ -521,7 +397,6 @@ def pspline_iasls(self, data, lam=1e1, p=1e-2, lam_1=1e-4, num_knots=100, return baseline, params - @_Algorithm._register(sort_keys=('weights',)) def pspline_airpls(self, data, lam=1e3, num_knots=100, spline_degree=3, diff_order=2, max_iter=50, tol=1e-3, weights=None, normalize_weights=False): """ @@ -556,9 +431,9 @@ def pspline_airpls(self, data, lam=1e3, num_knots=100, spline_degree=3, Returns ------- - baseline : numpy.ndarray, shape (N,) + numpy.ndarray, shape (N,) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (N,) @@ -585,33 +460,13 @@ def pspline_airpls(self, data, lam=1e3, num_knots=100, spline_degree=3, Reviews: Computational Statistics, 2010, 2(6), 637-653. """ - y, weight_array, pspline = self._setup_spline( - data, weights, spline_degree, num_knots, True, diff_order, lam + _check_spline_degree(spline_degree) + return super()._airpls( + data, lam=lam, diff_order=diff_order, max_iter=max_iter, tol=tol, + weights=weights, spline_degree=spline_degree, num_knots=num_knots, + normalize_weights=normalize_weights ) - y_l1_norm = np.abs(y).sum() - tol_history = np.empty(max_iter + 1) - for i in range(1, max_iter + 2): - baseline = pspline.solve_pspline(y, weight_array) - new_weights, residual_l1_norm, exit_early = _weighting._airpls( - y, baseline, i, normalize_weights - ) - if exit_early: - i -= 1 # reduce i so that output tol_history indexing is correct - break - calc_difference = residual_l1_norm / y_l1_norm - tol_history[i - 1] = calc_difference - if calc_difference < tol: - break - weight_array = new_weights - - params = { - 'weights': weight_array, 'tol_history': tol_history[:i], - 'result': PSplineResult(pspline, weight_array) - } - - return baseline, params - @_Algorithm._register(sort_keys=('weights',)) def pspline_arpls(self, data, lam=1e3, num_knots=100, spline_degree=3, diff_order=2, max_iter=50, tol=1e-3, weights=None): """ @@ -642,9 +497,9 @@ def pspline_arpls(self, data, lam=1e3, num_knots=100, spline_degree=3, diff_orde Returns ------- - baseline : numpy.ndarray, shape (N,) + numpy.ndarray, shape (N,) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (N,) @@ -671,30 +526,13 @@ def pspline_arpls(self, data, lam=1e3, num_knots=100, spline_degree=3, diff_orde Reviews: Computational Statistics, 2010, 2(6), 637-653. """ - y, weight_array, pspline = self._setup_spline( - data, weights, spline_degree, num_knots, True, diff_order, lam + _check_spline_degree(spline_degree) + return super()._arpls( + data, lam=lam, diff_order=diff_order, max_iter=max_iter, tol=tol, + weights=weights, spline_degree=spline_degree, num_knots=num_knots ) - tol_history = np.empty(max_iter + 1) - for i in range(max_iter + 1): - baseline = pspline.solve_pspline(y, weight_array) - new_weights, exit_early = _weighting._arpls(y, baseline) - if exit_early: - i -= 1 # reduce i so that output tol_history indexing is correct - break - calc_difference = relative_difference(weight_array, new_weights) - tol_history[i] = calc_difference - if calc_difference < tol: - break - weight_array = new_weights - - params = { - 'weights': weight_array, 'tol_history': tol_history[:i + 1], - 'result': PSplineResult(pspline, weight_array) - } - - return baseline, params - @_Algorithm._register(sort_keys=('weights',)) + @_Algorithm._handle_io(sort_keys=('weights',)) def pspline_drpls(self, data, lam=1e3, eta=0.5, num_knots=100, spline_degree=3, diff_order=2, max_iter=50, tol=1e-3, weights=None): """ @@ -794,7 +632,7 @@ def pspline_drpls(self, data, lam=1e3, eta=0.5, num_knots=100, spline_degree=3, shifted_bands, shifted_bands ) penalty = _add_diagonals(pspline.penalty, diff_n_w_diagonals, lower_only=False) - baseline = pspline.solve_pspline(y, weight_array, penalty=penalty) + baseline = pspline.solve(y, weight_array, penalty=penalty) new_weights, exit_early = _weighting._drpls(y, baseline, i) if exit_early: i -= 1 # reduce i so that output tol_history indexing is correct @@ -813,7 +651,6 @@ def pspline_drpls(self, data, lam=1e3, eta=0.5, num_knots=100, spline_degree=3, return baseline, params - @_Algorithm._register(sort_keys=('weights',)) def pspline_iarpls(self, data, lam=1e3, num_knots=100, spline_degree=3, diff_order=2, max_iter=50, tol=1e-3, weights=None): """ @@ -844,9 +681,9 @@ def pspline_iarpls(self, data, lam=1e3, num_knots=100, spline_degree=3, diff_ord Returns ------- - baseline : numpy.ndarray, shape (N,) + numpy.ndarray, shape (N,) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (N,) @@ -874,30 +711,13 @@ def pspline_iarpls(self, data, lam=1e3, num_knots=100, spline_degree=3, diff_ord Reviews: Computational Statistics, 2010, 2(6), 637-653. """ - y, weight_array, pspline = self._setup_spline( - data, weights, spline_degree, num_knots, True, diff_order, lam + _check_spline_degree(spline_degree) + return super()._iarpls( + data, lam=lam, diff_order=diff_order, max_iter=max_iter, tol=tol, + weights=weights, spline_degree=spline_degree, num_knots=num_knots ) - tol_history = np.empty(max_iter + 1) - for i in range(1, max_iter + 2): - baseline = pspline.solve_pspline(y, weight_array) - new_weights, exit_early = _weighting._iarpls(y, baseline, i) - if exit_early: - i -= 1 # reduce i so that output tol_history indexing is correct - break - calc_difference = relative_difference(weight_array, new_weights) - tol_history[i - 1] = calc_difference - if calc_difference < tol: - break - weight_array = new_weights - - params = { - 'weights': weight_array, 'tol_history': tol_history[:i], - 'result': PSplineResult(pspline, weight_array) - } - return baseline, params - - @_Algorithm._register(sort_keys=('weights', 'alpha')) + @_Algorithm._handle_io(sort_keys=('weights', 'alpha')) def pspline_aspls(self, data, lam=1e4, num_knots=100, spline_degree=3, diff_order=2, max_iter=100, tol=1e-3, weights=None, alpha=None, asymmetric_coef=2., alternate_weighting=True): @@ -1017,7 +837,7 @@ def pspline_aspls(self, data, lam=1e4, num_knots=100, spline_degree=3, diff_orde pspline.penalty * np.interp(interp_pts, self.x, alpha_array), pspline.num_bands, pspline.num_bands ) - baseline = pspline.solve_pspline(y, weight_array, penalty=alpha_penalty) + baseline = pspline.solve(y, weight_array, penalty=alpha_penalty) new_weights, residual, exit_early = _weighting._aspls( y, baseline, asymmetric_coef, alternate_weighting ) @@ -1039,7 +859,6 @@ def pspline_aspls(self, data, lam=1e4, num_knots=100, spline_degree=3, diff_orde return baseline, params - @_Algorithm._register(sort_keys=('weights',)) def pspline_psalsa(self, data, lam=1e3, p=0.5, k=None, num_knots=100, spline_degree=3, diff_order=2, max_iter=50, tol=1e-3, weights=None): """ @@ -1080,9 +899,9 @@ def pspline_psalsa(self, data, lam=1e3, p=0.5, k=None, num_knots=100, spline_deg Returns ------- - baseline : numpy.ndarray, shape (N,) + numpy.ndarray, shape (N,) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (N,) @@ -1116,34 +935,12 @@ def pspline_psalsa(self, data, lam=1e3, p=0.5, k=None, num_knots=100, spline_deg Reviews: Computational Statistics, 2010, 2(6), 637-653. """ - if not 0 < p < 1: - raise ValueError('p must be between 0 and 1') - - y, weight_array, pspline = self._setup_spline( - data, weights, spline_degree, num_knots, True, diff_order, lam + _check_spline_degree(spline_degree) + return super()._psalsa( + data, lam=lam, p=p, k=k, diff_order=diff_order, max_iter=max_iter, tol=tol, + weights=weights, spline_degree=spline_degree, num_knots=num_knots ) - if k is None: - k = np.std(y) / 10 - else: - k = _check_scalar_variable(k, variable_name='k') - tol_history = np.empty(max_iter + 1) - for i in range(max_iter + 1): - baseline = pspline.solve_pspline(y, weight_array) - new_weights = _weighting._psalsa(y, baseline, p, k, self._shape) - calc_difference = relative_difference(weight_array, new_weights) - tol_history[i] = calc_difference - if calc_difference < tol: - break - weight_array = new_weights - params = { - 'weights': weight_array, 'tol_history': tol_history[:i + 1], - 'result': PSplineResult(pspline, weight_array) - } - - return baseline, params - - @_Algorithm._register(sort_keys=('weights',)) def pspline_derpsalsa(self, data, lam=1e2, p=1e-2, k=None, num_knots=100, spline_degree=3, diff_order=2, max_iter=50, tol=1e-3, weights=None, smooth_half_window=None, num_smooths=16, pad_kwargs=None, **kwargs): @@ -1199,9 +996,9 @@ def pspline_derpsalsa(self, data, lam=1e2, p=1e-2, k=None, num_knots=100, spline Returns ------- - baseline : numpy.ndarray, shape (N,) + numpy.ndarray, shape (N,) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (N,) @@ -1235,56 +1032,15 @@ def pspline_derpsalsa(self, data, lam=1e2, p=1e-2, k=None, num_knots=100, spline Reviews: Computational Statistics, 2010, 2(6), 637-653. """ - if not 0 < p < 1: - raise ValueError('p must be between 0 and 1') - y, weight_array, pspline = self._setup_spline( - data, weights, spline_degree, num_knots, True, diff_order, lam + _check_spline_degree(spline_degree) + return super()._derpsalsa( + data, lam=lam, p=p, k=k, num_knots=num_knots, spline_degree=spline_degree, + diff_order=diff_order, max_iter=max_iter, tol=tol, weights=weights, + smooth_half_window=smooth_half_window, num_smooths=num_smooths, + pad_kwargs=pad_kwargs, **kwargs ) - if k is None: - k = np.std(y) / 10 - else: - k = _check_scalar_variable(k, variable_name='k') - - if smooth_half_window is None: - smooth_half_window = self._size // 200 - # could pad the data every iteration, but it is ~2-3 times slower and only affects - # the edges, so it's not worth it - self._deprecate_pad_kwargs(**kwargs) - pad_kwargs = pad_kwargs if pad_kwargs is not None else {} - y_smooth = pad_edges(y, smooth_half_window, **pad_kwargs, **kwargs) - if smooth_half_window > 0: - smooth_kernel = _mollifier_kernel(smooth_half_window) - for _ in range(num_smooths): - y_smooth = padded_convolve(y_smooth, smooth_kernel) - y_smooth = y_smooth[smooth_half_window:self._size + smooth_half_window] - - diff_y_1 = np.gradient(y_smooth) - diff_y_2 = np.gradient(diff_y_1) - # x.dot(x) is same as (x**2).sum() but faster - rms_diff_1 = np.sqrt(diff_y_1.dot(diff_y_1) / self._size) - rms_diff_2 = np.sqrt(diff_y_2.dot(diff_y_2) / self._size) - - diff_1_weights = np.exp(-((diff_y_1 / rms_diff_1)**2) / 2) - diff_2_weights = np.exp(-((diff_y_2 / rms_diff_2)**2) / 2) - partial_weights = diff_1_weights * diff_2_weights - tol_history = np.empty(max_iter + 1) - for i in range(max_iter + 1): - baseline = pspline.solve_pspline(y, weight_array) - new_weights = _weighting._derpsalsa(y, baseline, p, k, self._shape, partial_weights) - calc_difference = relative_difference(weight_array, new_weights) - tol_history[i] = calc_difference - if calc_difference < tol: - break - weight_array = new_weights - - params = { - 'weights': weight_array, 'tol_history': tol_history[:i + 1], - 'result': PSplineResult(pspline, weight_array) - } - return baseline, params - - @_Algorithm._register(sort_keys=('weights',)) + @_Algorithm._handle_io(sort_keys=('weights',)) def pspline_mpls(self, data, half_window=None, lam=1e3, p=0.0, num_knots=100, spline_degree=3, diff_order=2, tol=None, max_iter=None, weights=None, window_kwargs=None, **kwargs): @@ -1417,7 +1173,7 @@ def pspline_mpls(self, data, half_window=None, lam=1e3, p=0.0, num_knots=100, sp _, weight_array, pspline = self._setup_spline( y, w, spline_degree, num_knots, True, diff_order, lam ) - baseline = pspline.solve_pspline(y, weight_array) + baseline = pspline.solve(y, weight_array) params = { 'weights': weight_array, 'half_window': half_wind, @@ -1425,7 +1181,6 @@ def pspline_mpls(self, data, half_window=None, lam=1e3, p=0.0, num_knots=100, sp } return baseline, params - @_Algorithm._register(sort_keys=('weights',)) def pspline_brpls(self, data, lam=1e3, num_knots=100, spline_degree=3, diff_order=2, max_iter=50, tol=1e-3, max_iter_2=50, tol_2=1e-3, weights=None): """ @@ -1450,7 +1205,7 @@ def pspline_brpls(self, data, lam=1e3, num_knots=100, spline_degree=3, diff_orde The max number of fit iterations. Default is 50. tol : float, optional The exit criteria. Default is 1e-3. - max_iter_2 : float, optional + max_iter_2 : int, optional The number of iterations for updating the proportion of data occupied by peaks. Default is 50. tol_2 : float, optional @@ -1462,9 +1217,9 @@ def pspline_brpls(self, data, lam=1e3, num_knots=100, spline_degree=3, diff_orde Returns ------- - baseline : numpy.ndarray, shape (N,) + numpy.ndarray, shape (N,) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (N,) @@ -1496,55 +1251,13 @@ def pspline_brpls(self, data, lam=1e3, num_knots=100, spline_degree=3, diff_orde Reviews: Computational Statistics, 2010, 2(6), 637-653. """ - y, weight_array, pspline = self._setup_spline( - data, weights, spline_degree, num_knots, True, diff_order, lam + _check_spline_degree(spline_degree) + return super()._brpls( + data, lam=lam, diff_order=diff_order, max_iter=max_iter, tol=tol, + max_iter_2=max_iter_2, tol_2=tol_2, weights=weights, + spline_degree=spline_degree, num_knots=num_knots ) - beta = 0.5 - j_max = 0 - baseline = y - baseline_weights = weight_array - tol_history = np.zeros((max_iter_2 + 2, max(max_iter, max_iter_2) + 1)) - # implementation note: weight_array must always be updated since otherwise when - # reentering the inner loop, new_baseline and baseline would be the same; instead, - # use baseline_weights to track which weights produced the output baseline - for i in range(max_iter_2 + 1): - for j in range(max_iter + 1): - new_baseline = pspline.solve_pspline(y, weight_array) - new_weights, exit_early = _weighting._brpls(y, new_baseline, beta) - if exit_early: - j -= 1 # reduce j so that output tol_history indexing is correct - tol_2 = np.inf # ensure it exits outer loop - break - # Paper used norm(old - new) / norm(new) rather than old in the denominator, - # but I use old in the denominator instead to be consistent with all other - # algorithms; does not make a major difference - calc_difference = relative_difference(baseline, new_baseline) - tol_history[i + 1, j] = calc_difference - if calc_difference < tol: - if i == 0 and j == 0: # for cases where tol == inf - baseline = new_baseline - break - baseline_weights = weight_array - weight_array = new_weights - baseline = new_baseline - j_max = max(j, j_max) - - weight_array = new_weights - weight_mean = weight_array.mean() - calc_difference_2 = abs(beta + weight_mean - 1) - tol_history[0, i] = calc_difference_2 - if calc_difference_2 < tol_2: - break - beta = 1 - weight_mean - params = { - 'weights': baseline_weights, 'tol_history': tol_history[:i + 2, :max(i, j_max) + 1], - 'result': PSplineResult(pspline, baseline_weights) - } - - return baseline, params - - @_Algorithm._register(sort_keys=('weights',)) def pspline_lsrpls(self, data, lam=1e3, num_knots=100, spline_degree=3, diff_order=2, max_iter=50, tol=1e-3, weights=None, alternate_weighting=False): """ @@ -1581,9 +1294,9 @@ def pspline_lsrpls(self, data, lam=1e3, num_knots=100, spline_degree=3, diff_ord Returns ------- - baseline : numpy.ndarray, shape (N,) + numpy.ndarray, shape (N,) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (N,) @@ -1621,28 +1334,12 @@ def pspline_lsrpls(self, data, lam=1e3, num_knots=100, spline_degree=3, diff_ord penalized least squares, Applied Optics, 2019, 58, 3913-3920. """ - y, weight_array, pspline = self._setup_spline( - data, weights, spline_degree, num_knots, True, diff_order, lam + _check_spline_degree(spline_degree) + return super()._lsrpls( + data, lam=lam, diff_order=diff_order, max_iter=max_iter, tol=tol, + weights=weights, spline_degree=spline_degree, num_knots=num_knots, + alternate_weighting=alternate_weighting ) - tol_history = np.empty(max_iter + 1) - for i in range(1, max_iter + 2): - baseline = pspline.solve_pspline(y, weight_array) - new_weights, exit_early = _weighting._lsrpls(y, baseline, i, alternate_weighting) - if exit_early: - i -= 1 # reduce i so that output tol_history indexing is correct - break - calc_difference = relative_difference(weight_array, new_weights) - tol_history[i - 1] = calc_difference - if calc_difference < tol: - break - weight_array = new_weights - - params = { - 'weights': weight_array, 'tol_history': tol_history[:i], - 'result': PSplineResult(pspline, weight_array) - } - - return baseline, params _spline_wrapper = _class_wrapper(_Spline) diff --git a/pybaselines/two_d/_algorithm_setup.py b/pybaselines/two_d/_algorithm_setup.py index 19fbcda..4776587 100644 --- a/pybaselines/two_d/_algorithm_setup.py +++ b/pybaselines/two_d/_algorithm_setup.py @@ -12,6 +12,7 @@ import numpy as np +from ..results import PSplineResult2D, WhittakerResult2D from .._validation import ( _check_array, _check_half_window, _check_optional_array, _check_scalar_variable, _check_sized_array, _yxz_arrays @@ -252,8 +253,8 @@ def pentapy_solver(self, value): ) self.banded_solver = value - def _return_results(self, baseline, params, dtype, sort_keys=(), ensure_2d=False, - reshape_baseline=False, reshape_keys=(), skip_sorting=False): + def _return_results(self, baseline, params, dtype, sort_keys=(), ensure_dims=True, + reshape_keys=(), skip_sorting=False): """ Re-orders the input baseline and parameters based on the x ordering. @@ -270,13 +271,10 @@ def _return_results(self, baseline, params, dtype, sort_keys=(), ensure_2d=False sort_keys : Iterable, optional An iterable of keys corresponding to the values in `params` that need re-ordering. Default is (). - ensure_2d : bool, optional + ensure_dims : bool, optional If True (default), will raise an error if the shape of `array` is not a two dimensional array with shape (M, N) or a three dimensional array with shape (M, N, 1), (M, 1, N), or (1, M, N). - reshape_baseline : bool, optional - If True, will reshape the output baseline back into the shape of the input data. If - False (default), will not modify the output baseline shape. reshape_keys : tuple, optional The keys within the output parameter dictionary that will need reshaped to match the shape of the data. For example, used to convert weights for polynomials from 1D back @@ -293,11 +291,12 @@ def _return_results(self, baseline, params, dtype, sort_keys=(), ensure_2d=False The input `params` after re-ordering the values for `sort_keys`. """ - if reshape_baseline: - if ensure_2d: - baseline = baseline.reshape(self._shape) - else: - baseline = baseline.reshape(-1, *self._shape) + ndims = baseline.ndim + if ndims == 1 and ensure_dims: # raveled to 1D within the method + baseline = baseline.reshape(self._shape) + elif ndims == 2 and not ensure_dims: # 3D input raveled to (P, M * N) + baseline = baseline.reshape(-1, *self._shape) + for key in reshape_keys: if key in params: # TODO can any params be non-2d that need reshaped? @@ -316,10 +315,10 @@ def _return_results(self, baseline, params, dtype, sort_keys=(), ensure_2d=False return baseline, params @classmethod - def _register(cls, func=None, *, sort_keys=(), ensure_2d=True, reshape_baseline=False, - reshape_keys=(), skip_sorting=False, require_unique_xz=False): + def _handle_io(cls, func=None, *, sort_keys=(), ensure_dims=True, reshape_keys=(), + skip_sorting=False, require_unique=False): """ - Wraps a baseline function to validate inputs and correct outputs. + Wraps a baseline method to validate inputs and correct outputs. The input data is converted to a numpy array, validated to ensure the length is consistent, and ordered to match the input x ordering. The outputs are corrected @@ -328,17 +327,14 @@ def _register(cls, func=None, *, sort_keys=(), ensure_2d=True, reshape_baseline= Parameters ---------- func : Callable, optional - The function that is being decorated. Default is None, which returns a partial function. + The method that is being decorated. Default is None, which returns a partial function. sort_keys : tuple, optional The keys within the output parameter dictionary that will need sorting to match the - sort order of :attr:`.x`. Default is (). - ensure_2d : bool, optional + sort order of ``self.x`` and ``self.z``. Default is (). + ensure_dims : bool, optional If True (default), will raise an error if the shape of `array` is not a two dimensional array with shape (M, N) or a three dimensional array with shape (M, N, 1), (M, 1, N), or (1, M, N). - reshape_baseline : bool, optional - If True, will reshape the output baseline back into the shape of the input data. If - False (default), will not modify the output baseline shape. reshape_keys : tuple, optional The keys within the output parameter dictionary that will need reshaped to match the shape of the data. For example, used to convert weights for polynomials from 1D back @@ -346,7 +342,7 @@ def _register(cls, func=None, *, sort_keys=(), ensure_2d=True, reshape_baseline= skip_sorting : bool, optional If True, will skip sorting the output baseline. The keys in `sort_keys` will still be sorted. Default is False. - require_unique_xz : bool, optional + require_unique : bool, optional If True, will check ``self.x`` and ``self.z`` to ensure all values are unique and will raise an error if non-unique values are present. Default is False, which skips the check. @@ -361,9 +357,9 @@ def _register(cls, func=None, *, sort_keys=(), ensure_2d=True, reshape_baseline= """ if func is None: return partial( - cls._register, sort_keys=sort_keys, ensure_2d=ensure_2d, - reshape_baseline=reshape_baseline, reshape_keys=reshape_keys, - skip_sorting=skip_sorting, require_unique_xz=require_unique_xz + cls._handle_io, sort_keys=sort_keys, ensure_dims=ensure_dims, + reshape_keys=reshape_keys, skip_sorting=skip_sorting, + require_unique=require_unique ) @wraps(func) @@ -387,17 +383,17 @@ def inner(self, data=None, *args, **kwargs): axis = -1 y = _check_sized_array( data, expected_shape, check_finite=self._check_finite, ensure_1d=False, - axis=axis, name='data', ensure_2d=ensure_2d, two_d=True + axis=axis, name='data', ensure_2d=ensure_dims, two_d=True ) else: y, self.x, self.z = _yxz_arrays( - data, self.x, self.z, check_finite=self._check_finite, ensure_2d=ensure_2d + data, self.x, self.z, check_finite=self._check_finite, ensure_2d=ensure_dims ) if not has_x: self._shape = (y.shape[-2], self._shape[1]) self.x = np.linspace(-1, 1, self._shape[0]) - elif require_unique_xz and not self._validated_x: + elif require_unique and not self._validated_x: if np.any(self.x[1:] == self.x[:-1]): raise ValueError('x-values must be unique for the selected method') else: @@ -405,7 +401,7 @@ def inner(self, data=None, *args, **kwargs): if not has_z: self._shape = (self._shape[0], y.shape[-1]) self.z = np.linspace(-1, 1, self._shape[1]) - elif require_unique_xz and not self._validated_z: + elif require_unique and not self._validated_z: if np.any(self.z[1:] == self.z[:-1]): raise ValueError('z-values must be unique for the selected method') else: @@ -424,9 +420,8 @@ def inner(self, data=None, *args, **kwargs): baseline, params = func(self, y, *args, **kwargs) return self._return_results( - baseline, params, dtype=output_dtype, sort_keys=sort_keys, ensure_2d=ensure_2d, - reshape_baseline=reshape_baseline, reshape_keys=reshape_keys, - skip_sorting=skip_sorting + baseline, params, dtype=output_dtype, sort_keys=sort_keys, ensure_dims=ensure_dims, + reshape_keys=reshape_keys, skip_sorting=skip_sorting ) return inner @@ -466,7 +461,7 @@ def _setup_whittaker(self, y, lam=1, diff_order=2, weights=None, copy_weights=Fa ---------- y : numpy.ndarray, shape (M ,N) The y-values of the measured data, already converted to a numpy - array by :meth:`~._Algorithm2D._register`. + array by :meth:`~._Algorithm2D._handle_io`. lam : float or Sequence[float, float], optional The smoothing parameter, lambda. Typical values are between 10 and 1e8, but it strongly depends on the penalized least square method @@ -519,8 +514,8 @@ def _setup_whittaker(self, y, lam=1, diff_order=2, weights=None, copy_weights=Fa self._shape, weights, copy_input=copy_weights, check_finite=self._check_finite, ensure_1d=False, axis=slice(None), dtype=float ) - if self._sort_order is not None and weights is not None: - weight_array = weight_array[self._sort_order] + if weights is not None: + weight_array = _sort_array2d(weight_array, self._sort_order) # TODO can probably keep the basis for reuse if using SVD, like _setup_spline does, and # retain the unmodified penalties for the rows and columns if possible to skip that @@ -543,7 +538,7 @@ def _setup_polynomial(self, y, weights=None, poly_order=2, calc_vander=True, ---------- y : numpy.ndarray, shape (M, N) The y-values of the measured data, already converted to a numpy - array by :meth:`~._Algorithm2D._register`. + array by :meth:`~._Algorithm2D._handle_io`. weights : array-like, shape (M, N), optional The weighting array. If None (default), then will be an array with shape equal to (M, N) and all values set to 1. @@ -595,8 +590,8 @@ def _setup_polynomial(self, y, weights=None, poly_order=2, calc_vander=True, self._shape, weights, copy_input=copy_weights, check_finite=self._check_finite, ensure_1d=False, axis=slice(None), dtype=float ) - if self._sort_order is not None and weights is not None: - weight_array = weight_array[self._sort_order] + if weights is not None: + weight_array = _sort_array2d(weight_array, self._sort_order) weight_array = weight_array.ravel() if calc_vander: @@ -625,8 +620,7 @@ def _setup_polynomial(self, y, weights=None, poly_order=2, calc_vander=True, return y, weight_array, pseudo_inverse def _setup_spline(self, y, weights=None, spline_degree=3, num_knots=10, - penalized=True, diff_order=3, lam=1, make_basis=True, allow_lower=True, - reverse_diags=False, copy_weights=False): + penalized=True, diff_order=3, lam=1, make_basis=True, copy_weights=False): """ Sets the starting parameters for doing spline fitting. @@ -634,7 +628,7 @@ def _setup_spline(self, y, weights=None, spline_degree=3, num_knots=10, ---------- y : numpy.ndarray, shape (M, N) The y-values of the measured data, already converted to a numpy - array by :meth:`~._Algorithm2D._register`. + array by :meth:`~._Algorithm2D._handle_io`. weights : array-like, shape (M, N), optional The weighting array. If None (default), then will be an array with shape equal to (M, N) and all values set to 1. @@ -654,12 +648,6 @@ def _setup_spline(self, y, weights=None, spline_degree=3, num_knots=10, Default is 1. make_basis : bool, optional If True (default), will create the matrix containing the spline basis functions. - allow_lower : boolean, optional - If True (default), will include only the lower non-zero diagonals of - the squared difference matrix. If False, will include all non-zero diagonals. - reverse_diags : boolean, optional - If True, will reverse the order of the diagonals of the penalty matrix. - Default is False. copy_weights : boolean, optional If True, will copy the array of input weights. Only needed if the algorithm changes the weights in-place. Default is False. @@ -689,8 +677,8 @@ def _setup_spline(self, y, weights=None, spline_degree=3, num_knots=10, self._shape, weights, copy_input=copy_weights, check_finite=self._check_finite, ensure_1d=False, axis=slice(None), dtype=float ) - if self._sort_order is not None and weights is not None: - weight_array = weight_array[self._sort_order] + if weights is not None: + weight_array = _sort_array2d(weight_array, self._sort_order) diff_order = _check_scalar_variable( diff_order, allow_zero=False, variable_name='difference order', two_d=True, dtype=int ) @@ -716,6 +704,80 @@ def _setup_spline(self, y, weights=None, spline_degree=3, num_knots=10, return y, weight_array, pspline + def _setup_pls(self, y, weights=None, spline_degree=None, num_knots=10, + diff_order=2, lam=1, allow_lower=True, reverse_diags=False, + copy_weights=False, num_eigens=None): + """ + Sets the starting parameters for methods using penalized least squares. + + Depending on the input of `spline_degree`, will dispatch to either + `_setup_whittaker` or `_setup_spline`. + + Parameters + ---------- + y : numpy.ndarray, shape (N,) + The y-values of the measured data, already converted to a numpy + array by :meth:`~._Algorithm._handle_io`. + weights : array-like, shape (N,), optional + The weighting array. If None (default), then will be an array with + size equal to N and all values set to 1. + spline_degree : int or None, optional + If None (default), denotes that the system is using Whittaker smoothing. + Otherwise, the system is a penalized spline with a spline degree of `spline_degree`. + num_knots : int, optional + The number of interior knots for the splines. Only used if `spline_degree` is + not None. Default is 10. + diff_order : int, optional + The integer differential order for the penalty; must be greater than 0. + Default is 2. + lam : float, optional + The smoothing parameter, lambda. Typical values are between 10 and + 1e8, but it strongly depends on `diff_order` and the data size. + Default is 1. + allow_lower : boolean, optional + Not used within this method, simply added to have the same call signature + as `_Algorithm._setup_pls`. + reverse_diags : boolean, optional + Not used within this method, simply added to have the same call signature + as `_Algorithm._setup_pls`. + copy_weights : boolean, optional + If True, will copy the array of input weights. Only needed if the + algorithm changes the weights in-place. Default is False. + num_eigens : int or Sequence[int, int] or None + Only used if `spline_degree` is None. The number of eigenvalues for the rows + and columns, respectively, to use for eigendecomposition. If None, will s + olve the linear system using the full analytical solution, which is typically + much slower. Default is None. + + Returns + ------- + y : numpy.ndarray, shape (N,) + The y-values of the measured data, converted to a numpy array. + weight_array : numpy.ndarray, shape (N,) + The weight array for fitting the spline to the data. + penalized_system : WhittakerSystem2D or PSpline2D + The object for solving the penalized least squared system. If `spline_degree` + is None, returns a WhittakerSystem2D object;, otherwise, returns a PSpline2D. + result_class : WhittakerResult2D or PSplineResult2D + The result class for defining the solution. If `spline_degree` + is None, returns WhittakerResult2D; otherwise, returns PSplineResult2D. + + """ + if spline_degree is None: + y, weight_array, penalized_system = self._setup_whittaker( + y, lam=lam, diff_order=diff_order, weights=weights, copy_weights=copy_weights, + num_eigens=num_eigens + ) + result_class = WhittakerResult2D + else: + y, weight_array, penalized_system = self._setup_spline( + y, lam=lam, diff_order=diff_order, weights=weights, copy_weights=copy_weights, + spline_degree=spline_degree, num_knots=num_knots, penalized=True, make_basis=True + ) + result_class = PSplineResult2D + + return y, weight_array, penalized_system, result_class + def _setup_morphology(self, y, half_window=None, window_kwargs=None, **kwargs): """ Sets the starting parameters for morphology-based methods. @@ -724,7 +786,7 @@ def _setup_morphology(self, y, half_window=None, window_kwargs=None, **kwargs): ---------- y : numpy.ndarray, shape (M, N) The y-values of the measured data, already converted to a numpy - array by :meth:`~._Algorithm2D._register`. + array by :meth:`~._Algorithm2D._handle_io`. half_window : int or Sequence[int, int], optional The half-window used for the morphology functions. If a value is input, then that value will be used. Default is None, which will optimize the @@ -776,7 +838,7 @@ def _setup_smooth(self, y, half_window=None, window_multiplier=1, pad_kwargs=Non ---------- y : numpy.ndarray, shape (M, N) The y-values of the measured data, already converted to a numpy - array by :meth:`~._Algorithm2D._register`. + array by :meth:`~._Algorithm2D._handle_io`. half_window : int or Sequence[int, int], optional The half-window used for the smoothing functions. Used to pad the left and right edges of the data to reduce edge @@ -826,7 +888,7 @@ def _setup_classification(self, y, weights=None): ---------- y : numpy.ndarray, shape (M, N) The y-values of the measured data, already converted to a numpy - array by :meth:`~._Algorithm2D._register`. + array by :meth:`~._Algorithm2D._handle_io`. weights : array-like, shape (M, N), optional The weighting array. If None (default), then will be an array with shape equal to (M, N) and all values set to 1. @@ -843,8 +905,8 @@ def _setup_classification(self, y, weights=None): self._shape, weights, check_finite=self._check_finite, dtype=bool, ensure_1d=False, axis=slice(None) ) - if self._sort_order is not None and weights is not None: - weight_array = weight_array[self._sort_order] + if weights is not None: + weight_array = _sort_array2d(weight_array, self._sort_order) weight_array = weight_array return y, weight_array @@ -934,7 +996,7 @@ def _setup_optimizer(self, y, method, modules, method_kwargs=None, copy_kwargs=T ---------- y : numpy.ndarray The y-values of the measured data, already converted to a numpy - array by :meth:`~._Algorithm2D._register`. + array by :meth:`~._Algorithm2D._handle_io`. method : str The string name of the desired function, like 'asls'. Case does not matter. modules : Sequence[module, ...] @@ -984,7 +1046,7 @@ def _setup_misc(self, y): ---------- y : numpy.ndarray, shape (M, N) The y-values of the measured data, already converted to a numpy - array by :meth:`~._Algorithm2D._register`. + array by :meth:`~._Algorithm2D._handle_io`. Returns ------- diff --git a/pybaselines/two_d/_spline_utils.py b/pybaselines/two_d/_spline_utils.py index 86097bc..c082067 100644 --- a/pybaselines/two_d/_spline_utils.py +++ b/pybaselines/two_d/_spline_utils.py @@ -8,7 +8,6 @@ import numpy as np from scipy.sparse import kron -from scipy.sparse.linalg import spsolve from .._compat import csr_object from .._spline_utils import _spline_basis, _spline_knots @@ -146,6 +145,11 @@ def basis(self): def _make_btwb(self, weights): """Computes ``Basis.T @ Weights @ Basis`` as a generalized linear array model. + Parameters + ---------- + weights : numpy.ndarray, shape (M, N) + The weights for each y-value. + Returns ------- F : scipy.sparse.csr_matrix or scipy.sparse.csr_array @@ -263,6 +267,39 @@ def __init__(self, spline_basis, lam=1, diff_order=2): 'functions, which is the number of knots + spline degree - 1' )) + @property + def shape(self): + """ + The shape of the data being fit by the penalized system. + + Returns + ------- + tuple[int, int] + The shape of the data that the system corresponds to. + + """ + return (len(self.basis.x), len(self.basis.z)) + + def _make_btwb(self, weights): + """Computes ``Basis.T @ Weights @ Basis`` as a generalized linear array model. + + Parameters + ---------- + weights : numpy.ndarray, shape (M, N) + The weights for each y-value. + + Returns + ------- + scipy.sparse.csr_matrix or scipy.sparse.csr_array + The computed result of ``B.T @ W @ B``. + + Notes + ----- + This is just a shim to connect 1D and 2D PSpline method calls. + + """ + return self.basis._make_btwb(weights) + def reset_penalty(self, lam=1, diff_order=2): """ Resets the penalty of the system and all of the attributes. @@ -328,7 +365,7 @@ def solve(self, y, weights, penalty=None, rhs_extra=None): if rhs_extra is not None: rhs = rhs + rhs_extra - self.coef = spsolve(self.basis._make_btwb(weights) + penalty, rhs) + self.coef = self.direct_solve(self.basis._make_btwb(weights) + penalty, rhs) output = ( self.basis.basis_r @ self.coef.reshape(self.basis._num_bases) @ self.basis.basis_c.T ) diff --git a/pybaselines/two_d/_whittaker_utils.py b/pybaselines/two_d/_whittaker_utils.py index 64b3baa..b10d64d 100644 --- a/pybaselines/two_d/_whittaker_utils.py +++ b/pybaselines/two_d/_whittaker_utils.py @@ -9,9 +9,11 @@ import warnings import numpy as np -from scipy.linalg import eig_banded, eigh_tridiagonal, solve -from scipy.sparse import kron -from scipy.sparse.linalg import spsolve +from scipy.linalg import ( + cholesky, cho_solve, eig_banded, eigh_tridiagonal, lu_factor, lu_solve, solve +) +from scipy.sparse import issparse, kron +from scipy.sparse.linalg import factorized, spsolve from .._banded_utils import diff_penalty_diagonals, diff_penalty_matrix from .._compat import identity @@ -97,6 +99,32 @@ def __init__(self, data_size, lam=1, diff_order=2): self._num_bases = data_size self.reset_diagonals(lam, diff_order) + @property + def tot_bases(self): + """ + The total number of basis functions for the system. + + Returns + ------- + int + The total number of basis functions for the system. + + """ + return np.prod(self._num_bases) + + @property + def shape(self): + """ + The shape of the data being fit by the penalized system. + + Returns + ------- + tuple[int, int] + The shape of the data that the system corresponds to. + + """ + return self._num_bases + def add_penalty(self, penalty): """ Updates `self.penalty` with an additional penalty and updates the bands. @@ -197,7 +225,8 @@ def solve(self, y, weights, penalty=None, rhs_extra=None): return self.direct_solve(lhs, rhs) - def direct_solve(self, lhs, rhs): + def direct_solve(self, lhs, rhs, overwrite_ab=False, overwrite_b=False, assume_a='pos', + check_finite=False): """ Solves the linear system ``lhs @ x = rhs``. @@ -207,14 +236,30 @@ def direct_solve(self, lhs, rhs): The left hand side of the equation. rhs : numpy.ndarray or scipy.sparse.spmatrix or scipy.sparse.sparray The right hand side of the equation. + overwrite_ab : bool, optional + Whether to overwrite `lhs` when using any of the solvers. Default is False. + Written as 'overwrite_ab' rather than 'overwrite_a' for compatible usage + with pybaselines._banded_utils.PenalizedSystem. + overwrite_b : bool, optional + Whether to overwrite `rhs` when using any of the solvers. Default is False. + check_finite : bool, optional + Whether to check if the inputs are finite. Default is False. Returns ------- - scipy.sparse.spmatrix or scipy.sparse.sparray + output : numpy.ndarray, shape (N,) or (M, N) The solution to the linear system, with the same shape as `rhs`. """ - return spsolve(lhs, rhs) + if issparse(lhs): + output = spsolve(lhs, rhs) + else: + # set lower=True since it's consistently used elsewhere + output = solve( + lhs, rhs, lower=True, assume_a=assume_a, overwrite_a=overwrite_ab, + overwrite_b=overwrite_b, check_finite=check_finite + ) + return output def add_diagonal(self, value): """ @@ -238,6 +283,93 @@ def reset_diagonal(self): """Sets the main diagonal of the penalty matrix back to its original value.""" self.penalty.setdiag(self.main_diagonal) + def factorize(self, lhs, assume_a='pos', overwrite_ab=False, check_finite=False): + """ + Calculates the factorization of ``A`` for the linear equation ``A x = b``. + + Parameters + ---------- + lhs : array-like, shape (M, N) + The left-hand side of the equation, in banded format. `lhs` is assumed to be + some slight modification of `self.penalty` in the same format (reversed, lower, + number of bands, etc. are all the same). + assume_a : str, optional + Only used if the system is using eigendecomposition. Default is 'pos', which + will use factorize `lhs` using :func:`scipy.linalg.cholesky`. Any other value + will use :func:`scipy.linalg.lu_factor`. + overwrite_ab : bool, optional + Whether to overwrite `lhs` during factorization. Default is False. + Written as 'overwrite_ab' rather than 'overwrite_a' for compatible usage + with pybaselines._banded_utils.PenalizedSystem. + check_finite : bool, optional + Whether to check if the inputs are finite. Default is False. + + Returns + ------- + factorization : Callable or tuple[np.ndarray, bool] or tuple[np.ndarray, np.ndarray] + The factorization of `lhs`. + + """ + if issparse(lhs): + factorization = factorized(lhs.tocsc()) + else: + # TODO assume_a should probably just be an attribute of the object; for now, + # WhittakerSystem2D is always positive definite when using eigendecomposition, so + # there's no real reason to support other configurations here + if assume_a == 'pos': + # note: cholesky is a little slower than scipy.linalg.cho_factor since it fills + # in zeros, but cho_factor may be deprecated at some future point (see + # https://github.com/scipy/scipy/pull/24759), so just use cholesky; still add + # the lower bool that cho_factor would output for use with cho_solve + factorization = ( + cholesky(lhs, lower=True, overwrite_a=overwrite_ab, check_finite=check_finite), + True + ) + else: + factorization = lu_factor(lhs, overwrite_a=overwrite_ab, check_finite=check_finite) + + return factorization + + def factorized_solve(self, factorization, rhs, assume_a='pos', overwrite_b=False, + check_finite=False): + """ + Solves ``A x = b`` given the factorization of ``A``. + + Parameters + ---------- + factorization : Callable or tuple[np.ndarray, bool] or tuple[np.ndarray, np.ndarray] + The factorization of ``A``, output by :meth:`~.PenalizedSystem2D.factorize`. + rhs : array-like, shape (N,) or (N, M) + The right-hand side of the equation. + assume_a : str, optional + Only used if the system is using eigendecomposition. Default is 'pos', which + will solve using :func:`scipy.linalg.cho_solve`. Any other value + will use :func:`scipy.linalg.lu_solve`. + overwrite_b : bool, optional + Whether to overwrite `rhs` when using any of the solvers. Default is False. + check_finite : bool, optional + Whether to check if the inputs are finite. Default is False. + + Returns + ------- + output : numpy.ndarray, shape (N,) or (N, M) + The solution to the linear system, `x`. + + """ + if callable(factorization): + output = factorization(rhs) + else: + if assume_a == 'pos': + output = cho_solve( + factorization, rhs, overwrite_b=overwrite_b, check_finite=check_finite + ) + else: + output = lu_solve( + factorization, rhs, overwrite_b=overwrite_b, check_finite=check_finite + ) + + return output + class WhittakerSystem2D(PenalizedSystem2D): """ @@ -311,6 +443,19 @@ def __init__(self, data_size, lam=1, diff_order=2, num_eigens=None): self._using_svd = True self.reset_diagonals(lam, diff_order) + @property + def shape(self): + """ + The shape of the data being fit by the penalized system. + + Returns + ------- + tuple[int, int] + The shape of the data that the system corresponds to. + + """ + return self._num_points + def reset_diagonals(self, lam=1, diff_order=2): """ Resets the diagonals of the system and all of the attributes. @@ -338,8 +483,6 @@ def reset_diagonals(self, lam=1, diff_order=2): ) self.lam = _check_lam(lam, two_d=True) - # initially need num_bases to point to the data shape; maybe set a second - # attribute instead values_rows, vectors_rows = self._calc_eigenvalues( self._num_points[0], self.diff_order[0], self._num_bases[0] ) @@ -577,12 +720,15 @@ def solve(self, y, weights, penalty=None, rhs_extra=None, assume_a='pos'): The y-values for fitting the spline. weights : numpy.ndarray, shape (M, N) The weights for each y-value. - penalty : numpy.ndarray or scipy.sparse.spmatrix or scipy.sparse.sparray + penalty : numpy.ndarray or scipy.sparse.spmatrix or scipy.sparse.sparray, optional The finite difference penalty matrix with shape (``M * N``, ``M * N``). Default is None, which will use the object's penalty. rhs_extra : float or numpy.ndarray, shape (``M * N``,), optional If supplied, `rhs_extra` will be added to the right hand side (``B.T @ W @ y``) of the equation before solving. Default is None, which adds nothing. + assume_a : str, optional + The solver to pass to :func:`scipy.linalg.solve`. Default is 'pos' since the methods + using eigendecomposition have positive definite left-hand-sides of the equation. Returns ------- @@ -614,10 +760,6 @@ def solve(self, y, weights, penalty=None, rhs_extra=None, assume_a='pos'): penalty = self.penalty lhs = self._make_btwb(weights) - # TODO could use cho_factor and save the factorization to call within _calc_dof to make - # the call save time since it would only be used after the weights are finalized -> would - # only be valid if assume_a is 'pos', which all current methods are but in the future that - # may not be guaranteed; better to be explicit and keep it as two separate steps np.fill_diagonal(lhs, lhs.diagonal() + penalty) self.coef = solve( lhs, rhs, lower=True, overwrite_a=True, overwrite_b=True, check_finite=False, diff --git a/pybaselines/two_d/morphological.py b/pybaselines/two_d/morphological.py index 9186c80..4e38a0d 100644 --- a/pybaselines/two_d/morphological.py +++ b/pybaselines/two_d/morphological.py @@ -6,125 +6,17 @@ """ -import numpy as np -from scipy.ndimage import grey_dilation, grey_erosion, grey_opening, uniform_filter +from scipy.ndimage import grey_opening, uniform_filter +from .._nd.morphological import _MorphologicalNDMixin from .._validation import _check_half_window -from ..utils import relative_difference from ._algorithm_setup import _Algorithm2D -class _Morphological(_Algorithm2D): +class _Morphological(_Algorithm2D, _MorphologicalNDMixin): """A base class for all morphological algorithms.""" - @_Algorithm2D._register - def mor(self, data, half_window=None, window_kwargs=None, **kwargs): - """ - A Morphological based (Mor) baseline algorithm. - - Parameters - ---------- - data : array-like, shape (M, N) - The y-values of the measured data. - half_window : int or Sequence[int, int], optional - The half-window used for the rows and columns, respectively, for the morphology - functions. If a single value is given, rows and columns will use the same value. - Default is None, which will optimize the half-window size using - :func:`.estimate_window` and `window_kwargs`. - window_kwargs : dict, optional - A dictionary of keyword arguments to pass to :func:`.estimate_window` for - estimating the half window if `half_window` is None. Default is None. - **kwargs - - .. deprecated:: 1.2.0 - Passing additional keyword arguments is deprecated and will be removed in version - 1.4.0. Pass keyword arguments using `window_kwargs`. - - Returns - ------- - baseline : numpy.ndarray, shape (M, N) - The calculated baseline. - dict - A dictionary with the following items: - - * 'half_window': np.ndarray[int, int] - The half windows used for the morphological calculations. - - References - ---------- - Perez-Pueyo, R., et al. Morphology-Based Automated Baseline Removal for - Raman Spectra of Artistic Pigments. Applied Spectroscopy, 2010, 64, 595-600. - - """ - y, half_wind = self._setup_morphology(data, half_window, window_kwargs, **kwargs) - opening = grey_opening(y, 2 * half_wind + 1) - baseline = np.minimum(opening, _avg_opening(y, half_wind, opening)) - - return baseline, {'half_window': half_wind} - - @_Algorithm2D._register - def imor(self, data, half_window=None, tol=1e-3, max_iter=200, window_kwargs=None, **kwargs): - """ - An Improved Morphological based (IMor) baseline algorithm. - - Parameters - ---------- - data : array-like, shape (M, N) - The y-values of the measured data. - half_window : int or Sequence[int, int], optional - The half-window used for the rows and columns, respectively, for the morphology - functions. If a single value is given, rows and columns will use the same value. - Default is None, which will optimize the half-window size using - :func:`.estimate_window` and `window_kwargs`. - tol : float, optional - The exit criteria. Default is 1e-3. - max_iter : int, optional - The maximum number of iterations. Default is 200. - window_kwargs : dict, optional - A dictionary of keyword arguments to pass to :func:`.estimate_window` for - estimating the half window if `half_window` is None. Default is None. - **kwargs - - .. deprecated:: 1.2.0 - Passing additional keyword arguments is deprecated and will be removed in version - 1.4.0. Pass keyword arguments using `window_kwargs`. - - Returns - ------- - baseline : numpy.ndarray, shape (M, N) - The calculated baseline. - dict - A dictionary with the following items: - - * 'half_window': np.ndarray[int, int] - The half windows used for the morphological calculations. - * 'tol_history': numpy.ndarray - An array containing the calculated tolerance values for - each iteration. The length of the array is the number of iterations - completed. If the last value in the array is greater than the input - `tol` value, then the function did not converge. - - References - ---------- - Dai, L., et al. An Automated Baseline Correction Method Based on Iterative - Morphological Operations. Applied Spectroscopy, 2018, 72(5), 731-739. - - """ - y, half_wind = self._setup_morphology(data, half_window, window_kwargs, **kwargs) - baseline = y - tol_history = np.empty(max_iter + 1) - for i in range(max_iter + 1): - baseline_new = np.minimum(y, _avg_opening(baseline, half_wind)) - calc_difference = relative_difference(baseline, baseline_new) - tol_history[i] = calc_difference - if calc_difference < tol: - break - baseline = baseline_new - - params = {'half_window': half_wind, 'tol_history': tol_history[:i + 1]} - return baseline, params - - @_Algorithm2D._register + @_Algorithm2D._handle_io def rolling_ball(self, data, half_window=None, smooth_half_window=None, pad_kwargs=None, window_kwargs=None, **kwargs): """ @@ -191,88 +83,3 @@ def rolling_ball(self, data, half_window=None, smooth_half_window=None, ) return baseline, {'half_window': half_wind} - - @_Algorithm2D._register - def tophat(self, data, half_window=None, window_kwargs=None, **kwargs): - """ - Estimates the baseline using a top-hat transformation (morphological opening). - - Parameters - ---------- - data : array-like, shape (M, N) - The y-values of the measured data. - half_window : int or Sequence[int, int], optional - The half-window used for the rows and columns, respectively, for the morphology - functions. If a single value is given, rows and columns will use the same value. - Default is None, which will optimize the half-window size using - :func:`.estimate_window` and `window_kwargs`. - window_kwargs : dict, optional - A dictionary of keyword arguments to pass to :func:`.estimate_window` for - estimating the half window if `half_window` is None. Default is None. - **kwargs - - .. deprecated:: 1.2.0 - Passing additional keyword arguments is deprecated and will be removed in version - 1.4.0. Pass keyword arguments using `window_kwargs`. - - Returns - ------- - baseline : numpy.ndarray, shape (M, N) - The calculated baseline. - dict - A dictionary with the following items: - - * 'half_window': np.ndarray[int, int] - The half windows used for the morphological calculations. - - Notes - ----- - The actual top-hat transformation is defined as `data - opening(data)`, where - `opening` is the morphological opening operation. This function, however, returns - `opening(data)`, since that is technically the baseline defined by the operation. - - References - ---------- - Perez-Pueyo, R., et al. Morphology-Based Automated Baseline Removal for - Raman Spectra of Artistic Pigments. Applied Spectroscopy, 2010, 64, 595-600. - - """ - y, half_wind = self._setup_morphology(data, half_window, window_kwargs, **kwargs) - baseline = grey_opening(y, 2 * half_wind + 1) - - return baseline, {'half_window': half_wind} - - -def _avg_opening(y, half_window, opening=None): - """ - Averages the dilation and erosion of a morphological opening on data. - - Parameters - ---------- - y : numpy.ndarray, shape (M, N) - The array of the measured data. - half_window : numpy.ndarray([int, int]), optional - The half window size for the rows and columns, respectively, to use for the operations. - opening : numpy.ndarray, optional - The output of scipy.ndimage.grey_opening(y, window_size). Default is - None, which will compute the value. - - Returns - ------- - numpy.ndarray, shape (M, N) - The average of the dilation and erosion of the opening. - - References - ---------- - Perez-Pueyo, R., et al. Morphology-Based Automated Baseline Removal for - Raman Spectra of Artistic Pigments. Applied Spectroscopy, 2010, 64 595-600. - - """ - # TODO should find a way to merge this with its 1D counterpart - window_size = 2 * half_window + 1 - if opening is None: - opening = grey_opening(y, window_size) - return 0.5 * ( - grey_dilation(opening, window_size) - + grey_erosion(opening, window_size) - ) diff --git a/pybaselines/two_d/optimizers.py b/pybaselines/two_d/optimizers.py index be14685..f88ab31 100644 --- a/pybaselines/two_d/optimizers.py +++ b/pybaselines/two_d/optimizers.py @@ -26,7 +26,7 @@ class _Optimizers(_Algorithm2D): """A base class for all optimizer algorithms.""" - @_Algorithm2D._register(ensure_2d=False, skip_sorting=True) + @_Algorithm2D._handle_io(ensure_dims=False, skip_sorting=True) def collab_pls(self, data, average_dataset=True, method='asls', method_kwargs=None): """ Collaborative Penalized Least Squares (collab-PLS). @@ -140,7 +140,7 @@ def collab_pls(self, data, average_dataset=True, method='asls', method_kwargs=No return baselines, params - @_Algorithm2D._register(skip_sorting=True) + @_Algorithm2D._handle_io(skip_sorting=True) def adaptive_minmax(self, data, poly_order=None, method='modpoly', weights=None, constrained_fraction=0.01, constrained_weight=1e5, estimation_poly_order=2, method_kwargs=None): @@ -274,7 +274,7 @@ def adaptive_minmax(self, data, poly_order=None, method='modpoly', weights=None, return np.maximum.reduce(baselines), params - @_Algorithm2D._register(skip_sorting=True) + @_Algorithm2D._handle_io(skip_sorting=True) def individual_axes(self, data, axes=(0, 1), method='asls', method_kwargs=None): """ Applies a one dimensional baseline correction method along each row and/or column. diff --git a/pybaselines/two_d/polynomial.py b/pybaselines/two_d/polynomial.py index 406bd7d..4f3b992 100644 --- a/pybaselines/two_d/polynomial.py +++ b/pybaselines/two_d/polynomial.py @@ -4,55 +4,21 @@ Created on April 16, 2023 @author: Donald Erb - -The function penalized_poly was adapted from MATLAB code from -https://www.mathworks.com/matlabcentral/fileexchange/27429-background-correction -(accessed March 18, 2021), which was licensed under the BSD-2-clause below. - -License: 2-clause BSD - -Copyright (c) 2012, Vincent Mazet -All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in - the documentation and/or other materials provided with the distribution - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -POSSIBILITY OF SUCH DAMAGE. - """ import warnings import numpy as np -from .. import _weighting -from ..utils import _MIN_FLOAT, _convert_coef2d, relative_difference +from .._nd.polynomial import _PolynomialNDMixin +from ..utils import _convert_coef2d from ._algorithm_setup import _Algorithm2D -class _Polynomial(_Algorithm2D): +class _Polynomial(_Algorithm2D, _PolynomialNDMixin): """A base class for all polynomial algorithms.""" - @_Algorithm2D._register( - sort_keys=('weights',), reshape_baseline=True, reshape_keys=('weights',) - ) + @_Algorithm2D._handle_io(sort_keys=('weights',), reshape_keys=('weights',)) def poly(self, data, poly_order=2, weights=None, return_coef=False, max_cross=None): """ Computes a polynomial fit to the data. @@ -120,691 +86,3 @@ def poly(self, data, poly_order=2, weights=None, return_coef=False, max_cross=No ) return baseline, params - - @_Algorithm2D._register( - sort_keys=('weights',), reshape_baseline=True, reshape_keys=('weights',) - ) - def modpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, - use_original=False, mask_initial_peaks=False, return_coef=False, max_cross=None): - """ - The modified polynomial (ModPoly) baseline algorithm. - - Parameters - ---------- - data : array-like, shape (M, N) - The y-values of the measured data. - poly_order : int or Sequence[int, int], optional - The polynomial orders for the rows and columns. If a single value is given, will use - that for both rows and columns. Default is 2. - tol : float, optional - The exit criteria. Default is 1e-3. - max_iter : int, optional - The maximum number of iterations. Default is 250. - weights : array-like, shape (M, N), optional - The weighting array. If None (default), then will be an array with - shape equal to (M, N) and all values set to 1. - use_original : bool, optional - If False (default), will compare the baseline of each iteration with - the y-values of that iteration [1]_ when choosing minimum values. If True, - will compare the baseline with the original y-values given by `data` [2]_. - mask_initial_peaks : bool, optional - If True, will mask any data where the initial baseline fit + the standard - deviation of the residual is less than measured data [3]_. Default is False. - return_coef : bool, optional - If True, will convert the polynomial coefficients for the fit baseline to - a form that fits the `x_data` and `z_data` values and return them in the params - dictionary. Default is False, since the conversion takes time. - max_cross : int, optional - The maximum degree for the cross terms. For example, if `max_cross` is 1, then - ``x * z**2``, ``x**2 * z``, and ``x**2 * z**2`` would all be set to 0. Default is - None, which does not limit the cross terms. - - Returns - ------- - baseline : numpy.ndarray, shape (M, N) - The calculated baseline. - params : dict - A dictionary with the following items: - - * 'weights': numpy.ndarray, shape (M, N) - The weight array used for fitting the data. - * 'tol_history': numpy.ndarray - An array containing the calculated tolerance values for - each iteration. The length of the array is the number of iterations - completed. If the last value in the array is greater than the input - `tol` value, then the function did not converge. - * 'coef': numpy.ndarray, shape (``poly_order[0] + 1``, ``poly_order[1] + 1``) - Only if `return_coef` is True. The array of polynomial parameters - for the baseline, in increasing order. Can be used to create a - polynomial using :func:`numpy.polynomial.polynomial.polyval2d`. - - Notes - ----- - Algorithm originally developed in [2]_ and then slightly modified in [1]_. - - References - ---------- - .. [1] Gan, F., et al. Baseline correction by improved iterative polynomial - fitting with automatic threshold. Chemometrics and Intelligent - Laboratory Systems, 2006, 82, 59-65. - .. [2] Lieber, C., et al. Automated method for subtraction of fluorescence - from biological raman spectra. Applied Spectroscopy, 2003, 57(11), - 1363-1367. - .. [3] Zhao, J., et al. Automated Autofluorescence Background Subtraction - Algorithm for Biomedical Raman Spectroscopy, Applied Spectroscopy, - 2007, 61(11), 1225-1232. - - """ - y, weight_array, pseudo_inverse = self._setup_polynomial( - data, weights, poly_order, calc_vander=True, calc_pinv=True, copy_weights=True, - max_cross=max_cross - ) - sqrt_w = np.sqrt(weight_array) - if use_original: - y0 = y - - coef = pseudo_inverse @ (sqrt_w * y) - baseline = self._polynomial.vandermonde @ coef - if mask_initial_peaks: - # use baseline + deviation since without deviation, half of y should be above baseline - weight_array[baseline + np.std(y - baseline) < y] = 0 - sqrt_w = np.sqrt(weight_array) - pseudo_inverse = np.linalg.pinv(sqrt_w[:, None] * self._polynomial.vandermonde) - - tol_history = np.empty(max_iter) - for i in range(max_iter): - baseline_old = baseline - y = np.minimum(y0 if use_original else y, baseline) - coef = pseudo_inverse @ (sqrt_w * y) - baseline = self._polynomial.vandermonde @ coef - calc_difference = relative_difference(baseline_old, baseline) - tol_history[i] = calc_difference - if calc_difference < tol: - break - - params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} - if return_coef: - params['coef'] = _convert_coef2d( - coef, *self._polynomial.poly_order, self.x_domain, self.z_domain - ) - - return baseline, params - - @_Algorithm2D._register( - sort_keys=('weights',), reshape_baseline=True, reshape_keys=('weights',) - ) - def imodpoly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, - use_original=False, mask_initial_peaks=True, return_coef=False, - num_std=1., max_cross=None): - """ - The improved modified polynomial (IModPoly) baseline algorithm. - - Parameters - ---------- - data : array-like, shape (M, N) - The y-values of the measured data. - poly_order : int or Sequence[int, int], optional - The polynomial orders for the rows and columns. If a single value is given, will use - that for both rows and columns. Default is 2. - tol : float, optional - The exit criteria. Default is 1e-3. - max_iter : int, optional - The maximum number of iterations. Default is 250. - weights : array-like, shape (M, N), optional - The weighting array. If None (default), then will be an array with - shape equal to (M, N) and all values set to 1. - use_original : bool, optional - If False (default), will compare the baseline of each iteration with - the y-values of that iteration [1]_ when choosing minimum values. If True, - will compare the baseline with the original y-values given by `data` [2]_. - mask_initial_peaks : bool, optional - If True (default), will mask any data where the initial baseline fit + - the standard deviation of the residual is less than measured data [3]_. - return_coef : bool, optional - If True, will convert the polynomial coefficients for the fit baseline to - a form that fits the `x_data` and `z_data` values and return them in the params - dictionary. Default is False, since the conversion takes time. - num_std : float, optional - The number of standard deviations to include when thresholding. Default - is 1. Must be greater or equal to 0. - max_cross : int, optional - The maximum degree for the cross terms. For example, if `max_cross` is 1, then - ``x * z**2``, ``x**2 * z``, and ``x**2 * z**2`` would all be set to 0. Default is - None, which does not limit the cross terms. - - Returns - ------- - baseline : numpy.ndarray, shape (M, N) - The calculated baseline. - params : dict - A dictionary with the following items: - - * 'weights': numpy.ndarray, shape (M, N) - The weight array used for fitting the data. - * 'tol_history': numpy.ndarray - An array containing the calculated tolerance values for - each iteration. The length of the array is the number of iterations - completed. If the last value in the array is greater than the input - `tol` value, then the function did not converge. - * 'coef': numpy.ndarray, shape (``poly_order[0] + 1``, ``poly_order[1] + 1``) - Only if `return_coef` is True. The array of polynomial parameters - for the baseline, in increasing order. Can be used to create a - polynomial using :func:`numpy.polynomial.polynomial.polyval2d`. - - Raises - ------ - ValueError - Raised if `num_std` is less than 0. - - Notes - ----- - Algorithm originally developed in [3]_. - - References - ---------- - .. [1] Gan, F., et al. Baseline correction by improved iterative polynomial - fitting with automatic threshold. Chemometrics and Intelligent - Laboratory Systems, 2006, 82, 59-65. - .. [2] Lieber, C., et al. Automated method for subtraction of fluorescence - from biological raman spectra. Applied Spectroscopy, 2003, 57(11), - 1363-1367. - .. [3] Zhao, J., et al. Automated Autofluorescence Background Subtraction - Algorithm for Biomedical Raman Spectroscopy, Applied Spectroscopy, - 2007, 61(11), 1225-1232. - - """ - if num_std < 0: - raise ValueError('num_std must be greater than or equal to 0') - - y, weight_array, pseudo_inverse = self._setup_polynomial( - data, weights, poly_order, calc_vander=True, calc_pinv=True, - copy_weights=True, max_cross=max_cross - ) - sqrt_w = np.sqrt(weight_array) - if use_original: - y0 = y - - coef = pseudo_inverse @ (sqrt_w * y) - baseline = self._polynomial.vandermonde @ coef - deviation = np.std(sqrt_w * (y - baseline)) - if mask_initial_peaks: - weight_array[baseline + deviation < y] = 0 - sqrt_w = np.sqrt(weight_array) - pseudo_inverse = np.linalg.pinv(sqrt_w[:, None] * self._polynomial.vandermonde) - - tol_history = np.empty(max_iter) - for i in range(max_iter): - y = np.minimum(y0 if use_original else y, baseline + num_std * deviation) - coef = pseudo_inverse @ (sqrt_w * y) - baseline = self._polynomial.vandermonde @ coef - new_deviation = np.std(sqrt_w * (y - baseline)) - # use new_deviation as dividing term in relative difference - calc_difference = relative_difference(new_deviation, deviation) - tol_history[i] = calc_difference - if calc_difference < tol: - break - deviation = new_deviation - - params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} - if return_coef: - params['coef'] = _convert_coef2d( - coef, *self._polynomial.poly_order, self.x_domain, self.z_domain - ) - - return baseline, params - - # adapted from - # https://www.mathworks.com/matlabcentral/fileexchange/27429-background-correction; - # see license above - @_Algorithm2D._register( - sort_keys=('weights',), reshape_baseline=True, reshape_keys=('weights',) - ) - def penalized_poly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, - cost_function='asymmetric_truncated_quadratic', threshold=None, - alpha_factor=0.99, return_coef=False, max_cross=None): - """ - Fits a polynomial baseline using a non-quadratic cost function. - - The non-quadratic cost functions penalize residuals with larger values, - giving a more robust fit compared to normal least-squares. - - Parameters - ---------- - data : array-like, shape (M, N) - The y-values of the measured data. - poly_order : int or Sequence[int, int], optional - The polynomial orders for the rows and columns. If a single value is given, will use - that for both rows and columns. Default is 2. - tol : float, optional - The exit criteria. Default is 1e-3. - max_iter : int, optional - The maximum number of iterations. Default is 250. - weights : array-like, shape (M, N), optional - The weighting array. If None (default), then will be an array with - shape equal to (M, N) and all values set to 1. - cost_function : str, optional - The non-quadratic cost function to minimize. Must indicate symmetry of the - method by prepending 'a' or 'asymmetric' for asymmetric loss, and 's' or - 'symmetric' for symmetric loss. Default is 'asymmetric_truncated_quadratic'. - Available methods, and their associated reference, are: - - * 'asymmetric_truncated_quadratic'[1]_ - * 'symmetric_truncated_quadratic'[1]_ - * 'asymmetric_huber'[1]_ - * 'symmetric_huber'[1]_ - * 'asymmetric_indec'[2]_ - * 'symmetric_indec'[2]_ - - threshold : float, optional - The threshold value for the loss method, where the function goes from - quadratic loss (such as used for least squares) to non-quadratic. For - symmetric loss methods, residual values with absolute value less than - threshold will have quadratic loss. For asymmetric loss methods, residual - values less than the threshold will have quadratic loss. Default is None, - which sets `threshold` to one-tenth of the standard deviation of the input - data. - alpha_factor : float, optional - A value between 0 and 1 that controls the value of the penalty. Default is - 0.99. Typically should not need to change this value. - return_coef : bool, optional - If True, will convert the polynomial coefficients for the fit baseline to - a form that fits the `x_data` and `z_data` values and return them in the params - dictionary. Default is False, since the conversion takes time. - max_cross : int, optional - The maximum degree for the cross terms. For example, if `max_cross` is 1, then - ``x * z**2``, ``x**2 * z``, and ``x**2 * z**2`` would all be set to 0. Default is - None, which does not limit the cross terms. - - Returns - ------- - baseline : numpy.ndarray, shape (M, N) - The calculated baseline. - params : dict - A dictionary with the following items: - - * 'weights': numpy.ndarray, shape (M, N) - The weight array used for fitting the data. - * 'tol_history': numpy.ndarray - An array containing the calculated tolerance values for - each iteration. The length of the array is the number of iterations - completed. If the last value in the array is greater than the input - `tol` value, then the function did not converge. - * 'coef': numpy.ndarray, shape (``poly_order[0] + 1``, ``poly_order[1] + 1``) - Only if `return_coef` is True. The array of polynomial parameters - for the baseline, in increasing order. Can be used to create a - polynomial using :func:`numpy.polynomial.polynomial.polyval2d`. - - Raises - ------ - ValueError - Raised if `alpha_factor` is not between 0 and 1. - - Notes - ----- - In baseline literature, this procedure is sometimes called "backcor". - - References - ---------- - .. [1] Mazet, V., et al. Background removal from spectra by designing and - minimising a non-quadratic cost function. Chemometrics and Intelligent - Laboratory Systems, 2005, 76(2), 121-133. - .. [2] Liu, J., et al. Goldindec: A Novel Algorithm for Raman Spectrum Baseline - Correction. Applied Spectroscopy, 2015, 69(7), 834-842. - - """ - if not 0 < alpha_factor <= 1: - raise ValueError('alpha_factor must be between 0 and 1') - symmetric_loss, method = _identify_loss_method(cost_function) - loss_function = { - 'huber': _huber_loss, - 'truncated_quadratic': _truncated_quadratic_loss, - 'indec': _indec_loss - }[method] - - y, weight_array, pseudo_inverse = self._setup_polynomial( - data, weights, poly_order, calc_vander=True, calc_pinv=True, max_cross=max_cross - ) - if threshold is None: - threshold = np.std(y) / 10 - loss_kwargs = { - 'threshold': threshold, 'alpha_factor': alpha_factor, 'symmetric': symmetric_loss - } - - sqrt_w = np.sqrt(weight_array) - y = sqrt_w * y - - coef = pseudo_inverse @ y - baseline = self._polynomial.vandermonde @ coef - tol_history = np.empty(max_iter) - for i in range(max_iter): - baseline_old = baseline - coef = pseudo_inverse @ (y + loss_function(y - sqrt_w * baseline, **loss_kwargs)) - baseline = self._polynomial.vandermonde @ coef - calc_difference = relative_difference(baseline_old, baseline) - tol_history[i] = calc_difference - if calc_difference < tol: - break - - params = {'weights': weight_array, 'tol_history': tol_history[:i + 1]} - if return_coef: - params['coef'] = _convert_coef2d( - coef, *self._polynomial.poly_order, self.x_domain, self.z_domain - ) - - return baseline, params - - @_Algorithm2D._register( - sort_keys=('weights',), reshape_baseline=True, reshape_keys=('weights',) - ) - def quant_reg(self, data, poly_order=2, quantile=0.05, tol=1e-6, max_iter=250, - weights=None, eps=None, return_coef=False, max_cross=None): - """ - Approximates the baseline of the data using quantile regression. - - Parameters - ---------- - data : array-like, shape (M, N) - The y-values of the measured data. - poly_order : int or Sequence[int, int], optional - The polynomial orders for the rows and columns. If a single value is given, will use - that for both rows and columns. Default is 2. - quantile : float, optional - The quantile at which to fit the baseline. Default is 0.05. - tol : float, optional - The exit criteria. Default is 1e-6. For extreme quantiles (`quantile` < 0.01 - or `quantile` > 0.99), may need to use a lower value to get a good fit. - max_iter : int, optional - The maximum number of iterations. Default is 250. For extreme quantiles - (`quantile` < 0.01 or `quantile` > 0.99), may need to use a higher value to - ensure convergence. - weights : array-like, shape (M, N), optional - The weighting array. If None (default), then will be an array with - shape equal to (M, N) and all values set to 1. - eps : float, optional - A small value added to the square of the residual to prevent dividing by 0. - Default is None, which uses the square of the maximum-absolute-value of the - fit each iteration multiplied by 1e-6. - return_coef : bool, optional - If True, will convert the polynomial coefficients for the fit baseline to - a form that fits the `x_data` and `z_data` values and return them in the params - dictionary. Default is False, since the conversion takes time. - max_cross : int, optional - The maximum degree for the cross terms. For example, if `max_cross` is 1, then - ``x * z**2``, ``x**2 * z``, and ``x**2 * z**2`` would all be set to 0. Default is - None, which does not limit the cross terms. - - Returns - ------- - baseline : numpy.ndarray, shape (M, N) - The calculated baseline. - params : dict - A dictionary with the following items: - - * 'weights': numpy.ndarray, shape (M, N) - The weight array used for fitting the data. - * 'tol_history': numpy.ndarray - An array containing the calculated tolerance values for - each iteration. The length of the array is the number of iterations - completed. If the last value in the array is greater than the input - `tol` value, then the function did not converge. - * 'coef': numpy.ndarray, shape (``poly_order[0] + 1``, ``poly_order[1] + 1``) - Only if `return_coef` is True. The array of polynomial parameters - for the baseline, in increasing order. Can be used to create a - polynomial using :func:`numpy.polynomial.polynomial.polyval2d`. - - Raises - ------ - ValueError - Raised if `quantile` is not between 0 and 1. - - Notes - ----- - Application of quantile regression for baseline fitting as described in [1]_. - - Performs quantile regression using iteratively reweighted least squares (IRLS) - as described in [2]_. - - References - ---------- - .. [1] Komsta, Ł. Comparison of Several Methods of Chromatographic - Baseline Removal with a New Approach Based on Quantile Regression. - Chromatographia, 2011, 73, 721-731. - .. [2] Schnabel, S., et al. Simultaneous estimation of quantile curves using - quantile sheets. AStA Advances in Statistical Analysis, 2013, 97, 77-87. - - """ - # TODO provide a way to estimate best poly_order based on AIC like in Komsta? could be - # useful for all polynomial methods; maybe could be an optimizer function - if not 0 < quantile < 1: - raise ValueError('quantile must be between 0 and 1.') - - y, weight_array = self._setup_polynomial( - data, weights, poly_order, calc_vander=True, max_cross=max_cross - ) - sqrt_w = np.sqrt(weight_array) - baseline_old = y - tol_history = np.empty(max_iter + 1) - for i in range(max_iter + 1): - coef = np.linalg.lstsq( - self._polynomial.vandermonde * sqrt_w[:, None], y * sqrt_w, None - )[0] - baseline = self._polynomial.vandermonde @ coef - # relative_difference(baseline_old, baseline, 1) gives nearly same result and - # the l2 norm is faster to calculate, so use that instead of l1 norm - calc_difference = relative_difference(baseline_old, baseline) - tol_history[i] = calc_difference - if calc_difference < tol: - break - sqrt_w = np.sqrt(_weighting._quantile(y, baseline, quantile, eps)) - baseline_old = baseline - - params = {'weights': sqrt_w**2, 'tol_history': tol_history[:i + 1]} - if return_coef: - params['coef'] = _convert_coef2d( - coef, *self._polynomial.poly_order, self.x_domain, self.z_domain - ) - - return baseline, params - - -# adapted from (https://www.mathworks.com/matlabcentral/fileexchange/27429-background-correction); -# see license above -def _huber_loss(residual, threshold=1.0, alpha_factor=0.99, symmetric=True): - """ - The Huber non-quadratic cost function. - - Parameters - ---------- - residual : numpy.ndarray, shape (N,) - The residual array. - threshold : float, optional - Any residual values below the threshold are given quadratic loss. - Default is 1.0. - alpha_factor : float, optional - The scale between 0 and 1 to multiply the cost function's alpha_max - value (see Notes below). Default is 0.99. - symmetric : bool, optional - If True (default), the cost function is symmetric and applies the same - weighting for positive and negative values. If False, will apply weights - asymmetrically so that only positive weights are given the non-quadratic - weigting and negative weights have normal, quadratic weighting. - - Returns - ------- - weights : numpy.ndarray, shape (N,) - The weight array. - - Notes - ----- - The returned result is - - -residual + alpha_factor * alpha_max * phi'(residual) - - where phi'(x) is the derivative of the huber loss function, phi(x). - - References - ---------- - Mazet, V., et al. Background removal from spectra by designing and - minimising a non-quadratic cost function. Chemometrics and Intelligent - Laboratory Systems, 2005, 76(2), 121-133. - - """ - alpha = alpha_factor * 0.5 # alpha_max for huber is 0.5 - if symmetric: - mask = (np.abs(residual) < threshold) - weights = ( - mask * residual * (2 * alpha - 1) - + (~mask) * 2 * alpha * threshold * np.sign(residual) - ) - else: - mask = (residual < threshold) - weights = ( - mask * residual * (2 * alpha - 1) - + (~mask) * (2 * alpha * threshold - residual) - ) - return weights - - -# adapted from (https://www.mathworks.com/matlabcentral/fileexchange/27429-background-correction); -# see license above -def _truncated_quadratic_loss(residual, threshold=1.0, alpha_factor=0.99, symmetric=True): - """ - The Truncated-Quadratic non-quadratic cost function. - - Parameters - ---------- - residual : numpy.ndarray, shape (N,) - The residual array. - threshold : float, optional - Any residual values below the threshold are given quadratic loss. - Default is 1.0. - alpha_factor : float, optional - The scale between 0 and 1 to multiply the cost function's alpha_max - value (see Notes below). Default is 0.99. - symmetric : bool, optional - If True (default), the cost function is symmetric and applies the same - weighting for positive and negative values. If False, will apply weights - asymmetrically so that only positive weights are given the non-quadratic - weigting and negative weights have normal, quadratic weighting. - - Returns - ------- - weights : numpy.ndarray, shape (N,) - The weight array. - - Notes - ----- - The returned result is - - -residual + alpha_factor * alpha_max * phi'(residual) - - where phi'(x) is the derivative of the truncated quadratic function, phi(x). - - References - ---------- - Mazet, V., et al. Background removal from spectra by designing and - minimising a non-quadratic cost function. Chemometrics and Intelligent - Laboratory Systems, 2005, 76(2), 121-133. - - """ - alpha = alpha_factor * 0.5 # alpha_max for truncated quadratic is 0.5 - if symmetric: - mask = (np.abs(residual) < threshold) - else: - mask = (residual < threshold) - return mask * residual * (2 * alpha - 1) - (~mask) * residual - - -def _indec_loss(residual, threshold=1.0, alpha_factor=0.99, symmetric=True): - """ - The Indec non-quadratic cost function. - - Parameters - ---------- - residual : numpy.ndarray, shape (N,) - The residual array. - threshold : float, optional - Any residual values below the threshold are given quadratic loss. - Default is 1.0. - alpha_factor : float, optional - The scale between 0 and 1 to multiply the cost function's alpha_max - value (see Notes below). Default is 0.99. - symmetric : bool, optional - If True (default), the cost function is symmetric and applies the same - weighting for positive and negative values. If False, will apply weights - asymmetrically so that only positive weights are given the non-quadratic - weigting and negative weights have normal, quadratic weighting. - - Returns - ------- - weights : numpy.ndarray, shape (N,) - The weight array. - - Notes - ----- - The returned result is - - -residual + alpha_factor * alpha_max * phi'(residual) - - where phi'(x) is the derivative of the Indec function, phi(x). - - References - ---------- - Liu, J., et al. Goldindec: A Novel Algorithm for Raman Spectrum Baseline - Correction. Applied Spectroscopy, 2015, 69(7), 834-842. - - Mazet, V., et al. Background removal from spectra by designing and - minimising a non-quadratic cost function. Chemometrics and Intelligent - Laboratory Systems, 2005, 76(2), 121-133. - - """ - alpha = alpha_factor * 0.5 # alpha_max for indec is 0.5 - if symmetric: - mask = (np.abs(residual) < threshold) - multiple = np.sign(residual) - else: - mask = (residual < threshold) - # multiple=1 is same as sign(residual) since residual is always > 0 - # for asymmetric case, but this allows not doing the sign calculation - multiple = 1 - weights = ( - mask * residual * (2 * alpha - 1) - - (~mask) * ( - residual + alpha * multiple * threshold**3 / np.maximum(2 * residual**2, _MIN_FLOAT) - ) - ) - return weights - - -def _identify_loss_method(loss_method): - """ - Identifies the symmetry for the given loss method. - - Parameters - ---------- - loss_method : str - The loss method to use. Should have the symmetry identifier as - the prefix. - - Returns - ------- - symmetric : bool - True if `loss_method` had 's_' or 'symmetric_' as the prefix, else False. - str - The input `loss_method` value without the first section that indicated - the symmetry. - - Raises - ------ - ValueError - Raised if the loss method does not have the correct form. - - """ - prefix, *split_method = loss_method.lower().split('_') - if prefix not in ('a', 's', 'asymmetric', 'symmetric') or not split_method: - raise ValueError('must specify loss function symmetry by prepending "a_" or "s_"') - if prefix in ('a', 'asymmetric'): - symmetric = False - else: - symmetric = True - return symmetric, '_'.join(split_method) diff --git a/pybaselines/two_d/smooth.py b/pybaselines/two_d/smooth.py index 61a120b..7d3c176 100644 --- a/pybaselines/two_d/smooth.py +++ b/pybaselines/two_d/smooth.py @@ -15,7 +15,7 @@ class _Smooth(_Algorithm2D): """A base class for all smoothing algorithms.""" - @_Algorithm2D._register + @_Algorithm2D._handle_io def noise_median(self, data, half_window=None, smooth_half_window=None, sigma=None, pad_kwargs=None, **kwargs): """ diff --git a/pybaselines/two_d/spline.py b/pybaselines/two_d/spline.py index 1f93940..cebb3f7 100644 --- a/pybaselines/two_d/spline.py +++ b/pybaselines/two_d/spline.py @@ -6,22 +6,21 @@ """ -import warnings import numpy as np from .. import _weighting -from .._validation import _check_scalar_variable +from .._nd.pls import _PLSNDMixin +from .._validation import _check_spline_degree from ..results import PSplineResult2D -from ..utils import ParameterWarning, gaussian, relative_difference, _MIN_FLOAT +from ..utils import relative_difference from ._algorithm_setup import _Algorithm2D from ._whittaker_utils import PenalizedSystem2D -class _Spline(_Algorithm2D): +class _Spline(_Algorithm2D, _PLSNDMixin): """A base class for all spline algorithms.""" - @_Algorithm2D._register(sort_keys=('weights',)) def mixture_model(self, data, lam=1e3, p=1e-2, num_knots=25, spline_degree=3, diff_order=3, max_iter=50, tol=1e-3, weights=None, symmetric=False): """ @@ -71,9 +70,9 @@ def mixture_model(self, data, lam=1e3, p=1e-2, num_knots=25, spline_degree=3, di Returns ------- - baseline : numpy.ndarray, shape (M, N) + numpy.ndarray, shape (M, N) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (M, N) @@ -90,7 +89,7 @@ def mixture_model(self, data, lam=1e3, p=1e-2, num_knots=25, spline_degree=3, di Raises ------ ValueError - Raised if p is not between 0 and 1. + Raised if `p` is not between 0 and 1. References ---------- @@ -101,100 +100,13 @@ def mixture_model(self, data, lam=1e3, p=1e-2, num_knots=25, spline_degree=3, di preprint arXiv:1901.06708, 2019. """ - if not 0 < p < 1: - raise ValueError('p must be between 0 and 1') - - y, weight_array, pspline = self._setup_spline( - data, weights, spline_degree, num_knots, True, diff_order, lam + _check_spline_degree(spline_degree) + return super()._mixture_model( + data, lam=lam, p=p, diff_order=diff_order, max_iter=max_iter, tol=tol, + weights=weights, spline_degree=spline_degree, num_knots=num_knots, + symmetric=symmetric ) - # scale y between -1 and 1 so that the residual fit is more numerically stable - # TODO is this still necessary now that expectation-maximization is used? -> still - # helps to prevent overflows when using gaussian - y_domain = np.polynomial.polyutils.getdomain(y.ravel()) - y = np.polynomial.polyutils.mapdomain(y, y_domain, np.array([-1., 1.])) - - if weights is not None: - baseline = pspline.solve(y, weight_array) - else: - # perform 2 iterations: first is a least-squares fit and second is initial - # reweighted fit; 2 fits are needed to get weights to have a decent starting - # distribution for the expectation-maximization - if symmetric and not 0.2 < p < 0.8: - # p values far away from 0.5 with symmetric=True give bad initial weights - # for the expectation maximization - warnings.warn( - 'should use a p value closer to 0.5 when symmetric is True', - ParameterWarning, stacklevel=2 - ) - for _ in range(2): - baseline = pspline.solve(y, weight_array) - weight_array = _weighting._asls(y, baseline, p) - - residual = y - baseline - # the 0.2 * std(residual) is an "okay" starting sigma estimate - sigma = 0.2 * np.std(residual) - fraction_noise = 0.5 - if symmetric: - fraction_positive = 0.25 - else: - fraction_positive = 1 - fraction_noise - tol_history = np.empty(max_iter + 1) - for i in range(max_iter + 1): - # expectation part of expectation-maximization -> calc pdfs and - # posterior probabilities - positive_pdf = np.where( - residual >= 0, fraction_positive / max(abs(residual.max()), 1e-6), 0 - ) - noise_pdf = ( - fraction_noise * gaussian(residual, 1 / (sigma * np.sqrt(2 * np.pi)), 0, sigma) - ) - total_pdf = noise_pdf + positive_pdf - if symmetric: - negative_pdf = np.where( - residual < 0, - (1 - fraction_noise - fraction_positive) / max(abs(residual.min()), 1e-6), - 0 - ) - total_pdf += negative_pdf - posterior_prob_noise = noise_pdf / np.maximum(total_pdf, _MIN_FLOAT) - - calc_difference = relative_difference(weight_array, posterior_prob_noise) - tol_history[i] = calc_difference - if calc_difference < tol: - break - - # maximization part of expectation-maximization -> update sigma and - # fractions of each pdf - noise_sum = posterior_prob_noise.sum() - sigma = np.sqrt((posterior_prob_noise * residual**2).sum() / noise_sum) - if not symmetric: - fraction_noise = posterior_prob_noise.mean() - fraction_positive = 1 - fraction_noise - else: - posterior_prob_positive = positive_pdf / total_pdf - posterior_prob_negative = negative_pdf / total_pdf - - positive_sum = posterior_prob_positive.sum() - negative_sum = posterior_prob_negative.sum() - total_sum = noise_sum + positive_sum + negative_sum - - fraction_noise = noise_sum / total_sum - fraction_positive = positive_sum / total_sum - - weight_array = posterior_prob_noise - baseline = pspline.solve(y, weight_array) - residual = y - baseline - - params = { - 'weights': weight_array, 'tol_history': tol_history[:i + 1], - 'result': PSplineResult2D(pspline, weight_array) - } - - baseline = np.polynomial.polyutils.mapdomain(baseline, np.array([-1., 1.]), y_domain) - - return baseline, params - @_Algorithm2D._register(sort_keys=('weights',)) def irsqr(self, data, lam=1e3, quantile=0.05, num_knots=25, spline_degree=3, diff_order=3, max_iter=100, tol=1e-6, weights=None, eps=None): """ @@ -236,9 +148,9 @@ def irsqr(self, data, lam=1e3, quantile=0.05, num_knots=25, spline_degree=3, Returns ------- - baseline : numpy.ndarray, shape (M, N) + numpy.ndarray, shape (M, N) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (M, N) @@ -255,7 +167,7 @@ def irsqr(self, data, lam=1e3, quantile=0.05, num_knots=25, spline_degree=3, Raises ------ ValueError - Raised if quantile is not between 0 and 1. + Raised if `quantile` is not between 0 and 1. References ---------- @@ -264,31 +176,12 @@ def irsqr(self, data, lam=1e3, quantile=0.05, num_knots=25, spline_degree=3, Science and Control Engineering (ICISCE), 2018, 280-284. """ - if not 0 < quantile < 1: - raise ValueError('quantile must be between 0 and 1') - - y, weight_array, pspline = self._setup_spline( - data, weights, spline_degree, num_knots, True, diff_order, lam + _check_spline_degree(spline_degree) + return super()._irsqr( + data, lam=lam, quantile=quantile, num_knots=num_knots, spline_degree=spline_degree, + diff_order=diff_order, max_iter=max_iter, tol=tol, weights=weights, eps=eps ) - old_coef = np.zeros(np.prod(self._spline_basis._num_bases)) - tol_history = np.empty(max_iter + 1) - for i in range(max_iter + 1): - baseline = pspline.solve(y, weight_array) - calc_difference = relative_difference(old_coef, pspline.coef) - tol_history[i] = calc_difference - if calc_difference < tol: - break - old_coef = pspline.coef - weight_array = _weighting._quantile(y, baseline, quantile, eps) - - params = { - 'weights': weight_array, 'tol_history': tol_history[:i + 1], - 'result': PSplineResult2D(pspline, weight_array) - } - return baseline, params - - @_Algorithm2D._register(sort_keys=('weights',)) def pspline_asls(self, data, lam=1e3, p=1e-2, num_knots=25, spline_degree=3, diff_order=2, max_iter=50, tol=1e-3, weights=None): """ @@ -326,9 +219,9 @@ def pspline_asls(self, data, lam=1e3, p=1e-2, num_knots=25, spline_degree=3, dif Returns ------- - baseline : numpy.ndarray, shape (M, N) + numpy.ndarray, shape (M, N) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (M, N) @@ -362,30 +255,13 @@ def pspline_asls(self, data, lam=1e3, p=1e-2, num_knots=25, spline_degree=3, dif Reviews: Computational Statistics, 2010, 2(6), 637-653. """ - if not 0 < p < 1: - raise ValueError('p must be between 0 and 1') - - y, weight_array, pspline = self._setup_spline( - data, weights, spline_degree, num_knots, True, diff_order, lam + _check_spline_degree(spline_degree) + return super()._asls( + data, lam=lam, p=p, diff_order=diff_order, max_iter=max_iter, tol=tol, + weights=weights, spline_degree=spline_degree, num_knots=num_knots ) - tol_history = np.empty(max_iter + 1) - for i in range(max_iter + 1): - baseline = pspline.solve(y, weight_array) - new_weights = _weighting._asls(y, baseline, p) - calc_difference = relative_difference(weight_array, new_weights) - tol_history[i] = calc_difference - if calc_difference < tol: - break - weight_array = new_weights - - params = { - 'weights': weight_array, 'tol_history': tol_history[:i + 1], - 'result': PSplineResult2D(pspline, weight_array) - } - return baseline, params - - @_Algorithm2D._register(sort_keys=('weights',)) + @_Algorithm2D._handle_io(sort_keys=('weights',)) def pspline_iasls(self, data, lam=1e3, p=1e-2, lam_1=1e-4, num_knots=25, spline_degree=3, max_iter=50, tol=1e-3, weights=None, diff_order=2): """ @@ -514,7 +390,6 @@ def pspline_iasls(self, data, lam=1e3, p=1e-2, lam_1=1e-4, num_knots=25, return baseline, params - @_Algorithm2D._register(sort_keys=('weights',)) def pspline_airpls(self, data, lam=1e3, num_knots=25, spline_degree=3, diff_order=2, max_iter=50, tol=1e-3, weights=None, normalize_weights=False): """ @@ -552,9 +427,9 @@ def pspline_airpls(self, data, lam=1e3, num_knots=25, spline_degree=3, Returns ------- - baseline : numpy.ndarray, shape (M, N) + numpy.ndarray, shape (M, N) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (M, N) @@ -581,33 +456,13 @@ def pspline_airpls(self, data, lam=1e3, num_knots=25, spline_degree=3, Reviews: Computational Statistics, 2010, 2(6), 637-653. """ - y, weight_array, pspline = self._setup_spline( - data, weights, spline_degree, num_knots, True, diff_order, lam + _check_spline_degree(spline_degree) + return super()._airpls( + data, lam=lam, diff_order=diff_order, max_iter=max_iter, tol=tol, + weights=weights, spline_degree=spline_degree, num_knots=num_knots, + normalize_weights=normalize_weights ) - y_l1_norm = np.abs(y).sum() - tol_history = np.empty(max_iter + 1) - for i in range(1, max_iter + 2): - baseline = pspline.solve(y, weight_array) - new_weights, residual_l1_norm, exit_early = _weighting._airpls( - y, baseline, i, normalize_weights - ) - if exit_early: - i -= 1 # reduce i so that output tol_history indexing is correct - break - calc_difference = residual_l1_norm / y_l1_norm - tol_history[i - 1] = calc_difference - if calc_difference < tol: - break - weight_array = new_weights - - params = { - 'weights': weight_array, 'tol_history': tol_history[:i], - 'result': PSplineResult2D(pspline, weight_array) - } - return baseline, params - - @_Algorithm2D._register(sort_keys=('weights',)) def pspline_arpls(self, data, lam=1e3, num_knots=25, spline_degree=3, diff_order=2, max_iter=50, tol=1e-3, weights=None): """ @@ -641,9 +496,9 @@ def pspline_arpls(self, data, lam=1e3, num_knots=25, spline_degree=3, diff_order Returns ------- - baseline : numpy.ndarray, shape (M, N) + numpy.ndarray, shape (M, N) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (M, N) @@ -670,30 +525,12 @@ def pspline_arpls(self, data, lam=1e3, num_knots=25, spline_degree=3, diff_order Reviews: Computational Statistics, 2010, 2(6), 637-653. """ - y, weight_array, pspline = self._setup_spline( - data, weights, spline_degree, num_knots, True, diff_order, lam + _check_spline_degree(spline_degree) + return super()._arpls( + data, lam=lam, diff_order=diff_order, max_iter=max_iter, tol=tol, + weights=weights, spline_degree=spline_degree, num_knots=num_knots ) - tol_history = np.empty(max_iter + 1) - for i in range(max_iter + 1): - baseline = pspline.solve(y, weight_array) - new_weights, exit_early = _weighting._arpls(y, baseline) - if exit_early: - i -= 1 # reduce i so that output tol_history indexing is correct - break - calc_difference = relative_difference(weight_array, new_weights) - tol_history[i] = calc_difference - if calc_difference < tol: - break - weight_array = new_weights - params = { - 'weights': weight_array, 'tol_history': tol_history[:i + 1], - 'result': PSplineResult2D(pspline, weight_array) - } - - return baseline, params - - @_Algorithm2D._register(sort_keys=('weights',)) def pspline_iarpls(self, data, lam=1e3, num_knots=25, spline_degree=3, diff_order=2, max_iter=50, tol=1e-3, weights=None): """ @@ -727,9 +564,9 @@ def pspline_iarpls(self, data, lam=1e3, num_knots=25, spline_degree=3, diff_orde Returns ------- - baseline : numpy.ndarray, shape (M, N) + numpy.ndarray, shape (M, N) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (M, N) @@ -757,30 +594,12 @@ def pspline_iarpls(self, data, lam=1e3, num_knots=25, spline_degree=3, diff_orde Reviews: Computational Statistics, 2010, 2(6), 637-653. """ - y, weight_array, pspline = self._setup_spline( - data, weights, spline_degree, num_knots, True, diff_order, lam + _check_spline_degree(spline_degree) + return super()._iarpls( + data, lam=lam, diff_order=diff_order, max_iter=max_iter, tol=tol, + weights=weights, spline_degree=spline_degree, num_knots=num_knots ) - tol_history = np.empty(max_iter + 1) - for i in range(1, max_iter + 2): - baseline = pspline.solve(y, weight_array) - new_weights, exit_early = _weighting._iarpls(y, baseline, i) - if exit_early: - i -= 1 # reduce i so that output tol_history indexing is correct - break - calc_difference = relative_difference(weight_array, new_weights) - tol_history[i - 1] = calc_difference - if calc_difference < tol: - break - weight_array = new_weights - params = { - 'weights': weight_array, 'tol_history': tol_history[:i], - 'result': PSplineResult2D(pspline, weight_array) - } - - return baseline, params - - @_Algorithm2D._register(sort_keys=('weights',)) def pspline_psalsa(self, data, lam=1e3, p=0.5, k=None, num_knots=25, spline_degree=3, diff_order=2, max_iter=50, tol=1e-3, weights=None): """ @@ -824,9 +643,9 @@ def pspline_psalsa(self, data, lam=1e3, p=0.5, k=None, num_knots=25, spline_degr Returns ------- - baseline : numpy.ndarray, shape (M, N) + numpy.ndarray, shape (M, N) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (M, N) @@ -860,34 +679,12 @@ def pspline_psalsa(self, data, lam=1e3, p=0.5, k=None, num_knots=25, spline_degr Reviews: Computational Statistics, 2010, 2(6), 637-653. """ - if not 0 < p < 1: - raise ValueError('p must be between 0 and 1') - - y, weight_array, pspline = self._setup_spline( - data, weights, spline_degree, num_knots, True, diff_order, lam + _check_spline_degree(spline_degree) + return super()._psalsa( + data, lam=lam, p=p, k=k, diff_order=diff_order, max_iter=max_iter, tol=tol, + weights=weights, spline_degree=spline_degree, num_knots=num_knots ) - if k is None: - k = np.std(y) / 10 - else: - k = _check_scalar_variable(k, variable_name='k') - tol_history = np.empty(max_iter + 1) - for i in range(max_iter + 1): - baseline = pspline.solve(y, weight_array) - new_weights = _weighting._psalsa(y, baseline, p, k, self._shape) - calc_difference = relative_difference(weight_array, new_weights) - tol_history[i] = calc_difference - if calc_difference < tol: - break - weight_array = new_weights - - params = { - 'weights': weight_array, 'tol_history': tol_history[:i + 1], - 'result': PSplineResult2D(pspline, weight_array) - } - return baseline, params - - @_Algorithm2D._register(sort_keys=('weights',)) def pspline_brpls(self, data, lam=1e3, num_knots=25, spline_degree=3, diff_order=2, max_iter=50, tol=1e-3, max_iter_2=50, tol_2=1e-3, weights=None): """ @@ -915,7 +712,7 @@ def pspline_brpls(self, data, lam=1e3, num_knots=25, spline_degree=3, diff_order The max number of fit iterations. Default is 50. tol : float, optional The exit criteria. Default is 1e-3. - max_iter_2 : float, optional + max_iter_2 : int, optional The number of iterations for updating the proportion of data occupied by peaks. Default is 50. tol_2 : float, optional @@ -927,9 +724,9 @@ def pspline_brpls(self, data, lam=1e3, num_knots=25, spline_degree=3, diff_order Returns ------- - baseline : numpy.ndarray, shape (M, N) + numpy.ndarray, shape (M, N) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (M, N) @@ -961,55 +758,13 @@ def pspline_brpls(self, data, lam=1e3, num_knots=25, spline_degree=3, diff_order Reviews: Computational Statistics, 2010, 2(6), 637-653. """ - y, weight_array, pspline = self._setup_spline( - data, weights, spline_degree, num_knots, True, diff_order, lam + _check_spline_degree(spline_degree) + return super()._brpls( + data, lam=lam, diff_order=diff_order, max_iter=max_iter, tol=tol, + max_iter_2=max_iter_2, tol_2=tol_2, weights=weights, + spline_degree=spline_degree, num_knots=num_knots ) - beta = 0.5 - j_max = 0 - baseline = y - baseline_weights = weight_array - tol_history = np.zeros((max_iter_2 + 2, max(max_iter, max_iter_2) + 1)) - # implementation note: weight_array must always be updated since otherwise when - # reentering the inner loop, new_baseline and baseline would be the same; instead, - # use baseline_weights to track which weights produced the output baseline - for i in range(max_iter_2 + 1): - for j in range(max_iter + 1): - new_baseline = pspline.solve(y, weight_array) - new_weights, exit_early = _weighting._brpls(y, new_baseline, beta) - if exit_early: - j -= 1 # reduce j so that output tol_history indexing is correct - tol_2 = np.inf # ensure it exits outer loop - break - # Paper used norm(old - new) / norm(new) rather than old in the denominator, - # but I use old in the denominator instead to be consistent with all other - # algorithms; does not make a major difference - calc_difference = relative_difference(baseline, new_baseline) - tol_history[i + 1, j] = calc_difference - if calc_difference < tol: - if i == 0 and j == 0: # for cases where tol == inf - baseline = new_baseline - break - baseline_weights = weight_array - weight_array = new_weights - baseline = new_baseline - j_max = max(j, j_max) - weight_array = new_weights - weight_mean = weight_array.mean() - calc_difference_2 = abs(beta + weight_mean - 1) - tol_history[0, i] = calc_difference_2 - if calc_difference_2 < tol_2: - break - beta = 1 - weight_mean - - params = { - 'weights': baseline_weights, 'tol_history': tol_history[:i + 2, :max(i, j_max) + 1], - 'result': PSplineResult2D(pspline, weight_array) - } - - return baseline, params - - @_Algorithm2D._register(sort_keys=('weights',)) def pspline_lsrpls(self, data, lam=1e3, num_knots=25, spline_degree=3, diff_order=2, max_iter=50, tol=1e-3, weights=None, alternate_weighting=False): """ @@ -1049,9 +804,9 @@ def pspline_lsrpls(self, data, lam=1e3, num_knots=25, spline_degree=3, diff_orde Returns ------- - baseline : numpy.ndarray, shape (M, N) + numpy.ndarray, shape (M, N) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (M, N) @@ -1091,25 +846,9 @@ def pspline_lsrpls(self, data, lam=1e3, num_knots=25, spline_degree=3, diff_orde Reviews: Computational Statistics, 2010, 2(6), 637-653. """ - y, weight_array, pspline = self._setup_spline( - data, weights, spline_degree, num_knots, True, diff_order, lam + _check_spline_degree(spline_degree) + return super()._lsrpls( + data, lam=lam, diff_order=diff_order, max_iter=max_iter, tol=tol, + weights=weights, spline_degree=spline_degree, num_knots=num_knots, + alternate_weighting=alternate_weighting ) - tol_history = np.empty(max_iter + 1) - for i in range(1, max_iter + 2): - baseline = pspline.solve(y, weight_array) - new_weights, exit_early = _weighting._lsrpls(y, baseline, i, alternate_weighting) - if exit_early: - i -= 1 # reduce i so that output tol_history indexing is correct - break - calc_difference = relative_difference(weight_array, new_weights) - tol_history[i - 1] = calc_difference - if calc_difference < tol: - break - weight_array = new_weights - - params = { - 'weights': weight_array, 'tol_history': tol_history[:i], - 'result': PSplineResult2D(pspline, weight_array) - } - - return baseline, params diff --git a/pybaselines/two_d/whittaker.py b/pybaselines/two_d/whittaker.py index edfc6ce..7960c2c 100644 --- a/pybaselines/two_d/whittaker.py +++ b/pybaselines/two_d/whittaker.py @@ -10,6 +10,7 @@ from .. import _weighting from .._compat import diags +from .._nd.pls import _PLSNDMixin from .._validation import _check_optional_array, _check_scalar_variable from ..results import WhittakerResult2D from ..utils import _MIN_FLOAT, relative_difference @@ -17,10 +18,9 @@ from ._whittaker_utils import PenalizedSystem2D -class _Whittaker(_Algorithm2D): +class _Whittaker(_Algorithm2D, _PLSNDMixin): """A base class for all Whittaker-smoothing-based algorithms.""" - @_Algorithm2D._register(sort_keys=('weights',)) def asls(self, data, lam=1e6, p=1e-2, diff_order=2, max_iter=50, tol=1e-3, weights=None, num_eigens=(10, 10), return_dof=False): """ @@ -62,9 +62,9 @@ def asls(self, data, lam=1e6, p=1e-2, diff_order=2, max_iter=50, tol=1e-3, weigh Returns ------- - baseline : numpy.ndarray, shape (M, N) + numpy.ndarray, shape (M, N) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (M, N) @@ -98,38 +98,12 @@ def asls(self, data, lam=1e6, p=1e-2, diff_order=2, max_iter=50, tol=1e-3, weigh practical use. ASTIN Bulletin, 2025, 1-31. """ - if not 0 < p < 1: - raise ValueError('p must be between 0 and 1') - y, weight_array, whittaker_system = self._setup_whittaker( - data, lam, diff_order, weights, num_eigens=num_eigens + return super()._asls( + data, lam=lam, p=p, diff_order=diff_order, max_iter=max_iter, tol=tol, + weights=weights, num_eigens=num_eigens, return_dof=return_dof ) - tol_history = np.empty(max_iter + 1) - for i in range(max_iter + 1): - baseline = whittaker_system.solve(y, weight_array) - new_weights = _weighting._asls(y, baseline, p) - calc_difference = relative_difference(weight_array, new_weights) - tol_history[i] = calc_difference - if calc_difference < tol: - break - weight_array = new_weights - - params = { - 'tol_history': tol_history[:i + 1], - 'result': WhittakerResult2D(whittaker_system, weight_array) - } - if whittaker_system._using_svd: - params['weights'] = weight_array - if return_dof: - params['dof'] = whittaker_system._calc_dof(weight_array) - else: - baseline = baseline.reshape(self._shape) - params['weights'] = weight_array.reshape(self._shape) - - return baseline, params - @_Algorithm2D._register( - sort_keys=('weights',), reshape_keys=('weights',), reshape_baseline=True - ) + @_Algorithm2D._handle_io(sort_keys=('weights',), reshape_keys=('weights',)) def iasls(self, data, lam=1e6, p=1e-2, lam_1=1e-4, max_iter=50, tol=1e-3, weights=None, diff_order=2): """ @@ -241,7 +215,6 @@ def iasls(self, data, lam=1e6, p=1e-2, lam_1=1e-4, max_iter=50, tol=1e-3, return baseline, params - @_Algorithm2D._register(sort_keys=('weights',)) def airpls(self, data, lam=1e6, diff_order=2, max_iter=50, tol=1e-3, weights=None, num_eigens=(10, 10), return_dof=False, normalize_weights=False): """ @@ -283,9 +256,9 @@ def airpls(self, data, lam=1e6, diff_order=2, max_iter=50, tol=1e-3, weights=Non Returns ------- - baseline : numpy.ndarray, shape (M, N) + numpy.ndarray, shape (M, N) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (M, N) @@ -295,13 +268,13 @@ def airpls(self, data, lam=1e6, diff_order=2, max_iter=50, tol=1e-3, weights=Non each iteration. The length of the array is the number of iterations completed. If the last value in the array is greater than the input `tol` value, then the function did not converge. + * 'result': WhittakerResult2D + An object that can use the results of the fit to perform additional + calculations. * 'dof' : numpy.ndarray, shape (`num_eigens[0]`, `num_eigens[1]`) Only if `return_dof` is True. The effective degrees of freedom associated with each eigenvector. Lower values signify that the eigenvector was less important for the fit. - * 'result': WhittakerResult2D - An object that can use the results of the fit to perform additional - calculations. References ---------- @@ -312,41 +285,12 @@ def airpls(self, data, lam=1e6, diff_order=2, max_iter=50, tol=1e-3, weights=Non practical use. ASTIN Bulletin, 2025, 1-31. """ - y, weight_array, whittaker_system = self._setup_whittaker( - data, lam, diff_order, weights, num_eigens=num_eigens - + return super()._airpls( + data, lam=lam, diff_order=diff_order, max_iter=max_iter, tol=tol, + weights=weights, num_eigens=num_eigens, return_dof=return_dof, + normalize_weights=normalize_weights ) - y_l1_norm = np.abs(y).sum() - tol_history = np.empty(max_iter + 1) - for i in range(1, max_iter + 2): - baseline = whittaker_system.solve(y, weight_array) - new_weights, residual_l1_norm, exit_early = _weighting._airpls( - y, baseline, i, normalize_weights - ) - if exit_early: - i -= 1 # reduce i so that output tol_history indexing is correct - break - calc_difference = residual_l1_norm / y_l1_norm - tol_history[i - 1] = calc_difference - if calc_difference < tol: - break - weight_array = new_weights - params = { - 'tol_history': tol_history[:i], - 'result': WhittakerResult2D(whittaker_system, weight_array) - } - if whittaker_system._using_svd: - params['weights'] = weight_array - if return_dof: - params['dof'] = whittaker_system._calc_dof(weight_array) - else: - baseline = baseline.reshape(self._shape) - params['weights'] = weight_array.reshape(self._shape) - - return baseline, params - - @_Algorithm2D._register(sort_keys=('weights',)) def arpls(self, data, lam=1e3, diff_order=2, max_iter=50, tol=1e-3, weights=None, num_eigens=(10, 10), return_dof=False): """ @@ -384,9 +328,9 @@ def arpls(self, data, lam=1e3, diff_order=2, max_iter=50, tol=1e-3, weights=None Returns ------- - baseline : numpy.ndarray, shape (M, N) + numpy.ndarray, shape (M, N) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (M, N) @@ -413,39 +357,12 @@ def arpls(self, data, lam=1e3, diff_order=2, max_iter=50, tol=1e-3, weights=None practical use. ASTIN Bulletin, 2025, 1-31. """ - y, weight_array, whittaker_system = self._setup_whittaker( - data, lam, diff_order, weights, num_eigens=num_eigens + return super()._arpls( + data, lam=lam, diff_order=diff_order, max_iter=max_iter, tol=tol, + weights=weights, num_eigens=num_eigens, return_dof=return_dof ) - tol_history = np.empty(max_iter + 1) - for i in range(max_iter + 1): - baseline = whittaker_system.solve(y, weight_array) - new_weights, exit_early = _weighting._arpls(y, baseline) - if exit_early: - i -= 1 # reduce i so that output tol_history indexing is correct - break - calc_difference = relative_difference(weight_array, new_weights) - tol_history[i] = calc_difference - if calc_difference < tol: - break - weight_array = new_weights - params = { - 'tol_history': tol_history[:i + 1], - 'result': WhittakerResult2D(whittaker_system, weight_array) - } - if whittaker_system._using_svd: - params['weights'] = weight_array - if return_dof: - params['dof'] = whittaker_system._calc_dof(weight_array) - else: - baseline = baseline.reshape(self._shape) - params['weights'] = weight_array.reshape(self._shape) - - return baseline, params - - @_Algorithm2D._register( - sort_keys=('weights',), reshape_keys=('weights',), reshape_baseline=True - ) + @_Algorithm2D._handle_io(sort_keys=('weights',), reshape_keys=('weights',)) def drpls(self, data, lam=1e5, eta=0.5, max_iter=50, tol=1e-3, weights=None, diff_order=2): """ Doubly reweighted penalized least squares (drPLS) baseline. @@ -538,7 +455,6 @@ def drpls(self, data, lam=1e5, eta=0.5, max_iter=50, tol=1e-3, weights=None, dif return baseline, params - @_Algorithm2D._register(sort_keys=('weights',)) def iarpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=None, num_eigens=(10, 10), return_dof=False): """ @@ -576,9 +492,9 @@ def iarpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=Non Returns ------- - baseline : numpy.ndarray, shape (M, N) + numpy.ndarray, shape (M, N) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (M, N) @@ -588,13 +504,13 @@ def iarpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=Non each iteration. The length of the array is the number of iterations completed. If the last value in the array is greater than the input `tol` value, then the function did not converge. + * 'result': WhittakerResult2D + An object that can use the results of the fit to perform additional + calculations. * 'dof' : numpy.ndarray, shape (`num_eigens[0]`, `num_eigens[1]`) Only if `return_dof` is True. The effective degrees of freedom associated with each eigenvector. Lower values signify that the eigenvector was less important for the fit. - * 'result': WhittakerResult2D - An object that can use the results of the fit to perform additional - calculations. References ---------- @@ -606,39 +522,12 @@ def iarpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=Non practical use. ASTIN Bulletin, 2025, 1-31. """ - y, weight_array, whittaker_system = self._setup_whittaker( - data, lam, diff_order, weights, num_eigens=num_eigens + return super()._iarpls( + data, lam=lam, diff_order=diff_order, max_iter=max_iter, tol=tol, + weights=weights, num_eigens=num_eigens, return_dof=return_dof ) - tol_history = np.empty(max_iter + 1) - for i in range(1, max_iter + 2): - baseline = whittaker_system.solve(y, weight_array) - new_weights, exit_early = _weighting._iarpls(y, baseline, i) - if exit_early: - i -= 1 # reduce i so that output tol_history indexing is correct - break - calc_difference = relative_difference(weight_array, new_weights) - tol_history[i - 1] = calc_difference - if calc_difference < tol: - break - weight_array = new_weights - params = { - 'tol_history': tol_history[:i], - 'result': WhittakerResult2D(whittaker_system, weight_array) - } - if whittaker_system._using_svd: - params['weights'] = weight_array - if return_dof: - params['dof'] = whittaker_system._calc_dof(weight_array) - else: - baseline = baseline.reshape(self._shape) - params['weights'] = weight_array.reshape(self._shape) - - return baseline, params - - @_Algorithm2D._register( - sort_keys=('weights', 'alpha'), reshape_keys=('weights', 'alpha'), reshape_baseline=True - ) + @_Algorithm2D._handle_io(sort_keys=('weights', 'alpha'), reshape_keys=('weights', 'alpha')) def aspls(self, data, lam=1e5, diff_order=2, max_iter=100, tol=1e-3, weights=None, alpha=None, asymmetric_coef=2., alternate_weighting=True): """ @@ -765,12 +654,11 @@ def aspls(self, data, lam=1e5, diff_order=2, max_iter=100, tol=1e-3, params = { 'weights': weight_array, 'alpha': alpha_array, 'tol_history': tol_history[:i + 1], - 'result': WhittakerResult2D(whittaker_system, weight_array, penalty=penalty) + 'result': WhittakerResult2D(whittaker_system, weight_array, lhs=penalty) } return baseline, params - @_Algorithm2D._register(sort_keys=('weights',)) def psalsa(self, data, lam=1e5, p=0.5, k=None, diff_order=2, max_iter=50, tol=1e-3, weights=None, num_eigens=(10, 10), return_dof=False): """ @@ -822,9 +710,9 @@ def psalsa(self, data, lam=1e5, p=0.5, k=None, diff_order=2, max_iter=50, tol=1e Returns ------- - baseline : numpy.ndarray, shape (M, N) + numpy.ndarray, shape (M, N) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (M, N) @@ -834,13 +722,13 @@ def psalsa(self, data, lam=1e5, p=0.5, k=None, diff_order=2, max_iter=50, tol=1e each iteration. The length of the array is the number of iterations completed. If the last value in the array is greater than the input `tol` value, then the function did not converge. + * 'result': WhittakerResult2D + An object that can use the results of the fit to perform additional + calculations. * 'dof' : numpy.ndarray, shape (`num_eigens[0]`, `num_eigens[1]`) Only if `return_dof` is True. The effective degrees of freedom associated with each eigenvector. Lower values signify that the eigenvector was less important for the fit. - * 'result': WhittakerResult2D - An object that can use the results of the fit to perform additional - calculations. Raises ------ @@ -865,42 +753,11 @@ def psalsa(self, data, lam=1e5, p=0.5, k=None, diff_order=2, max_iter=50, tol=1e practical use. ASTIN Bulletin, 2025, 1-31. """ - if not 0 < p < 1: - raise ValueError('p must be between 0 and 1') - y, weight_array, whittaker_system = self._setup_whittaker( - data, lam, diff_order, weights, num_eigens=num_eigens + return super()._psalsa( + data, lam=lam, p=p, k=k, diff_order=diff_order, max_iter=max_iter, tol=tol, + weights=weights, num_eigens=num_eigens, return_dof=return_dof ) - if k is None: - k = np.std(y) / 10 - else: - k = _check_scalar_variable(k, variable_name='k') - - shape = self._shape if whittaker_system._using_svd else self._size - tol_history = np.empty(max_iter + 1) - for i in range(max_iter + 1): - baseline = whittaker_system.solve(y, weight_array) - new_weights = _weighting._psalsa(y, baseline, p, k, shape) - calc_difference = relative_difference(weight_array, new_weights) - tol_history[i] = calc_difference - if calc_difference < tol: - break - weight_array = new_weights - - params = { - 'tol_history': tol_history[:i + 1], - 'result': WhittakerResult2D(whittaker_system, weight_array) - } - if whittaker_system._using_svd: - params['weights'] = weight_array - if return_dof: - params['dof'] = whittaker_system._calc_dof(weight_array) - else: - baseline = baseline.reshape(self._shape) - params['weights'] = weight_array.reshape(self._shape) - - return baseline, params - @_Algorithm2D._register(sort_keys=('weights',)) def brpls(self, data, lam=1e3, diff_order=2, max_iter=50, tol=1e-3, max_iter_2=50, tol_2=1e-3, weights=None, num_eigens=(10, 10), return_dof=False): """ @@ -922,7 +779,7 @@ def brpls(self, data, lam=1e3, diff_order=2, max_iter=50, tol=1e-3, max_iter_2=5 The max number of fit iterations. Default is 50. tol : float, optional The exit criteria. Default is 1e-3. - max_iter_2 : float, optional + max_iter_2 : int, optional The number of iterations for updating the proportion of data occupied by peaks. Default is 50. tol_2 : float, optional @@ -944,9 +801,9 @@ def brpls(self, data, lam=1e3, diff_order=2, max_iter=50, tol=1e-3, max_iter_2=5 Returns ------- - baseline : numpy.ndarray, shape (M, N) + numpy.ndarray, shape (M, N) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (M, N) @@ -960,13 +817,13 @@ def brpls(self, data, lam=1e3, diff_order=2, max_iter=50, tol=1e-3, max_iter_2=5 `max_iter_2`, `tol_2`), and shape K is the maximum of the number of iterations for the threshold and the maximum number of iterations for all of the fits of the various threshold values (related to `max_iter` and `tol`). + * 'result': WhittakerResult2D + An object that can use the results of the fit to perform additional + calculations. * 'dof' : numpy.ndarray, shape (`num_eigens[0]`, `num_eigens[1]`) Only if `return_dof` is True. The effective degrees of freedom associated with each eigenvector. Lower values signify that the eigenvector was less important for the fit. - * 'result': WhittakerResult2D - An object that can use the results of the fit to perform additional - calculations. References ---------- @@ -978,62 +835,12 @@ def brpls(self, data, lam=1e3, diff_order=2, max_iter=50, tol=1e-3, max_iter_2=5 practical use. ASTIN Bulletin, 2025, 1-31. """ - y, weight_array, whittaker_system = self._setup_whittaker( - data, lam, diff_order, weights, num_eigens=num_eigens + return super()._brpls( + data, lam=lam, diff_order=diff_order, max_iter=max_iter, tol=tol, + max_iter_2=max_iter_2, tol_2=tol_2, weights=weights, num_eigens=num_eigens, + return_dof=return_dof ) - beta = 0.5 - j_max = 0 - baseline = y - baseline_weights = weight_array - tol_history = np.zeros((max_iter_2 + 2, max(max_iter, max_iter_2) + 1)) - # implementation note: weight_array must always be updated since otherwise when - # reentering the inner loop, new_baseline and baseline would be the same; instead, - # use baseline_weights to track which weights produced the output baseline - for i in range(max_iter_2 + 1): - for j in range(max_iter + 1): - new_baseline = whittaker_system.solve(y, weight_array) - new_weights, exit_early = _weighting._brpls(y, new_baseline, beta) - if exit_early: - j -= 1 # reduce j so that output tol_history indexing is correct - tol_2 = np.inf # ensure it exits outer loop - break - # Paper used norm(old - new) / norm(new) rather than old in the denominator, - # but I use old in the denominator instead to be consistent with all other - # algorithms; does not make a major difference - calc_difference = relative_difference(baseline, new_baseline) - tol_history[i + 1, j] = calc_difference - if calc_difference < tol: - if i == 0 and j == 0: # for cases where tol == inf - baseline = new_baseline - break - baseline_weights = weight_array - weight_array = new_weights - baseline = new_baseline - j_max = max(j, j_max) - - weight_array = new_weights - weight_mean = weight_array.mean() - calc_difference_2 = abs(beta + weight_mean - 1) - tol_history[0, i] = calc_difference_2 - if calc_difference_2 < tol_2: - break - beta = 1 - weight_mean - - params = { - 'tol_history': tol_history[:i + 2, :max(i, j_max) + 1], - 'result': WhittakerResult2D(whittaker_system, weight_array) - } - if whittaker_system._using_svd: - params['weights'] = baseline_weights - if return_dof: - params['dof'] = whittaker_system._calc_dof(baseline_weights) - else: - baseline = baseline.reshape(self._shape) - params['weights'] = baseline_weights.reshape(self._shape) - - return baseline, params - @_Algorithm2D._register(sort_keys=('weights',)) def lsrpls(self, data, lam=1e3, diff_order=2, max_iter=50, tol=1e-3, weights=None, num_eigens=(10, 10), return_dof=False, alternate_weighting=False): """ @@ -1077,9 +884,9 @@ def lsrpls(self, data, lam=1e3, diff_order=2, max_iter=50, tol=1e-3, weights=Non Returns ------- - baseline : numpy.ndarray, shape (M, N) + numpy.ndarray, shape (M, N) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (M, N) @@ -1119,32 +926,8 @@ def lsrpls(self, data, lam=1e3, diff_order=2, max_iter=50, tol=1e-3, weights=Non for practical use. ASTIN Bulletin, 2025, 1-31. """ - y, weight_array, whittaker_system = self._setup_whittaker( - data, lam, diff_order, weights, num_eigens=num_eigens + return super()._lsrpls( + data, lam=lam, diff_order=diff_order, max_iter=max_iter, tol=tol, + weights=weights, num_eigens=num_eigens, return_dof=return_dof, + alternate_weighting=alternate_weighting ) - tol_history = np.empty(max_iter + 1) - for i in range(1, max_iter + 2): - baseline = whittaker_system.solve(y, weight_array) - new_weights, exit_early = _weighting._lsrpls(y, baseline, i, alternate_weighting) - if exit_early: - i -= 1 # reduce i so that output tol_history indexing is correct - break - calc_difference = relative_difference(weight_array, new_weights) - tol_history[i - 1] = calc_difference - if calc_difference < tol: - break - weight_array = new_weights - - params = { - 'tol_history': tol_history[:i], - 'result': WhittakerResult2D(whittaker_system, weight_array) - } - if whittaker_system._using_svd: - params['weights'] = weight_array - if return_dof: - params['dof'] = whittaker_system._calc_dof(weight_array) - else: - baseline = baseline.reshape(self._shape) - params['weights'] = weight_array.reshape(self._shape) - - return baseline, params diff --git a/pybaselines/utils.py b/pybaselines/utils.py index 80c2819..e05b8ab 100644 --- a/pybaselines/utils.py +++ b/pybaselines/utils.py @@ -9,7 +9,7 @@ from math import ceil import numpy as np -from scipy.ndimage import grey_opening +from scipy.ndimage import grey_dilation, grey_erosion, grey_opening from scipy.signal import convolve from scipy.special import binom from scipy.stats import skew @@ -872,6 +872,62 @@ def optimize_window(*args, **kwargs): return estimate_window(*args, **kwargs) +def _make_window(y, half_window): + """ + Converts a half-window to full window for use with ndimage filters. + + Parameters + ---------- + y : numpy.ndarray, shape (N,) or shape (M, N) + The array of the measured data. Can be one or two dimensional. + half_window : int or numpy.ndarray([int, int]), optional + The half window size to use for the operations. + + Returns + ------- + window : numpy.ndarray, shape (2,) or list[int, ...] + The full window, with length matching the input y-dimensions for use + within SciPy's `ndimage` module. + + """ + window = 2 * half_window + 1 + if isinstance(window, int): + window = [window] * y.ndim + + return window + + +def _avg_opening(y, half_window, opening=None): + """ + Averages the dilation and erosion of a morphological opening on data. + + Parameters + ---------- + y : numpy.ndarray, shape (N,) or shape (M, N) + The array of the measured data. Can be one or two dimensional. + half_window : int or numpy.ndarray([int, int]), optional + The half window size to use for the operations. + opening : numpy.ndarray, shape (N,) or shape (M, N), optional + The output of ``scipy.ndimage.grey_opening(y, window_size)``. Default is + None, which will compute the value. + + Returns + ------- + numpy.ndarray, shape (N,) or shape (M, N) + The average of the dilation and erosion of the opening. + + References + ---------- + Perez-Pueyo, R., et al. Morphology-Based Automated Baseline Removal for + Raman Spectra of Artistic Pigments. Applied Spectroscopy, 2010, 64 595-600. + + """ + window = _make_window(y, half_window) + if opening is None: + opening = grey_opening(y, window) + return 0.5 * (grey_dilation(opening, window) + grey_erosion(opening, window)) + + def _inverted_sort(sort_order): """ Finds the indices that invert a sorting. @@ -1063,10 +1119,7 @@ def whittaker_smooth(data, lam=1e6, diff_order=2, weights=None, check_finite=Tru penalized_system = PenalizedSystem(len_y, lam=lam, diff_order=diff_order) weight_array = _check_optional_array(len_y, weights, check_finite=check_finite) - y_smooth = penalized_system.solve( - penalized_system.add_diagonal(weight_array), - weight_array * y, overwrite_ab=True, overwrite_b=True - ) + y_smooth = penalized_system.solve(y, weight_array) return y_smooth @@ -1129,7 +1182,7 @@ def pspline_smooth(data, x_data=None, lam=1e1, num_knots=100, spline_degree=3, d weight_array = _check_optional_array( len(y), weights, dtype=float, order='C', check_finite=check_finite ) - y_smooth = pspline.solve_pspline(y, weight_array) + y_smooth = pspline.solve(y, weight_array) return y_smooth, pspline.tck diff --git a/pybaselines/whittaker.py b/pybaselines/whittaker.py index 46a02c4..0b33252 100644 --- a/pybaselines/whittaker.py +++ b/pybaselines/whittaker.py @@ -11,15 +11,15 @@ from . import _weighting from ._algorithm_setup import _Algorithm, _class_wrapper from ._banded_utils import _shift_rows, diff_penalty_diagonals +from ._nd.pls import _PLSNDMixin from ._validation import _check_lam, _check_optional_array, _check_scalar_variable from .results import WhittakerResult -from .utils import _mollifier_kernel, pad_edges, padded_convolve, relative_difference +from .utils import relative_difference -class _Whittaker(_Algorithm): +class _Whittaker(_Algorithm, _PLSNDMixin): """A base class for all Whittaker-smoothing-based algorithms.""" - @_Algorithm._register(sort_keys=('weights',)) def asls(self, data, lam=1e6, p=1e-2, diff_order=2, max_iter=50, tol=1e-3, weights=None): r""" Fits the baseline using the asymmetric least squares (AsLS) algorithm. @@ -63,9 +63,9 @@ def asls(self, data, lam=1e6, p=1e-2, diff_order=2, max_iter=50, tol=1e-3, weigh Returns ------- - baseline : numpy.ndarray, shape (N,) + numpy.ndarray, shape (N,) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (N,) @@ -140,30 +140,12 @@ def asls(self, data, lam=1e6, p=1e-2, diff_order=2, max_iter=50, tol=1e-3, weigh >>> plt.show() """ - if not 0 < p < 1: - raise ValueError('p must be between 0 and 1') - y, weight_array, whittaker_system = self._setup_whittaker(data, lam, diff_order, weights) - tol_history = np.empty(max_iter + 1) - for i in range(max_iter + 1): - baseline = whittaker_system.solve( - whittaker_system.add_diagonal(weight_array), weight_array * y, - overwrite_b=True - ) - new_weights = _weighting._asls(y, baseline, p) - calc_difference = relative_difference(weight_array, new_weights) - tol_history[i] = calc_difference - if calc_difference < tol: - break - weight_array = new_weights - - params = { - 'weights': weight_array, 'tol_history': tol_history[:i + 1], - 'result': WhittakerResult(whittaker_system, weight_array) - } - - return baseline, params + return super()._asls( + data, lam=lam, p=p, diff_order=diff_order, max_iter=max_iter, tol=tol, + weights=weights + ) - @_Algorithm._register(sort_keys=('weights',)) + @_Algorithm._handle_io(sort_keys=('weights',)) def iasls(self, data, lam=1e6, p=1e-2, lam_1=1e-4, max_iter=50, tol=1e-3, weights=None, diff_order=2): r""" @@ -344,10 +326,7 @@ def iasls(self, data, lam=1e6, p=1e-2, lam_1=1e-4, max_iter=50, tol=1e-3, d1_y = lambda_1 * d1_y tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): - baseline = whittaker_system.solve( - whittaker_system.add_diagonal(weight_array), weight_array * y + d1_y, - overwrite_b=True - ) + baseline = whittaker_system.solve(y, weight_array, rhs_extra=d1_y) new_weights = _weighting._iasls(y, baseline, p) calc_difference = relative_difference(weight_array, new_weights) tol_history[i] = calc_difference @@ -364,7 +343,6 @@ def iasls(self, data, lam=1e6, p=1e-2, lam_1=1e-4, max_iter=50, tol=1e-3, return baseline, params - @_Algorithm._register(sort_keys=('weights',)) def airpls(self, data, lam=1e6, diff_order=2, max_iter=50, tol=1e-3, weights=None, normalize_weights=False): r""" @@ -412,9 +390,9 @@ def airpls(self, data, lam=1e6, diff_order=2, max_iter=50, tol=1e-3, weights=Non Returns ------- - baseline : numpy.ndarray, shape (N,) + numpy.ndarray, shape (N,) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (N,) @@ -472,34 +450,11 @@ def airpls(self, data, lam=1e6, diff_order=2, max_iter=50, tol=1e-3, weights=Non >>> plt.show() """ - y, weight_array, whittaker_system = self._setup_whittaker(data, lam, diff_order, weights) - y_l1_norm = np.abs(y).sum() - tol_history = np.empty(max_iter + 1) - for i in range(1, max_iter + 2): - baseline = whittaker_system.solve( - whittaker_system.add_diagonal(weight_array), weight_array * y, - overwrite_b=True - ) - new_weights, residual_l1_norm, exit_early = _weighting._airpls( - y, baseline, i, normalize_weights - ) - if exit_early: - i -= 1 # reduce i so that output tol_history indexing is correct - break - calc_difference = residual_l1_norm / y_l1_norm - tol_history[i - 1] = calc_difference - if calc_difference < tol: - break - weight_array = new_weights - - params = { - 'weights': weight_array, 'tol_history': tol_history[:i], - 'result': WhittakerResult(whittaker_system, weight_array) - } - - return baseline, params + return super()._airpls( + data, lam=lam, diff_order=diff_order, max_iter=max_iter, tol=tol, + weights=weights, normalize_weights=normalize_weights + ) - @_Algorithm._register(sort_keys=('weights',)) def arpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=None): r""" Asymmetrically reweighted penalized least squares smoothing (arPLS). @@ -544,9 +499,9 @@ def arpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=None Returns ------- - baseline : numpy.ndarray, shape (N,) + numpy.ndarray, shape (N,) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (N,) @@ -584,31 +539,11 @@ def arpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=None >>> plt.show() """ - y, weight_array, whittaker_system = self._setup_whittaker(data, lam, diff_order, weights) - tol_history = np.empty(max_iter + 1) - for i in range(max_iter + 1): - baseline = whittaker_system.solve( - whittaker_system.add_diagonal(weight_array), weight_array * y, - overwrite_b=True - ) - new_weights, exit_early = _weighting._arpls(y, baseline) - if exit_early: - i -= 1 # reduce i so that output tol_history indexing is correct - break - calc_difference = relative_difference(weight_array, new_weights) - tol_history[i] = calc_difference - if calc_difference < tol: - break - weight_array = new_weights - - params = { - 'weights': weight_array, 'tol_history': tol_history[:i + 1], - 'result': WhittakerResult(whittaker_system, weight_array) - } - - return baseline, params + return super()._arpls( + data, lam=lam, diff_order=diff_order, max_iter=max_iter, tol=tol, weights=weights + ) - @_Algorithm._register(sort_keys=('weights',)) + @_Algorithm._handle_io(sort_keys=('weights',)) def drpls(self, data, lam=1e5, eta=0.5, max_iter=50, tol=1e-3, weights=None, diff_order=2): """ Doubly reweighted penalized least squares (drPLS) baseline. @@ -686,7 +621,7 @@ def drpls(self, data, lam=1e5, eta=0.5, max_iter=50, tol=1e-3, weights=None, dif diff_n_diagonals * weight_array, diff_order, diff_order ) lhs = whittaker_system.penalty + penalty_with_weights - baseline = whittaker_system.solve( + baseline = whittaker_system.direct_solve( lhs, weight_array * y, overwrite_b=True, l_and_u=lower_upper_bands ) new_weights, exit_early = _weighting._drpls(y, baseline, i) @@ -707,7 +642,6 @@ def drpls(self, data, lam=1e5, eta=0.5, max_iter=50, tol=1e-3, weights=None, dif return baseline, params - @_Algorithm._register(sort_keys=('weights',)) def iarpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=None): """ Improved asymmetrically reweighted penalized least squares smoothing (IarPLS). @@ -732,9 +666,9 @@ def iarpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=Non Returns ------- - baseline : numpy.ndarray, shape (N,) + numpy.ndarray, shape (N,) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (N,) @@ -755,31 +689,11 @@ def iarpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=Non 59, 10933-10943. """ - y, weight_array, whittaker_system = self._setup_whittaker(data, lam, diff_order, weights) - tol_history = np.empty(max_iter + 1) - for i in range(1, max_iter + 2): - baseline = whittaker_system.solve( - whittaker_system.add_diagonal(weight_array), weight_array * y, - overwrite_b=True - ) - new_weights, exit_early = _weighting._iarpls(y, baseline, i) - if exit_early: - i -= 1 # reduce i so that output tol_history indexing is correct - break - calc_difference = relative_difference(weight_array, new_weights) - tol_history[i - 1] = calc_difference - if calc_difference < tol: - break - weight_array = new_weights - - params = { - 'weights': weight_array, 'tol_history': tol_history[:i], - 'result': WhittakerResult(whittaker_system, weight_array) - } - - return baseline, params + return super()._iarpls( + data, lam=lam, diff_order=diff_order, max_iter=max_iter, tol=tol, weights=weights + ) - @_Algorithm._register(sort_keys=('weights', 'alpha')) + @_Algorithm._handle_io(sort_keys=('weights', 'alpha')) def aspls(self, data, lam=1e5, diff_order=2, max_iter=100, tol=1e-3, weights=None, alpha=None, asymmetric_coef=2., alternate_weighting=True): """ @@ -878,14 +792,11 @@ def aspls(self, data, lam=1e5, diff_order=2, max_iter=100, tol=1e-3, alpha_array = alpha_array[self._sort_order] asymmetric_coef = _check_scalar_variable(asymmetric_coef, variable_name='asymmetric_coef') - lower_upper_bands = (diff_order, diff_order) tol_history = np.empty(max_iter + 1) for i in range(max_iter + 1): lhs = whittaker_system.penalty * alpha_array - lhs[whittaker_system.main_diagonal_index] += weight_array baseline = whittaker_system.solve( - _shift_rows(lhs, diff_order, diff_order), weight_array * y, - overwrite_b=True, l_and_u=lower_upper_bands + y, weight_array, penalty=_shift_rows(lhs, diff_order, diff_order) ) new_weights, residual, exit_early = _weighting._aspls( y, baseline, asymmetric_coef, alternate_weighting @@ -908,7 +819,6 @@ def aspls(self, data, lam=1e5, diff_order=2, max_iter=100, tol=1e-3, return baseline, params - @_Algorithm._register(sort_keys=('weights',)) def psalsa(self, data, lam=1e5, p=0.5, k=None, diff_order=2, max_iter=50, tol=1e-3, weights=None): """ @@ -948,9 +858,9 @@ def psalsa(self, data, lam=1e5, p=0.5, k=None, diff_order=2, max_iter=50, tol=1e Returns ------- - baseline : numpy.ndarray, shape (N,) + numpy.ndarray, shape (N,) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (N,) @@ -984,34 +894,11 @@ def psalsa(self, data, lam=1e5, p=0.5, k=None, diff_order=2, max_iter=50, tol=1e Systems, Signals, and Devices, 2014, 1-5. """ - if not 0 < p < 1: - raise ValueError('p must be between 0 and 1') - y, weight_array, whittaker_system = self._setup_whittaker(data, lam, diff_order, weights) - if k is None: - k = np.std(y) / 10 - else: - k = _check_scalar_variable(k, variable_name='k') - tol_history = np.empty(max_iter + 1) - for i in range(max_iter + 1): - baseline = whittaker_system.solve( - whittaker_system.add_diagonal(weight_array), weight_array * y, - overwrite_b=True - ) - new_weights = _weighting._psalsa(y, baseline, p, k, self._shape) - calc_difference = relative_difference(weight_array, new_weights) - tol_history[i] = calc_difference - if calc_difference < tol: - break - weight_array = new_weights - - params = { - 'weights': weight_array, 'tol_history': tol_history[:i + 1], - 'result': WhittakerResult(whittaker_system, weight_array) - } - - return baseline, params + return super()._psalsa( + data, lam=lam, p=p, k=k, diff_order=diff_order, max_iter=max_iter, tol=tol, + weights=weights + ) - @_Algorithm._register(sort_keys=('weights',)) def derpsalsa(self, data, lam=1e6, p=0.01, k=None, diff_order=2, max_iter=50, tol=1e-3, weights=None, smooth_half_window=None, num_smooths=16, pad_kwargs=None, **kwargs): @@ -1062,9 +949,9 @@ def derpsalsa(self, data, lam=1e6, p=0.01, k=None, diff_order=2, max_iter=50, to Returns ------- - baseline : numpy.ndarray, shape (N,) + numpy.ndarray, shape (N,) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (N,) @@ -1091,57 +978,12 @@ def derpsalsa(self, data, lam=1e6, p=0.01, k=None, diff_order=2, max_iter=50, to 51(10), 2061-2065. """ - if not 0 < p < 1: - raise ValueError('p must be between 0 and 1') - y, weight_array, whittaker_system = self._setup_whittaker(data, lam, diff_order, weights) - if k is None: - k = np.std(y) / 10 - else: - k = _check_scalar_variable(k, variable_name='k') - if smooth_half_window is None: - smooth_half_window = self._size // 200 - # could pad the data every iteration, but it is ~2-3 times slower and only affects - # the edges, so it's not worth it - self._deprecate_pad_kwargs(**kwargs) - pad_kwargs = pad_kwargs if pad_kwargs is not None else {} - y_smooth = pad_edges(y, smooth_half_window, **pad_kwargs, **kwargs) - if smooth_half_window > 0: - smooth_kernel = _mollifier_kernel(smooth_half_window) - for _ in range(num_smooths): - y_smooth = padded_convolve(y_smooth, smooth_kernel) - y_smooth = y_smooth[smooth_half_window:self._size + smooth_half_window] - - diff_y_1 = np.gradient(y_smooth) - diff_y_2 = np.gradient(diff_y_1) - # x.dot(x) is same as (x**2).sum() but faster - rms_diff_1 = np.sqrt(diff_y_1.dot(diff_y_1) / self._size) - rms_diff_2 = np.sqrt(diff_y_2.dot(diff_y_2) / self._size) - - diff_1_weights = np.exp(-((diff_y_1 / rms_diff_1)**2) / 2) - diff_2_weights = np.exp(-((diff_y_2 / rms_diff_2)**2) / 2) - partial_weights = diff_1_weights * diff_2_weights - - tol_history = np.empty(max_iter + 1) - for i in range(max_iter + 1): - baseline = whittaker_system.solve( - whittaker_system.add_diagonal(weight_array), weight_array * y, - overwrite_b=True - ) - new_weights = _weighting._derpsalsa(y, baseline, p, k, self._shape, partial_weights) - calc_difference = relative_difference(weight_array, new_weights) - tol_history[i] = calc_difference - if calc_difference < tol: - break - weight_array = new_weights - - params = { - 'weights': weight_array, 'tol_history': tol_history[:i + 1], - 'result': WhittakerResult(whittaker_system, weight_array) - } - - return baseline, params + return super()._derpsalsa( + data, lam=lam, p=p, k=k, diff_order=diff_order, max_iter=max_iter, tol=tol, + weights=weights, smooth_half_window=smooth_half_window, num_smooths=num_smooths, + pad_kwargs=pad_kwargs, **kwargs + ) - @_Algorithm._register(sort_keys=('weights',)) def brpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, max_iter_2=50, tol_2=1e-3, weights=None): """ @@ -1161,7 +1003,7 @@ def brpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, max_iter_2=5 The max number of fit iterations. Default is 50. tol : float, optional The exit criteria. Default is 1e-3. - max_iter_2 : float, optional + max_iter_2 : int, optional The number of iterations for updating the proportion of data occupied by peaks. Default is 50. tol_2 : float, optional @@ -1173,9 +1015,9 @@ def brpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, max_iter_2=5 Returns ------- - baseline : numpy.ndarray, shape (N,) + numpy.ndarray, shape (N,) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (N,) @@ -1200,56 +1042,11 @@ def brpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, max_iter_2=5 2022, 140, 250-257. """ - y, weight_array, whittaker_system = self._setup_whittaker(data, lam, diff_order, weights) - beta = 0.5 - j_max = 0 - baseline = y - baseline_weights = weight_array - tol_history = np.zeros((max_iter_2 + 2, max(max_iter, max_iter_2) + 1)) - # implementation note: weight_array must always be updated since otherwise when - # reentering the inner loop, new_baseline and baseline would be the same; instead, - # use baseline_weights to track which weights produced the output baseline - for i in range(max_iter_2 + 1): - for j in range(max_iter + 1): - new_baseline = whittaker_system.solve( - whittaker_system.add_diagonal(weight_array), weight_array * y, - overwrite_b=True - ) - new_weights, exit_early = _weighting._brpls(y, new_baseline, beta) - if exit_early: - j -= 1 # reduce j so that output tol_history indexing is correct - tol_2 = np.inf # ensure it exits outer loop - break - # Paper used norm(old - new) / norm(new) rather than old in the denominator, - # but I use old in the denominator instead to be consistent with all other - # algorithms; does not make a major difference - calc_difference = relative_difference(baseline, new_baseline) - tol_history[i + 1, j] = calc_difference - if calc_difference < tol: - if i == 0 and j == 0: # for cases where tol == inf - baseline = new_baseline - break - baseline_weights = weight_array - weight_array = new_weights - baseline = new_baseline - j_max = max(j, j_max) - - weight_array = new_weights - weight_mean = weight_array.mean() - calc_difference_2 = abs(beta + weight_mean - 1) - tol_history[0, i] = calc_difference_2 - if calc_difference_2 < tol_2: - break - beta = 1 - weight_mean - - params = { - 'weights': baseline_weights, 'tol_history': tol_history[:i + 2, :max(i, j_max) + 1], - 'result': WhittakerResult(whittaker_system, weight_array) - } - - return baseline, params + return super()._brpls( + data, lam=lam, diff_order=diff_order, max_iter=max_iter, tol=tol, + max_iter_2=max_iter_2, tol_2=tol_2, weights=weights + ) - @_Algorithm._register(sort_keys=('weights',)) def lsrpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=None, alternate_weighting=False): """ @@ -1281,9 +1078,9 @@ def lsrpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=Non Returns ------- - baseline : numpy.ndarray, shape (N,) + numpy.ndarray, shape (N,) The calculated baseline. - params : dict + dict A dictionary with the following items: * 'weights': numpy.ndarray, shape (N,) @@ -1317,29 +1114,10 @@ def lsrpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=Non penalized least squares, Applied Optics, 2019, 58, 3913-3920. """ - y, weight_array, whittaker_system = self._setup_whittaker(data, lam, diff_order, weights) - tol_history = np.empty(max_iter + 1) - for i in range(1, max_iter + 2): - baseline = whittaker_system.solve( - whittaker_system.add_diagonal(weight_array), weight_array * y, - overwrite_b=True - ) - new_weights, exit_early = _weighting._lsrpls(y, baseline, i, alternate_weighting) - if exit_early: - i -= 1 # reduce i so that output tol_history indexing is correct - break - calc_difference = relative_difference(weight_array, new_weights) - tol_history[i - 1] = calc_difference - if calc_difference < tol: - break - weight_array = new_weights - - params = { - 'weights': weight_array, 'tol_history': tol_history[:i], - 'result': WhittakerResult(whittaker_system, weight_array) - } - - return baseline, params + return super()._lsrpls( + data, lam=lam, diff_order=diff_order, max_iter=max_iter, tol=tol, + weights=weights, alternate_weighting=alternate_weighting + ) _whittaker_wrapper = _class_wrapper(_Whittaker) diff --git a/tests/base_tests.py b/tests/base_tests.py index ff9f327..d73d1c3 100644 --- a/tests/base_tests.py +++ b/tests/base_tests.py @@ -8,14 +8,14 @@ from concurrent.futures import ThreadPoolExecutor from functools import partial, wraps -from inspect import signature +import inspect import numpy as np from numpy.testing import assert_allclose, assert_array_equal import pytest import pybaselines -from pybaselines import Baseline, Baseline2D +from pybaselines import Baseline, Baseline2D, _nd from pybaselines.two_d._algorithm_setup import _Algorithm2D @@ -129,7 +129,7 @@ def gaussian2d(x, z, height=1.0, center_x=0.0, center_z=0.0, sigma_x=1.0, sigma_ class _Poly2D(_Algorithm2D): """A class that provides a 2D polynomial method for testing purposes.""" - @_Algorithm2D._register(reshape_baseline=True) + @_Algorithm2D._handle_io def poly(self, data, poly_order=2, weights=None, max_cross=None): """ Computes a polynomial fit to the data. @@ -158,7 +158,7 @@ def poly(self, data, poly_order=2, weights=None, max_cross=None): baseline : numpy.ndarray, shape (M, N) The calculated baseline. dict - An empty dictionary so that `_Algorithm2D._register` does not throw an error. No + An empty dictionary so that `_Algorithm2D._handle_io` does not throw an error. No parameters are of concern for this testing class. """ @@ -463,7 +463,25 @@ def teardown_class(cls): cls.param_keys = None def test_ensure_wrapped(self): - """Ensures the class method was wrapped using _Algorithm._register to control inputs.""" + """Ensures the class method was wrapped using _Algorithm._handle_io to control inputs.""" + mod_name = self.module.__name__.split('.')[-1] + pls_module = mod_name in ('whittaker', 'spline') + if hasattr(_nd, mod_name) or pls_module: + nd_module = 'pls' if pls_module else mod_name + cls_name = '_PLSNDMixin' if pls_module else f'_{mod_name.capitalize()}NDMixin' + nd_mixin = getattr(getattr(_nd, nd_module), cls_name) + if mod_name == 'spline': + method_name = f'_{self.func_name.removeprefix("pspline_")}' + elif pls_module: + method_name = f'_{self.func_name}' + else: + method_name = self.func_name + if hasattr(nd_mixin, method_name): + # method should not be wrapped, only the ND method + assert not hasattr(self.class_func, '__wrapped__') + assert hasattr(getattr(nd_mixin, method_name), '__wrapped__') + return + assert hasattr(self.class_func, '__wrapped__') @pytest.mark.parametrize('use_class', (True, False)) @@ -525,8 +543,8 @@ def test_functional_vs_class_parameters(self): the two signatures should be that the functional api has an `x_data` keyword. """ - class_parameters = signature(self.class_func).parameters - functional_parameters = signature( + class_parameters = inspect.signature(self.class_func).parameters + functional_parameters = inspect.signature( getattr(self.module, self.func_name) ).parameters @@ -833,7 +851,32 @@ def teardown_class(cls): cls.param_keys = None def test_ensure_wrapped(self): - """Ensures the class method was wrapped using _Algorithm._register to control inputs.""" + """Ensures the class method was wrapped using _Algorithm2D._handle_io to control inputs.""" + mod_name = self.module.__name__.split('.')[-1] + pls_module = mod_name in ('whittaker', 'spline') + if hasattr(_nd, mod_name) or pls_module: + nd_module = 'pls' if pls_module else mod_name + cls_name_nd = '_PLSNDMixin' if pls_module else f'_{mod_name.capitalize()}NDMixin' + nd_mixin = getattr(getattr(_nd, nd_module), cls_name_nd) + if mod_name == 'spline': + method_name = f'_{self.func_name.removeprefix("pspline_")}' + elif pls_module: + method_name = f'_{self.func_name}' + else: + method_name = self.func_name + if hasattr(nd_mixin, method_name): + assert hasattr(getattr(nd_mixin, method_name), '__wrapped__') + # some 2D methods are directly inherited without subclassing + cls_name_2d = f'_{mod_name.capitalize()}' + class_2d = getattr(self.module, cls_name_2d) + if ( + hasattr(class_2d, method_name) + and inspect.getmodule(getattr(class_2d, method_name)) is self.module + ): + # method should not be wrapped, only the ND method + assert not hasattr(self.class_func, '__wrapped__') + return + assert hasattr(self.class_func, '__wrapped__') @pytest.mark.parametrize('new_instance', (True, False)) @@ -1120,7 +1163,7 @@ def test_recreation(self): first_baseline, params = self.class_func(self.y, **self.kwargs) kwargs = {'weights': params['weights'], **self.kwargs} - class_parameters = signature(self.class_func).parameters + class_parameters = inspect.signature(self.class_func).parameters if 'tol' in class_parameters: kwargs['tol'] = np.inf if 'tol_2' in class_parameters: diff --git a/tests/nd/__init__.py b/tests/nd/__init__.py new file mode 100644 index 0000000..fafea14 --- /dev/null +++ b/tests/nd/__init__.py @@ -0,0 +1,2 @@ +# -*- coding: utf-8 -*- +"""Tests for pybaselines._nd.""" diff --git a/tests/nd/test_algorithm_setup.py b/tests/nd/test_algorithm_setup.py new file mode 100644 index 0000000..ddc4f92 --- /dev/null +++ b/tests/nd/test_algorithm_setup.py @@ -0,0 +1,544 @@ +# -*- coding: utf-8 -*- +"""Tests for pybaselines._nd._algorithm_setup. + +@author: Donald Erb +Created on March 25, 2026 + +""" + +from inspect import signature + +import numpy as np +from numpy.testing import assert_allclose, assert_array_equal +import pytest + +from pybaselines import _algorithm_setup as _algorithm_setup1d +from pybaselines.utils import SortingWarning +from pybaselines.two_d import _algorithm_setup as _algorithm_setup2d +from pybaselines._nd import _algorithm_setup + +from ..base_tests import get_data, get_data2d + + +def test_handle_io_signature(): + """Ensures _handle_io has the same signature and defaults as _Algorithm and _Algorithm2D.""" + wrapper_parameters = signature(_algorithm_setup._handle_io).parameters + + algorithm_parameters = signature(_algorithm_setup1d._Algorithm._handle_io).parameters + algorithm2d_parameters = signature(_algorithm_setup2d._Algorithm2D._handle_io).parameters + + for alg_parameters in (algorithm_parameters, algorithm2d_parameters): + assert len(wrapper_parameters) == len(alg_parameters) + # ensure key and values for all parameters match for both signatures + for key in wrapper_parameters: + assert key in alg_parameters + wrapper_value = alg_parameters[key].default + algorithm_value = alg_parameters[key].default + # all the defaults should just be booleans and empty tuples + assert wrapper_value == algorithm_value, f'Parameter mismatch for key "{key}"' + + +@pytest.mark.parametrize('assume_sorted', (True, False)) +@pytest.mark.parametrize('output_dtype', (None, int, float, np.float64)) +@pytest.mark.parametrize('change_order', (True, False)) +@pytest.mark.parametrize('list_input', (True, False)) +@pytest.mark.parametrize('skip_sorting', (True, False)) +def test_handle_io_1d(assume_sorted, output_dtype, change_order, list_input, skip_sorting): + """Ensures the _handle_io wrapper passes all tests expected by _Algorithm2D._handle_io.""" + x = np.arange(20) + y = 5 * x + sort_indices = slice(0, 10) + + class SubClass(_algorithm_setup1d._Algorithm): + # 'a' values will be sorted and 'b' values will be kept the same + @_algorithm_setup._handle_io(sort_keys=('a',)) + def func(self, data, *args, **kwargs): + """For checking sorting of output parameters.""" + expected_x = np.arange(20) + expected_input = 5 * expected_x + + assert isinstance(data, np.ndarray) + assert_allclose(data, expected_input, 1e-16, 1e-16) + assert isinstance(self.x, np.ndarray) + assert_allclose(self.x, expected_x, 1e-16, 1e-16) + + params = { + 'a': np.arange(len(x)), + 'b': np.arange(len(x)) + } + return 1 * data, params + + @_algorithm_setup._handle_io(sort_keys=('a',), skip_sorting=skip_sorting) + def func2(self, data, *args, **kwargs): + """For checking skip_sorting.""" + expected_x = np.arange(20) + expected_input = 5 * expected_x + if change_order and skip_sorting: + expected_input[sort_indices] = expected_input[sort_indices][::-1] + + assert_allclose(data, expected_input, 1e-14, 1e-14) + assert_allclose(self.x, expected_x, 1e-14, 1e-14) + + params = { + 'a': np.arange(len(x)), + 'b': np.arange(len(x)) + } + return 1 * data, params + + @_algorithm_setup._handle_io(require_unique=False) + def func3(self, data, *args, **kwargs): + """For ensuring require_unique works as intedended.""" + return 1 * data, {} + + @_algorithm_setup._handle_io(require_unique=True) + def func4(self, data, *args, **kwargs): + """For ensuring require_unique works as intedended.""" + return 1 * data, {} + + if change_order: + x[sort_indices] = x[sort_indices][::-1] + y[sort_indices] = y[sort_indices][::-1] + expected_baseline = (1 * y).astype(output_dtype) + if output_dtype is None: + expected_dtype = y.dtype + else: + expected_dtype = expected_baseline.dtype + if list_input: + x = x.tolist() + y = y.tolist() + + expected_params = { + 'a': np.arange(len(x)), + 'b': np.arange(len(x)) + } + if change_order: + expected_params['a'][sort_indices] = expected_params['a'][sort_indices][::-1] + + if change_order and assume_sorted: + with pytest.warns(SortingWarning): + algorithm = SubClass( + x, assume_sorted=assume_sorted, output_dtype=output_dtype, check_finite=False + ) + else: + algorithm = SubClass( + x, assume_sorted=assume_sorted, output_dtype=output_dtype, check_finite=False + ) + output, output_params = algorithm.func(y) + + # baseline should always match y-order on the output; only sorted within the + # function + assert_allclose(output, expected_baseline, 1e-16, 1e-16) + assert isinstance(output, np.ndarray) + assert output.dtype == expected_dtype + for key, value in expected_params.items(): + assert_array_equal(value, output_params[key]) + + output2, output_params2 = algorithm.func2(y) + + # baseline should always match y-order on the output; only sorted within the + # function + assert_allclose(output2, expected_baseline, 1e-16, 1e-16) + assert isinstance(output2, np.ndarray) + for key, value in expected_params.items(): + assert_array_equal(value, output_params2[key]) + + assert not algorithm._validated_x # has not had a need to validate x yet + output = algorithm.func4(y) + assert algorithm._validated_x + + new_x = np.arange(20) + new_x[0] = new_x[1] + new_algorithm = SubClass(new_x) + # ensure calling a method that does not require unique x does not validate or raise an error + out = new_algorithm.func3(y) + assert not new_algorithm._validated_x + with pytest.raises(ValueError): + out = new_algorithm.func4(y) + + +@pytest.mark.parametrize('input_x', (True, False)) +def test_algorithm_handle_io_1d_2d(data_fixture, input_x): + """Ensures 2D data is allowed for 1D algorithms only when specified. + + Also checks _Algorithm setup when given 2D data as the first call. + + """ + _, expected_y = get_data() + + class SubClass(_algorithm_setup1d._Algorithm): + + @_algorithm_setup._handle_io + def func(self, data, *args, **kwargs): + """Errors if input is not 1D.""" + assert data.ndim == 1 + assert data.shape == expected_y.shape + return data, {} + + @_algorithm_setup._handle_io(ensure_dims=False) + def func2(self, data, *args, **kwargs): + """Allows 2D data.""" + assert data.ndim == 2 + assert data.shape[1:] == expected_y.shape + return data, {} + + x_, y_1d = data_fixture + x = None + if input_x: + x = x_ + initial_size = len(x) + initial_shape = (len(x),) + else: + initial_size = None + initial_shape = (None,) + + input_y = np.stack((y_1d, y_1d), axis=0) + assert input_y.shape == (2, *y_1d.shape) # sanity check for correct setup + + algorithm = SubClass(x) + assert algorithm._shape == initial_shape + assert algorithm._size == initial_size + + with pytest.raises(ValueError, match='input data must be a one dimensional'): + algorithm.func(input_y) + assert algorithm._shape == initial_shape + + # should run without issues and set stored shape correctly + output, _ = algorithm.func2(input_y) + assert algorithm._shape == y_1d.shape + assert algorithm._size == y_1d.size + assert output.shape == input_y.shape + + +@pytest.mark.parametrize('assume_sorted', (True, False)) +@pytest.mark.parametrize('output_dtype', (None, int, float, np.float64)) +@pytest.mark.parametrize('change_order', (True, False)) +@pytest.mark.parametrize('skip_sorting', (True, False)) +@pytest.mark.parametrize('list_input', (True, False)) +def test_handle_io_2d(assume_sorted, output_dtype, change_order, skip_sorting, list_input): + """Ensures the _handle_io wrapper passes all tests expected by _Algorithm2D._handle_io.""" + x, z, y = get_data2d() + + class SubClass(_algorithm_setup2d._Algorithm2D): + # 'a' values will be sorted and 'b' values will be kept the same + @_algorithm_setup._handle_io(sort_keys=('a', 'd'), reshape_keys=('c', 'd')) + def func(self, data, *args, **kwargs): + """For checking sorting and reshaping output parameters.""" + expected_x, expected_z, expected_y = get_data2d() + + assert isinstance(data, np.ndarray) + assert_allclose(data, expected_y, 1e-14, 1e-14) + assert isinstance(self.x, np.ndarray) + assert_allclose(self.x, expected_x, 1e-14, 1e-14) + assert isinstance(self.z, np.ndarray) + assert_allclose(self.z, expected_z, 1e-14, 1e-14) + + params = { + 'a': np.arange(data.size).reshape(data.shape), + 'b': np.arange(len(self.x)), + 'c': np.arange(data.size), + 'd': np.arange(data.size) + } + return 1 * data, params + + @_algorithm_setup._handle_io + def func2(self, data, *args, **kwargs): + """For checking reshaping output baseline.""" + expected_x, expected_z, expected_y = get_data2d() + + assert isinstance(data, np.ndarray) + assert_allclose(data, expected_y, 1e-14, 1e-14) + assert isinstance(self.x, np.ndarray) + assert_allclose(self.x, expected_x, 1e-14, 1e-14) + assert isinstance(self.z, np.ndarray) + assert_allclose(self.z, expected_z, 1e-14, 1e-14) + + return 1 * data.flatten(), {} + + @_algorithm_setup._handle_io + def func3(self, data, *args, **kwargs): + """For checking empty decorator.""" + expected_x, expected_z, expected_y = get_data2d() + + assert isinstance(data, np.ndarray) + assert_allclose(data, expected_y, 1e-14, 1e-14) + assert isinstance(self.x, np.ndarray) + assert_allclose(self.x, expected_x, 1e-14, 1e-14) + assert isinstance(self.z, np.ndarray) + assert_allclose(self.z, expected_z, 1e-14, 1e-14) + + return 1 * data, {} + + @_algorithm_setup._handle_io( + sort_keys=('a', 'd'), reshape_keys=('c', 'd'), skip_sorting=skip_sorting + ) + def func4(self, data, *args, **kwargs): + """For checking skip_sorting key.""" + expected_x, expected_z, expected_y = get_data2d() + if change_order and skip_sorting: + expected_y = expected_y[::-1, ::-1] + + assert isinstance(data, np.ndarray) + assert_allclose(data, expected_y, 1e-14, 1e-14) + assert isinstance(self.x, np.ndarray) + assert_allclose(self.x, expected_x, 1e-14, 1e-14) + assert isinstance(self.z, np.ndarray) + assert_allclose(self.z, expected_z, 1e-14, 1e-14) + + params = { + 'a': np.arange(data.size).reshape(data.shape), + 'b': np.arange(len(self.x)), + 'c': np.arange(data.size), + 'd': np.arange(data.size) + } + + return 1 * data, params + + @_algorithm_setup._handle_io(require_unique=False) + def func5(self, data, *args, **kwargs): + """For ensuring require_unique works as intended.""" + return 1 * data, {} + + @_algorithm_setup._handle_io(require_unique=True) + def func6(self, data, *args, **kwargs): + """For ensuring require_unique works as intended.""" + return 1 * data, {} + + if change_order: + x = x[::-1] + z = z[::-1] + y = y[::-1, ::-1] + expected_params = { + 'a': np.arange(y.size).reshape(y.shape), + 'b': np.arange(len(x)), + 'c': np.arange(y.size).reshape(y.shape), + 'd': np.arange(y.size).reshape(y.shape), + } + expected_baseline = (1 * y).astype(output_dtype) + if output_dtype is None: + expected_dtype = y.dtype + else: + expected_dtype = expected_baseline.dtype + if list_input: + x = x.tolist() + z = z.tolist() + y = y.tolist() + + if change_order: + expected_params['a'] = expected_params['a'][::-1, ::-1] + expected_params['d'] = expected_params['d'][::-1, ::-1] + + if assume_sorted and change_order: + with pytest.warns(SortingWarning): + algorithm = SubClass( + x, z, check_finite=False, assume_sorted=assume_sorted, + output_dtype=output_dtype + ) + else: + algorithm = SubClass( + x, z, check_finite=False, assume_sorted=assume_sorted, output_dtype=output_dtype + ) + + output, output_params = algorithm.func(y) + + # baseline should always match y-order on the output; only sorted within the + # function + assert_allclose(output, expected_baseline, 1e-14, 1e-14) + assert isinstance(output, np.ndarray) + assert output.dtype == expected_dtype + for key, value in expected_params.items(): + assert_array_equal(value, output_params[key], err_msg=f'{key} failed') + + output2, _ = algorithm.func2(y) + assert_allclose(output2, expected_baseline, 1e-14, 1e-14) + assert isinstance(output2, np.ndarray) + assert output2.dtype == expected_dtype + + output3, _ = algorithm.func3(y) + assert_allclose(output3, expected_baseline, 1e-14, 1e-14) + assert isinstance(output3, np.ndarray) + assert output3.dtype == expected_dtype + + output4, output_params4 = algorithm.func4(y) + assert_allclose(output4, expected_baseline, 1e-14, 1e-14) + assert isinstance(output4, np.ndarray) + assert output4.dtype == expected_dtype + for key, value in expected_params.items(): + assert_array_equal(value, output_params4[key], err_msg=f'{key} failed') + + assert not algorithm._validated_x # has not had a need to validate x or z yet + assert not algorithm._validated_z + output = algorithm.func6(y) + assert algorithm._validated_x + assert algorithm._validated_z + + x[5] = x[4] + new_algorithm = SubClass(x) + # ensure calling a method that does not require unique x does not validate or raise an error + out = new_algorithm.func5(y) + assert not new_algorithm._validated_x + assert new_algorithm._validated_z # not given z + with pytest.raises(ValueError): + out = new_algorithm.func6(y) + + z[5] = z[4] + new_algorithm = SubClass(z_data=z) + # ensure calling a method that does not require unique z does not validate or raise an error + out = new_algorithm.func5(y) + assert new_algorithm._validated_x # not given x + assert not new_algorithm._validated_z + with pytest.raises(ValueError): + out = new_algorithm.func6(y) + + new_algorithm = SubClass(x, z) + out = new_algorithm.func5(y) + assert not new_algorithm._validated_x + assert not new_algorithm._validated_z + with pytest.raises(ValueError): + out = new_algorithm.func6(y) + + +def test_algorithm_handle_io_2d_no_data_fails(): + """Ensures an error is raised if the input data is None.""" + + class SubClass(_algorithm_setup2d._Algorithm2D): + + @_algorithm_setup._handle_io + def func(self, data, *args, **kwargs): + """For checking empty decorator.""" + return data, {} + + @_algorithm_setup._handle_io + def func2(self, data, *args, **kwargs): + """For checking closed decorator.""" + return data, {} + + with pytest.raises(TypeError, match='"data" cannot be None'): + SubClass().func() + with pytest.raises(TypeError, match='"data" cannot be None'): + SubClass().func2() + + +def test_algorithm_handle_io_2d_1d_fails(data_fixture): + """Ensures an error is raised if 1D data is used for 2D algorithms.""" + + class SubClass(_algorithm_setup2d._Algorithm2D): + + @_algorithm_setup._handle_io + def func(self, data, *args, **kwargs): + """For checking empty decorator.""" + return data, {} + + @_algorithm_setup._handle_io + def func2(self, data, *args, **kwargs): + """For checking closed decorator.""" + return data, {} + + x, y = data_fixture + algorithm = SubClass() + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func(y) + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func2(y) + + # also test when given x values + algorithm = SubClass(None, x) # x would correspond to the columns in 2D y + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func(y) + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func2(y) + + # and when y is 2D but only has one row + y_2d = np.atleast_2d(y) + algorithm = SubClass() + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func(y_2d) + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func2(y_2d) + + algorithm = SubClass(None, x) # x would correspond to the columns in 2D y + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func(y_2d) + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func2(y_2d) + + # and when y is 2D but only has one column + y_2d_transposed = np.atleast_2d(y).T + algorithm = SubClass() + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func(y_2d_transposed) + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func2(y_2d_transposed) + + algorithm = SubClass(x) # x now correspond to the rows in 2D y + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func(y_2d_transposed) + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func2(y_2d_transposed) + + +@pytest.mark.parametrize('input_x', (True, False)) +@pytest.mark.parametrize('input_z', (True, False)) +def test_algorithm_handle_io_2d_3d(data_fixture2d, input_x, input_z): + """Ensures 3D data is allowed for 2D algorithms only when specified. + + Also checks _Algorithm2D setup when given 3D data as the first call. + + """ + _, _, expected_y = get_data2d() + + class SubClass(_algorithm_setup2d._Algorithm2D): + + @_algorithm_setup._handle_io + def func(self, data, *args, **kwargs): + """Errors if input is not 2D.""" + assert data.ndim == 2 + assert data.shape == expected_y.shape + return data, {} + + @_algorithm_setup._handle_io(ensure_dims=False) + def func2(self, data, *args, **kwargs): + """Allows 3D data.""" + assert data.ndim == 3 + assert data.shape[1:] == expected_y.shape + return data, {} + + @_algorithm_setup._handle_io(ensure_dims=False) + def func3(self, data, *args, **kwargs): + """For checking reshaping output baseline for 3D input raveled on last axis.""" + assert data.ndim == 3 + assert data.shape[1:] == expected_y.shape + + return 1 * data.reshape(data.shape[0], -1), {} + + x_, z_, y_2d = data_fixture2d + x = None + z = None + initial_shape = [None, None] + if input_x: + x = x_ + initial_shape[0] = len(x) + if input_z: + z = z_ + initial_shape[1] = len(z) + initial_shape = tuple(initial_shape) + initial_size = None if None in initial_shape else y_2d.size + + input_y = np.stack((y_2d, y_2d), axis=0) + assert input_y.shape == (2, *y_2d.shape) # sanity check for correct setup + + algorithm = SubClass(x, z) + assert algorithm._shape == initial_shape + assert algorithm._size == initial_size + + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func(input_y) + assert algorithm._shape == initial_shape + + # should run without issues and set stored shape correctly + output, _ = algorithm.func2(input_y) + assert algorithm._shape == y_2d.shape + assert algorithm._size == y_2d.size + assert output.shape == input_y.shape + + output2, _ = algorithm.func3(input_y) + assert output2.shape == input_y.shape diff --git a/tests/nd/test_pls.py b/tests/nd/test_pls.py new file mode 100644 index 0000000..d546c90 --- /dev/null +++ b/tests/nd/test_pls.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- +"""Tests for pybaselines._nd.pls. + +@author: Donald Erb +Created on March 30, 2026 + +""" + +import inspect + +import pytest + +from pybaselines._nd import pls + + +def get_module_methods(klass): + """Gets all methods of a class defined in the same module as the class.""" + methods = [] + class_module = inspect.getmodule(klass) + for (method_name, method) in inspect.getmembers(klass): + if ( + inspect.isfunction(method) + and inspect.getmodule(method) is class_module + ): + methods.append(method_name) + + return methods + + +@pytest.mark.parametrize('method', get_module_methods(pls._PLSNDMixin)) +def test_spline_degree_none(method): + """Ensures the default `spline_degree` is None for all PLS methods to ensure logic flow. + + Penalized least squares methods should have default `spline_degree=None` to + do Whittaker smoothing as the default behavior. + + """ + params = inspect.signature(getattr(pls._PLSNDMixin, method)).parameters + assert params['spline_degree'].default is None diff --git a/tests/test_algorithm_setup.py b/tests/test_algorithm_setup.py index 333228f..e0613fd 100644 --- a/tests/test_algorithm_setup.py +++ b/tests/test_algorithm_setup.py @@ -11,7 +11,10 @@ import pytest from pybaselines import Baseline, _algorithm_setup, optimizers, polynomial, whittaker +from pybaselines._banded_utils import PenalizedSystem from pybaselines._compat import dia_object +from pybaselines._spline_utils import PSpline +from pybaselines.results import PSplineResult, WhittakerResult from pybaselines.utils import ParameterWarning, SortingWarning, estimate_window from .base_tests import ensure_deprecation, get_data @@ -791,9 +794,9 @@ def test_algorithm_return_results(assume_sorted, output_dtype, change_order): @pytest.mark.parametrize('change_order', (True, False)) @pytest.mark.parametrize('list_input', (True, False)) @pytest.mark.parametrize('skip_sorting', (True, False)) -def test_algorithm_register(assume_sorted, output_dtype, change_order, list_input, skip_sorting): +def test_algorithm_handle_io(assume_sorted, output_dtype, change_order, list_input, skip_sorting): """ - Ensures the _register wrapper method returns the correctly sorted outputs. + Ensures the _handle_io wrapper method returns the correctly sorted outputs. The input y-values within the wrapped function should be correctly sorted if `assume_sorted` is False, while the output baseline should always match @@ -808,7 +811,7 @@ def test_algorithm_register(assume_sorted, output_dtype, change_order, list_inpu class SubClass(_algorithm_setup._Algorithm): # 'a' values will be sorted and 'b' values will be kept the same - @_algorithm_setup._Algorithm._register(sort_keys=('a',)) + @_algorithm_setup._Algorithm._handle_io(sort_keys=('a',)) def func(self, data, *args, **kwargs): """For checking sorting of output parameters.""" expected_x = np.arange(20) @@ -825,7 +828,7 @@ def func(self, data, *args, **kwargs): } return 1 * data, params - @_algorithm_setup._Algorithm._register(sort_keys=('a',), skip_sorting=skip_sorting) + @_algorithm_setup._Algorithm._handle_io(sort_keys=('a',), skip_sorting=skip_sorting) def func2(self, data, *args, **kwargs): """For checking skip_sorting.""" expected_x = np.arange(20) @@ -842,14 +845,14 @@ def func2(self, data, *args, **kwargs): } return 1 * data, params - @_algorithm_setup._Algorithm._register(require_unique_x=False) + @_algorithm_setup._Algorithm._handle_io(require_unique=False) def func3(self, data, *args, **kwargs): - """For ensuring require_unique_x works as intedended.""" + """For ensuring require_unique works as intedended.""" return 1 * data, {} - @_algorithm_setup._Algorithm._register(require_unique_x=True) + @_algorithm_setup._Algorithm._handle_io(require_unique=True) def func4(self, data, *args, **kwargs): - """For ensuring require_unique_x works as intedended.""" + """For ensuring require_unique works as intedended.""" return 1 * data, {} if change_order: @@ -913,6 +916,99 @@ def func4(self, data, *args, **kwargs): out = new_algorithm.func4(y) +@pytest.mark.parametrize('input_x', (True, False)) +@pytest.mark.parametrize('change_order', (True, False)) +def test_algorithm_handle_io_2d(data_fixture, input_x, change_order): + """Ensures 2D data is allowed for 1D algorithms only when specified. + + Also checks _Algorithm setup when given 2D data as the first call. + + """ + x_vals, input_y_1d = get_data() + if input_x: + expected_x = x_vals + else: + expected_x = np.linspace(-1, 1, input_y_1d.size) + stacks = 2 + expected_y = np.repeat(input_y_1d[None, :], stacks, axis=0) + sort_indices = slice(0, 10) + + class SubClass(_algorithm_setup._Algorithm): + + @_algorithm_setup._Algorithm._handle_io + def func(self, data, *args, **kwargs): + """Errors if input is not 1D.""" + assert data.ndim == 1 + assert data.shape == expected_y.shape + return data * 1, {} + + @_algorithm_setup._Algorithm._handle_io(ensure_dims=False) + def func2(self, data, *args, **kwargs): + """Allows 2D data.""" + assert data.ndim == 2 + assert data.shape == expected_y.shape + + expected = expected_y.copy() + if change_order and not input_x: + expected[:, sort_indices] = expected[:, sort_indices][:, ::-1] + + assert_allclose(data, expected, 1e-14, 1e-14) + assert_allclose(self.x, expected_x, 1e-14, 1e-14) + + return data * 1, {} + + @_algorithm_setup._Algorithm._handle_io(ensure_dims=False, skip_sorting=True) + def func3(self, data, *args, **kwargs): + """Allows 2D data and skips sorting.""" + assert data.ndim == 2 + assert data.shape == expected_y.shape + + expected = expected_y.copy() + if change_order: + expected[:, sort_indices] = expected[:, sort_indices][:, ::-1] + + assert_allclose(data, expected, 1e-14, 1e-14) + assert_allclose(self.x, expected_x, 1e-14, 1e-14) + + return data * 1, {} + + x_, y_1d = data_fixture + if change_order: + x_[sort_indices] = x_[sort_indices][::-1] + y_1d[sort_indices] = y_1d[sort_indices][::-1] + + x = None + if input_x: + x = x_ + initial_size = len(x) + initial_shape = (len(x),) + else: + initial_size = None + initial_shape = (None,) + + input_y = np.repeat(y_1d[None, :], stacks, axis=0) + assert input_y.shape == (stacks, *y_1d.shape) # sanity check for correct setup + + algorithm = SubClass(x) + assert algorithm._shape == initial_shape + assert algorithm._size == initial_size + + with pytest.raises(ValueError, match='input data must be a one dimensional'): + algorithm.func(input_y) + assert algorithm._shape == initial_shape + + # should run without issues and set stored shape correctly + output, _ = algorithm.func2(input_y) + assert algorithm._shape == y_1d.shape + assert algorithm._size == y_1d.size + assert output.shape == input_y.shape + assert_allclose(output, input_y, 1e-14, 1e-14) + + output2, _ = algorithm.func3(input_y) + assert output2.shape == input_y.shape + assert_allclose(output2, input_y, 1e-14, 1e-14) + + def test_class_wrapper(): """Ensures the class wrapper function correctly processes inputs for _Algorithm classes.""" default_b = 2 @@ -1132,3 +1228,119 @@ def test_wrong_banded_solver_fails(algorithm, banded_solver): """Ensures only valid integers between 0 and 4 are allowed as banded_solver inputs.""" with pytest.raises(ValueError): algorithm.banded_solver = banded_solver + + +@pytest.mark.parametrize('diff_order', (1, 2, 3)) +@pytest.mark.parametrize('lam', (1, 20)) +@pytest.mark.parametrize('allow_lower', (True, False)) +@pytest.mark.parametrize('reverse_diags', (True, False)) +def test_setup_pls_whittaker_diff_matrix(small_data, algorithm, lam, diff_order, + allow_lower, reverse_diags): + """Ensures output difference matrix diagonal data is in desired format for _setup_pls.""" + if reverse_diags and allow_lower: + # this configuration is never used + return + + # intentionally do not input spline_degree here to ensure default behavior is + # spline_degree=None -> Whittaker smoothing + _, _, whittaker_system, result_class = algorithm._setup_pls( + small_data, lam=lam, diff_order=diff_order, allow_lower=allow_lower, + reverse_diags=reverse_diags + ) + _, _, expected_system = algorithm._setup_whittaker( + small_data, lam=lam, diff_order=diff_order, allow_lower=allow_lower, + reverse_diags=reverse_diags + ) + + numpy_diff = np.diff(np.eye(small_data.shape[0]), diff_order, 0) + desired_diagonals = dia_object(lam * (numpy_diff.T @ numpy_diff)).data[::-1] + if allow_lower and not whittaker_system.using_penta: + # only include the lower diagonals + desired_diagonals = desired_diagonals[diff_order:] + + # the diagonals should be in the opposite order as the diagonal matrix's data + # if reverse_diags is False + if reverse_diags or (whittaker_system.using_penta and reverse_diags is not False): + desired_diagonals = desired_diagonals[::-1] + + assert_allclose(whittaker_system.penalty, desired_diagonals, 1e-10) + assert_allclose(whittaker_system.penalty, expected_system.penalty, 1e-10) + assert isinstance(whittaker_system, PenalizedSystem) + assert result_class is WhittakerResult + + +@pytest.mark.parametrize('spline_degree', (None, 3)) +@pytest.mark.parametrize('weight_enum', (0, 1, 2, 3)) +def test_setup_pls_weights(small_data, algorithm, spline_degree, weight_enum): + """Ensures output weight array is correct when using _setup_pls.""" + if weight_enum == 0: + # no weights specified + weights = None + desired_weights = np.ones_like(small_data) + elif weight_enum == 1: + # uniform 1 weighting + weights = np.ones_like(small_data) + desired_weights = weights.copy() + elif weight_enum == 2: + # different weights for all points + weights = np.arange(small_data.shape[0]) + desired_weights = np.arange(small_data.shape[0]) + elif weight_enum == 3: + # different weights for all points, and weights input as a list + weights = np.arange(small_data.shape[0]).tolist() + desired_weights = np.arange(small_data.shape[0]) + + _, weight_array, penalized_system, result_class = algorithm._setup_pls( + small_data, lam=1, diff_order=2, weights=weights, spline_degree=spline_degree + ) + + assert isinstance(weight_array, np.ndarray) + assert_array_equal(weight_array, desired_weights) + assert weight_array.dtype == float + assert isinstance(penalized_system, PenalizedSystem if spline_degree is None else PSpline) + assert result_class is WhittakerResult if spline_degree is None else PSplineResult + + +@pytest.mark.parametrize('num_knots', (5, 15, 100)) +@pytest.mark.parametrize('spline_degree', (1, 2, 3, 4, None)) +def test_setup_pls_spline_basis(small_data, num_knots, spline_degree): + """Ensures the spline basis function is correctly created through _setup_pls.""" + fitter = _algorithm_setup._Algorithm(np.arange(len(small_data))) + fitter._setup_pls( + small_data, weights=None, spline_degree=spline_degree, num_knots=num_knots, + ) + if spline_degree is None: + assert fitter._spline_basis is None + else: + assert fitter._spline_basis.basis.shape[0] == len(small_data) + assert fitter._spline_basis.basis.shape[1] == num_knots + spline_degree - 1 + + +@pytest.mark.parametrize('lam', (1, 20)) +@pytest.mark.parametrize('diff_order', (1, 2, 3, 4)) +@pytest.mark.parametrize('spline_degree', (1, 2, 3, 4)) +@pytest.mark.parametrize('num_knots', (5, 50, 100)) +def test_setup_pls_pspline_diff_matrix(small_data, lam, diff_order, spline_degree, num_knots): + """Ensures output difference matrix diagonal data is in desired format for setup_pls.""" + fitter = _algorithm_setup._Algorithm(np.arange(len(small_data))) + _, _, pspline, result_class = fitter._setup_pls( + small_data, weights=None, spline_degree=spline_degree, num_knots=num_knots, + diff_order=diff_order, lam=lam + ) + + num_bases = num_knots + spline_degree - 1 + numpy_diff = np.diff(np.eye(num_bases), diff_order, axis=0) + desired_diagonals = lam * dia_object(numpy_diff.T @ numpy_diff).data[::-1][diff_order:] + if diff_order < spline_degree: + padding = np.zeros((spline_degree - diff_order, desired_diagonals.shape[1])) + desired_diagonals = np.concatenate((desired_diagonals, padding)) + + assert_allclose(pspline.penalty, desired_diagonals, 1e-10, 1e-12) + assert isinstance(pspline, PSpline) + assert result_class is PSplineResult + + _, _, expected_system = fitter._setup_spline( + small_data, weights=None, spline_degree=spline_degree, num_knots=num_knots, + diff_order=diff_order, lam=lam + ) + assert_allclose(pspline.penalty, expected_system.penalty, 1e-10, 1e-12) diff --git a/tests/test_banded_utils.py b/tests/test_banded_utils.py index 383d2b1..7d8e47e 100644 --- a/tests/test_banded_utils.py +++ b/tests/test_banded_utils.py @@ -471,6 +471,8 @@ def check_penalized_system(penalized_system, expected_penalty, lam, diff_order, ) assert penalized_system._num_bases == data_size + assert penalized_system.shape == (data_size,) + assert penalized_system.tot_bases == data_size assert_array_equal(penalized_system.original_diagonals, expected_penalty) assert_array_equal(penalized_system.penalty, expected_padded_penalty) assert penalized_system.reversed == reverse_diags @@ -532,6 +534,8 @@ def test_penalized_system_setup(diff_order, allow_lower, reverse_diags): data_size, lam=1, diff_order=0, allow_penta=False ) assert initial_system._num_bases == data_size + assert initial_system.shape == (data_size,) + assert initial_system.tot_bases == data_size for padding in range(-1, 3): penalized_system = _banded_utils.PenalizedSystem( @@ -686,12 +690,55 @@ def test_penalized_system_solve(data_fixture, diff_order, allow_lower, allow_pen ).tocsr() expected_solution = spsolve(diags(weights, format='csr') + sparse_penalty, weights * y) + penalized_system = _banded_utils.PenalizedSystem( + data_size, lam=lam, diff_order=diff_order, allow_lower=allow_lower, + reverse_diags=False, allow_penta=allow_penta + ) + output = penalized_system.solve(y, weights) + assert_allclose(output, expected_solution, rtol=1e-6, atol=1e-10) + + # need to reset diagonal for the next test since it directly adds weights + # to ihe input penalty's existing diagonal + penalized_system.add_diagonal(0) + + # also test inputting a penalty; penalty may potentially overwritten by the + # solver here, so needs to be last check + output2 = penalized_system.solve(y, weights, penalty=penalized_system.penalty) + assert_allclose(output2, expected_solution, rtol=1e-6, atol=1e-10) + + +@pytest.mark.parametrize('diff_order', (1, 2, 3)) +@pytest.mark.parametrize('allow_lower', (True, False)) +@pytest.mark.parametrize('allow_penta', (True, False)) +def test_penalized_system_direct_solve(data_fixture, diff_order, allow_lower, allow_penta): + """ + Tests the direct_solve method of a PenalizedSystem object. + + Solves the equation ``(W + lam * D.T @ D) x = W @ y``, where `W` is the weight + matrix, and ``D.T @ D`` is the penalty. + + """ + x, y = data_fixture + data_size = len(y) + weights = np.random.default_rng(0).normal(0.8, 0.05, x.size) + weights = np.clip(weights, 0, 1).astype(float) + + lam = {1: 1e2, 2: 1e5, 3: 1e8}[diff_order] + expected_penalty = _banded_utils.diff_penalty_diagonals( + data_size, diff_order=diff_order, lower_only=False + ) + sparse_penalty = dia_object( + (lam * expected_penalty, np.arange(diff_order, -(diff_order + 1), -1)), + shape=(data_size, data_size) + ).tocsr() + expected_solution = spsolve(diags(weights, format='csr') + sparse_penalty, weights * y) + penalized_system = _banded_utils.PenalizedSystem( data_size, lam=lam, diff_order=diff_order, allow_lower=allow_lower, reverse_diags=False, allow_penta=allow_penta ) penalized_system.add_diagonal(weights) - output = penalized_system.solve(penalized_system.penalty, weights * y) + output = penalized_system.direct_solve(penalized_system.penalty, weights * y) assert_allclose(output, expected_solution, rtol=1e-6, atol=1e-10) @@ -720,7 +767,7 @@ def test_whittaker_lam_extremes(data_fixture, diff_order, allow_lower, allow_pen data_size, lam=1e13, diff_order=diff_order, allow_lower=allow_lower, allow_penta=allow_penta ) - output = penalized_system.solve(penalized_system.add_diagonal(1.), y) + output = penalized_system.solve(y, weights=1) polynomial_fit = np.polynomial.Polynomial.fit(x, y, deg=diff_order - 1)(x) # limited by how close to infinity lam can get before it causes numerical instability, @@ -730,11 +777,8 @@ def test_whittaker_lam_extremes(data_fixture, diff_order, allow_lower, allow_pen assert_allclose(output, polynomial_fit, rtol=rtol, atol=1e-10) # for lam ~ 0, should just approximate the input - penalized_system2 = _banded_utils.PenalizedSystem( - data_size, lam=1e-8, diff_order=diff_order, allow_lower=allow_lower, - reverse_diags=None, allow_penta=allow_penta - ) - output2 = penalized_system.solve(penalized_system2.add_diagonal(1.), y) + penalized_system.update_lam(1e-8) + output2 = penalized_system.solve(y, weights=1) assert_allclose(output2, y, rtol=1e-8, atol=1e-10) diff --git a/tests/test_results.py b/tests/test_results.py index d0c647b..d753915 100644 --- a/tests/test_results.py +++ b/tests/test_results.py @@ -7,7 +7,7 @@ """ import numpy as np -from numpy.testing import assert_allclose +from numpy.testing import assert_allclose, assert_array_equal import pytest from scipy.sparse import kron from scipy.sparse.linalg import factorized @@ -20,6 +20,24 @@ from .base_tests import get_2dspline_inputs +@pytest.mark.parametrize('shape', (100, (100, 11))) +@pytest.mark.parametrize('use_generator', (True, False)) +def test_rademacher(shape, use_generator): + """Ensures _rademacher returns only -1 or 1.""" + seed = 9 + if use_generator: + rng = np.random.default_rng(seed) + else: + rng = seed + + output = results._rademacher(shape, rng) + assert np.all((output == -1.) | (output == 1.)) + # call order matters, so create new generator within accuracy test + assert_array_equal( + output, np.random.default_rng(seed).choice([-1., 1.], size=shape) + ) + + @pytest.mark.parametrize('diff_order', (1, 2)) @pytest.mark.parametrize('allow_lower', (True, False)) @pytest.mark.parametrize('allow_penta', (True, False)) @@ -645,21 +663,6 @@ def test_whittaker_result_two_d_no_weights(data_fixture2d, num_eigens): assert_allclose(result_obj._weights, np.ones(expected_shape), rtol=1e-16, atol=0) -def test_whittaker_result_two_d_lhs_penalty_raises(data_fixture2d): - """Ensures an exception is raised if both `lhs` and `penalty` are supplied.""" - x, z, y = data_fixture2d - weights = np.random.default_rng(0).normal(0.8, 0.05, y.shape) - weights = np.clip(weights, 0, 1, dtype=float) - - penalized_system = WhittakerSystem2D(y.shape) - - with pytest.raises(ValueError, match='both `lhs` and `penalty` cannot'): - results.WhittakerResult2D( - penalized_system, weights, lhs=penalized_system.penalty, - penalty=penalized_system.penalty - ) - - @pytest.mark.parametrize('shape', ((30, 21), (15, 40))) @pytest.mark.parametrize('diff_order', (1, 2, 3, (1, 2))) @pytest.mark.parametrize('large_lam', (True, False)) diff --git a/tests/test_spline.py b/tests/test_spline.py index ef4b021..69342a0 100644 --- a/tests/test_spline.py +++ b/tests/test_spline.py @@ -13,6 +13,7 @@ import pytest from pybaselines import _spline_utils, classification, morphological, spline, Baseline +from pybaselines.results import PSplineResult from .base_tests import BaseTester, InputWeightsMixin, RecreationMixin, ensure_deprecation @@ -63,6 +64,28 @@ def test_numba_implementation(self): assert_allclose(numba_output, normal_output, rtol=1e-10, atol=1e-10) + def test_result_obj(self): + """Ensures the `result` item in the output params is a PSplineResult.""" + _, params = self.class_func(self.y, **self.kwargs) + # don't use isinstance since don't want to allow subclasses + assert type(params['result']) is PSplineResult + + def test_check_spline_degree(self): + """ + Ensures an exception is raised if the input spline_degree is None. + + For methods that are implemented as ND penalized least square mixins, the same + code path is used for both penalized spline and Whittaker smoothing, whose logic + is controlled by the input `spline_degree`, so need to ensure that the input for + penalized spline methods is not None. + + For methods that are not PLS mixins, a TypeError should still occur during basis + creation. + + """ + with pytest.raises(TypeError): + self.class_func(self.y, spline_degree=None) + class IterativeSplineTester(SplineTester, InputWeightsMixin, RecreationMixin): """Base testing class for iterative spline functions.""" diff --git a/tests/test_spline_utils.py b/tests/test_spline_utils.py index ada2fe0..23011d4 100644 --- a/tests/test_spline_utils.py +++ b/tests/test_spline_utils.py @@ -254,7 +254,7 @@ def test_pspline_solve(data_fixture, num_knots, spline_degree, diff_order, lower spline_basis, lam=1, diff_order=diff_order, allow_lower=lower_only ) assert_allclose( - pspline.solve_pspline(y, weights=weights, penalty=penalty), + pspline.solve(y, weights=weights, penalty=penalty), expected_spline, 1e-10, 1e-12 ) assert_allclose( @@ -307,6 +307,49 @@ def test_pspline_factorize_solve(data_fixture, num_knots, spline_degree, diff_or output = pspline.factorized_solve(output_factorization, rhs) assert_allclose(output, expected_coeffs, rtol=1e-10, atol=1e-12) + # going through factorized_solve should not set coefficients + assert pspline.coef is None + + +@pytest.mark.parametrize('num_knots', (20, 101)) +@pytest.mark.parametrize('spline_degree', (0, 1, 2, 3, 4, 5)) +@pytest.mark.parametrize('diff_order', (1, 2, 3, 4)) +@pytest.mark.parametrize('lower_only', (True, False)) +def test_pspline_direct_solve(data_fixture, num_knots, spline_degree, diff_order, lower_only): + """Tests the direct_solve method of a PSpline object.""" + x, y = data_fixture + # ensure x and y are floats + x = x.astype(float) + y = y.astype(float) + weights = np.random.default_rng(0).normal(0.8, 0.05, x.size) + weights = np.clip(weights, 0, 1).astype(float) + + knots = _spline_utils._spline_knots(x, num_knots, spline_degree, True) + basis = _spline_utils._spline_basis(x, knots, spline_degree) + num_bases = basis.shape[1] + penalty_matrix = _banded_utils.diff_penalty_matrix(num_bases, diff_order=diff_order) + + lhs_sparse = basis.T @ diags(weights, format='csr') @ basis + penalty_matrix + rhs = basis.T @ (weights * y) + expected_coeffs = spsolve(lhs_sparse, rhs) + + lhs_banded = _banded_utils._sparse_to_banded(lhs_sparse)[0] + if lower_only: + lhs_banded = lhs_banded[len(lhs_banded) // 2:] + + spline_basis = _spline_utils.SplineBasis( + x, num_knots=num_knots, spline_degree=spline_degree + ) + pspline = _spline_utils.PSpline( + spline_basis, lam=1, diff_order=diff_order, allow_lower=lower_only + ) + + output = pspline.direct_solve(lhs_banded, rhs) + assert_allclose(output, expected_coeffs, rtol=1e-10, atol=1e-12) + + # going through direct_solve should not set coefficients + assert pspline.coef is None + @pytest.mark.parametrize('num_knots', (20, 101)) @pytest.mark.parametrize('spline_degree', (0, 1, 2, 3, 4, 5)) @@ -370,6 +413,8 @@ def check_penalized_spline(penalized_system, expected_penalty, lam, diff_order, expected_penalty, padding, lower_only=allow_lower ) + num_bases = num_knots + spline_degree - 1 + assert_array_equal(penalized_system.original_diagonals, expected_penalty) assert_array_equal(penalized_system.penalty, expected_padded_penalty) assert penalized_system.reversed == reverse_diags @@ -379,8 +424,10 @@ def check_penalized_spline(penalized_system, expected_penalty, lam, diff_order, assert penalized_system.basis.num_knots == num_knots assert penalized_system.basis.spline_degree == spline_degree assert penalized_system.coef is None # None since the solve method has not been called - assert penalized_system.basis.basis.shape == (data_size, num_knots + spline_degree - 1) - assert penalized_system.basis._num_bases == num_knots + spline_degree - 1 + assert penalized_system.basis.basis.shape == (data_size, num_bases) + assert penalized_system.basis._num_bases == num_bases + assert penalized_system.shape == (data_size,) + assert penalized_system.tot_bases == num_bases assert penalized_system.basis.knots.shape == (num_knots + 2 * spline_degree,) assert isinstance(penalized_system.basis.x, np.ndarray) assert penalized_system.basis._x_len == len(penalized_system.basis.x) @@ -439,23 +486,24 @@ def test_pspline_setup(data_fixture, num_knots, spline_degree, diff_order, spline_basis, lam=lam, diff_order=diff_order, allow_lower=allow_lower, reverse_diags=reverse_diags ) - else: - pspline = _spline_utils.PSpline( - spline_basis, lam=lam, diff_order=diff_order, allow_lower=allow_lower, - reverse_diags=reverse_diags - ) - check_penalized_spline( - pspline, expected_penalty, lam, diff_order, allow_lower, - bool(reverse_diags), spline_degree, num_knots, data_size - ) - # also check that the reset_diagonal method performs similarly - pspline.reset_penalty_diagonals( - lam=lam, diff_order=diff_order, allow_lower=allow_lower, reverse_diags=reverse_diags - ) - check_penalized_spline( - pspline, expected_penalty, lam, diff_order, allow_lower, - bool(reverse_diags), spline_degree, num_knots, data_size - ) + return + + pspline = _spline_utils.PSpline( + spline_basis, lam=lam, diff_order=diff_order, allow_lower=allow_lower, + reverse_diags=reverse_diags + ) + check_penalized_spline( + pspline, expected_penalty, lam, diff_order, allow_lower, + bool(reverse_diags), spline_degree, num_knots, data_size + ) + # also check that the reset_diagonal method performs similarly + pspline.reset_penalty_diagonals( + lam=lam, diff_order=diff_order, allow_lower=allow_lower, reverse_diags=reverse_diags + ) + check_penalized_spline( + pspline, expected_penalty, lam, diff_order, allow_lower, + bool(reverse_diags), spline_degree, num_knots, data_size + ) def test_spline_basis_non_finite_fails(): @@ -495,7 +543,7 @@ def test_pspline_tck(data_fixture, num_knots, spline_degree, diff_order, lam): x, y = data_fixture basis = _spline_utils.SplineBasis(x, num_knots=num_knots, spline_degree=spline_degree) pspline = _spline_utils.PSpline(basis, diff_order=diff_order, lam=lam) - fit_spline = pspline.solve_pspline(y, weights=np.ones_like(y)) + fit_spline = pspline.solve(y, weights=np.ones_like(y)) # ensure tck is the knots, coefficients, and spline degree assert len(pspline.tck) == 3 @@ -527,7 +575,7 @@ def test_pspline_tck_readonly(data_fixture): x, y = data_fixture basis = _spline_utils.SplineBasis(x) pspline = _spline_utils.PSpline(basis) - pspline.solve_pspline(y, np.ones_like(y)) + pspline.solve(y, np.ones_like(y)) with pytest.raises(AttributeError): pspline.tck = (1, 2, 3) @@ -656,15 +704,9 @@ def test_compare_to_whittaker(data_fixture, lam, diff_order): weights = np.random.default_rng(0).normal(0.8, 0.05, len(y)) weights = np.clip(weights, 0, 1).astype(float, copy=False) - main_diag_idx = whittaker_system.main_diagonal_index - main_diagonal = whittaker_system.penalty[main_diag_idx] - whittaker_system.penalty[main_diag_idx] = main_diagonal + weights - whittaker_output = whittaker_system.solve( - whittaker_system.penalty, weights * y, overwrite_b=True - ) + whittaker_output = whittaker_system.solve(y, weights=weights) - spline_output = pspline.solve_pspline(y, weights=weights) - whittaker_output = whittaker_system.solve(whittaker_system.penalty, weights.ravel() * y.ravel()) + spline_output = pspline.solve(y, weights=weights) assert_allclose(spline_output, whittaker_output, rtol=1e-12, atol=1e-12) @@ -747,7 +789,7 @@ def test_pspline_lam_extremes(data_fixture, diff_order, allow_lower, spline_degr pspline = _spline_utils.PSpline( spline_basis, lam=1e13, diff_order=diff_order, allow_lower=allow_lower ) - output = pspline.solve_pspline(y, weights) + output = pspline.solve(y, weights) polynomial_fit = np.polynomial.Polynomial.fit(x, y, deg=diff_order - 1)(x) # limited by how close to infinity lam can get before it causes numerical instability, @@ -761,7 +803,7 @@ def test_pspline_lam_extremes(data_fixture, diff_order, allow_lower, spline_degr pspline2 = _spline_utils.PSpline( spline_basis, lam=1e-10, diff_order=diff_order, allow_lower=allow_lower ) - output2 = pspline2.solve_pspline(y, weights) + output2 = pspline2.solve(y, weights) # cannot use interpolation from SciPy since the knot arrangement is going to be different expected_coeffs = spsolve(spline_basis.basis.T @ spline_basis.basis, spline_basis.basis.T @ y) expected = spline_basis.basis @ expected_coeffs diff --git a/tests/test_utils.py b/tests/test_utils.py index 3910b1a..8d349cb 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -10,6 +10,7 @@ from numpy.testing import assert_allclose, assert_array_equal import pytest from scipy.interpolate import BSpline +from scipy.ndimage import grey_dilation, grey_erosion, grey_opening from scipy.sparse.linalg import spsolve from pybaselines import _banded_utils, _spline_utils, utils @@ -839,6 +840,67 @@ def test_estimate_window(small_data2d, two_d): assert_allclose(output_opt, output, rtol=1e-12, atol=0) +@pytest.mark.parametrize('two_d', (True, False)) +def test_make_window(small_data2d, two_d): + """Ensures _make_window has the correct outputs for the dimensions of the input.""" + data = small_data2d + if not two_d: + data = data.flatten() + + # _make_window is always called after _setup_morphological, so + # the half-window will always be set correctly as an int in 1D and + # np.array([int, int]) for 2D + half_window = 5 + if two_d: + half_window = np.array([half_window, half_window]) + expected_window = 2 * half_window + 1 + else: + expected_window = [2 * half_window + 1] + + output = utils._make_window(data, half_window) + + assert len(output) == data.ndim + assert isinstance(output, np.ndarray if two_d else list) + assert_array_equal(output, expected_window) + + # can also check that ints are correctly converted to 2d, even though it + # shouldn't be used this way internally + if two_d: + half_window_int = 5 + expected_window_int = [2 * half_window_int + 1] * 2 + output = utils._make_window(data, half_window_int) + assert isinstance(output, list) + assert_array_equal(output, expected_window_int) + + +@pytest.mark.parametrize('two_d', (True, False)) +@pytest.mark.parametrize('input_opening', (True, False)) +def test_average_opening(small_data2d, two_d, input_opening): + """Ensures _average_opening has the correct outputs for the dimensions of the input.""" + data = small_data2d + if not two_d: + data = data.flatten() + + # _average_opening is always called after _setup_morphological, so + # the half-window will always be set correctly as an int in 1D and + # np.array([int, int]) for 2D + half_window = 5 + if two_d: + half_window = np.array([half_window, half_window]) + window = 2 * half_window + 1 + else: + window = [2 * half_window + 1] + opening_ = grey_opening(data, window) + + expected_output = 0.5 * (grey_dilation(opening_, window) + grey_erosion(opening_, window)) + + output = utils._avg_opening(data, half_window, opening=opening_ if input_opening else None) + + assert output.shape == data.shape + assert isinstance(output, np.ndarray) + assert_allclose(output, expected_output, rtol=1e-14, atol=1e-14) + + @pytest.mark.parametrize('data_size', (500, 1000, 1001)) @pytest.mark.parametrize( 'baseline_type', ('exponential', 'gaussian', 'linear', 'sine', 'gaussian_small') diff --git a/tests/test_validation.py b/tests/test_validation.py index 05d6c24..60544c5 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -706,3 +706,14 @@ def test_get_row_col_values_fails(values): """Ensures _get_row_col_values raises an exception with incorrectly sized inputs..""" with pytest.raises(ValueError): _validation._get_row_col_values(values) + + +def test_check_spline_degree(): + """Ensures _check_spline_degree raises if the input has None.""" + for degree in (None, [None, None], [None, 3], [1, None]): + with pytest.raises(TypeError): + _validation._check_spline_degree(degree) + + # ensure valid inputs don't raise + for degree in (3, [1, 2], [0, 3], [0, 0]): + _validation._check_spline_degree(degree) diff --git a/tests/test_weighting.py b/tests/test_weighting.py index 7b5758f..23461de 100644 --- a/tests/test_weighting.py +++ b/tests/test_weighting.py @@ -978,7 +978,7 @@ def test_aspls_overflow(one_d, asymmetric_coef, alternate_weighting): assert_allclose(residual, expected_residual, rtol=1e-12, atol=1e-12) -def expected_psalsa(y, baseline, p, k, shape_y): +def expected_psalsa(y, baseline, p, k): """ Weighting for the peaked signal's asymmetric least squares algorithm (psalsa). @@ -998,8 +998,6 @@ def expected_psalsa(y, baseline, p, k, shape_y): A factor that controls the exponential decay of the weights for baseline values greater than the data. Should be approximately the height at which a value could be considered a peak. - shape_y : int or (int,) or (int, int) - The length of `y`, `N`. Precomputed to avoid repeated calculations. Returns ------- @@ -1028,8 +1026,8 @@ def test_psalsa_normal(one_d, k, p): else: y_data, baseline = baseline_2d_normal() - weights = _weighting._psalsa(y_data, baseline, p, k, y_data.shape) - expected_weights = expected_psalsa(y_data, baseline, p, k, y_data.shape) + weights = _weighting._psalsa(y_data, baseline, p, k) + expected_weights = expected_psalsa(y_data, baseline, p, k) assert isinstance(weights, np.ndarray) assert weights.shape == y_data.shape @@ -1048,7 +1046,7 @@ def test_psalsa_all_above(one_d, k, p): else: y_data, baseline = baseline_2d_all_above() - weights = _weighting._psalsa(y_data, baseline, p, k, y_data.shape) + weights = _weighting._psalsa(y_data, baseline, p, k) expected_weights = np.full_like(y_data, 1 - p) assert isinstance(weights, np.ndarray) @@ -1066,7 +1064,7 @@ def test_psalsa_all_below(one_d, k, p): else: y_data, baseline = baseline_2d_all_below() - weights = _weighting._psalsa(y_data, baseline, p, k, y_data.shape) + weights = _weighting._psalsa(y_data, baseline, p, k) expected_weights = p * np.exp(-(y_data - baseline) / k) assert isinstance(weights, np.ndarray) @@ -1097,13 +1095,13 @@ def test_psalsa_overflow(one_d, k, p): # sanity check to ensure overflow actually should occur with pytest.warns(RuntimeWarning): - expected_weights = expected_psalsa(y_data, baseline, p, k, y_data.shape) + expected_weights = expected_psalsa(y_data, baseline, p, k) # weights in naive approach should still be finite since overflow only occurs in regions # where the exponential value is not actually used assert np.isfinite(expected_weights).all() with np.errstate(over='raise'): - weights = _weighting._psalsa(y_data, baseline, p, k, y_data.shape) + weights = _weighting._psalsa(y_data, baseline, p, k) assert np.isfinite(weights).all() @@ -1116,7 +1114,7 @@ def test_psalsa_overflow(one_d, k, p): assert_allclose(weights, expected_weights, rtol=1e-12, atol=1e-12) -def expected_derpsalsa(y, baseline, p, k, shape_y, partial_weights): +def expected_derpsalsa(y, baseline, p, k, partial_weights): """ Weights for derivative peak-screening asymmetric least squares algorithm (derpsalsa). @@ -1134,8 +1132,6 @@ def expected_derpsalsa(y, baseline, p, k, shape_y, partial_weights): A factor that controls the exponential decay of the weights for baseline values greater than the data. Should be approximately the height at which a value could be considered a peak. - shape_y : int or (int,) or (int, int) - The length of `y`, `N`. Precomputed to avoid repeated calculations. partial_weights : numpy.ndarray, shape (N,) The weights associated with the first and second derivatives of the data. @@ -1180,8 +1176,8 @@ def test_derpsalsa_normal(k, p): diff_2_weights = np.exp(-((diff_y_2 / rms_diff_2)**2) / 2) partial_weights = diff_1_weights * diff_2_weights - weights = _weighting._derpsalsa(y_data, baseline, p, k, y_data.shape, partial_weights) - expected_weights = expected_derpsalsa(y_data, baseline, p, k, y_data.shape, partial_weights) + weights = _weighting._derpsalsa(y_data, baseline, p, k, partial_weights) + expected_weights = expected_derpsalsa(y_data, baseline, p, k, partial_weights) assert isinstance(weights, np.ndarray) assert weights.shape == y_data.shape @@ -1205,7 +1201,7 @@ def test_derpsalsa_all_above(k, p): diff_2_weights = np.exp(-((diff_y_2 / rms_diff_2)**2) / 2) partial_weights = diff_1_weights * diff_2_weights - weights = _weighting._derpsalsa(y_data, baseline, p, k, y_data.shape, partial_weights) + weights = _weighting._derpsalsa(y_data, baseline, p, k, partial_weights) expected_weights = np.full_like(y_data, partial_weights * (1 - p)) assert isinstance(weights, np.ndarray) @@ -1228,7 +1224,7 @@ def test_derpsalsa_all_below(k, p): diff_2_weights = np.exp(-((diff_y_2 / rms_diff_2)**2) / 2) partial_weights = diff_1_weights * diff_2_weights - weights = _weighting._derpsalsa(y_data, baseline, p, k, y_data.shape, partial_weights) + weights = _weighting._derpsalsa(y_data, baseline, p, k, partial_weights) expected_weights = partial_weights * p * np.exp(-0.5 * ((y_data - baseline) / k)**2) assert isinstance(weights, np.ndarray) diff --git a/tests/test_whittaker.py b/tests/test_whittaker.py index 35493d2..6661f24 100644 --- a/tests/test_whittaker.py +++ b/tests/test_whittaker.py @@ -14,6 +14,7 @@ from scipy.sparse.linalg import spsolve from pybaselines import _banded_utils, _weighting, whittaker +from pybaselines.results import WhittakerResult from pybaselines.utils import relative_difference from pybaselines._compat import diags, identity @@ -136,6 +137,12 @@ def test_tol_history(self): assert params['tol_history'].size == max_iter + 1 + def test_result_obj(self): + """Ensures the `result` item in the output params is a WhittakerResult.""" + _, params = self.class_func(self.y, **self.kwargs) + # don't use isinstance since don't want to allow subclasses + assert type(params['result']) is WhittakerResult + class TestAsLS(WhittakerTester): """Class for testing asls baseline.""" diff --git a/tests/two_d/__init__.py b/tests/two_d/__init__.py index 0c8cac4..4326c98 100644 --- a/tests/two_d/__init__.py +++ b/tests/two_d/__init__.py @@ -1,2 +1,2 @@ # -*- coding: utf-8 -*- -"""Tests for pybaselines.""" +"""Tests for pybaselines.two_d.""" diff --git a/tests/two_d/test_algorithm_setup.py b/tests/two_d/test_algorithm_setup.py index 9a720c1..a883e3f 100644 --- a/tests/two_d/test_algorithm_setup.py +++ b/tests/two_d/test_algorithm_setup.py @@ -12,7 +12,11 @@ from scipy.sparse import kron from pybaselines._compat import identity -from pybaselines.two_d import Baseline2D, _algorithm_setup, optimizers, polynomial, whittaker +from pybaselines.two_d import ( + Baseline2D, _algorithm_setup, optimizers, polynomial, whittaker, + _spline_utils, _whittaker_utils +) +from pybaselines.results import PSplineResult2D, WhittakerResult2D from pybaselines.utils import ParameterWarning, SortingWarning, difference_matrix, estimate_window from pybaselines._validation import _check_scalar @@ -63,8 +67,9 @@ def test_setup_whittaker_diff_matrix(data_fixture2d, lam, diff_order): ) +@pytest.mark.parametrize('num_eigens', (None, 3)) @pytest.mark.parametrize('weight_enum', (0, 1, 2, 3)) -def test_setup_whittaker_weights(small_data2d, algorithm, weight_enum): +def test_setup_whittaker_weights(small_data2d, algorithm, num_eigens, weight_enum): """Ensures output weight array is correct.""" if weight_enum == 0: # no weights specified @@ -83,12 +88,19 @@ def test_setup_whittaker_weights(small_data2d, algorithm, weight_enum): weights = np.arange(small_data2d.size).reshape(small_data2d.shape).tolist() desired_weights = np.arange(small_data2d.size) - _, weight_array, _ = algorithm._setup_whittaker( - small_data2d, lam=1, diff_order=2, weights=weights + if num_eigens is not None: + desired_weights = desired_weights.reshape(small_data2d.shape) + expected_y = small_data2d + else: + expected_y = small_data2d.ravel() + + y, weight_array, _ = algorithm._setup_whittaker( + small_data2d, lam=1, diff_order=2, weights=weights, num_eigens=num_eigens ) assert isinstance(weight_array, np.ndarray) assert_array_equal(weight_array, desired_weights) + assert_allclose(y, expected_y, rtol=1e-14, atol=1e-14) assert weight_array.dtype == float @@ -154,10 +166,11 @@ def test_setup_polynomial_weights(small_data2d, algorithm, weight_enum): weights = np.arange(small_data2d.size).reshape(small_data2d.shape).tolist() desired_weights = np.arange(small_data2d.size) - _, weight_array = algorithm._setup_polynomial(small_data2d, weights=weights) + y, weight_array = algorithm._setup_polynomial(small_data2d, weights=weights) assert isinstance(weight_array, np.ndarray) assert_array_equal(weight_array, desired_weights) + assert_allclose(y, small_data2d.ravel(), rtol=1e-14, atol=1e-14) assert weight_array.dtype == float @@ -544,12 +557,13 @@ def test_setup_spline_weights(small_data2d, algorithm, weight_enum): weights = np.arange(small_data2d.size).reshape(small_data2d.shape).tolist() desired_weights = np.arange(small_data2d.size).reshape(small_data2d.shape) - _, weight_array, _ = algorithm._setup_spline( + y, weight_array, _ = algorithm._setup_spline( small_data2d, lam=1, diff_order=2, weights=weights ) assert isinstance(weight_array, np.ndarray) assert_array_equal(weight_array, desired_weights) + assert_allclose(y, small_data2d, rtol=1e-14, atol=1e-14) assert weight_array.dtype == float @@ -711,8 +725,7 @@ def test_algorithm_return_results(assume_sorted, output_dtype, change_order, res ) output, output_params = algorithm._return_results( baseline, params, dtype=output_dtype, sort_keys=('a', 'd'), - reshape_baseline=reshape_baseline, reshape_keys=('c', 'd'), - ensure_2d=not three_d + reshape_keys=('c', 'd'), ensure_dims=not three_d ) if not change_order and (output_dtype is None or baseline.dtype == output_dtype): @@ -735,9 +748,9 @@ def test_algorithm_return_results(assume_sorted, output_dtype, change_order, res @pytest.mark.parametrize('change_order', (True, False)) @pytest.mark.parametrize('skip_sorting', (True, False)) @pytest.mark.parametrize('list_input', (True, False)) -def test_algorithm_register(assume_sorted, output_dtype, change_order, skip_sorting, list_input): +def test_algorithm_handle_io(assume_sorted, output_dtype, change_order, skip_sorting, list_input): """ - Ensures the _register wrapper method returns the correctly sorted and shaped outputs. + Ensures the _handle_io wrapper method returns the correctly sorted and shaped outputs. The input y-values within the wrapped function should be correctly sorted if `assume_sorted` is False, while the output baseline should always match @@ -750,7 +763,7 @@ def test_algorithm_register(assume_sorted, output_dtype, change_order, skip_sort class SubClass(_algorithm_setup._Algorithm2D): # 'a' values will be sorted and 'b' values will be kept the same - @_algorithm_setup._Algorithm2D._register(sort_keys=('a', 'd'), reshape_keys=('c', 'd')) + @_algorithm_setup._Algorithm2D._handle_io(sort_keys=('a', 'd'), reshape_keys=('c', 'd')) def func(self, data, *args, **kwargs): """For checking sorting and reshaping output parameters.""" expected_x, expected_z, expected_y = get_data2d() @@ -770,7 +783,7 @@ def func(self, data, *args, **kwargs): } return 1 * data, params - @_algorithm_setup._Algorithm2D._register(reshape_baseline=True) + @_algorithm_setup._Algorithm2D._handle_io def func2(self, data, *args, **kwargs): """For checking reshaping output baseline.""" expected_x, expected_z, expected_y = get_data2d() @@ -782,9 +795,9 @@ def func2(self, data, *args, **kwargs): assert isinstance(self.z, np.ndarray) assert_allclose(self.z, expected_z, 1e-14, 1e-14) - return 1 * data.flatten(), {} + return 1 * data.ravel(), {} - @_algorithm_setup._Algorithm2D._register + @_algorithm_setup._Algorithm2D._handle_io def func3(self, data, *args, **kwargs): """For checking empty decorator.""" expected_x, expected_z, expected_y = get_data2d() @@ -798,7 +811,7 @@ def func3(self, data, *args, **kwargs): return 1 * data, {} - @_algorithm_setup._Algorithm2D._register( + @_algorithm_setup._Algorithm2D._handle_io( sort_keys=('a', 'd'), reshape_keys=('c', 'd'), skip_sorting=skip_sorting ) def func4(self, data, *args, **kwargs): @@ -823,14 +836,14 @@ def func4(self, data, *args, **kwargs): return 1 * data, params - @_algorithm_setup._Algorithm2D._register(require_unique_xz=False) + @_algorithm_setup._Algorithm2D._handle_io(require_unique=False) def func5(self, data, *args, **kwargs): - """For ensuring require_unique_xz works as intedended.""" + """For ensuring require_unique works as intended.""" return 1 * data, {} - @_algorithm_setup._Algorithm2D._register(require_unique_xz=True) + @_algorithm_setup._Algorithm2D._handle_io(require_unique=True) def func6(self, data, *args, **kwargs): - """For ensuring require_unique_xz works as intedended.""" + """For ensuring require_unique works as intended.""" return 1 * data, {} if change_order: @@ -927,17 +940,17 @@ def func6(self, data, *args, **kwargs): out = new_algorithm.func6(y) -def test_algorithm_register_no_data_fails(): +def test_algorithm_handle_io_no_data_fails(): """Ensures an error is raised if the input data is None.""" class SubClass(_algorithm_setup._Algorithm2D): - @_algorithm_setup._Algorithm2D._register + @_algorithm_setup._Algorithm2D._handle_io def func(self, data, *args, **kwargs): """For checking empty decorator.""" return data, {} - @_algorithm_setup._Algorithm2D._register() + @_algorithm_setup._Algorithm2D._handle_io() def func2(self, data, *args, **kwargs): """For checking closed decorator.""" return data, {} @@ -948,17 +961,17 @@ def func2(self, data, *args, **kwargs): SubClass().func2() -def test_algorithm_register_1d_fails(data_fixture): +def test_algorithm_handle_io_1d_fails(data_fixture): """Ensures an error is raised if 1D data is used for 2D algorithms.""" class SubClass(_algorithm_setup._Algorithm2D): - @_algorithm_setup._Algorithm2D._register + @_algorithm_setup._Algorithm2D._handle_io def func(self, data, *args, **kwargs): """For checking empty decorator.""" return data, {} - @_algorithm_setup._Algorithm2D._register() + @_algorithm_setup._Algorithm2D._handle_io() def func2(self, data, *args, **kwargs): """For checking closed decorator.""" return data, {} @@ -1006,6 +1019,126 @@ def func2(self, data, *args, **kwargs): algorithm.func2(y_2d_transposed) +@pytest.mark.parametrize('input_x', (True, False)) +@pytest.mark.parametrize('input_z', (True, False)) +@pytest.mark.parametrize('change_order', (True, False)) +def test_algorithm_handle_io_3d(data_fixture2d, input_x, input_z, change_order): + """Ensures 3D data is allowed for 2D algorithms only when specified. + + Also checks _Algorithm2D setup when given 3D data as the first call. + + """ + x_vals, z_vals, input_y_2d = get_data2d() + x_slice = slice(None) + z_slice = slice(None) + if input_x: + expected_x = x_vals + else: + expected_x = np.linspace(-1, 1, input_y_2d.shape[0]) + if change_order: + x_slice = slice(None, None, -1) + if input_z: + expected_z = z_vals + else: + expected_z = np.linspace(-1, 1, input_y_2d.shape[1]) + if change_order: + z_slice = slice(None, None, -1) + stacks = 2 + expected_y = np.repeat(input_y_2d[None, :], stacks, axis=0) + + class SubClass(_algorithm_setup._Algorithm2D): + + @_algorithm_setup._Algorithm2D._handle_io + def func(self, data, *args, **kwargs): + """Errors if input is not 2D.""" + assert data.ndim == 2 + assert data.shape == expected_y.shape + return data, {} + + @_algorithm_setup._Algorithm2D._handle_io(ensure_dims=False) + def func2(self, data, *args, **kwargs): + """Allows 3D data.""" + assert data.ndim == 3 + assert data.shape == expected_y.shape + + expected = expected_y.copy() + if change_order: + expected = expected[:, x_slice, z_slice] + + assert_allclose(data, expected, 1e-14, 1e-14) + assert_allclose(self.x, expected_x, 1e-14, 1e-14) + assert_allclose(self.z, expected_z, 1e-14, 1e-14) + + return data * 1, {} + + @_algorithm_setup._Algorithm2D._handle_io(ensure_dims=False) + def func3(self, data, *args, **kwargs): + """For checking reshaping output baseline for 3D input raveled on last axis.""" + assert data.ndim == 3 + assert data.shape == expected_y.shape + + return 1 * data.reshape(data.shape[0], -1), {} + + @_algorithm_setup._Algorithm2D._handle_io(ensure_dims=False, skip_sorting=True) + def func4(self, data, *args, **kwargs): + """Allows 3D data and skips sorting.""" + assert data.ndim == 3 + assert data.shape == expected_y.shape + + expected = expected_y.copy() + if change_order: + expected = expected[:, ::-1, ::-1] + + assert_allclose(data, expected, 1e-14, 1e-14) + assert_allclose(self.x, expected_x, 1e-14, 1e-14) + assert_allclose(self.z, expected_z, 1e-14, 1e-14) + + return data * 1, {} + + x_, z_, y_2d = data_fixture2d + if change_order: + x_ = x_[::-1] + z_ = z_[::-1] + y_2d = y_2d[::-1, ::-1] + x = None + z = None + initial_shape = [None, None] + if input_x: + x = x_ + initial_shape[0] = len(x) + if input_z: + z = z_ + initial_shape[1] = len(z) + initial_shape = tuple(initial_shape) + initial_size = None if None in initial_shape else y_2d.size + + input_y = np.repeat(y_2d[None, :], stacks, axis=0) + assert input_y.shape == (stacks, *y_2d.shape) # sanity check for correct setup + + algorithm = SubClass(x, z) + assert algorithm._shape == initial_shape + assert algorithm._size == initial_size + + with pytest.raises(ValueError, match='input data must be a two dimensional'): + algorithm.func(input_y) + assert algorithm._shape == initial_shape + + # should run without issues and set stored shape correctly + output, _ = algorithm.func2(input_y) + assert algorithm._shape == y_2d.shape + assert algorithm._size == y_2d.size + assert output.shape == input_y.shape + assert_allclose(output, input_y, 1e-14, 1e-14) + + output2, _ = algorithm.func3(input_y) + assert output2.shape == input_y.shape + assert_allclose(output2, input_y, 1e-14, 1e-14) + + output3, _ = algorithm.func4(input_y) + assert output3.shape == input_y.shape + assert_allclose(output2, input_y, 1e-14, 1e-14) + + def test_override_x(algorithm): """Ensures the `override_x` method correctly initializes with the new x values.""" new_len = 20 @@ -1188,3 +1321,172 @@ def test_wrong_banded_solver_fails(algorithm, banded_solver): """Ensures only valid integers between 0 and 4 are allowed as banded_solver inputs.""" with pytest.raises(ValueError): algorithm.banded_solver = banded_solver + + +@pytest.mark.parametrize('diff_order', (1, 2, 3, (2, 3))) +@pytest.mark.parametrize('lam', (1, 20, (2, 5))) +def test_setup_pls_whittaker_diff_matrix(data_fixture2d, lam, diff_order): + """Ensures output difference matrix diagonal data is in desired format for _setup_pls.""" + x, z, y = data_fixture2d + + algorithm = _algorithm_setup._Algorithm2D(x, z) + + # intentionally do not input spline_degree here to ensure default behavior is + # spline_degree=None -> Whittaker smoothing + _, _, whittaker_system, result_class = algorithm._setup_pls(y, lam=lam, diff_order=diff_order) + _, _, expected_system = algorithm._setup_whittaker(y, lam=lam, diff_order=diff_order) + + *_, lam_x, lam_z, diff_order_x, diff_order_z = get_2dspline_inputs( + lam=lam, diff_order=diff_order + ) + + D1 = difference_matrix(len(x), diff_order_x) + D2 = difference_matrix(len(z), diff_order_z) + + P1 = lam_x * kron(D1.T @ D1, identity(len(z))) + P2 = lam_z * kron(identity(len(x)), D2.T @ D2) + expected_penalty = P1 + P2 + + assert_allclose( + whittaker_system.penalty.toarray(), + expected_penalty.toarray(), + rtol=1e-12, atol=1e-12 + ) + assert_allclose( + whittaker_system.penalty.toarray(), + expected_system.penalty.toarray(), + rtol=1e-12, atol=1e-12 + ) + assert isinstance(whittaker_system, _whittaker_utils.WhittakerSystem2D) + assert result_class is WhittakerResult2D + + +@pytest.mark.parametrize('spline_degree', (None, 3)) +@pytest.mark.parametrize('num_eigens', (None, 3)) +@pytest.mark.parametrize('weight_enum', (0, 1, 2, 3)) +def test_setup_pls_weights(small_data2d, algorithm, spline_degree, num_eigens, weight_enum): + """Ensures output weight array is correct when using _setup_pls.""" + if weight_enum == 0: + # no weights specified + weights = None + desired_weights = np.ones(small_data2d.size) + elif weight_enum == 1: + # uniform 1 weighting + weights = np.ones_like(small_data2d) + desired_weights = np.ones(small_data2d.size) + elif weight_enum == 2: + # different weights for all points + weights = np.arange(small_data2d.size).reshape(small_data2d.shape) + desired_weights = np.arange(small_data2d.size) + elif weight_enum == 3: + # different weights for all points, and weights input as a list + weights = np.arange(small_data2d.size).reshape(small_data2d.shape).tolist() + desired_weights = np.arange(small_data2d.size) + + if spline_degree is None and num_eigens is None: + expected_y = small_data2d.ravel() + else: + desired_weights = desired_weights.reshape(small_data2d.shape) + expected_y = small_data2d + + y, weight_array, penalized_system, result_class = algorithm._setup_pls( + small_data2d, lam=1, diff_order=2, weights=weights, spline_degree=spline_degree, + num_eigens=num_eigens + ) + + assert isinstance(weight_array, np.ndarray) + assert_array_equal(weight_array, desired_weights) + assert weight_array.dtype == float + assert_allclose(y, expected_y, rtol=1e-14, atol=1e-14) + assert isinstance( + penalized_system, + _whittaker_utils.WhittakerSystem2D if spline_degree is None else _spline_utils.PSpline2D + ) + assert result_class is WhittakerResult2D if spline_degree is None else PSplineResult2D + + +@pytest.mark.parametrize('num_knots', (10, 30, (20, 30))) +@pytest.mark.parametrize('spline_degree', (1, 2, 3, 4, (2, 3), None)) +def test_setup_pls_spline_basis(data_fixture2d, num_knots, spline_degree): + """Ensures the spline basis function is correctly created through _setup_pls.""" + x, z, y = data_fixture2d + fitter = _algorithm_setup._Algorithm2D(x, z) + assert fitter._spline_basis is None + + fitter._setup_pls( + y, weights=None, spline_degree=spline_degree, num_knots=num_knots + ) + + if spline_degree is None: + assert fitter._spline_basis is None + return + + if isinstance(num_knots, int): + num_knots_r = num_knots + num_knots_c = num_knots + else: + num_knots_r, num_knots_c = num_knots + if isinstance(spline_degree, int): + spline_degree_x = spline_degree + spline_degree_z = spline_degree + else: + spline_degree_x, spline_degree_z = spline_degree + + assert_array_equal( + fitter._spline_basis.basis_r.shape, + (len(x), num_knots_r + spline_degree_x - 1) + ) + assert_array_equal( + fitter._spline_basis.basis_c.shape, + (len(z), num_knots_c + spline_degree_z - 1) + ) + + +@pytest.mark.parametrize('lam', (1, 20, (3, 10))) +@pytest.mark.parametrize('diff_order', (1, 2, 3, 4, (2, 3))) +@pytest.mark.parametrize('spline_degree', (1, 2, 3, 4, (2, 3))) +@pytest.mark.parametrize('num_knots', (20, (21, 30))) +def test_setup_pls_spline_diff_matrix(data_fixture2d, lam, diff_order, spline_degree, num_knots): + """Ensures output difference matrix diagonal data is in desired format for setup_pls.""" + x, z, y = data_fixture2d + + algorithm = _algorithm_setup._Algorithm2D(x, z) + _, _, pspline, result_class = algorithm._setup_pls( + y, weights=None, spline_degree=spline_degree, num_knots=num_knots, + diff_order=diff_order, lam=lam + ) + + ( + num_knots_r, num_knots_c, spline_degree_x, spline_degree_z, + lam_x, lam_z, diff_order_x, diff_order_z + ) = get_2dspline_inputs( + num_knots=num_knots, spline_degree=spline_degree, lam=lam, diff_order=diff_order + ) + + num_bases_x = num_knots_r + spline_degree_x - 1 + num_bases_z = num_knots_c + spline_degree_z - 1 + + D1 = difference_matrix(num_bases_x, diff_order_x) + D2 = difference_matrix(num_bases_z, diff_order_z) + + P1 = lam_x * kron(D1.T @ D1, identity(num_bases_z)) + P2 = lam_z * kron(identity(num_bases_x), D2.T @ D2) + expected_penalty = P1 + P2 + + assert_allclose( + pspline.penalty.toarray(), + expected_penalty.toarray(), + rtol=1e-12, atol=1e-12 + ) + assert isinstance(pspline, _spline_utils.PSpline2D) + assert result_class is PSplineResult2D + + _, _, expected_system = algorithm._setup_spline( + y, weights=None, spline_degree=spline_degree, num_knots=num_knots, + diff_order=diff_order, lam=lam + ) + assert_allclose( + pspline.penalty.toarray(), + expected_system.penalty.toarray(), + rtol=1e-12, atol=1e-12 + ) diff --git a/tests/two_d/test_spline.py b/tests/two_d/test_spline.py index 9abb5a9..8ecec2d 100644 --- a/tests/two_d/test_spline.py +++ b/tests/two_d/test_spline.py @@ -10,6 +10,7 @@ from numpy.testing import assert_allclose import pytest +from pybaselines.results import PSplineResult2D from pybaselines.two_d import Baseline2D, spline from ..base_tests import BaseTester2D, InputWeightsMixin, RecreationMixin @@ -51,6 +52,29 @@ class SplineTester(BaseTester2D): module = spline + def test_result_obj(self): + """Ensures the `result` item in the output params is a PSplineResult2D.""" + _, params = self.class_func(self.y, **self.kwargs) + # don't use isinstance since don't want to allow subclasses + assert type(params['result']) is PSplineResult2D + + @pytest.mark.parametrize('spline_degree', (None, (None, 1), (3, None), (None, None))) + def test_check_spline_degree(self, spline_degree): + """ + Ensures an exception is raised if the input spline_degree is None. + + For methods that are implemented as ND penalized least square mixins, the same + code path is used for both penalized spline and Whittaker smoothing, whose logic + is controlled by the input `spline_degree`, so need to ensure that the input for + penalized spline methods is not None. + + For methods that are not PLS mixins, a TypeError should still occur during basis + creation. + + """ + with pytest.raises(TypeError): + self.class_func(self.y, spline_degree=spline_degree) + class IterativeSplineTester(SplineTester, InputWeightsMixin, RecreationMixin): """Base testing class for iterative spline functions.""" diff --git a/tests/two_d/test_spline_utils.py b/tests/two_d/test_spline_utils.py index 1217d00..f04956c 100644 --- a/tests/two_d/test_spline_utils.py +++ b/tests/two_d/test_spline_utils.py @@ -21,12 +21,12 @@ @pytest.mark.parametrize('num_knots', (10, (11, 20))) -@pytest.mark.parametrize('spline_degree', (0, 1, 2, 3, 4, 5, (2, 3))) +@pytest.mark.parametrize('spline_degree', (0, 1, 2, 3, 4, (2, 3))) @pytest.mark.parametrize('diff_order', (1, 2, 3, 4, (2, 3))) @pytest.mark.parametrize('lam', (1e-2, (1e1, 1e2))) -def test_solve_psplines(data_fixture2d, num_knots, spline_degree, diff_order, lam): +def test_pspline_solve(data_fixture2d, num_knots, spline_degree, diff_order, lam): """ - Tests the accuracy of the penalized spline solvers. + Tests the solve method of PSpline2D. Uses the naive way to solve 2D PSplines from Eilers's paper as the expected result, which uses the flattened `y` and weight values, while pybaselines uses the second, more efficient @@ -86,6 +86,112 @@ def test_solve_psplines(data_fixture2d, num_knots, spline_degree, diff_order, la assert_allclose(basis_output, expected_result, rtol=1e-8, atol=1e-8) +@pytest.mark.parametrize('num_knots', (10, (11, 20))) +@pytest.mark.parametrize('spline_degree', (2, 3, (2, 3))) +@pytest.mark.parametrize('diff_order', (1, 2, (2, 3))) +@pytest.mark.parametrize('lam', (1e-2, (1e1, 1e2))) +def test_pspline_factorized_solve(data_fixture2d, num_knots, spline_degree, diff_order, lam): + """Tests factorziation and factorized_solve methods of PSpline2D.""" + x, z, y = data_fixture2d + ( + num_knots_r, num_knots_c, spline_degree_x, spline_degree_z, + lam_x, lam_z, diff_order_x, diff_order_z + ) = get_2dspline_inputs(num_knots, spline_degree, lam, diff_order) + + knots_r = _spline_utils._spline_knots(x, num_knots_r, spline_degree_x, True) + basis_r = _spline_utils._spline_basis(x, knots_r, spline_degree_x) + + knots_c = _spline_utils._spline_knots(z, num_knots_c, spline_degree_z, True) + basis_c = _spline_utils._spline_basis(z, knots_c, spline_degree_z) + + num_bases = (basis_r.shape[1], basis_c.shape[1]) + weights = np.random.default_rng(0).normal(0.8, 0.05, y.size) + weights = np.clip(weights, 0, 1, dtype=float) + + basis = kron(basis_r, basis_c) + CWT = basis.multiply( + np.repeat(weights.flatten(), num_bases[0] * num_bases[1]).reshape(len(x) * len(z), -1) + ).T + D1 = difference_matrix(num_bases[0], diff_order_x) + D2 = difference_matrix(num_bases[1], diff_order_z) + + P1 = lam_x * kron(D1.T @ D1, identity(num_bases[1])) + P2 = lam_z * kron(identity(num_bases[0]), D2.T @ D2) + penalty = P1 + P2 + + expected_coeffs = spsolve(CWT @ basis + penalty, CWT @ y.flatten()) + + spline_basis = _spline_utils.SplineBasis2D( + x, z, num_knots=num_knots, spline_degree=spline_degree, check_finite=False + ) + pspline = _spline_utils.PSpline2D(spline_basis, lam=lam, diff_order=diff_order) + + lhs = pspline._make_btwb(weights.reshape(y.shape)) + pspline.penalty + factorization = pspline.factorize(lhs) + assert callable(factorization) + + rhs = ( + pspline.basis.basis_r.T @ (weights.reshape(y.shape) * y) @ pspline.basis.basis_c + ).ravel() + output = pspline.factorized_solve(factorization, rhs) + assert_allclose(output, expected_coeffs, rtol=1e-8, atol=1e-8) + + # going through factorized_solve should not set coefficients + assert pspline.coef is None + + +@pytest.mark.parametrize('num_knots', (10, (11, 20))) +@pytest.mark.parametrize('spline_degree', (2, 3, (2, 3))) +@pytest.mark.parametrize('diff_order', (1, 2, (2, 3))) +@pytest.mark.parametrize('lam', (1e-2, (1e1, 1e2))) +def test_pspline_direct_solve(data_fixture2d, num_knots, spline_degree, diff_order, lam): + """Tests direct_solve method of PSpline2D.""" + x, z, y = data_fixture2d + ( + num_knots_r, num_knots_c, spline_degree_x, spline_degree_z, + lam_x, lam_z, diff_order_x, diff_order_z + ) = get_2dspline_inputs(num_knots, spline_degree, lam, diff_order) + + knots_r = _spline_utils._spline_knots(x, num_knots_r, spline_degree_x, True) + basis_r = _spline_utils._spline_basis(x, knots_r, spline_degree_x) + + knots_c = _spline_utils._spline_knots(z, num_knots_c, spline_degree_z, True) + basis_c = _spline_utils._spline_basis(z, knots_c, spline_degree_z) + + num_bases = (basis_r.shape[1], basis_c.shape[1]) + weights = np.random.default_rng(0).normal(0.8, 0.05, y.size) + weights = np.clip(weights, 0, 1, dtype=float) + + basis = kron(basis_r, basis_c) + CWT = basis.multiply( + np.repeat(weights.flatten(), num_bases[0] * num_bases[1]).reshape(len(x) * len(z), -1) + ).T + D1 = difference_matrix(num_bases[0], diff_order_x) + D2 = difference_matrix(num_bases[1], diff_order_z) + + P1 = lam_x * kron(D1.T @ D1, identity(num_bases[1])) + P2 = lam_z * kron(identity(num_bases[0]), D2.T @ D2) + penalty = P1 + P2 + + expected_coeffs = spsolve(CWT @ basis + penalty, CWT @ y.flatten()) + + spline_basis = _spline_utils.SplineBasis2D( + x, z, num_knots=num_knots, spline_degree=spline_degree, check_finite=False + ) + pspline = _spline_utils.PSpline2D(spline_basis, lam=lam, diff_order=diff_order) + + lhs = pspline._make_btwb(weights.reshape(y.shape)) + pspline.penalty + + rhs = ( + pspline.basis.basis_r.T @ (weights.reshape(y.shape) * y) @ pspline.basis.basis_c + ).ravel() + output = pspline.direct_solve(lhs, rhs) + assert_allclose(output, expected_coeffs, rtol=1e-8, atol=1e-8) + + # going through direct_solve should not set coefficients + assert pspline.coef is None + + @pytest.mark.parametrize('spline_degree', (1, 2, 3, [2, 3])) @pytest.mark.parametrize('num_knots', (16, [21, 30])) @pytest.mark.parametrize('diff_order', (1, 2, 3, [1, 3])) @@ -122,6 +228,9 @@ def test_pspline_setup(data_fixture2d, num_knots, spline_degree, diff_order, lam assert spline_basis.basis_c.shape == (len(z), len(knots_c) - spline_degree_z - 1) assert_array_equal(spline_basis._num_bases, num_bases) assert_array_equal(pspline._num_bases, num_bases) + assert_array_equal(pspline._num_bases, num_bases) + assert pspline.tot_bases == np.prod(num_bases) + assert pspline.shape == (len(x), len(z)) assert issparse(spline_basis.basis_r) assert issparse(spline_basis.basis_c) diff --git a/tests/two_d/test_whittaker.py b/tests/two_d/test_whittaker.py index ff4b0a0..9ee95a4 100644 --- a/tests/two_d/test_whittaker.py +++ b/tests/two_d/test_whittaker.py @@ -9,6 +9,7 @@ import numpy as np import pytest +from pybaselines.results import WhittakerResult2D from pybaselines.two_d import whittaker from ..base_tests import BaseTester2D, InputWeightsMixin, RecreationMixin @@ -27,6 +28,12 @@ def test_tol_history(self): assert params['tol_history'].size == max_iter + 1 + def test_result_obj(self): + """Ensures the `result` item in the output params is a WhittakerResult2D.""" + _, params = self.class_func(self.y, **self.kwargs) + # don't use isinstance since don't want to allow subclasses + assert type(params['result']) is WhittakerResult2D + class EigenvalueMixin: """BaseTester2D mixin for testing the Whittaker methods that can use eigendecomposition.""" diff --git a/tests/two_d/test_whittaker_utils.py b/tests/two_d/test_whittaker_utils.py index 47af95c..59f3fae 100644 --- a/tests/two_d/test_whittaker_utils.py +++ b/tests/two_d/test_whittaker_utils.py @@ -9,7 +9,7 @@ import numpy as np from numpy.testing import assert_allclose, assert_array_equal import pytest -from scipy.linalg import eig_banded, solve +from scipy.linalg import cholesky, eig_banded, solve from scipy.sparse import issparse, kron from scipy.sparse.linalg import spsolve @@ -23,14 +23,76 @@ @pytest.mark.parametrize('diff_order', (1, 2, 3, 4, (2, 3))) @pytest.mark.parametrize('lam', (1e-2, 1e2, (1e1, 1e2))) -def test_solve_penalized_system(small_data2d, diff_order, lam): - """ - Tests the accuracy of the penalized system solver. +def test_penalized_system_solve(small_data2d, diff_order, lam): + """Tests the solve method of a PenalizedSystem2D object.""" + *_, lam_x, lam_z, diff_order_x, diff_order_z = get_2dspline_inputs( + lam=lam, diff_order=diff_order + ) - Not really useful at the moment, but will be more useful if the solver changes - from the current basic sparse solver. + num_bases = small_data2d.shape - """ + D1 = difference_matrix(num_bases[0], diff_order_x) + D2 = difference_matrix(num_bases[1], diff_order_z) + + P1 = lam_x * kron(D1.T @ D1, identity(num_bases[1])) + P2 = lam_z * kron(identity(num_bases[0]), D2.T @ D2) + penalty = P1 + P2 + + penalized_system = _whittaker_utils.PenalizedSystem2D( + small_data2d.shape, lam=lam, diff_order=diff_order + ) + + weights = np.random.default_rng(0).normal(0.8, 0.05, small_data2d.size) + weights = np.clip(weights, 1e-12, 1).astype(float, copy=False).ravel() + + penalty.setdiag(penalty.diagonal() + weights) + + expected_result = spsolve(penalty, weights * small_data2d.ravel()) + output = penalized_system.solve(small_data2d.ravel(), weights) + + assert_allclose(output.ravel(), expected_result, rtol=1e-8, atol=1e-8) + + +@pytest.mark.parametrize('diff_order', (1, 2, 3, 4, (2, 3))) +@pytest.mark.parametrize('lam', (1e-2, 1e2, (1e1, 1e2))) +def test_penalized_system_factorized_solve(small_data2d, diff_order, lam): + """Tests the factorize and factorized_solve methods of a PenalizedSystem2D object.""" + *_, lam_x, lam_z, diff_order_x, diff_order_z = get_2dspline_inputs( + lam=lam, diff_order=diff_order + ) + + num_bases = small_data2d.shape + + D1 = difference_matrix(num_bases[0], diff_order_x) + D2 = difference_matrix(num_bases[1], diff_order_z) + + P1 = lam_x * kron(D1.T @ D1, identity(num_bases[1])) + P2 = lam_z * kron(identity(num_bases[0]), D2.T @ D2) + penalty = P1 + P2 + + penalized_system = _whittaker_utils.PenalizedSystem2D( + small_data2d.shape, lam=lam, diff_order=diff_order + ) + + weights = np.random.default_rng(0).normal(0.8, 0.05, small_data2d.size) + weights = np.clip(weights, 1e-12, 1).astype(float, copy=False).ravel() + + penalty.setdiag(penalty.diagonal() + weights) + expected_result = spsolve(penalty, weights * small_data2d.ravel()) + + penalized_system.add_diagonal(weights) + factorization = penalized_system.factorize(penalized_system.penalty) + assert callable(factorization) + + output = penalized_system.factorized_solve(factorization, weights * small_data2d.ravel()) + + assert_allclose(output.ravel(), expected_result, rtol=1e-8, atol=1e-8) + + +@pytest.mark.parametrize('diff_order', (1, 2, 3, 4, (2, 3))) +@pytest.mark.parametrize('lam', (1e-2, 1e2, (1e1, 1e2))) +def test_penalized_system_direct_solve(small_data2d, diff_order, lam): + """Tests the direct_solve method of a PenalizedSystem2D object.""" *_, lam_x, lam_z, diff_order_x, diff_order_z = get_2dspline_inputs( lam=lam, diff_order=diff_order ) @@ -52,11 +114,15 @@ def test_solve_penalized_system(small_data2d, diff_order, lam): weights = np.clip(weights, 1e-12, 1).astype(float, copy=False).ravel() penalty.setdiag(penalty.diagonal() + weights) + expected_result = spsolve(penalty, weights * small_data2d.ravel()) - expected_result = spsolve(penalty, weights * small_data2d.flatten()) - output = penalized_system.solve(small_data2d.flatten(), weights) + penalized_system.add_diagonal(weights) + + output = penalized_system.direct_solve( + penalized_system.penalty, weights * small_data2d.ravel() + ) - assert_allclose(output.flatten(), expected_result, rtol=1e-8, atol=1e-8) + assert_allclose(output.ravel(), expected_result, rtol=1e-8, atol=1e-8) @pytest.mark.parametrize('diff_order', (1, 2, 3, [1, 3])) @@ -81,6 +147,8 @@ def test_penalized_system_setup(small_data2d, diff_order, lam): ) assert_array_equal(penalized_system._num_bases, num_bases) + assert penalized_system.tot_bases == np.prod(num_bases) + assert_array_equal(penalized_system.shape, num_bases) assert issparse(penalized_system.penalty) assert_allclose( @@ -189,14 +257,44 @@ def test_face_splitting(): @pytest.mark.parametrize('diff_order', (1, 2, 3, 4, (2, 3))) @pytest.mark.parametrize('lam', (1e-2, 1e2, (1e1, 1e2))) -def test_solve_whittaker_system_no_eigenvalues(small_data2d, diff_order, lam): - """ - Tests the accuracy of the Whittaker system solver when not using eigendecomposition. +def test_whittaker_system_solve_no_eigenvalues(small_data2d, diff_order, lam): + """Tests solve method of WhittakerSystem2D when not using SVD.""" + *_, lam_x, lam_z, diff_order_x, diff_order_z = get_2dspline_inputs( + lam=lam, diff_order=diff_order + ) - Not really useful at the moment, but will be more useful if the solver changes - from the current basic sparse solver. + num_bases = small_data2d.shape - """ + D1 = difference_matrix(num_bases[0], diff_order_x) + D2 = difference_matrix(num_bases[1], diff_order_z) + + P1 = lam_x * kron(D1.T @ D1, identity(num_bases[1])) + P2 = lam_z * kron(identity(num_bases[0]), D2.T @ D2) + penalty = P1 + P2 + + penalized_system = _whittaker_utils.WhittakerSystem2D( + small_data2d.shape, lam=lam, diff_order=diff_order, num_eigens=None + ) + assert penalized_system.coef is None + + weights = np.random.default_rng(0).normal(0.8, 0.05, small_data2d.size) + weights = np.clip(weights, 1e-12, 1).astype(float, copy=False).ravel() + + penalty.setdiag(penalty.diagonal() + weights) + + expected_result = spsolve(penalty, weights * small_data2d.ravel()) + output = penalized_system.solve(small_data2d.ravel(), weights) + + # coef should not be updated since not using eigendecomposition + assert penalized_system.coef is None + + assert_allclose(output.ravel(), expected_result, rtol=1e-8, atol=1e-8) + + +@pytest.mark.parametrize('diff_order', (1, 2, 3, 4, (2, 3))) +@pytest.mark.parametrize('lam', (1e-2, 1e2, (1e1, 1e2))) +def test_whittaker_system_factorized_solve_no_eigenvalues(small_data2d, diff_order, lam): + """Tests factorziation and factorized_solve methods of WhittakerSystem2D when not using SVD.""" *_, lam_x, lam_z, diff_order_x, diff_order_z = get_2dspline_inputs( lam=lam, diff_order=diff_order ) @@ -219,14 +317,50 @@ def test_solve_whittaker_system_no_eigenvalues(small_data2d, diff_order, lam): weights = np.clip(weights, 1e-12, 1).astype(float, copy=False).ravel() penalty.setdiag(penalty.diagonal() + weights) + expected_result = spsolve(penalty, weights * small_data2d.ravel()) - expected_result = spsolve(penalty, weights * small_data2d.flatten()) - output = penalized_system.solve(small_data2d.flatten(), weights) + penalized_system.add_diagonal(weights) + factorization = penalized_system.factorize(penalized_system.penalty) + assert callable(factorization) + output = penalized_system.factorized_solve(factorization, weights * small_data2d.ravel()) - # coef should not be updated since not using eigendecomposition + assert_allclose(output.ravel(), expected_result, rtol=1e-8, atol=1e-8) + + +@pytest.mark.parametrize('diff_order', (1, 2, 3, 4, (2, 3))) +@pytest.mark.parametrize('lam', (1e-2, 1e2, (1e1, 1e2))) +def test_whittaker_system_direct_solve_no_eigenvalues(small_data2d, diff_order, lam): + """Tests direct_solve method of WhittakerSystem2D when not using SVD.""" + *_, lam_x, lam_z, diff_order_x, diff_order_z = get_2dspline_inputs( + lam=lam, diff_order=diff_order + ) + + num_bases = small_data2d.shape + + D1 = difference_matrix(num_bases[0], diff_order_x) + D2 = difference_matrix(num_bases[1], diff_order_z) + + P1 = lam_x * kron(D1.T @ D1, identity(num_bases[1])) + P2 = lam_z * kron(identity(num_bases[0]), D2.T @ D2) + penalty = P1 + P2 + + penalized_system = _whittaker_utils.WhittakerSystem2D( + small_data2d.shape, lam=lam, diff_order=diff_order, num_eigens=None + ) assert penalized_system.coef is None - assert_allclose(output.flatten(), expected_result, rtol=1e-8, atol=1e-8) + weights = np.random.default_rng(0).normal(0.8, 0.05, small_data2d.size) + weights = np.clip(weights, 1e-12, 1).astype(float, copy=False).ravel() + + penalty.setdiag(penalty.diagonal() + weights) + expected_result = spsolve(penalty, weights * small_data2d.ravel()) + + penalized_system.add_diagonal(weights) + output = penalized_system.direct_solve( + penalized_system.penalty, weights * small_data2d.ravel() + ) + + assert_allclose(output.ravel(), expected_result, rtol=1e-8, atol=1e-8) @pytest.mark.parametrize('diff_order', (1, 2, 3, [1, 3])) @@ -251,6 +385,8 @@ def test_whittaker_system_setup_no_eigenvalues(small_data2d, diff_order, lam): ) assert_array_equal(penalized_system._num_bases, num_bases) + assert penalized_system.tot_bases == np.prod(num_bases) + assert_array_equal(penalized_system.shape, num_bases) assert issparse(penalized_system.penalty) assert_allclose( @@ -290,7 +426,9 @@ def test_whittaker_system_setup_eigenvalues(data_fixture2d, num_eigens, diff_ord y.shape, lam=lam, diff_order=diff_order, num_eigens=num_eigens ) - assert_array_equal(whittaker_system._num_bases, num_eigens) + assert_array_equal(whittaker_system._num_bases, (num_eigens_r, num_eigens_c)) + assert_array_equal(whittaker_system.tot_bases, np.prod((num_eigens_r, num_eigens_c))) + assert_array_equal(whittaker_system.shape, y.shape) eigenvalues_rows, expected_basis_rows = eig_banded( diff_penalty_diagonals(y.shape[0], diff_order_r, lower_only=True), @@ -447,7 +585,7 @@ def test_whittaker_system_same_basis(): @pytest.mark.parametrize('num_eigens', (5, 8, (5, 8))) @pytest.mark.parametrize('diff_order', (1, 2, 3, (2, 3))) @pytest.mark.parametrize('lam', (1e-2, 1e2, (1e1, 1e2))) -def test_solve_whittaker_system_eigenvalues(data_fixture2d, num_eigens, diff_order, lam): +def test_whittaker_system_solve_eigenvalues(data_fixture2d, num_eigens, diff_order, lam): """ Tests the accuracy of the Whittaker system solver when using eigendecomposition. @@ -492,11 +630,11 @@ def test_solve_whittaker_system_eigenvalues(data_fixture2d, num_eigens, diff_ord basis = kron(basis_r, basis_c) CWT = basis.multiply( - np.repeat(weights.flatten(), num_bases[0] * num_bases[1]).reshape(len(x) * len(z), -1) + np.repeat(weights.ravel(), num_bases[0] * num_bases[1]).reshape(len(x) * len(z), -1) ).T expected_coeffs = solve( - (CWT @ basis + penalty).toarray(), CWT @ y.flatten(), + (CWT @ basis + penalty).toarray(), CWT @ y.ravel(), lower=True, overwrite_a=True, overwrite_b=True, check_finite=False, assume_a='pos' ) expected_result = basis @ expected_coeffs @@ -508,7 +646,7 @@ def test_solve_whittaker_system_eigenvalues(data_fixture2d, num_eigens, diff_ord assert whittaker_system.coef is None output = whittaker_system.solve(y, weights=weights.reshape(y.shape)) - assert_allclose(output.flatten(), expected_result, rtol=1e-8, atol=1e-8) + assert_allclose(output.ravel(), expected_result, rtol=1e-8, atol=1e-8) # TODO comparing the non-absolute values of the coefficients fails for diff_order=1 since # that uses eigh_tridiagonal; the only difference is that the eigenvectors are # -1 * eigenvectors of eig_banded for the first `diff_order` eigenvectors; this does @@ -537,6 +675,160 @@ def test_solve_whittaker_system_eigenvalues(data_fixture2d, num_eigens, diff_ord assert_allclose(dof, expected_dof, rtol=1e-8, atol=1e-8) +@pytest.mark.parametrize('num_eigens', (5, 8, (5, 8))) +@pytest.mark.parametrize('diff_order', (1, 2, 3, (2, 3))) +@pytest.mark.parametrize('lam', (1e-2, 1e2, (1e1, 1e2))) +def test_whittaker_system_factorized_solve_eigenvalues(data_fixture2d, num_eigens, diff_order, lam): + """ + Tests the accuracy of the Whittaker system solver when using eigendecomposition. + + Uses the naive way to solve 2D Whittaker system as the expected result, which + uses the flattened `y` and weight values, while pybaselines uses the second, more efficient + method in Eiler's paper which directly uses the 2D `y` and weights. + + References + ---------- + Eilers, P., et al. Fast and compact smoothing on large multidimensional grids. Computational + Statistics and Data Analysis, 2006, 50(1), 61-76. + + """ + x, z, y = data_fixture2d + ( + num_eigens_r, num_eigens_c, _, _, + lam_x, lam_z, diff_order_x, diff_order_z + ) = get_2dspline_inputs(num_knots=num_eigens, lam=lam, diff_order=diff_order) + + eigenvalues_rows, basis_r = eig_banded( + diff_penalty_diagonals(y.shape[0], diff_order_x, lower_only=True), + lower=True, overwrite_a_band=True, select='i', select_range=(0, num_eigens_r - 1) + ) + penalty_rows = kron( + lam_x * dia_object((eigenvalues_rows, 0), shape=(num_eigens_r, num_eigens_r)), + identity(num_eigens_c) + ) + + eigenvalues_cols, basis_c = eig_banded( + diff_penalty_diagonals(y.shape[1], diff_order_z, lower_only=True), + lower=True, overwrite_a_band=True, select='i', select_range=(0, num_eigens_c - 1) + ) + penalty_cols = kron( + identity(num_eigens_r), + lam_z * dia_object((eigenvalues_cols, 0), shape=(num_eigens_c, num_eigens_c)) + ) + penalty = penalty_rows + penalty_cols + + num_bases = (basis_r.shape[1], basis_c.shape[1]) + weights = np.random.default_rng(0).normal(0.8, 0.05, y.size) + weights = np.clip(weights, 0, 1, dtype=float) + + basis = kron(basis_r, basis_c) + CWT = basis.multiply( + np.repeat(weights.ravel(), num_bases[0] * num_bases[1]).reshape(len(x) * len(z), -1) + ).T + + expected_coeffs = solve( + (CWT @ basis + penalty).toarray(), CWT @ y.ravel(), + lower=True, overwrite_a=True, overwrite_b=True, check_finite=False, assume_a='pos' + ) + + whittaker_system = _whittaker_utils.WhittakerSystem2D( + y.shape, lam=lam, diff_order=diff_order, num_eigens=num_eigens + ) + assert whittaker_system.coef is None + + lhs = whittaker_system._make_btwb(weights.reshape(y.shape)) + np.fill_diagonal(lhs, lhs.diagonal() + whittaker_system.penalty) + + expected_factorization = cholesky(lhs, lower=True) + factorization = whittaker_system.factorize(lhs) + + assert_allclose(factorization[0], expected_factorization, rtol=1e-12, atol=1e-12) + assert factorization[1] # should denote lower=True + + rhs = ( + whittaker_system.basis_r.T @ (weights.reshape(y.shape) * y) @ whittaker_system.basis_c + ).ravel() + output = whittaker_system.factorized_solve(factorization, rhs) + # TODO same concern about eigenvector signs as in test_whittaker_system_solve_eigenvalues + if 1 in (diff_order_x, diff_order_z): + assert_allclose( + np.abs(output.ravel()), np.abs(expected_coeffs), rtol=1e-8, atol=1e-8 + ) + else: + assert_allclose(output.ravel(), expected_coeffs, rtol=1e-8, atol=1e-8) + + # going through factorized_solve should not set coefficients + assert whittaker_system.coef is None + + +@pytest.mark.parametrize('num_eigens', (5, 8, (5, 8))) +@pytest.mark.parametrize('diff_order', (1, 2, 3, (2, 3))) +@pytest.mark.parametrize('lam', (1e-2, 1e2, (1e1, 1e2))) +def test_whittaker_system_direct_solve_eigenvalues(data_fixture2d, num_eigens, diff_order, lam): + """Tests direct_solve method of WhittakerSystem2D when using eigendecomposition.""" + x, z, y = data_fixture2d + ( + num_eigens_r, num_eigens_c, _, _, + lam_x, lam_z, diff_order_x, diff_order_z + ) = get_2dspline_inputs(num_knots=num_eigens, lam=lam, diff_order=diff_order) + + eigenvalues_rows, basis_r = eig_banded( + diff_penalty_diagonals(y.shape[0], diff_order_x, lower_only=True), + lower=True, overwrite_a_band=True, select='i', select_range=(0, num_eigens_r - 1) + ) + penalty_rows = kron( + lam_x * dia_object((eigenvalues_rows, 0), shape=(num_eigens_r, num_eigens_r)), + identity(num_eigens_c) + ) + + eigenvalues_cols, basis_c = eig_banded( + diff_penalty_diagonals(y.shape[1], diff_order_z, lower_only=True), + lower=True, overwrite_a_band=True, select='i', select_range=(0, num_eigens_c - 1) + ) + penalty_cols = kron( + identity(num_eigens_r), + lam_z * dia_object((eigenvalues_cols, 0), shape=(num_eigens_c, num_eigens_c)) + ) + penalty = penalty_rows + penalty_cols + + num_bases = (basis_r.shape[1], basis_c.shape[1]) + weights = np.random.default_rng(0).normal(0.8, 0.05, y.size) + weights = np.clip(weights, 0, 1, dtype=float) + + basis = kron(basis_r, basis_c) + CWT = basis.multiply( + np.repeat(weights.ravel(), num_bases[0] * num_bases[1]).reshape(len(x) * len(z), -1) + ).T + + expected_coeffs = solve( + (CWT @ basis + penalty).toarray(), CWT @ y.ravel(), + lower=True, overwrite_a=True, overwrite_b=True, check_finite=False, assume_a='pos' + ) + + whittaker_system = _whittaker_utils.WhittakerSystem2D( + y.shape, lam=lam, diff_order=diff_order, num_eigens=num_eigens + ) + assert whittaker_system.coef is None + + lhs = whittaker_system._make_btwb(weights.reshape(y.shape)) + np.fill_diagonal(lhs, lhs.diagonal() + whittaker_system.penalty) + + rhs = ( + whittaker_system.basis_r.T @ (weights.reshape(y.shape) * y) @ whittaker_system.basis_c + ).ravel() + output = whittaker_system.direct_solve(lhs, rhs) + # TODO same concern about eigenvector signs as in test_whittaker_system_solve_eigenvalues + if 1 in (diff_order_x, diff_order_z): + assert_allclose( + np.abs(output.ravel()), np.abs(expected_coeffs), rtol=1e-8, atol=1e-8 + ) + else: + assert_allclose(output.ravel(), expected_coeffs, rtol=1e-8, atol=1e-8) + + # going through direct_solve should not set coefficients + assert whittaker_system.coef is None + + @pytest.mark.parametrize('num_eigens', (5, 8, (5, 8))) @pytest.mark.parametrize('diff_order', (1, 2, 3, (2, 3))) @pytest.mark.parametrize('lam', (1e-2, 1e2, (1e1, 1e2)))