diff --git a/.github/workflows/python-test-latest.yml b/.github/workflows/python-test-latest.yml index c42791e1..81f79f1e 100644 --- a/.github/workflows/python-test-latest.yml +++ b/.github/workflows/python-test-latest.yml @@ -40,6 +40,8 @@ jobs: steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + persist-credentials: false - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 diff --git a/.github/workflows/python-test.yml b/.github/workflows/python-test.yml index f7c53771..fada8bd5 100644 --- a/.github/workflows/python-test.yml +++ b/.github/workflows/python-test.yml @@ -40,12 +40,8 @@ jobs: steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - # fetch-depth: '0' is needed to fetch tags for hatch-vcs (and thus setuptools-scm) - # to generate the version correctly; only really needed for jobs that would build and - # upload to pypi, so ignore it since fetching the entire git history can take much longer - # than the default of fetching just the last commit - #with: - #fetch-depth: '0' + with: + persist-credentials: false - name: Set up Python uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 @@ -81,6 +77,8 @@ jobs: steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + persist-credentials: false - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 @@ -113,6 +111,8 @@ jobs: steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + persist-credentials: false - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 @@ -124,3 +124,32 @@ jobs: - name: Lint run: ruff check . + + doctest: + + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + # only test earliest and latest supported Python versions since the examples should + # be fairly simple + python-version: ['3.9', '3.13'] + + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + persist-credentials: false + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + # TODO will need to think of how to handle doctests when they rely on optional dependencies + run: | + python -m pip install --upgrade pip + python -m pip install ".[test, doctest]" + + - name: Run doctests + run: pytest pybaselines --doctest-modules diff --git a/docs/algorithms/classification.rst b/docs/algorithms/classification.rst index e54ba443..fb687afe 100644 --- a/docs/algorithms/classification.rst +++ b/docs/algorithms/classification.rst @@ -17,6 +17,8 @@ method. The plot below shows such an example. .. plot:: :align: center + :include-source: False + :show-source-link: True import numpy as np import matplotlib.pyplot as plt @@ -75,6 +77,8 @@ points, and then iteratively fitting a polynomial to the interpolated baseline. .. plot:: :align: center :context: reset + :include-source: False + :show-source-link: True import numpy as np import matplotlib.pyplot as plt @@ -206,6 +210,8 @@ the noise's standard deviation as belonging to the baseline. .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_data function, look at the top-most algorithm's code figure, axes, handles = create_plots(data, baselines) @@ -234,6 +240,8 @@ of the median of the noise's standard deviation distribution. .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_data function, look at the top-most algorithm's code figure, axes, handles = create_plots(data, baselines) @@ -263,6 +271,8 @@ threshold, and peak regions are iteratively interpolated until the baseline is b .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_data function, look at the top-most algorithm's code figure, axes, handles = create_plots(data, baselines) @@ -291,6 +301,8 @@ other points have a weight of 0. .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True scales = np.arange(2, 40) # to see contents of create_data function, look at the top-most algorithm's code @@ -327,6 +339,8 @@ a weight of 0. .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_data function, look at the top-most algorithm's code figure, axes, handles = create_plots(data, baselines) @@ -369,6 +383,8 @@ peak positions. .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_data function, look at the top-most algorithm's code figure, axes, handles = create_plots(data, baselines) @@ -397,6 +413,8 @@ slightly reduced, as seen below. .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_data function, look at the top-most algorithm's code figure, axes, handles = create_plots(data, baselines) diff --git a/docs/algorithms/misc.rst b/docs/algorithms/misc.rst index 169ea7a2..08b8f3f3 100644 --- a/docs/algorithms/misc.rst +++ b/docs/algorithms/misc.rst @@ -25,6 +25,8 @@ with user interfaces and is not encouraged otherwise. .. plot:: :align: center + :include-source: False + :show-source-link: True import numpy as np import matplotlib.pyplot as plt @@ -81,6 +83,8 @@ as sparse. .. plot:: :align: center :context: reset + :include-source: False + :show-source-link: True import numpy as np import matplotlib.pyplot as plt @@ -205,6 +209,8 @@ of the beads function. .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_data function, look at the second-to-top-most algorithm's code figure, axes, handles = create_plots(data, baselines) diff --git a/docs/algorithms/morphological.rst b/docs/algorithms/morphological.rst index 2cfe47ca..6c8e0aaa 100644 --- a/docs/algorithms/morphological.rst +++ b/docs/algorithms/morphological.rst @@ -13,6 +13,8 @@ below. The algorithms in this section use these operations to estimate the basel .. plot:: :align: center :context: reset + :include-source: False + :show-source-link: True import numpy as np import matplotlib.pyplot as plt @@ -80,6 +82,8 @@ method. .. plot:: :align: center :context: reset + :include-source: False + :show-source-link: True import numpy as np import matplotlib.pyplot as plt @@ -212,6 +216,8 @@ erosion and dilation of the opening to create the baseline. .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_data function, look at the top-most algorithm's code figure, axes, handles = create_plots(data, baselines) @@ -235,6 +241,8 @@ create the baseline. .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_data function, look at the top-most algorithm's code figure, axes, handles = create_plots(data, baselines) @@ -252,6 +260,8 @@ kernel, to produce a smooth baseline. .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_data function, look at the top-most algorithm's code figure, axes, handles = create_plots(data, baselines) @@ -276,6 +286,8 @@ and opening of either the data (first iteration) or previous iteration's baselin .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_data function, look at the top-most algorithm's code figure, axes, handles = create_plots(data, baselines) @@ -294,6 +306,8 @@ resembles rolling a ball across the data. .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_data function, look at the top-most algorithm's code figure, axes, handles = create_plots(data, baselines) @@ -315,6 +329,8 @@ then smooths the result with a moving average. .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_data function, look at the top-most algorithm's code figure, axes, handles = create_plots(data, baselines) @@ -340,6 +356,8 @@ tophat (Top-hat Transformation) .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_data function, look at the top-most algorithm's code figure, axes, handles = create_plots(data, baselines) @@ -367,6 +385,8 @@ Finally, a penalized spline is fit to the smoothed data with the assigned weight .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_data function, look at the top-most algorithm's code figure, axes, handles = create_plots(data, baselines) @@ -419,11 +439,11 @@ linear equations: .. math:: - (I + 2 \gamma D_d^{\top} D_d) s^n = y - v^{n-1} + (I + 2 \gamma D_d^{\mathsf{T}} D_d) s^n = y - v^{n-1} .. math:: - (I + 2 \alpha I + 2 \beta D_d^{\top} D_d) v^n = y - s^n + 2 \alpha Op + (I + 2 \alpha I + 2 \beta D_d^{\mathsf{T}} D_d) v^n = y - s^n + 2 \alpha Op where :math:`I` is the identity matrix and :math:`D_d` is the matrix version of :math:`\Delta^d`, which is also the d-th derivative of the identity matrix. @@ -433,6 +453,8 @@ multipliers. .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_data function, look at the top-most algorithm's code figure, axes, handles = create_plots(data, baselines) @@ -452,6 +474,8 @@ of the jbcd function. .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_data function, look at the second-to-top-most algorithm's code figure, axes, handles = create_plots(data, baselines) diff --git a/docs/algorithms/optimizers.rst b/docs/algorithms/optimizers.rst index 011caf55..b6ac5194 100644 --- a/docs/algorithms/optimizers.rst +++ b/docs/algorithms/optimizers.rst @@ -26,6 +26,8 @@ An example of data with added baseline and Gaussian peaks is shown below. .. plot:: :align: center + :include-source: False + :show-source-link: True import numpy as np import matplotlib.pyplot as plt @@ -88,6 +90,8 @@ added linear regions is selected as the optimal parameter. .. plot:: :align: center :context: reset + :include-source: False + :show-source-link: True import numpy as np import matplotlib.pyplot as plt @@ -225,6 +229,8 @@ algorithm versus the individual baselines from the mpls method. .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True x = np.linspace(1, 1000, 500) signal = ( @@ -283,6 +289,8 @@ the name). .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_data function, look at the top-most algorithm's code figure, axes, handles = create_plots(data, baselines) diff --git a/docs/algorithms/polynomial.rst b/docs/algorithms/polynomial.rst index 105c6b17..a8e43351 100644 --- a/docs/algorithms/polynomial.rst +++ b/docs/algorithms/polynomial.rst @@ -25,7 +25,7 @@ However, since only the baseline of the data is desired, the least-squares approach must be modified. For polynomial-based algorithms, this is done by 1) only fitting the data in regions where there is only baseline (termed selective masking), 2) modifying the y-values being fit each iteration, termed -thresholding, or 3) penalyzing outliers. +thresholding, or 3) penalizing outliers. .. _selective-masking-explanation: @@ -142,14 +142,15 @@ Thresholding ~~~~~~~~~~~~ Thresholding is an iterative method that first fits the data using -traditional least-squares, and then sets the next iteration's fit data +traditional least-squares and then sets the next iteration's data-to-fit as the element-wise minimum between the current data and the current fit. -The figure below illustrates the iterative thresholding. +The figure below illustrates this iterative thresholding. .. plot:: :align: center :context: close-figs :include-source: False + :show-source-link: True fig, axes = plt.subplots( 2, 2, gridspec_kw={'hspace': 0, 'wspace': 0}, @@ -176,14 +177,15 @@ The figure below illustrates the iterative thresholding. The algorithms in pybaselines that use thresholding are :meth:`~.Baseline.modpoly`, :meth:`~.Baseline.imodpoly`, and :meth:`~.Baseline.loess` (if ``use_threshold`` is True). -Penalyzing Outliers +Penalizing Outliers ~~~~~~~~~~~~~~~~~~~ -The algorithms in pybaselines that penalyze outliers are -:meth:`~.Baseline.penalized_poly`, which incorporate the penalty directly into the -minimized cost function, and :meth:`~.Baseline.loess` (if ``use_threshold`` is False), -which incorporates penalties by applying lower weights to outliers. Refer -to the particular algorithms below for more details. +The polynomial algorithms in pybaselines that penalize outliers include +:meth:`~.Baseline.penalized_poly`, which incorporates the penalty directly into the +minimized cost function, and :meth:`~.Baseline.loess` (if ``use_threshold`` is False) +and :meth:`~.Baseline.quant_reg`, which use +:ref:`iterative reweighting ` to apply lower weights +to outliers. Refer to the particular algorithms below for more details. Algorithms @@ -201,6 +203,8 @@ of the data since masking is time-consuming. .. plot:: :align: center :context: reset + :include-source: False + :show-source-link: True import numpy as np import matplotlib.pyplot as plt @@ -325,6 +329,8 @@ baseline to data. `modpoly` is also sometimes called "ModPolyFit" in literature, .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_data function, look at the top-most algorithm's code figure, axes, handles = create_plots(data, baselines) @@ -362,6 +368,8 @@ and both `modpoly` and `imodpoly` are sometimes referred to as "IPF" or "Iterati .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_data function, look at the top-most algorithm's code figure, axes, handles = create_plots(data, baselines) @@ -393,6 +401,8 @@ The plots below show the symmetric and asymmetric forms of the cost functions. .. plot:: :align: center + :include-source: False + :show-source-link: True import numpy as np import matplotlib.pyplot as plt @@ -474,6 +484,8 @@ The plots below show the symmetric and asymmetric forms of the cost functions. .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_data function, look at the top-most algorithm's code figure, axes, handles = create_plots(data, baselines) @@ -507,6 +519,8 @@ is reduced by iterative reweighting. .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_data function, look at the top-most algorithm's code figure, axes, handles = create_plots(data, baselines) @@ -541,6 +555,8 @@ quant_reg (Quantile Regression) .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_data function, look at the top-most algorithm's code figure, axes, handles = create_plots(data, baselines) @@ -567,6 +583,8 @@ based on the input `peak_ratio` value. .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True peak_ratios = [0.2, 0.6, 0.2, 0.2, 0.3] # to see contents of create_data function, look at the top-most algorithm's code diff --git a/docs/algorithms/smooth.rst b/docs/algorithms/smooth.rst index 38f8ab47..965e6eb0 100644 --- a/docs/algorithms/smooth.rst +++ b/docs/algorithms/smooth.rst @@ -27,6 +27,8 @@ kernel. Note that this method does not perform well for tightly-grouped peaks. .. plot:: :align: center :context: reset + :include-source: False + :show-source-link: True import numpy as np import matplotlib.pyplot as plt @@ -157,6 +159,8 @@ index-based width of the largest peak or feature. .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_data function, look at the top-most algorithm's code figure, axes, handles = create_plots(data, baselines) @@ -177,6 +181,8 @@ data. The baselines when using decreasing window size and smoothing is shown bel .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_data function, look at the top-most algorithm's code figure, axes, handles = create_plots(data, baselines) @@ -202,6 +208,8 @@ incrementally increased to smooth peaks until convergence is reached. .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_data function, look at the top-most algorithm's code figure, axes, handles = create_plots(data, baselines) @@ -219,11 +227,13 @@ ipsa (Iterative Polynomial Smoothing Algorithm) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ :meth:`~.Baseline.ipsa` iteratively smooths the input data using a second-order -Savitzky–Golay filter until the exit criteria is reached. +Savitzky-Golay filter until the exit criteria is reached. .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_data function, look at the top-most algorithm's code figure, axes, handles = create_plots(data, baselines) @@ -243,13 +253,15 @@ ria (Range Independent Algorithm) right edges of the data and adds Gaussian peaks to these baselines, similar to the :ref:`optimize_extended_range ` function, and records their initial areas. The data is then iteratively smoothed using a -zero-order Savitzky–Golay filter (moving average) until the area of the extended +zero-order Savitzky-Golay filter (moving average) until the area of the extended regions after subtracting the smoothed data from the initial data is close to their starting areas. .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_data function, look at the top-most algorithm's code figure, axes, handles = create_plots(data, baselines) @@ -278,6 +290,8 @@ is then interpolated back into the original data size. .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_data function, look at the top-most algorithm's code figure, axes, handles = create_plots(data, baselines) diff --git a/docs/algorithms/spline.rst b/docs/algorithms/spline.rst index 714dbc69..e6f88831 100644 --- a/docs/algorithms/spline.rst +++ b/docs/algorithms/spline.rst @@ -11,19 +11,19 @@ predominantly used in pybaselines. B-splines can be expressed as: .. math:: - v(x) = \sum\limits_{i}^N \sum\limits_{j}^M {B_j(x_i) c_j} + v(x) = \sum\limits_{i}^N \sum\limits_{j}^M c_j {B_j(x_i)} where :math:`N` is the number of points in :math:`x`, :math:`M` is the number of spline basis functions, :math:`B_j(x_i)` is the j-th basis function evaluated at :math:`x_i`, -and :math:`c_j` is the coefficient for the j-th basis (which is analogous to +and :math:`c_j` is the coefficient vector for the j-th basis (which is analogous to the height of the j-th basis). In pybaselines, the number of spline basis functions, -:math:`M`, is calculated as the number of knots, `num_knots`, plus the spline degree +:math:`M`, is calculated as the number of knots, ``num_knots``, plus the spline degree minus 1. For regular B-spline fitting, the spline coefficients that best fit the data are gotten from minimizing the least-squares: -.. math:: \sum\limits_{i}^N w_i (y_i - \sum\limits_{j}^M {B_j(x_i) c_j})^2 +.. math:: \sum\limits_{i}^N w_i (y_i - v(x_i))^2 where :math:`y_i` and :math:`x_i` are the measured data, and :math:`w_i` is the weighting. In order to control the smoothness of the fitting spline, a penalty @@ -34,7 +34,7 @@ The minimized function for P-splines is thus: .. math:: - \sum\limits_{i}^N w_i (y_i - \sum\limits_{j}^M {B_j(x_i) c_j})^2 + \sum\limits_{i}^N w_i (y_i - v(x_i))^2 + \lambda \sum\limits_{i}^{M - d} (\Delta^d c_i)^2 where :math:`\lambda` is the penalty scale factor, and @@ -45,14 +45,21 @@ The resulting linear equation for solving the above minimization is: .. math:: - (B^{\top} W B + \lambda D_d^{\top} D_d) c = B^{\top} W y + (B^{\mathsf{T}} W B + \lambda D_d^{\mathsf{T}} D_d) c = B^{\mathsf{T}} W y + +and the baseline is given by: + +.. math:: + + v = B c where :math:`W` is the diagaonal matrix of the weights, :math:`B` is the matrix containing all of the spline basis functions, and :math:`D_d` is the matrix version of :math:`\Delta^d` (same as :ref:`explained ` for Whittaker-smoothing-based algorithms). P-splines have similarities with Whittaker -smoothing; in fact, if the number of basis functions, :math:`M`, is set up to be equal -to the number of data points, :math:`N`, and the spline degree is set to 0, then +smoothing, including the use of :ref:`iterative reweighting ` +to calculate the baseline; in fact, if the number of basis functions, :math:`M`, is set up to +be equal to the number of data points, :math:`N`, and the spline degree is set to 0, then :math:`B` becomes the identity matrix and the above equation becomes identical to the equation used for Whittaker smoothing. @@ -75,6 +82,8 @@ residual belonging to the noise's normal distribution. .. plot:: :align: center :context: reset + :include-source: False + :show-source-link: True import numpy as np import matplotlib.pyplot as plt @@ -206,6 +215,8 @@ to perform quantile regression on the data. .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True quantiles = {0: 0.3, 1: 0.1, 2: 0.2, 3: 0.25, 4: 0.5} # to see contents of create_data function, look at the top-most algorithm's code @@ -232,6 +243,8 @@ between all but the first and last non-corner points. .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_data function, look at the top-most algorithm's code figure, axes, handles = create_plots(data, baselines) @@ -256,14 +269,14 @@ Minimized function: .. math:: - \sum\limits_{i}^N w_i (y_i - \sum\limits_{j}^M {B_j(x_i) c_j})^2 + \sum\limits_{i}^N w_i (y_i - v(x_i))^2 + \lambda \sum\limits_{i}^{M - d} (\Delta^d c_i)^2 Linear system: .. math:: - (B^{\top} W B + \lambda D_d^{\top} D_d) c = B^{\top} W y + (B^{\mathsf{T}} W B + \lambda D_d^{\mathsf{T}} D_d) c = B^{\mathsf{T}} W y Weighting: @@ -277,6 +290,8 @@ Weighting: .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_data function, look at the top-most algorithm's code figure, axes, handles = create_plots(data, baselines) @@ -304,16 +319,16 @@ Minimized function: .. math:: - \sum\limits_{i}^N (w_i (y_i - \sum\limits_{j}^M {B_j(x_i) c_j}))^2 + \sum\limits_{i}^N (w_i (y_i - v(x_i)))^2 + \lambda \sum\limits_{i}^{M - d} (\Delta^d c_i)^2 - + \lambda_1 \sum\limits_{i}^{N - 1} (\Delta^1 (y_i - \sum\limits_{j}^M {B_j(x_i) c_j}))^2 + + \lambda_1 \sum\limits_{i}^{N - 1} (\Delta^1 (y_i - v(x_i)))^2 Linear system: .. math:: - (B^{\top} W^{\top} W B + \lambda_1 B^{\top} D_1^{\top} D_1 B + \lambda D_d^{\top} D_d) c - = (B^{\top} W^{\top} W B + \lambda_1 B^{\top} D_1^{\top} D_1) y + (B^{\mathsf{T}} W^{\mathsf{T}} W B + \lambda_1 B^{\mathsf{T}} D_1^{\mathsf{T}} D_1 B + \lambda D_d^{\mathsf{T}} D_d) c + = (B^{\mathsf{T}} W^{\mathsf{T}} W B + \lambda_1 B^{\mathsf{T}} D_1^{\mathsf{T}} D_1) y Weighting: @@ -328,6 +343,8 @@ Weighting: .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_data function, look at the top-most algorithm's code figure, axes, handles = create_plots(data, baselines) @@ -357,14 +374,14 @@ Minimized function: .. math:: - \sum\limits_{i}^N w_i (y_i - \sum\limits_{j}^M {B_j(x_i) c_j})^2 + \sum\limits_{i}^N w_i (y_i - v(x_i))^2 + \lambda \sum\limits_{i}^{M - d} (\Delta^d c_i)^2 Linear system: .. math:: - (B^{\top} W B + \lambda D_d^{\top} D_d) c = B^{\top} W y + (B^{\mathsf{T}} W B + \lambda D_d^{\mathsf{T}} D_d) c = B^{\mathsf{T}} W y Weighting: @@ -383,6 +400,8 @@ publication, as `specified by the author `_ (note that Whittaker -smoothing is often also referred to as Whittaker-Henderson smoothing and as Hodrick-Prescott +smoothing is often also referred to as Whittaker-Henderson smoothing or as Hodrick-Prescott filtering in the case of a second order difference). The general idea behind Whittaker smoothing algorithms is to make the baseline match the measured data as well as it can while also penalizing the roughness of the baseline. The @@ -31,7 +31,7 @@ The resulting linear equation for solving the above minimization is: .. math:: - (W + \lambda D_d^{\top} D_d) v = W y + (W + \lambda D_d^{\mathsf{T}} D_d) v = W y .. _difference-matrix-explanation: @@ -61,11 +61,111 @@ and :math:`D_2` (second order difference matrix) is: Most Whittaker-smoothing-based techniques recommend using the second order difference matrix, although some techniques use both the first and second order difference matrices. -The baseline is iteratively calculated using the linear system above by solving for -the baseline, :math:`v`, updating the weights, solving for the baseline using the new -weights, and repeating until some exit criteria. -The difference between Whittaker-smoothing-based algorithms is the selection of weights -and/or the function that is minimized. +.. _iterative-reweighting-explanation: + +Baseline algorithms based on Whittaker smoothing use +`iterative reweighting `_, +in which the baseline, :math:`v`, is calculated by solving the linear equation above, updating the +weights based on that baseline, solving for a new baseline using the new weights, and repeating +until some exit criteria. The major difference between Whittaker-smoothing-based algorithms is the +selection of weights and/or the function that is minimized. An example of this process is shown +below using the :meth:`~.Baseline.arpls` method. + +.. plot:: + :align: center + :context: reset + :include-source: False + :show-source-link: True + :nofigs: + + from pathlib import Path + + import matplotlib.animation as animation + import matplotlib.pyplot as plt + import numpy as np + + from pybaselines import Baseline, utils + + x = np.linspace(1, 1000, 1000) + signal = ( + utils.gaussian(x, 4, 120, 5) + + utils.gaussian(x, 5, 220, 12) + + utils.gaussian(x, 5, 350, 10) + + utils.gaussian(x, 7, 400, 8) + + utils.gaussian(x, 4, 550, 6) + + utils.gaussian(x, 5, 660, 14) + + utils.gaussian(x, 4, 750, 12) + + utils.gaussian(x, 5, 880, 8) + ) + true_baseline = 2 + 10 * np.exp(-x / 400) + noise = np.random.default_rng(1).normal(0, 0.2, x.size) + + y = signal + true_baseline + noise + + baseline_fitter = Baseline(x_data=x) + tol = 1e-3 + lam = 1e5 + + # do a complete fit once to figure out how many iterations (ie. frames) are required + _, params_complete = baseline_fitter.arpls(y, lam=lam, tol=tol) + max_frames = len(params_complete['tol_history']) + + # do an initial fit to get axes set up + fit, params = baseline_fitter.arpls(y, lam=lam, max_iter=0, tol=tol) + fig, (ax, ax2, ax3) = plt.subplots(nrows=3, tight_layout=True) + + ax.plot(x, y, label='data') + baseline_plot = ax.plot(x, fit, label='baseline')[0] + weight_plot = ax2.plot(x, np.ones_like(y), 'r.')[0] + tol_plot = ax3.plot(1, params['tol_history'], 'o-')[0] + ax3.axhline(tol, linestyle='--', color='k', label='Exit tolerance') + + ax.set_xlabel('x') + ax.legend() + ax2.set(ylim=[-0.1, 1.1], ylabel='Weights', xlabel='x') + ax3.set( + xlim=[0, max_frames + 1], ylim=[tol / 10, params_complete['tol_history'].max() * 10], + ylabel='Tolerance', xlabel='Iteration' + ) + ax3.semilogy() + ax3.legend() + + def update_lines(frame): + # frame == max number of iterations + fit, params = baseline_fitter.arpls(y, lam=lam, max_iter=frame, tol=tol) + baseline_plot.set_ydata(fit) + # until convergence, output weights actually correspond to iteration `frame + 1`, + # so do a second fit using `frame - 1` to get the correct weights for this frame + if frame == 0: + weight_plot.set_ydata(np.ones_like(y)) + else: + _, params_weights = baseline_fitter.arpls(y, lam=lam, max_iter=frame - 1, tol=tol) + weight_plot.set_ydata(params_weights['weights']) + changed_lines = [baseline_plot, weight_plot] + if len(params['tol_history']) == frame + 1: # in case tolerance was reached early + tol_plot.set_xdata(np.arange(frame + 1) + 1) + tol_plot.set_ydata(params['tol_history']) + changed_lines.append(tol_plot) + + return changed_lines + + # matplotlib's `plot` directive does not display gifs, so save the generated + # gif instead and then embed it below + ani = animation.FuncAnimation( + fig=fig, func=update_lines, frames=max_frames, blit=True, interval=500 + ) + if Path.cwd().stem == 'algorithms': # within documentation, so just save the gif + output_path = Path('../generated/images') + output_path.mkdir(exist_ok=True, parents=True) + ani.save(output_path.joinpath('iterative_reweighting.gif')) + plt.close(fig) + else: # downloaded, so show the plot rather than saving + plt.show() + +.. image:: ../generated/images/iterative_reweighting.gif + :alt: Iterative Reweighting + :align: center + .. note:: The :math:`\lambda` (``lam``) value required to fit a particular baseline for all @@ -94,7 +194,7 @@ Linear system: .. math:: - (W + \lambda D_d^{\top} D_d) v = W y + (W + \lambda D_d^{\mathsf{T}} D_d) v = W y Weighting: @@ -109,6 +209,8 @@ Weighting: .. plot:: :align: center :context: reset + :include-source: False + :show-source-link: True import numpy as np import matplotlib.pyplot as plt @@ -247,8 +349,8 @@ Linear system: .. math:: - (W^{\top} W + \lambda_1 D_1^{\top} D_1 + \lambda D_d^{\top} D_d) v - = (W^{\top} W + \lambda_1 D_1^{\top} D_1) y + (W^{\mathsf{T}} W + \lambda_1 D_1^{\mathsf{T}} D_1 + \lambda D_d^{\mathsf{T}} D_d) v + = (W^{\mathsf{T}} W + \lambda_1 D_1^{\mathsf{T}} D_1) y Weighting: @@ -263,6 +365,8 @@ Weighting: .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_data function, look at the top-most algorithm's code figure, axes, handles = create_plots(data, baselines) @@ -279,7 +383,7 @@ Weighting: else: lam = 1e3 p = 0.01 - baseline, params = baseline_fitter.iasls(y, lam=lam, lam_1=1e-4, p=p) + baseline, params = baseline_fitter.iasls(y, lam=lam, p=p) ax.plot(baseline, 'g--') @@ -299,7 +403,7 @@ Linear system: .. math:: - (W + \lambda D_d^{\top} D_d) v = W y + (W + \lambda D_d^{\mathsf{T}} D_d) v = W y Weighting: @@ -310,7 +414,7 @@ Weighting: \exp{\left(\frac{\text{abs}(y_i - v_i) t}{|\mathbf{r}^-|}\right)} & y_i < v_i \end{array}\right. -where :math:`t` is the iteration number and :math:`|\mathbf{r}^-|` is the l1-norm of the negative +where :math:`t` is the iteration number and :math:`|\mathbf{r}^-|` is the L1-norm of the negative values in the residual vector :math:`\mathbf r`, ie. :math:`\sum\limits_{y_i - v_i < 0} |y_i - v_i|`. Note that the absolute value within the weighting was mistakenly omitted in the original publication, as `specified by the author `_. @@ -318,6 +422,8 @@ publication, as `specified by the author `_ -of a matrix with itself such that :math:`F(B_r) = (B_r \otimes 1_{g}^{\top}) \odot (1_{g}^{\top} \otimes B_r)` -and :math:`F(B_c) = (B_c \otimes 1_{h}^{\top}) \odot (1_{h}^{\top} \otimes B_c)`, where +of a matrix with itself such that :math:`F(B_r) = (B_r \otimes 1_{g}^{\mathsf{T}}) \odot (1_{g}^{\mathsf{T}} \otimes B_r)` +and :math:`F(B_c) = (B_c \otimes 1_{h}^{\mathsf{T}}) \odot (1_{h}^{\mathsf{T}} \otimes B_c)`, where :math:`1_g` and :math:`1_h` are vectors of ones of length :math:`g` and :math:`h`, respecitvely, and :math:`\odot` signifies elementwise multiplication. Then the linear equation can be rewritten as: .. math:: - (F(B_r)^{\top} W F(B_c) + \lambda_r I_h \otimes D_{d_r}^{\top} D_{d_r} + \lambda_c D_{d_c}^{\top} D_{d_c} \otimes I_g) \alpha = B_{r}^{\top} (W \odot Y) B_c + (F(B_r)^{\mathsf{T}} W F(B_c) + \lambda_r I_h \otimes D_{d_r}^{\mathsf{T}} D_{d_r} + \lambda_c D_{d_c}^{\mathsf{T}} D_{d_c} \otimes I_g) \alpha = B_{r}^{\mathsf{T}} (W \odot Y) B_c and the baseline is: .. math:: - V = B_r \alpha B_{c}^{\top} + V = B_r \alpha B_{c}^{\mathsf{T}} Algorithms @@ -96,6 +96,8 @@ mixture_model (Mixture Model) .. plot:: :align: center :context: reset + :include-source: False + :show-source-link: True import numpy as np import matplotlib.pyplot as plt @@ -157,6 +159,8 @@ irsqr (Iterative Reweighted Spline Quantile Regression) .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_plots function, look at the top-most algorithm's code baseline, params = baseline_fitter.irsqr(y, lam=(1e3, 1e2), quantile=0.3) @@ -172,6 +176,8 @@ pspline_asls (Penalized Spline Version of asls) .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_plots function, look at the top-most algorithm's code baseline, params = baseline_fitter.pspline_asls(y, lam=(1e3, 1e0), p=0.005) @@ -187,6 +193,8 @@ pspline_iasls (Penalized Spline Version of iasls) .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_plots function, look at the top-most algorithm's code baseline, params = baseline_fitter.pspline_iasls(y, lam=(1e2, 1e-2)) @@ -202,6 +210,8 @@ pspline_airpls (Penalized Spline Version of airpls) .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_plots function, look at the top-most algorithm's code baseline, params = baseline_fitter.pspline_airpls(y, lam=(1e3, 1e-1)) @@ -217,6 +227,8 @@ pspline_arpls (Penalized Spline Version of arpls) .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_plots function, look at the top-most algorithm's code baseline, params = baseline_fitter.pspline_arpls(y, lam=(1e3, 5e0)) @@ -232,6 +244,8 @@ pspline_iarpls (Penalized Spline Version of iarpls) .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_plots function, look at the top-most algorithm's code baseline, params = baseline_fitter.pspline_iarpls(y, lam=(1e2, 1e0)) @@ -247,6 +261,8 @@ pspline_psalsa (Penalized Spline Version of psalsa) .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_plots function, look at the top-most algorithm's code baseline, params = baseline_fitter.pspline_psalsa(y, lam=(1e3, 5e0), k=0.5) @@ -262,6 +278,8 @@ pspline_brpls (Penalized Spline Version of brpls) .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_plots function, look at the top-most algorithm's code baseline, params = baseline_fitter.pspline_brpls(y, lam=(1e3, 5e0)) @@ -277,6 +295,8 @@ pspline_lsrpls (Penalized Spline Version of lsrpls) .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_plots function, look at the top-most algorithm's code baseline, params = baseline_fitter.pspline_lsrpls(y, lam=(1e3, 5e0)) diff --git a/docs/algorithms_2d/whittaker_2d.rst b/docs/algorithms_2d/whittaker_2d.rst index 0a546ff4..ba4a6354 100644 --- a/docs/algorithms_2d/whittaker_2d.rst +++ b/docs/algorithms_2d/whittaker_2d.rst @@ -32,7 +32,7 @@ The resulting linear equation for solving the above minimization is: .. math:: - (W_{diag} + \lambda_r I_M \otimes D_{d_r}^{\top} D_{d_r} + \lambda_c D_{d_c}^{\top} D_{d_c} \otimes I_M) v = w y + (W_{diag} + \lambda_r I_M \otimes D_{d_r}^{\mathsf{T}} D_{d_r} + \lambda_c D_{d_c}^{\mathsf{T}} D_{d_c} \otimes I_M) v = w y where :math:`W_{diag}` is the diagaonal matrix of the flattened weights, and :math:`D_d` is the matrix @@ -54,7 +54,7 @@ Eigendecomposition By following the excellent insights laid out by G. Biessy in `[2] `_, the dimensionality of the system can be reduced by using eigendecomposition on each of the two -penalty matrices, :math:`D_{d_r}^{\top} D_{d_r}` and :math:`D_{d_c}^{\top} D_{d_c}`. (Note that speeding up +penalty matrices, :math:`D_{d_r}^{\mathsf{T}} D_{d_r}` and :math:`D_{d_c}^{\mathsf{T}} D_{d_c}`. (Note that speeding up Whittaker smoothing using `factorization in 1D `_ and using the `analytical eigenvalues in nD (great paper) `_ are established methods, although they require using a fixed difference order, and, in the second case, of using @@ -63,7 +63,7 @@ The general eigendecomposition of the penalty matrix gives .. math:: - D_{d}^{\top} D_{d} = U \Sigma U^{\top} + D_{d}^{\mathsf{T}} D_{d} = U \Sigma U^{\mathsf{T}} where :math:`U` is the matrix of eigenvectors and :math:`\Sigma` is a diagonal matrix with the eigenvalues along the diagonal. Letting :math:`B = U_c \otimes U_r` denote the Kronecker @@ -73,7 +73,7 @@ can be rewritten as: .. math:: - (B^{\top} W_{diag} B + \lambda_r I_h \otimes \Sigma_r + \lambda_c \Sigma_c \otimes I_g) \alpha = B^{\top} W_{diag} y + (B^{\mathsf{T}} W_{diag} B + \lambda_r I_h \otimes \Sigma_r + \lambda_c \Sigma_c \otimes I_g) \alpha = B^{\mathsf{T}} W_{diag} y and the baseline is then: @@ -117,6 +117,8 @@ asls (Asymmetric Least Squares) .. plot:: :align: center :context: reset + :include-source: False + :show-source-link: True import numpy as np import matplotlib.pyplot as plt @@ -179,6 +181,8 @@ Eigendecomposition is not allowed for this method. .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_data function, look at the top-most algorithm's code baseline, params = baseline_fitter.iasls(y, lam=(1e3, 1e0)) @@ -194,6 +198,8 @@ airpls (Adaptive Iteratively Reweighted Penalized Least Squares) .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_plots function, look at the top-most algorithm's code baseline, params = baseline_fitter.airpls(y, lam=(1e3, 1e1)) @@ -209,6 +215,8 @@ arpls (Asymmetrically Reweighted Penalized Least Squares) .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_plots function, look at the top-most algorithm's code baseline, params = baseline_fitter.arpls(y, lam=(1e4, 1e2)) @@ -225,6 +233,8 @@ Eigendecomposition is not allowed for this method. .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_plots function, look at the top-most algorithm's code baseline, params = baseline_fitter.drpls(y, lam=(1e3, 1e2)) @@ -240,6 +250,8 @@ iarpls (Improved Asymmetrically Reweighted Penalized Least Squares) .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_plots function, look at the top-most algorithm's code baseline, params = baseline_fitter.iarpls(y, lam=(1e3, 1e2)) @@ -256,6 +268,8 @@ Eigendecomposition is not allowed for this method. .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_plots function, look at the top-most algorithm's code baseline, params = baseline_fitter.aspls(y, lam=(1e3, 1e2)) @@ -271,6 +285,8 @@ psalsa (Peaked Signal's Asymmetric Least Squares Algorithm) .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_plots function, look at the top-most algorithm's code baseline, params = baseline_fitter.psalsa(y, lam=(1e3, 1e2), k=0.5) @@ -286,6 +302,8 @@ brpls (Bayesian Reweighted Penalized Least Squares) .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_plots function, look at the top-most algorithm's code baseline, params = baseline_fitter.brpls(y, lam=(1e4, 1e2)) @@ -301,6 +319,8 @@ lsrpls (Locally Symmetric Reweighted Penalized Least Squares) .. plot:: :align: center :context: close-figs + :include-source: False + :show-source-link: True # to see contents of create_plots function, look at the top-most algorithm's code baseline, params = baseline_fitter.lsrpls(y, lam=(1e4, 1e2)) diff --git a/docs/citing.rst b/docs/citing.rst index 2e11f376..b22ccc99 100644 --- a/docs/citing.rst +++ b/docs/citing.rst @@ -8,7 +8,7 @@ research: .. code-block:: text - @software{pybaselines, + @software{erb2021pybaselines, author = {Erb, Donald}, doi = {10.5281/zenodo.5608581}, title = {{pybaselines}: A {Python} library of algorithms for the baseline correction of experimental data}, diff --git a/docs/conf.py b/docs/conf.py index 0cda6fdf..15cb6cc4 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -156,9 +156,9 @@ # -- Settings for matplotlib plot_directive extension ---------------------------- -plot_include_source = False +plot_include_source = True # set default to True so that docstring examples don't show source plot_html_show_formats = False -plot_html_show_source_link = True +plot_html_show_source_link = False # set default to False for docstring examples plot_formats = ['png'] @@ -227,16 +227,23 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # +html_logo = 'images/logo.png' # Theme options are theme-specific and customize the look and feel of a # theme further. For a list of options available for each theme, see the # documentation. try: - import sphinx_rtd_theme - html_theme = 'sphinx_rtd_theme' + import pydata_sphinx_theme + html_theme = 'pydata_sphinx_theme' html_theme_options = { - 'navigation_depth': 5, - 'prev_next_buttons_location': 'both', - } + 'icon_links': [ + { + 'name': 'GitHub', + 'url': 'https://github.com/derb12/pybaselines', + 'icon': 'fa-brands fa-github', + 'type': 'fontawesome', + }, + ], + } except ImportError: html_theme = 'nature' html_theme_options = {} diff --git a/docs/contributing.rst b/docs/contributing.rst index 656fbb4a..cd58e852 100644 --- a/docs/contributing.rst +++ b/docs/contributing.rst @@ -49,7 +49,7 @@ ensure `git `_ is installed and then run: git clone https://github.com/derb12/pybaselines.git cd pybaselines - pip install .[dev] + pip install -e .[dev] All sections below assume the above commands were ran. @@ -136,6 +136,19 @@ please ensure the documentation builds locally by running the following command and ensure that no warnings or errors are raised during building. The built documentation can then be viewed in the ``pybaselines/docs/_build/html`` folder. +Running Doctests +^^^^^^^^^^^^^^^^ + +If adding new code, it is often good to show an example usage of the class/method/function +within its docstring using +`doctest format `_. pybaselines +uses `scipy-doctest `_ to ensure that these examples are +correctly formatted and will run without errors. To perform these doctests, run the following +command in the terminal while in the pybaselines directory: + +.. code-block:: console + + pytest pybaselines --doctest-modules Adding New Algorithms ^^^^^^^^^^^^^^^^^^^^^ @@ -146,6 +159,9 @@ If adding a new baseline algorithm to pybaselines: ``pybaselines/tests/base_tests.py`` file that should be subclassed to ensure all basic requirements for a new algorithm are met. Additional tests should also be added as needed. See existing tests for examples. +* Try to add example usages of the algorithm within its docstring, showing basic usage and + any noteworthy fine-tuning. Ensure these examples run correctly by performing doctests as + outlined above. * Add a short summary of the algorithm to the appropriate place in the `algorithms section `_, and, if possible, add a plot showing how the algorithm fits different baselines using diff --git a/docs/examples/README.rst b/docs/examples/README.rst index 24f99432..90dba3c5 100644 --- a/docs/examples/README.rst +++ b/docs/examples/README.rst @@ -4,7 +4,7 @@ Examples This section shows various uses of pybaselines. -To run the examples, `Matplotlib `_ must be installed, +To run the examples locally, `Matplotlib `_ must be installed, which can be done by running: .. code-block:: console diff --git a/docs/examples/general/plot_masked_data.py b/docs/examples/general/plot_masked_data.py index 412d9fb8..264d357e 100644 --- a/docs/examples/general/plot_masked_data.py +++ b/docs/examples/general/plot_masked_data.py @@ -5,20 +5,21 @@ There are cases where data needs to be filtered/masked before processing, for example a faulty detector can result in problematic regions in measurements. In these cases, baseline correction on -the raw data could lead to severely incorrect fits. One such use of masking in literature is presented by -`Temmink, et al. `_ for removing downward spikes in -mid-infrared data collected from the James Webb Space Telescope before performing baseline correction. -This example will detail how to handle working with masked data for the various types of baseline -correction algorithms in pybaselines. +the raw data could lead to severely incorrect fits. One such use of masking in literature is +presented by `Temmink, et al. `_ for removing +downward spikes in mid-infrared data collected from the James Webb Space Telescope before +performing baseline correction. This example will detail how to handle working with masked data +for the various types of baseline correction algorithms in pybaselines. As a preface for this guide, the author of this library acknowledges that masking could potentially be handled internally within each individual algorithm to make them "mask-aware". However, it would -be a large change to the codebase, and there are many edge cases that would make it a non-trivial endeavor. -Support for masking following the guidelines presented in this example could alternatively be added -as an :doc:`optimizer-type algorithm <../../../algorithms/optimizers>`, but, in the author's opinion, -making one single function that is expected to cover 60+ different algorithms would be fiddly at best -and very prone to bugs. Therefore, this guide is presented as a starting point for users to adapt for -targeting a spectific baseline correction algorithm or group of algorithms. +be a large change to the codebase, and there are many edge cases that would make it a non-trivial +endeavor. Support for masking following the guidelines presented in this example could +alternatively be added as an :doc:`optimizer-type algorithm <../../../algorithms/optimizers>`, +but, in the author's opinion, making one single function that is expected to cover 60+ different +algorithms would be fiddly at best and very prone to bugs. Therefore, this guide is presented as a +starting point for users to adapt for targeting a specific baseline correction algorithm or group +of algorithms. The various algorithms in pybaselines can be broadly grouped into three different categories for how they handle masked data: @@ -47,8 +48,6 @@ from pybaselines import Baseline from pybaselines.utils import gaussian, relative_difference -from pybaselines._banded_utils import PenalizedSystem -from pybaselines._weighting import _arpls x = np.linspace(500, 4000, 1000) @@ -83,8 +82,9 @@ # To start, the mask for fitting the data has to be made. This can be done by eye if fitting # a few datasets, or can be automated using some metric. Many # :doc:`classification methods <../../../algorithms/classification>` use different methods for -# excluding positive peaks; for excluding negative peaks, see -# `Temmink, et al. `_ for an example. This example +# excluding positive peaks; for an example of excluding negative peaks, see +# `Temmink, et al. `_, which uses +# Savitzky-Golay filtering combined with iterative thresholding. This example # will simply define the mask region by hand. fit_mask = (x < 1900) | (x > 2550) # 1 in regions to fit, 0 in masked region @@ -160,6 +160,8 @@ # "Mask-aware" versions of these algorithms could be implemented, as shown below for # the arPLS algorithm. However, the "mask-aware" behavior can be closely approximated # simply by using weighted interpolation, to be shown below. +from pybaselines._banded_utils import PenalizedSystem # noqa: E402 +from pybaselines._weighting import _arpls # noqa: E402 def masked_arpls(y, mask=None, lam=1e5, diff_order=2, tol=1e-3, max_iter=50, weights=None): diff --git a/docs/examples/general/plot_reuse_Baseline.py b/docs/examples/general/plot_reuse_Baseline.py index 17f22bb0..c308bf9a 100644 --- a/docs/examples/general/plot_reuse_Baseline.py +++ b/docs/examples/general/plot_reuse_Baseline.py @@ -10,7 +10,7 @@ matrix, and potentially its pseudoinverse, once. Likewise, :doc:`spline methods <../../../algorithms/spline>` will only have to compute the spline basis matrix once. Note that this only applies if the same non-data parameters -(eg. ``poly_order``, ``lam``, etc.) are used for each fit. +(eg. ``poly_order``, ``num_knots``, etc.) are used for each fit. This example will explore the efficiency of reusing the same ``Baseline`` object when fitting multiple datasets for different types of algorithms. diff --git a/docs/examples/spline/plot_lam_vs_num_knots.py b/docs/examples/spline/plot_lam_vs_num_knots.py index db1bc9a4..a5ddff14 100644 --- a/docs/examples/spline/plot_lam_vs_num_knots.py +++ b/docs/examples/spline/plot_lam_vs_num_knots.py @@ -72,7 +72,7 @@ def optimize_lam(data, known_baseline, func, previous_min=None, **kwargs): # Other baseline types could be examined, similar to the # :ref:`Whittaker lam vs data size example `, # which should give similar results. -plt.plot(utils._make_data(1000, bkg_type='exponential')[1]) +plt.plot(utils.make_data(1000, bkg_type='exponential', signal_type=2)[1]) # %% # The number of knots will vary from 20 to 1000 on a logarithmic scale. For each number @@ -84,10 +84,12 @@ def optimize_lam(data, known_baseline, func, previous_min=None, **kwargs): symbols = cycle(['o', 's', 'd', 'h', '^', 'x']) best_lams = np.empty((len(num_knots), len(num_points))) for i, num_knot in enumerate(num_knots): - min_lam = 0 + min_lam = None for j, num_x in enumerate(num_points): func = partial(Baseline().mixture_model, num_knots=num_knot, diff_order=2) - x, y, baseline = utils._make_data(num_x, bkg_type='exponential') + x, y, baseline = utils.make_data( + num_x, bkg_type='exponential', signal_type=2, noise_std=0.1, return_baseline=True + ) # use a slightly lower tolerance to speed up the calculation min_lam = optimize_lam(y, baseline, func, min_lam, tol=1e-2, max_iter=50) best_lams[i, j] = min_lam diff --git a/docs/examples/spline/plot_pspline_whittaker.py b/docs/examples/spline/plot_pspline_whittaker.py index 79d8eefe..29ab1262 100644 --- a/docs/examples/spline/plot_pspline_whittaker.py +++ b/docs/examples/spline/plot_pspline_whittaker.py @@ -75,7 +75,7 @@ def optimize_lam(data, known_baseline, func, previous_min=None, **kwargs): # Other baseline types could be examined, similar to the # :ref:`Whittaker lam vs data size example `, # which should give similar results. -plt.plot(utils._make_data(1000, bkg_type='exponential')[1]) +plt.plot(utils.make_data(1000, bkg_type='exponential', signal_type=2)[1]) # %% # For each function, the optimal `lam` value will be calculated for data sizes @@ -97,7 +97,9 @@ def optimize_lam(data, known_baseline, func, previous_min=None, **kwargs): min_lam = None for j, num_x in enumerate(num_points): func = getattr(Baseline(), func_name) - x, y, baseline = utils._make_data(num_x, bkg_type='exponential') + x, y, baseline = utils.make_data( + num_x, bkg_type='exponential', signal_type=2, return_baseline=True + ) # use a slightly lower tolerance to speed up the calculation min_lam = optimize_lam(y, baseline, func, min_lam, tol=1e-2, max_iter=50) best_lams[j] = min_lam diff --git a/docs/examples/whittaker/plot_lam_vs_data_size.py b/docs/examples/whittaker/plot_lam_vs_data_size.py index ac936a3a..6387bed0 100644 --- a/docs/examples/whittaker/plot_lam_vs_data_size.py +++ b/docs/examples/whittaker/plot_lam_vs_data_size.py @@ -88,7 +88,7 @@ def iasls(*args, lam=1, p=0.05, **kwargs): bkg_types = ('exponential', 'gaussian', 'sine') for bkg_type in bkg_types: bkg_dict[bkg_type] = {} - plt.plot(utils._make_data(1000, bkg_type)[1], label=bkg_type) + plt.plot(utils.make_data(1000, bkg_type, signal_type=2, noise_std=0.1)[1], label=bkg_type) plt.legend() # %% @@ -115,7 +115,9 @@ def iasls(*args, lam=1, p=0.05, **kwargs): func = iasls else: func = getattr(Baseline(), func_name) - x, y, baseline = utils._make_data(num_x, bkg_type) + x, y, baseline = utils.make_data( + num_x, bkg_type, signal_type=2, noise_std=0.1, return_baseline=True + ) # use a slightly lower tolerance to speed up the calculation min_lam = optimize_lam(y, baseline, func, min_lam, tol=1e-2, max_iter=50) best_lams[j] = min_lam diff --git a/docs/examples/whittaker/plot_whittaker_solvers.py b/docs/examples/whittaker/plot_whittaker_solvers.py index c32d5d55..46ae43da 100644 --- a/docs/examples/whittaker/plot_whittaker_solvers.py +++ b/docs/examples/whittaker/plot_whittaker_solvers.py @@ -103,7 +103,7 @@ def sparse_asls(data, lam=1e6, p=1e-2, diff_order=2, max_iter=50, tol=1e-3, weig for func_name in functions: timings = [] for num_x in data_sizes: - y = utils._make_data(num_x)[1] + y = utils.make_data(num_x, bkg_type='exponential', noise_std=0.1, signal_type=2)[1] lam = lam_equation(num_x) if func_name == 'sparse': func = sparse_asls diff --git a/docs/images/logo.png b/docs/images/logo.png index 99989a25..7814a29f 100644 Binary files a/docs/images/logo.png and b/docs/images/logo.png differ diff --git a/docs/index.rst b/docs/index.rst index a111a75e..8852e4a7 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -4,6 +4,7 @@ pybaselines Documentation .. image:: images/logo.png :alt: Logo :align: center + :class: dark-light pybaselines is a library of algorithms for the baseline correction of experimental data. @@ -22,9 +23,9 @@ pybaselines is a library of algorithms for the baseline correction of experiment installation quickstart parameter_selection - performance algorithms/index algorithms_2d/index + performance generated/examples/index api/index contributing diff --git a/docs/installation.rst b/docs/installation.rst index 7a749037..e0db316c 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -25,23 +25,23 @@ pybaselines has the following optional dependencies: * `Numba `_ (>= 0.53): speeds up calculations used by the following functions: - * :meth:`~.Baseline.loess` - * :meth:`~.Baseline.dietrich` - * :meth:`~.Baseline.golotvin` - * :meth:`~.Baseline.std_distribution` - * :meth:`~.Baseline.fastchrom` - * :meth:`~.Baseline.beads` - * :meth:`~.Baseline.mpspline` - * all :ref:`spline ` methods + * :meth:`~.Baseline.loess` + * :meth:`~.Baseline.dietrich` + * :meth:`~.Baseline.golotvin` + * :meth:`~.Baseline.std_distribution` + * :meth:`~.Baseline.fastchrom` + * :meth:`~.Baseline.beads` + * :meth:`~.Baseline.mpspline` + * all :ref:`spline ` methods * `pentapy `_ (>= 1.1): provides a faster solver for banded pentadiagonal linear systems, which are used by the following functions (when ``diff_order=2``): - * all :ref:`Whittaker smoothing ` methods - * :meth:`~.Baseline.mpls` - * :meth:`~.Baseline.jbcd` - * :meth:`~.Baseline.fabc` + * all :ref:`Whittaker smoothing ` methods + * :meth:`~.Baseline.mpls` + * :meth:`~.Baseline.jbcd` + * :meth:`~.Baseline.fabc` Stable Release diff --git a/docs/parameter_selection.rst b/docs/parameter_selection.rst index ad4cdb35..944d459f 100644 --- a/docs/parameter_selection.rst +++ b/docs/parameter_selection.rst @@ -12,42 +12,42 @@ adjusting for each family of algorithms within pybaselines: * Polynomial methods - * ``poly_order`` controls the curvature of the baseline. + * ``poly_order`` controls the curvature of the baseline. * Whittaker-smoothing-based methods - * ``lam`` controls the curvature of the baseline. See - :ref:`this example ` - to get an idea of how ``lam`` effects the baseline. The optimal ``lam`` - value for each algorithm is not typically the same. + * ``lam`` controls the curvature of the baseline. See + :ref:`this example ` + to get an idea of how ``lam`` effects the baseline. The optimal ``lam`` + value for each algorithm is not typically the same. * Morphological methods - * ``half_window`` controls the general fit of the baseline. See - :ref:`this example ` - to get an idea of how ``half_window`` effects the baseline. The optimal - ``half_window`` value for each algorithm is not typically the same. + * ``half_window`` controls the general fit of the baseline. See + :ref:`this example ` + to get an idea of how ``half_window`` effects the baseline. The optimal + ``half_window`` value for each algorithm is not typically the same. * Spline methods - * ``lam`` controls the curvature of the baseline. The - :ref:`Whittaker example ` - also generally applies to spline methods. + * ``lam`` controls the curvature of the baseline. The + :ref:`Whittaker example ` + also generally applies to spline methods. * Smoothing-based methods - * ``half_window`` controls the general fit of the baseline. The - :ref:`Morphological example ` - also generally applies to smoothing methods. + * ``half_window`` controls the general fit of the baseline. The + :ref:`Morphological example ` + also generally applies to smoothing methods. * Baseline/Peak Classification methods - * Algorithm dependent + * Algorithm dependent * Optimizers - * Algorithm dependent + * Algorithm dependent * Miscellaneous methods - * Algorithm dependent + * Algorithm dependent diff --git a/pybaselines/classification.py b/pybaselines/classification.py index 2d520bd7..5eb72ed5 100644 --- a/pybaselines/classification.py +++ b/pybaselines/classification.py @@ -987,6 +987,8 @@ def _refine_mask(mask, min_length=2): Examples -------- + >>> import numpy as np + >>> from pybaselines.classification import _refine_mask >>> mask = np.array([1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1]) >>> _refine_mask(mask, 3).astype(int) array([1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]) diff --git a/pybaselines/polynomial.py b/pybaselines/polynomial.py index 01196004..46c0203d 100644 --- a/pybaselines/polynomial.py +++ b/pybaselines/polynomial.py @@ -378,16 +378,16 @@ def penalized_poly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=Non size equal to N and all values set to 1. cost_function : str, optional The non-quadratic cost function to minimize. Must indicate symmetry of the - method by appending 'a' or 'asymmetric' for asymmetric loss, and 's' or + method by prepending 'a' or 'asymmetric' for asymmetric loss, and 's' or 'symmetric' for symmetric loss. Default is 'asymmetric_truncated_quadratic'. Available methods, and their associated reference, are: - * 'asymmetric_truncated_quadratic'[1]_ - * 'symmetric_truncated_quadratic'[1]_ - * 'asymmetric_huber'[1]_ - * 'symmetric_huber'[1]_ - * 'asymmetric_indec'[2]_ - * 'symmetric_indec'[2]_ + * 'asymmetric_truncated_quadratic'[1]_ + * 'symmetric_truncated_quadratic'[1]_ + * 'asymmetric_huber'[1]_ + * 'symmetric_huber'[1]_ + * 'asymmetric_indec'[2]_ + * 'symmetric_indec'[2]_ threshold : float, optional The threshold value for the loss method, where the function goes from @@ -761,7 +761,7 @@ def quant_reg(self, data, poly_order=2, quantile=0.05, tol=1e-6, max_iter=250, Notes ----- - Application of quantile regression for baseline fitting ss described in [1]_. + Application of quantile regression for baseline fitting as described in [1]_. Performs quantile regression using iteratively reweighted least squares (IRLS) as described in [2]_. @@ -833,9 +833,9 @@ def goldindec(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None, ('a' or 'asymmetric') is optional (eg. 'indec' and 'a_indec' are the same). Default is 'asymmetric_indec'. Available methods, and their associated reference, are: - * 'asymmetric_indec'[1]_ - * 'asymmetric_truncated_quadratic'[2]_ - * 'asymmetric_huber'[2]_ + * 'asymmetric_indec'[1]_ + * 'asymmetric_truncated_quadratic'[2]_ + * 'asymmetric_huber'[2]_ peak_ratio : float, optional A value between 0 and 1 that designates how many points in the data belong @@ -1383,7 +1383,7 @@ def _identify_loss_method(loss_method): """ prefix, *split_method = loss_method.lower().split('_') if prefix not in ('a', 's', 'asymmetric', 'symmetric') or not split_method: - raise ValueError('must specify loss function symmetry by appending "a_" or "s_"') + raise ValueError('must specify loss function symmetry by prepending "a_" or "s_"') if prefix in ('a', 'asymmetric'): symmetric = False else: @@ -1421,16 +1421,16 @@ def penalized_poly(data, x_data=None, poly_order=2, tol=1e-3, max_iter=250, size equal to N and all values set to 1. cost_function : str, optional The non-quadratic cost function to minimize. Must indicate symmetry of the - method by appending 'a' or 'asymmetric' for asymmetric loss, and 's' or + method by prepending 'a' or 'asymmetric' for asymmetric loss, and 's' or 'symmetric' for symmetric loss. Default is 'asymmetric_truncated_quadratic'. Available methods, and their associated reference, are: - * 'asymmetric_truncated_quadratic'[7]_ - * 'symmetric_truncated_quadratic'[7]_ - * 'asymmetric_huber'[7]_ - * 'symmetric_huber'[7]_ - * 'asymmetric_indec'[8]_ - * 'symmetric_indec'[8]_ + * 'asymmetric_truncated_quadratic'[7]_ + * 'symmetric_truncated_quadratic'[7]_ + * 'asymmetric_huber'[7]_ + * 'symmetric_huber'[7]_ + * 'asymmetric_indec'[8]_ + * 'symmetric_indec'[8]_ threshold : float, optional The threshold value for the loss method, where the function goes from @@ -2189,9 +2189,9 @@ def goldindec(data, x_data=None, poly_order=2, tol=1e-3, max_iter=250, weights=N ('a' or 'asymmetric') is optional (eg. 'indec' and 'a_indec' are the same). Default is 'asymmetric_indec'. Available methods, and their associated reference, are: - * 'asymmetric_indec'[18]_ - * 'asymmetric_truncated_quadratic'[19]_ - * 'asymmetric_huber'[19]_ + * 'asymmetric_indec'[18]_ + * 'asymmetric_truncated_quadratic'[19]_ + * 'asymmetric_huber'[19]_ peak_ratio : float, optional A value between 0 and 1 that designates how many points in the data belong diff --git a/pybaselines/two_d/polynomial.py b/pybaselines/two_d/polynomial.py index 8faf6edb..b92eb9b9 100644 --- a/pybaselines/two_d/polynomial.py +++ b/pybaselines/two_d/polynomial.py @@ -372,16 +372,16 @@ def penalized_poly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=Non shape equal to (M, N) and all values set to 1. cost_function : str, optional The non-quadratic cost function to minimize. Must indicate symmetry of the - method by appending 'a' or 'asymmetric' for asymmetric loss, and 's' or + method by prepending 'a' or 'asymmetric' for asymmetric loss, and 's' or 'symmetric' for symmetric loss. Default is 'asymmetric_truncated_quadratic'. Available methods, and their associated reference, are: - * 'asymmetric_truncated_quadratic'[1]_ - * 'symmetric_truncated_quadratic'[1]_ - * 'asymmetric_huber'[1]_ - * 'symmetric_huber'[1]_ - * 'asymmetric_indec'[2]_ - * 'symmetric_indec'[2]_ + * 'asymmetric_truncated_quadratic'[1]_ + * 'symmetric_truncated_quadratic'[1]_ + * 'asymmetric_huber'[1]_ + * 'symmetric_huber'[1]_ + * 'asymmetric_indec'[2]_ + * 'symmetric_indec'[2]_ threshold : float, optional The threshold value for the loss method, where the function goes from @@ -547,7 +547,7 @@ def quant_reg(self, data, poly_order=2, quantile=0.05, tol=1e-6, max_iter=250, Notes ----- - Application of quantile regression for baseline fitting ss described in [1]_. + Application of quantile regression for baseline fitting as described in [1]_. Performs quantile regression using iteratively reweighted least squares (IRLS) as described in [2]_. @@ -790,7 +790,7 @@ def _identify_loss_method(loss_method): """ prefix, *split_method = loss_method.lower().split('_') if prefix not in ('a', 's', 'asymmetric', 'symmetric') or not split_method: - raise ValueError('must specify loss function symmetry by appending "a_" or "s_"') + raise ValueError('must specify loss function symmetry by prepending "a_" or "s_"') if prefix in ('a', 'asymmetric'): symmetric = False else: diff --git a/pybaselines/utils.py b/pybaselines/utils.py index f08556ee..0b8a8f75 100644 --- a/pybaselines/utils.py +++ b/pybaselines/utils.py @@ -161,9 +161,9 @@ def gaussian_kernel(window_size, sigma=1.0): Notes ----- - Return gaus/sum(gaus) rather than creating a unit-area gaussian + Return ``gaus/sum(gaus)`` rather than creating a unit-area gaussian since the unit-area gaussian would have an area smaller than 1 - for window_size < ~ 6 * sigma. + for ``window_size <~ 6 * sigma``. """ # centers distribution from -half_window to half_window @@ -1084,16 +1084,24 @@ def pspline_smooth(data, x_data=None, lam=1e1, num_knots=100, spline_degree=3, d return y_smooth, pspline.tck -def _make_data(data_size=1000, bkg_type='exponential'): +def make_data(data_size=1000, bkg_type='gaussian_small', signal_type=1, noise_std=0.05, + return_baseline=False): """ - Creates simple datasets for use in examples. + Creates a simple dataset for use in examples and docstrings. Parameters ---------- data_size : int, optional The number of data points. Default is 1000. - bkg_type : {'exponential', 'gaussian', 'linear', 'sine'}, optional - A string designating the type of baseline to add to the data. Default is 'exponential'. + bkg_type : {'gaussian_small', 'exponential', 'linear', 'sine', 'gaussian'}, optional + A string designating the type of baseline to add to the data. Default is 'gaussian_small'. + signal_type : {1, 2}, optional + The type of signal to produce. In general, examples within the documentation use 2 + while docstring examples use 1. Default is 1. + noise_std : float, optional + The standard deviation of the noise. Default is 0.05. + return_baseline : bool, optional + Whether to include the computed baseline in the output. Default is False. Returns ------- @@ -1102,12 +1110,12 @@ def _make_data(data_size=1000, bkg_type='exponential'): y : numpy.ndarray, shape (`data_size`,) The y-values for the example data. baseline : numpy.ndarray, shape (`data_size`,) - The true baseline for the example data. + Only returned if ``return_baseline`` is True. The true baseline for the example data. Raises ------ ValueError - Raised if `bkg_type` is not an allowed value. + Raised if ``bkg_type`` or ``signal_type`` is not an allowed value. Notes ----- @@ -1117,19 +1125,35 @@ def _make_data(data_size=1000, bkg_type='exponential'): """ x = np.linspace(0, 1000, data_size) - signal = ( - gaussian(x, 9, 100, 12) - + gaussian(x, 6, 180, 5) - + gaussian(x, 8, 350, 11) - + gaussian(x, 15, 400, 18) - + gaussian(x, 6, 550, 6) - + gaussian(x, 13, 700, 8) - + gaussian(x, 9, 800, 9) - + gaussian(x, 9, 880, 7) - ) + + if signal_type == 1: + signal = ( + + gaussian(x, 4, 180, 12) + + gaussian(x, 15, 400, 10) + + gaussian(x, 6, 540, 6) + + gaussian(x, 13, 700, 8) + + gaussian(x, 9, 880, 7) + ) + elif signal_type == 2: + signal = ( + gaussian(x, 9, 100, 12) + + gaussian(x, 6, 180, 5) + + gaussian(x, 8, 350, 11) + + gaussian(x, 15, 400, 18) + + gaussian(x, 6, 550, 6) + + gaussian(x, 13, 700, 8) + + gaussian(x, 9, 800, 9) + + gaussian(x, 9, 880, 7) + ) + else: + raise ValueError(f'unknown signal_type {signal_type}') + + bkg_type = bkg_type.lower() if bkg_type == 'exponential': baseline = 5 + 15 * np.exp(-x / 200) - elif bkg_type == 'gaussian': + elif bkg_type == 'gaussian_small': + baseline = 15 + gaussian(x, 4, 650, 400) + elif bkg_type == 'gaussian': # a more pronounced gaussian than 'gaussian_small' baseline = 30 + gaussian(x, 20, 500, 150) elif bkg_type == 'linear': baseline = 1 + x * 0.005 @@ -1137,8 +1161,10 @@ def _make_data(data_size=1000, bkg_type='exponential'): baseline = 70 + 5 * np.sin(x / 50) else: raise ValueError(f'unknown bkg_type {bkg_type}') - - noise = np.random.default_rng(0).normal(0, 0.1, data_size) + noise = np.random.default_rng(0).normal(0, noise_std, data_size) y = signal + baseline + noise - return x, y, baseline + if return_baseline: + return x, y, baseline + + return x, y diff --git a/pybaselines/whittaker.py b/pybaselines/whittaker.py index 04dd3b6c..bfff13d2 100644 --- a/pybaselines/whittaker.py +++ b/pybaselines/whittaker.py @@ -20,21 +20,35 @@ class _Whittaker(_Algorithm): @_Algorithm._register(sort_keys=('weights',)) def asls(self, data, lam=1e6, p=1e-2, diff_order=2, max_iter=50, tol=1e-3, weights=None): - """ - Fits the baseline using asymmetric least squares (AsLS) fitting. + r""" + Fits the baseline using the asymmetric least squares (AsLS) algorithm. + + Iteratively calculates the baseline, v, by solving the linear equation + :math:`(W + \lambda D_d^{\mathsf{T}} D_d) v = W y`, where y is the input data, + :math:`D_d` is the finite difference matrix of order d, W is the diagonal matrix + of the weights, and :math:`\lambda` (``lam``) is the regularization parameter. + + Each iteration, the weights are updated following: + + .. math:: + + w_i = \left\{\begin{array}{cr} + p & y_i > v_i \\ + 1 - p & y_i \le v_i + \end{array}\right. Parameters ---------- data : array-like, shape (N,) - The y-values of the measured data, with N data points. Must not - contain missing data (NaN) or Inf. + The y-values of the measured data. Must not contain missing data (NaN) or Inf. lam : float, optional The smoothing parameter. Larger values will create smoother baselines. Default is 1e6. p : float, optional The penalizing weighting factor. Must be between 0 and 1. Values greater - than the baseline will be given `p` weight, and values less than the baseline - will be given `1 - p` weight. Default is 1e-2. + than the baseline will be given ``p`` weight, and values less than the baseline + will be given ``1 - p`` weight. Can be considered as analogous to the quantile of + the data to fit. Default is 1e-2. diff_order : int, optional The order of the differential matrix. Must be greater than 0. Default is 2 (second order differential matrix). Typical values are 2 or 1. @@ -59,12 +73,17 @@ def asls(self, data, lam=1e6, p=1e-2, diff_order=2, max_iter=50, tol=1e-3, weigh An array containing the calculated tolerance values for each iteration. The length of the array is the number of iterations completed. If the last value in the array is greater than the input - `tol` value, then the function did not converge. + ``tol`` value, then the function did not converge. Raises ------ ValueError - Raised if `p` is not between 0 and 1. + Raised if ``p`` is not between 0 and 1. + + Notes + ----- + In literature, this algorithm is referred to as either "ALS" or "AsLS", + both standing for asymmetric least squares. References ---------- @@ -75,6 +94,47 @@ def asls(self, data, lam=1e6, p=1e-2, diff_order=2, max_iter=50, tol=1e-3, weigh Eilers, P. Parametric Time Warping. Analytical Chemistry, 2004, 76(2), 404-411. + Examples + -------- + >>> import matplotlib.pyplot as plt + >>> from pybaselines import Baseline, utils + >>> x, y = utils.make_data(noise_std=0.2) + >>> baseline_fitter = Baseline(x) + + The two main parameters to vary are ``lam``, which controls the smoothness of the + baseline, and ``p``, which controls the weighting. In general, a higher ``p`` value + is needed for noisy data and correspondly requires a slightly higher ``lam`` value. + + >>> fit_1, params_1 = baseline_fitter.asls(y, lam=1e6, p=0.001) + >>> fit_2, params_2 = baseline_fitter.asls(y, lam=1e7, p=0.02) + >>> plt.plot(x, y) + >>> plt.plot(x, fit_1, label='lam=1e6, p=0.001') + >>> plt.plot(x, fit_2, '--', label='lam=1e7, p=0.02') + >>> plt.legend() + >>> plt.show() + + The parameter ``p`` should typically be chosen such that the residuals, ``y - baseline``, + are centered around 0. In this example, the use of ``p=0.02`` fits the noise better. + + >>> plt.hist( + ... [y - fit_1, y - fit_2], bins=100, density=True, label=['p=0.001', 'p=0.02'], + ... histtype='step' + ... ) + >>> plt.xlabel('Residuals, y - baseline') + >>> plt.legend() + >>> plt.show() + + An alternative to adjusting ``p`` for noisy data is to simply smooth the data before + calling ``asls``, which allows for ignoring ``p`` (ie. selecting a ``p`` value near 0). + + >>> y_smooth = utils.whittaker_smooth(y, lam=1e1) + >>> fit_3, params_3 = baseline_fitter.asls(y_smooth, lam=1e6, p=0.001) + >>> plt.plot(x, y) + >>> plt.plot(x, y_smooth, '--', label='smoothed data') + >>> plt.plot(x, fit_3, label='smoothed fit') + >>> plt.legend() + >>> plt.show() + """ if not 0 < p < 1: raise ValueError('p must be between 0 and 1') @@ -99,16 +159,36 @@ def asls(self, data, lam=1e6, p=1e-2, diff_order=2, max_iter=50, tol=1e-3, weigh @_Algorithm._register(sort_keys=('weights',)) def iasls(self, data, lam=1e6, p=1e-2, lam_1=1e-4, max_iter=50, tol=1e-3, weights=None, diff_order=2): - """ + r""" Fits the baseline using the improved asymmetric least squares (IAsLS) algorithm. - The algorithm consideres both the first and second derivatives of the residual. + The algorithm penalizes both the smoothness of the baseline as well as the first + derivative of the signal, ``y - baseline``. The baseline, v, is iteratively calculated + by solving the linear equation + + .. math:: + + (W^{\mathsf{T}} W + \lambda_1 D_1^{\mathsf{T}} D_1 + \lambda D_d^{\mathsf{T}} D_d) v + = (W^{\mathsf{T}} W + \lambda_1 D_1^{\mathsf{T}} D_1) y + + where y is the input data, :math:`D_d` is the finite difference matrix of order d, + :math:`D_1` is the first-order finite difference matrix, W is the diagonal matrix + of the weights, and :math:`\lambda` (``lam``) and :math:`\lambda_1` (``lam_1``) are + regularization parameters. + + Each iteration, the weights are updated following: + + .. math:: + + w_i = \left\{\begin{array}{cr} + p & y_i > v_i \\ + 1 - p & y_i \le v_i + \end{array}\right. Parameters ---------- data : array-like, shape (N,) - The y-values of the measured data, with `N` data points. Must not - contain missing data (NaN) or Inf. + The y-values of the measured data. Must not contain missing data (NaN) or Inf. lam : float, optional The smoothing parameter. Larger values will create smoother baselines. Default is 1e6. @@ -149,11 +229,79 @@ def iasls(self, data, lam=1e6, p=1e-2, lam_1=1e-4, max_iter=50, tol=1e-3, ValueError Raised if `p` is not between 0 and 1 or if `diff_order` is less than 2. + Notes + ----- + Although both ``iasls`` and :meth:`~.Baseline.asls` use ``p`` for defining the weights, + the appropriate ``p`` value for ``iasls`` will be approximately equal to the square root + of the value used for ``asls`` since ``iasls`` squares the weights within its linear + equation. + + Omits the outer loop described by the reference implementation of the IAsLs algorithm, + in which the baseline fitting is repeated after subtracting the baseline from the data. + The outer loop makes the algorithm's performance difficult to predict and often leads + to either peak clipping or no change from the first fit baseline. If this outer loop is + desired, this method simply needs to be called repeatedly, as demonstrated in the + Examples section below. + References ---------- He, S., et al. Baseline correction for raman spectra using an improved asymmetric least squares method, Analytical Methods, 2014, 6(12), 4402-4407. + Examples + -------- + >>> import matplotlib.pyplot as plt + >>> import numpy as np + >>> from pybaselines import Baseline, utils + >>> x, y = utils.make_data(noise_std=0.2) + >>> baseline_fitter = Baseline(x) + + The two main parameters to vary are ``lam``, which controls the smoothness of the + baseline, and ``p``, which controls the weighting. In general, a higher ``p`` value + is needed for noisy data and correspondly requires a slightly higher ``lam`` value. + + >>> fit_1, params_1 = baseline_fitter.iasls(y, lam=1e6, p=0.05) + >>> fit_2, params_2 = baseline_fitter.iasls(y, lam=1e7, p=0.15) + >>> plt.plot(x, y) + >>> plt.plot(x, fit_1, label='lam=1e6, p=0.05') + >>> plt.plot(x, fit_2, '--', label='lam=1e7, p=0.15') + >>> plt.legend() + >>> plt.show() + + The parameter ``p`` should typically be chosen such that the residuals, ``y - baseline``, + are centered around 0. In this example, the use of ``p=0.15`` fits the noise better. + + >>> plt.hist( + ... [y - fit_1, y - fit_2], bins=100, density=True, label=['p=0.05', 'p=0.15'], + ... histtype='step' + ... ) + >>> plt.xlabel('Residuals, y - baseline') + >>> plt.legend() + >>> plt.show() + + As stated in the Notes section above, the original IAsLs algorithm used an outer loop + which subtracted the fit baseline from the data and then used the result to fit a new + baseline. This behavior can be emulated by calling ``iasls`` within a loop as shown below. + + >>> baseline = np.zeros_like(y) + >>> data = y + >>> tol = 5e-3 + >>> max_iter = 50 + >>> for i in range(max_iter): + ... fit, params = baseline_fitter.iasls(data, lam=1e7, p=0.15) + ... residual = data - fit + ... if utils.relative_difference(data, residual) < tol: + ... break + ... data = residual + ... baseline += fit + >>> print(f'Performed {i + 1} iterations') + Performed 3 iterations + >>> plt.plot(x, y) + >>> plt.plot(x, fit_2, label='single iteration') + >>> plt.plot(x, baseline, '--', label='multiple iterations') + >>> plt.legend() + >>> plt.show() + """ if not 0 < p < 1: raise ValueError('p must be between 0 and 1') @@ -201,14 +349,31 @@ def iasls(self, data, lam=1e6, p=1e-2, lam_1=1e-4, max_iter=50, tol=1e-3, @_Algorithm._register(sort_keys=('weights',)) def airpls(self, data, lam=1e6, diff_order=2, max_iter=50, tol=1e-3, weights=None, normalize_weights=False): - """ + r""" Adaptive iteratively reweighted penalized least squares (airPLS) baseline. + Iteratively calculates the baseline, v, by solving the linear equation + :math:`(W + \lambda D_d^{\mathsf{T}} D_d) v = W y`, where y is the input data, + :math:`D_d` is the finite difference matrix of order d, W is the diagonal matrix + of the weights, and :math:`\lambda` (``lam``) is the regularization parameter. + + Each iteration, the weights are updated following: + + .. math:: + + w_i = \left\{\begin{array}{cr} + 0 & y_i \ge v_i \\ + \exp{\left(\frac{\text{abs}(y_i - v_i) t}{|\mathbf{r}^-|}\right)} & y_i < v_i + \end{array}\right. + + where t is the iteration number and :math:`|\mathbf{r}^-|` is the L1-norm of the + negative values in the residual vector :math:`\mathbf r`, ie. + :math:`\sum\limits_{y_i - v_i < 0} |y_i - v_i|`. + Parameters ---------- data : array-like, shape (N,) - The y-values of the measured data, with N data points. Must not - contain missing data (NaN) or Inf. + The y-values of the measured data. Must not contain missing data (NaN) or Inf. lam : float, optional The smoothing parameter. Larger values will create smoother baselines. Default is 1e6. @@ -242,11 +407,49 @@ def airpls(self, data, lam=1e6, diff_order=2, max_iter=50, tol=1e-3, weights=Non completed. If the last value in the array is greater than the input `tol` value, then the function did not converge. + Notes + ----- + The original publication for the airPLS algorithm contained a typo in its + weighting scheme which omitted the use of absolute values within the exponential, + as indicated by the author: https://github.com/zmzhang/airPLS/issues/8. + References ---------- Zhang, Z.M., et al. Baseline correction using adaptive iteratively reweighted penalized least squares. Analyst, 2010, 135(5), 1138-1146. + Examples + -------- + >>> import matplotlib.pyplot as plt + >>> from pybaselines import Baseline, utils + >>> x, y = utils.make_data() + >>> baseline_fitter = Baseline(x) + + The main parameter to vary is ``lam``, which controls the smoothness of the + baseline, with larger values giving smoother baselines. + + >>> fit_1, params_1 = baseline_fitter.airpls(y, lam=1e4) + >>> fit_2, params_2 = baseline_fitter.airpls(y, lam=1e6) + >>> plt.plot(x, y) + >>> plt.plot(x, fit_1, label='lam=1e4') + >>> plt.plot(x, fit_2, '--', label='lam=1e6') + >>> plt.legend() + >>> plt.show() + + For noisy data, ``airpls`` underestimates the baseline, which can be mitigated by + smoothing the input data before calling the method. + + >>> _, y_noisy = utils.make_data(noise_std=0.3) + >>> y_smooth = utils.whittaker_smooth(y_noisy, lam=1e2) + >>> fit_3, params_4 = baseline_fitter.airpls(y_noisy, lam=1e6) + >>> fit_4, params_4 = baseline_fitter.airpls(y_smooth, lam=1e6) + >>> plt.plot(x, y_noisy) + >>> plt.plot(x, y_smooth, '--', label='smoothed data') + >>> plt.plot(x, fit_3, ':', label='noisy fit') + >>> plt.plot(x, fit_4, label='smoothed fit') + >>> plt.legend() + >>> plt.show() + """ y, weight_array, whittaker_system = self._setup_whittaker(data, lam, diff_order, weights) y_l1_norm = np.abs(y).sum() @@ -274,14 +477,33 @@ def airpls(self, data, lam=1e6, diff_order=2, max_iter=50, tol=1e-3, weights=Non @_Algorithm._register(sort_keys=('weights',)) def arpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=None): - """ + r""" Asymmetrically reweighted penalized least squares smoothing (arPLS). + Iteratively calculates the baseline, v, by solving the linear equation + :math:`(W + \lambda D_d^{\mathsf{T}} D_d) v = W y`, where y is the input data, + :math:`D_d` is the finite difference matrix of order d, W is the diagonal matrix + of the weights, and :math:`\lambda` (``lam``) is the regularization parameter. + + Each iteration, the weights are updated following: + + .. math:: + + w_i = \frac + {1} + {1 + \exp{\left(\frac + {2(r_i - (-\mu^- + 2 \sigma^-))} + {\sigma^-} + \right)}} + + where :math:`r_i = y_i - v_i` and :math:`\mu^-` and :math:`\sigma^-` are the mean and + standard deviation, respectively, of the negative values in the residual vector, + ``y - v``. + Parameters ---------- data : array-like, shape (N,) - The y-values of the measured data, with N data points. Must not - contain missing data (NaN) or Inf. + The y-values of the measured data. Must not contain missing data (NaN) or Inf. lam : float, optional The smoothing parameter. Larger values will create smoother baselines. Default is 1e5. @@ -316,6 +538,24 @@ def arpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=None Baek, S.J., et al. Baseline correction using asymmetrically reweighted penalized least squares smoothing. Analyst, 2015, 140, 250-257. + Examples + -------- + >>> import matplotlib.pyplot as plt + >>> from pybaselines import Baseline, utils + >>> x, y = utils.make_data(noise_std=0.2) + >>> baseline_fitter = Baseline(x) + + The main parameter to vary is ``lam``, which controls the smoothness of the + baseline, with larger values giving smoother baselines. + + >>> fit_1, params_1 = baseline_fitter.arpls(y, lam=1e4) + >>> fit_2, params_2 = baseline_fitter.arpls(y, lam=1e6) + >>> plt.plot(x, y) + >>> plt.plot(x, fit_1, label='lam=1e4') + >>> plt.plot(x, fit_2, '--', label='lam=1e6') + >>> plt.legend() + >>> plt.show() + """ y, weight_array, whittaker_system = self._setup_whittaker(data, lam, diff_order, weights) tol_history = np.empty(max_iter + 1) @@ -346,8 +586,7 @@ def drpls(self, data, lam=1e5, eta=0.5, max_iter=50, tol=1e-3, weights=None, dif Parameters ---------- data : array-like, shape (N,) - The y-values of the measured data, with N data points. Must not - contain missing data (NaN) or Inf. + The y-values of the measured data. Must not contain missing data (NaN) or Inf. lam : float, optional The smoothing parameter. Larger values will create smoother baselines. Default is 1e5. @@ -442,8 +681,7 @@ def iarpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=Non Parameters ---------- data : array-like, shape (N,) - The y-values of the measured data, with N data points. Must not - contain missing data (NaN) or Inf. + The y-values of the measured data. Must not contain missing data (NaN) or Inf. lam : float, optional The smoothing parameter. Larger values will create smoother baselines. Default is 1e5. @@ -510,8 +748,7 @@ def aspls(self, data, lam=1e5, diff_order=2, max_iter=100, tol=1e-3, Parameters ---------- data : array-like, shape (N,) - The y-values of the measured data, with N data points. Must not - contain missing data (NaN) or Inf. + The y-values of the measured data. Must not contain missing data (NaN) or Inf. lam : float, optional The smoothing parameter. Larger values will create smoother baselines. Default is 1e5. @@ -623,8 +860,7 @@ def psalsa(self, data, lam=1e5, p=0.5, k=None, diff_order=2, max_iter=50, tol=1e Parameters ---------- data : array-like, shape (N,) - The y-values of the measured data, with N data points. Must not - contain missing data (NaN) or Inf. + The y-values of the measured data. Must not contain missing data (NaN) or Inf. lam : float, optional The smoothing parameter. Larger values will create smoother baselines. Default is 1e5. @@ -718,8 +954,7 @@ def derpsalsa(self, data, lam=1e6, p=0.01, k=None, diff_order=2, max_iter=50, to Parameters ---------- data : array-like, shape (N,) - The y-values of the measured data, with N data points. Must not - contain missing data (NaN) or Inf. + The y-values of the measured data. Must not contain missing data (NaN) or Inf. lam : float, optional The smoothing parameter. Larger values will create smoother baselines. Default is 1e6. @@ -842,8 +1077,7 @@ def brpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, max_iter_2=5 Parameters ---------- data : array-like, shape (N,) - The y-values of the measured data, with N data points. Must not - contain missing data (NaN) or Inf. + The y-values of the measured data. Must not contain missing data (NaN) or Inf. lam : float, optional The smoothing parameter. Larger values will create smoother baselines. Default is 1e5. @@ -946,8 +1180,7 @@ def lsrpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=Non Parameters ---------- data : array-like, shape (N,) - The y-values of the measured data, with N data points. Must not - contain missing data (NaN) or Inf. + The y-values of the measured data. Must not contain missing data (NaN) or Inf. lam : float, optional The smoothing parameter. Larger values will create smoother baselines. Default is 1e5. diff --git a/pyproject.toml b/pyproject.toml index f7de4786..002a43d9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -83,11 +83,15 @@ test = [ "pytest>=6.0", # first version to work with pyproject.toml "ruff", ] +doctest = [ + "scipy-doctest", + "matplotlib", +] docs = [ "sphinx", "sphinx-copybutton", "sphinx-gallery>=0.16", # first version allowing strings to specify gallery sorting - "sphinx-rtd-theme", + "pydata-sphinx-theme", "matplotlib", "numpydoc", ] @@ -96,7 +100,7 @@ release = [ "bump-my-version", "twine", ] -dev = ["pybaselines[full, docs, test, release]"] +dev = ["pybaselines[full, docs, test, release, doctest]"] [tool.pytest.ini_options] diff --git a/requirements/requirements-documentation.txt b/requirements/requirements-documentation.txt index 18c77d8e..c574675e 100644 --- a/requirements/requirements-documentation.txt +++ b/requirements/requirements-documentation.txt @@ -7,14 +7,20 @@ pentapy==1.3.0 sphinx==8.2.3 sphinx-copybutton==0.5.2 sphinx-gallery==0.19.0 -sphinx-rtd-theme==3.0.2 +pydata-sphinx-theme==0.16.1 # Additional transitive dependencies, generated by pip-tools +accessible-pygments==0.0.5 + # via pydata-sphinx-theme alabaster==1.0.0 # via sphinx babel==2.17.0 - # via sphinx + # via + # pydata-sphinx-theme + # sphinx +beautifulsoup4==4.13.5 + # via pydata-sphinx-theme certifi==2025.1.31 # via requests charset-normalizer==3.4.1 @@ -27,8 +33,8 @@ cycler==0.12.1 # via matplotlib docutils==0.21.2 # via + # pydata-sphinx-theme # sphinx - # sphinx-rtd-theme fonttools==4.56.0 # via matplotlib idna==3.10 @@ -52,7 +58,9 @@ pillow==11.1.0 # matplotlib # sphinx-gallery pygments==2.19.1 - # via sphinx + # accessible-pygments + # pydata-sphinx-theme + # sphinx pyparsing==3.2.1 # via matplotlib python-dateutil==2.9.0.post0 @@ -65,14 +73,14 @@ six==1.17.0 # via python-dateutil snowballstemmer==2.2.0 # via sphinx +soupsieve==2.8 + # via beautifulsoup4 sphinxcontrib-applehelp==2.0.0 # via sphinx sphinxcontrib-devhelp==2.0.0 # via sphinx sphinxcontrib-htmlhelp==2.1.0 # via sphinx -sphinxcontrib-jquery==4.1 - # via sphinx-rtd-theme sphinxcontrib-jsmath==1.0.1 # via sphinx sphinxcontrib-qthelp==2.0.0 @@ -81,5 +89,9 @@ sphinxcontrib-serializinghtml==2.0.0 # via sphinx tabulate==0.9.0 # via numpydoc +typing-extensions==4.15.0 + # via + # beautifulsoup4 + # pydata-sphinx-theme urllib3==2.3.0 # via requests diff --git a/tests/conftest.py b/tests/conftest.py index 65ab1267..bef4bda5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -14,6 +14,17 @@ from .base_tests import get_data, get_data2d +pytest_plugins = [] +try: + # ignore if scipy-doctest is unavailable since it's not required for running non-doctest + # tests; if running doctests without scipy-doctest, the doctests will fail due to expecting no + # output from matplotlib's plotting routines + import scipy_doctest # noqa: F401 + pytest_plugins.append('scipy_doctest') +except ImportError: + pass + + def pytest_addoption(parser): """Adds additional pytest command line options.""" if hasattr(sys, '_is_gil_enabled'): # sys._is_gil_enabled added in Python 3.13 diff --git a/tests/test_utils.py b/tests/test_utils.py index 26b9893c..f1a1f6cb 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -831,23 +831,45 @@ def test_optimize_window(small_data2d, two_d): @pytest.mark.parametrize('data_size', (500, 1000, 1001)) -@pytest.mark.parametrize('baseline_type', ('exponential', 'gaussian', 'linear', 'sine')) -def test_make_data(data_size, baseline_type): - """Ensures basic functionality of the _make_data function.""" - x, y, baseline = utils._make_data(data_size, baseline_type) +@pytest.mark.parametrize( + 'baseline_type', ('exponential', 'gaussian', 'linear', 'sine', 'gaussian_small') +) +@pytest.mark.parametrize('signal_type', (1, 2)) +@pytest.mark.parametrize('return_baseline', (True, False)) +def test_make_data(data_size, baseline_type, signal_type, return_baseline): + """Ensures basic functionality of the make_data function.""" + output = utils.make_data( + data_size, bkg_type=baseline_type, signal_type=signal_type, return_baseline=return_baseline, + noise_std=0.2 + ) + if return_baseline: + assert len(output) == 3 + x, y, baseline = output + else: + assert len(output) == 2 + x, y = output + baseline = None + assert isinstance(x, np.ndarray) assert isinstance(y, np.ndarray) - assert isinstance(baseline, np.ndarray) - assert x.shape == (data_size,) assert y.shape == (data_size,) - assert baseline.shape == (data_size,) + + if baseline is not None: + assert isinstance(baseline, np.ndarray) + assert baseline.shape == (data_size,) def test_make_data_invalid_baseline(): """Ensures an error is raised when an invalid baseline type is specified.""" with pytest.raises(ValueError): - utils._make_data(100, 'aaaaa') + utils.make_data(100, bkg_type='aaaaa') + + +def test_make_data_invalid_signal(): + """Ensures an error is raised when an invalid signal type is specified.""" + with pytest.raises(ValueError): + utils.make_data(100, signal_type=3) @pytest.mark.parametrize('half_window', (1, 2, 22)) diff --git a/tools/logo_plot.py b/tools/logo_plot.py index 672ca0eb..9fb6d23a 100644 --- a/tools/logo_plot.py +++ b/tools/logo_plot.py @@ -1,7 +1,5 @@ # -*- coding: utf-8 -*- -"""Creates the logo plot. - -Creates the plot used for the logo; further edits are done in Inkscape to finalize the logo. +"""Creates the logo for pybaselines. Created on November 15, 2021 @@ -16,6 +14,7 @@ try: import matplotlib.pyplot as plt + from matplotlib import patheffects except ImportError: print('This file requires matplotlib to run') raise @@ -25,11 +24,15 @@ # assumes file is in pybaselines/tools image_directory = Path(__file__).parent - with plt.rc_context( - {'interactive': False, 'lines.linewidth': 2.5, - 'figure.dpi': 300} - ): - fig, ax = plt.subplots(tight_layout={'pad': 0.1}, frameon=False) + figure_dpi = 300 + with plt.rc_context({ + 'font.family': 'sans-serif', + 'font.sans-serif': 'arial', + }): + fig, ax = plt.subplots( + tight_layout={'pad': 0.1}, frameon=False, figsize=(1710 / figure_dpi, 473 / figure_dpi), + dpi=figure_dpi + ) x = np.linspace(1, 1000, 1000) signal = ( @@ -42,15 +45,31 @@ + utils.gaussian(x, 5, 880, 8) ) true_baseline = 2 + 1e-3 * x + utils.gaussian(x, 1, 600, 300) - noise = np.random.default_rng(1).normal(0, 0.05, x.size) + noise = np.random.default_rng(1).normal(0, 0.01, x.size) y = signal + true_baseline + noise baseline = Baseline().arpls(y, lam=1e7)[0] - blue = '#0952ff' + # for reference, see the matplotlib examples + # https://matplotlib.org/stable/gallery/text_labels_and_annotations/rainbow_text.html + # and https://matplotlib.org/stable/gallery/misc/patheffect_demo.html for how to make + # multicolored aligned text with borders + blue = '#137bff' pink = '#ff5255' - ax.plot(x, y, lw=1.5, color=blue) - ax.plot(x, baseline, lw=4, color=pink) + text_size = 52 + text_border = [patheffects.withStroke(linewidth=1.5, foreground='black')] + + ax.plot(x, y, lw=2.5, color=blue) + ax.plot(x, baseline, lw=3, color=pink) + + x_lims = ax.get_xlim() + ax.set_xlim(x_lims[0] - 200, x_lims[1] + 200) + ax.set_ylim(ax.get_ylim()[0] - 5) + text = ax.text(1, -1.8, 'py', color=blue, size=text_size, path_effects=text_border) + text = ax.annotate( + 'baselines', xycoords=text, xy=(1, 0), verticalalignment='bottom', size=text_size, + color=pink, path_effects=text_border + ) ax.set_yticks([]) ax.set_xticks([]) @@ -59,7 +78,10 @@ ax.spines['left'].set_visible(False) ax.spines['right'].set_visible(False) - # save as an svg so that it can be edited/scaled in inkskape without - # losing image quality - fig.savefig(image_directory.joinpath('logo_new.svg'), transparent=True) - plt.close(fig) + # No idea what's happening here, but for the first save, the scaling of the plot gets + # shrunk legthwise such that it does not look like the displayed plot, but after saving + # again it looks correct... just overwrite the first and ignore whatever is causing this + fig.savefig(image_directory.joinpath('logo_new.png'), transparent=True) + fig.savefig(image_directory.joinpath('logo_new.png'), transparent=True) + + plt.show()