diff --git a/.github/workflows/python-test-latest.yml b/.github/workflows/python-test-latest.yml
index c42791e1..81f79f1e 100644
--- a/.github/workflows/python-test-latest.yml
+++ b/.github/workflows/python-test-latest.yml
@@ -40,6 +40,8 @@ jobs:
 
     steps:
     - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      with:
+        persist-credentials: false
 
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
diff --git a/.github/workflows/python-test.yml b/.github/workflows/python-test.yml
index f7c53771..fada8bd5 100644
--- a/.github/workflows/python-test.yml
+++ b/.github/workflows/python-test.yml
@@ -40,12 +40,8 @@ jobs:
 
     steps:
     - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      # fetch-depth: '0' is needed to fetch tags for hatch-vcs (and thus setuptools-scm)
-      # to generate the version correctly; only really needed for jobs that would build and
-      # upload to pypi, so ignore it since fetching the entire git history can take much longer
-      # than the default of fetching just the last commit
-      #with:
-        #fetch-depth: '0'
+      with:
+        persist-credentials: false
 
     - name: Set up Python
       uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
@@ -81,6 +77,8 @@ jobs:
 
     steps:
     - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      with:
+        persist-credentials: false
 
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
@@ -113,6 +111,8 @@ jobs:
 
     steps:
     - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      with:
+        persist-credentials: false
 
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
@@ -124,3 +124,32 @@ jobs:
 
     - name: Lint
       run: ruff check .
+
+  doctest:
+
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        # only test earliest and latest supported Python versions since the examples should
+        # be fairly simple
+        python-version: ['3.9', '3.13']
+
+    steps:
+    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      with:
+        persist-credentials: false
+
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Install dependencies
+      # TODO will need to think of how to handle doctests when they rely on optional dependencies
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install ".[test, doctest]"
+
+    - name: Run doctests
+      run: pytest pybaselines --doctest-modules
diff --git a/docs/algorithms/classification.rst b/docs/algorithms/classification.rst
index e54ba443..fb687afe 100644
--- a/docs/algorithms/classification.rst
+++ b/docs/algorithms/classification.rst
@@ -17,6 +17,8 @@ method. The plot below shows such an example.
 
 .. plot::
    :align: center
+   :include-source: False
+   :show-source-link: True
 
     import numpy as np
     import matplotlib.pyplot as plt
@@ -75,6 +77,8 @@ points, and then iteratively fitting a polynomial to the interpolated baseline.
 .. plot::
    :align: center
    :context: reset
+   :include-source: False
+   :show-source-link: True
 
     import numpy as np
     import matplotlib.pyplot as plt
@@ -206,6 +210,8 @@ the noise's standard deviation as belonging to the baseline.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -234,6 +240,8 @@ of the median of the noise's standard deviation distribution.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -263,6 +271,8 @@ threshold, and peak regions are iteratively interpolated until the baseline is b
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -291,6 +301,8 @@ other points have a weight of 0.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     scales = np.arange(2, 40)
     # to see contents of create_data function, look at the top-most algorithm's code
@@ -327,6 +339,8 @@ a weight of 0.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -369,6 +383,8 @@ peak positions.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -397,6 +413,8 @@ slightly reduced, as seen below.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
diff --git a/docs/algorithms/misc.rst b/docs/algorithms/misc.rst
index 169ea7a2..08b8f3f3 100644
--- a/docs/algorithms/misc.rst
+++ b/docs/algorithms/misc.rst
@@ -25,6 +25,8 @@ with user interfaces and is not encouraged otherwise.
 
 .. plot::
    :align: center
+   :include-source: False
+   :show-source-link: True
 
     import numpy as np
     import matplotlib.pyplot as plt
@@ -81,6 +83,8 @@ as sparse.
 .. plot::
    :align: center
    :context: reset
+   :include-source: False
+   :show-source-link: True
 
     import numpy as np
     import matplotlib.pyplot as plt
@@ -205,6 +209,8 @@ of the beads function.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the second-to-top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
diff --git a/docs/algorithms/morphological.rst b/docs/algorithms/morphological.rst
index 2cfe47ca..6c8e0aaa 100644
--- a/docs/algorithms/morphological.rst
+++ b/docs/algorithms/morphological.rst
@@ -13,6 +13,8 @@ below. The algorithms in this section use these operations to estimate the basel
 .. plot::
    :align: center
    :context: reset
+   :include-source: False
+   :show-source-link: True
 
     import numpy as np
     import matplotlib.pyplot as plt
@@ -80,6 +82,8 @@ method.
 .. plot::
    :align: center
    :context: reset
+   :include-source: False
+   :show-source-link: True
 
     import numpy as np
     import matplotlib.pyplot as plt
@@ -212,6 +216,8 @@ erosion and dilation of the opening to create the baseline.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -235,6 +241,8 @@ create the baseline.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -252,6 +260,8 @@ kernel, to produce a smooth baseline.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -276,6 +286,8 @@ and opening of either the data (first iteration) or previous iteration's baselin
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -294,6 +306,8 @@ resembles rolling a ball across the data.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -315,6 +329,8 @@ then smooths the result with a moving average.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -340,6 +356,8 @@ tophat (Top-hat Transformation)
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -367,6 +385,8 @@ Finally, a penalized spline is fit to the smoothed data with the assigned weight
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -419,11 +439,11 @@ linear equations:
 
 .. math::
 
-    (I + 2 \gamma D_d^{\top} D_d) s^n = y - v^{n-1}
+    (I + 2 \gamma D_d^{\mathsf{T}} D_d) s^n = y - v^{n-1}
 
 .. math::
 
-    (I + 2 \alpha I + 2 \beta D_d^{\top} D_d) v^n = y - s^n + 2 \alpha Op
+    (I + 2 \alpha I + 2 \beta D_d^{\mathsf{T}} D_d) v^n = y - s^n + 2 \alpha Op
 
 where :math:`I` is the identity matrix and :math:`D_d` is the matrix version
 of :math:`\Delta^d`, which is also the d-th derivative of the identity matrix.
@@ -433,6 +453,8 @@ multipliers.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -452,6 +474,8 @@ of the jbcd function.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the second-to-top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
diff --git a/docs/algorithms/optimizers.rst b/docs/algorithms/optimizers.rst
index 011caf55..b6ac5194 100644
--- a/docs/algorithms/optimizers.rst
+++ b/docs/algorithms/optimizers.rst
@@ -26,6 +26,8 @@ An example of data with added baseline and Gaussian peaks is shown below.
 
 .. plot::
    :align: center
+   :include-source: False
+   :show-source-link: True
 
     import numpy as np
     import matplotlib.pyplot as plt
@@ -88,6 +90,8 @@ added linear regions is selected as the optimal parameter.
 .. plot::
    :align: center
    :context: reset
+   :include-source: False
+   :show-source-link: True
 
     import numpy as np
     import matplotlib.pyplot as plt
@@ -225,6 +229,8 @@ algorithm versus the individual baselines from the mpls method.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     x = np.linspace(1, 1000, 500)
     signal = (
@@ -283,6 +289,8 @@ the name).
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
diff --git a/docs/algorithms/polynomial.rst b/docs/algorithms/polynomial.rst
index 105c6b17..a8e43351 100644
--- a/docs/algorithms/polynomial.rst
+++ b/docs/algorithms/polynomial.rst
@@ -25,7 +25,7 @@ However, since only the baseline of the data is desired, the least-squares
 approach must be modified. For polynomial-based algorithms, this is done
 by 1) only fitting the data in regions where there is only baseline (termed
 selective masking), 2) modifying the y-values being fit each iteration, termed
-thresholding, or 3) penalyzing outliers.
+thresholding, or 3) penalizing outliers.
 
 .. _selective-masking-explanation:
 
@@ -142,14 +142,15 @@ Thresholding
 ~~~~~~~~~~~~
 
 Thresholding is an iterative method that first fits the data using
-traditional least-squares, and then sets the next iteration's fit data
+traditional least-squares and then sets the next iteration's data-to-fit
 as the element-wise minimum between the current data and the current fit.
-The figure below illustrates the iterative thresholding.
+The figure below illustrates this iterative thresholding.
 
 .. plot::
    :align: center
    :context: close-figs
    :include-source: False
+   :show-source-link: True
 
     fig, axes = plt.subplots(
         2, 2, gridspec_kw={'hspace': 0, 'wspace': 0},
@@ -176,14 +177,15 @@ The figure below illustrates the iterative thresholding.
 The algorithms in pybaselines that use thresholding are :meth:`~.Baseline.modpoly`,
 :meth:`~.Baseline.imodpoly`, and :meth:`~.Baseline.loess` (if ``use_threshold`` is True).
 
-Penalyzing Outliers
+Penalizing Outliers
 ~~~~~~~~~~~~~~~~~~~
 
-The algorithms in pybaselines that penalyze outliers are
-:meth:`~.Baseline.penalized_poly`, which incorporate the penalty directly into the
-minimized cost function, and :meth:`~.Baseline.loess` (if ``use_threshold`` is False),
-which incorporates penalties by applying lower weights to outliers. Refer
-to the particular algorithms below for more details.
+The polynomial algorithms in pybaselines that penalize outliers include
+:meth:`~.Baseline.penalized_poly`, which incorporates the penalty directly into the
+minimized cost function, and :meth:`~.Baseline.loess` (if ``use_threshold`` is False)
+and :meth:`~.Baseline.quant_reg`, which use
+:ref:`iterative reweighting <iterative-reweighting-explanation>` to apply lower weights
+to outliers. Refer to the particular algorithms below for more details.
 
 
 Algorithms
@@ -201,6 +203,8 @@ of the data since masking is time-consuming.
 .. plot::
    :align: center
    :context: reset
+   :include-source: False
+   :show-source-link: True
 
     import numpy as np
     import matplotlib.pyplot as plt
@@ -325,6 +329,8 @@ baseline to data. `modpoly` is also sometimes called "ModPolyFit" in literature,
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -362,6 +368,8 @@ and both `modpoly` and `imodpoly` are sometimes referred to as "IPF" or "Iterati
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -393,6 +401,8 @@ The plots below show the symmetric and asymmetric forms of the cost functions.
 
 .. plot::
    :align: center
+   :include-source: False
+   :show-source-link: True
 
     import numpy as np
     import matplotlib.pyplot as plt
@@ -474,6 +484,8 @@ The plots below show the symmetric and asymmetric forms of the cost functions.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -507,6 +519,8 @@ is reduced by iterative reweighting.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -541,6 +555,8 @@ quant_reg (Quantile Regression)
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -567,6 +583,8 @@ based on the input `peak_ratio` value.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     peak_ratios = [0.2, 0.6, 0.2, 0.2, 0.3]
     # to see contents of create_data function, look at the top-most algorithm's code
diff --git a/docs/algorithms/smooth.rst b/docs/algorithms/smooth.rst
index 38f8ab47..965e6eb0 100644
--- a/docs/algorithms/smooth.rst
+++ b/docs/algorithms/smooth.rst
@@ -27,6 +27,8 @@ kernel. Note that this method does not perform well for tightly-grouped peaks.
 .. plot::
    :align: center
    :context: reset
+   :include-source: False
+   :show-source-link: True
 
     import numpy as np
     import matplotlib.pyplot as plt
@@ -157,6 +159,8 @@ index-based width of the largest peak or feature.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -177,6 +181,8 @@ data. The baselines when using decreasing window size and smoothing is shown bel
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -202,6 +208,8 @@ incrementally increased to smooth peaks until convergence is reached.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -219,11 +227,13 @@ ipsa (Iterative Polynomial Smoothing Algorithm)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 :meth:`~.Baseline.ipsa` iteratively smooths the input data using a second-order
-Savitzky–Golay filter until the exit criteria is reached.
+Savitzky-Golay filter until the exit criteria is reached.
 
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -243,13 +253,15 @@ ria (Range Independent Algorithm)
 right edges of the data and adds Gaussian peaks to these baselines, similar to the
 :ref:`optimize_extended_range <extending-data-explanation>` function, and
 records their initial areas. The data is then iteratively smoothed using a
-zero-order Savitzky–Golay filter (moving average) until the area of the extended
+zero-order Savitzky-Golay filter (moving average) until the area of the extended
 regions after subtracting the smoothed data from the initial data is close to
 their starting areas.
 
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -278,6 +290,8 @@ is then interpolated back into the original data size.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
diff --git a/docs/algorithms/spline.rst b/docs/algorithms/spline.rst
index 714dbc69..e6f88831 100644
--- a/docs/algorithms/spline.rst
+++ b/docs/algorithms/spline.rst
@@ -11,19 +11,19 @@ predominantly used in pybaselines. B-splines can be expressed as:
 
 .. math::
 
-    v(x) = \sum\limits_{i}^N \sum\limits_{j}^M {B_j(x_i) c_j}
+    v(x) = \sum\limits_{i}^N \sum\limits_{j}^M c_j {B_j(x_i)}
 
 where :math:`N` is the number of points in :math:`x`, :math:`M` is the number of spline
 basis functions, :math:`B_j(x_i)` is the j-th basis function evaluated at :math:`x_i`,
-and :math:`c_j` is the coefficient for the j-th basis (which is analogous to
+and :math:`c_j` is the coefficient vector for the j-th basis (which is analogous to
 the height of the j-th basis). In pybaselines, the number of spline basis functions,
-:math:`M`, is calculated as the number of knots, `num_knots`, plus the spline degree
+:math:`M`, is calculated as the number of knots, ``num_knots``, plus the spline degree
 minus 1.
 
 For regular B-spline fitting, the spline coefficients that best fit the data
 are gotten from minimizing the least-squares:
 
-.. math:: \sum\limits_{i}^N w_i (y_i - \sum\limits_{j}^M {B_j(x_i) c_j})^2
+.. math:: \sum\limits_{i}^N w_i (y_i - v(x_i))^2
 
 where :math:`y_i` and :math:`x_i` are the measured data, and :math:`w_i` is
 the weighting. In order to control the smoothness of the fitting spline, a penalty
@@ -34,7 +34,7 @@ The minimized function for P-splines is thus:
 
 .. math::
 
-    \sum\limits_{i}^N w_i (y_i - \sum\limits_{j}^M {B_j(x_i) c_j})^2
+    \sum\limits_{i}^N w_i (y_i - v(x_i))^2
     + \lambda \sum\limits_{i}^{M - d} (\Delta^d c_i)^2
 
 where :math:`\lambda` is the penalty scale factor, and
@@ -45,14 +45,21 @@ The resulting linear equation for solving the above minimization is:
 
 .. math::
 
-    (B^{\top} W B + \lambda D_d^{\top} D_d) c = B^{\top} W y
+    (B^{\mathsf{T}} W B + \lambda D_d^{\mathsf{T}} D_d) c = B^{\mathsf{T}} W y
+
+and the baseline is given by:
+
+.. math::
+
+    v = B c
 
 where :math:`W` is the diagaonal matrix of the weights, :math:`B` is the matrix
 containing all of the spline basis functions, and :math:`D_d` is the matrix
 version of :math:`\Delta^d` (same as :ref:`explained <difference-matrix-explanation>`
 for Whittaker-smoothing-based algorithms). P-splines have similarities with Whittaker
-smoothing; in fact, if the number of basis functions, :math:`M`, is set up to be equal
-to the number of data points, :math:`N`, and the spline degree is set to 0, then
+smoothing, including the use of :ref:`iterative reweighting <iterative-reweighting-explanation>`
+to calculate the baseline; in fact, if the number of basis functions, :math:`M`, is set up to
+be equal to the number of data points, :math:`N`, and the spline degree is set to 0, then
 :math:`B` becomes the identity matrix and the above equation becomes identical
 to the equation used for Whittaker smoothing.
 
@@ -75,6 +82,8 @@ residual belonging to the noise's normal distribution.
 .. plot::
    :align: center
    :context: reset
+   :include-source: False
+   :show-source-link: True
 
     import numpy as np
     import matplotlib.pyplot as plt
@@ -206,6 +215,8 @@ to perform quantile regression on the data.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     quantiles = {0: 0.3, 1: 0.1, 2: 0.2, 3: 0.25, 4: 0.5}
     # to see contents of create_data function, look at the top-most algorithm's code
@@ -232,6 +243,8 @@ between all but the first and last non-corner points.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -256,14 +269,14 @@ Minimized function:
 
 .. math::
 
-    \sum\limits_{i}^N w_i (y_i - \sum\limits_{j}^M {B_j(x_i) c_j})^2
+    \sum\limits_{i}^N w_i (y_i -  v(x_i))^2
     + \lambda \sum\limits_{i}^{M - d} (\Delta^d c_i)^2
 
 Linear system:
 
 .. math::
 
-    (B^{\top} W B + \lambda D_d^{\top} D_d) c = B^{\top} W y
+    (B^{\mathsf{T}} W B + \lambda D_d^{\mathsf{T}} D_d) c = B^{\mathsf{T}} W y
 
 Weighting:
 
@@ -277,6 +290,8 @@ Weighting:
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -304,16 +319,16 @@ Minimized function:
 
 .. math::
 
-    \sum\limits_{i}^N (w_i (y_i - \sum\limits_{j}^M {B_j(x_i) c_j}))^2
+    \sum\limits_{i}^N (w_i (y_i - v(x_i)))^2
     + \lambda \sum\limits_{i}^{M - d} (\Delta^d c_i)^2
-    + \lambda_1 \sum\limits_{i}^{N - 1} (\Delta^1 (y_i - \sum\limits_{j}^M {B_j(x_i) c_j}))^2
+    + \lambda_1 \sum\limits_{i}^{N - 1} (\Delta^1 (y_i - v(x_i)))^2
 
 Linear system:
 
 .. math::
 
-    (B^{\top} W^{\top} W B + \lambda_1 B^{\top} D_1^{\top} D_1 B + \lambda D_d^{\top} D_d) c
-    = (B^{\top} W^{\top} W B + \lambda_1 B^{\top} D_1^{\top} D_1) y
+    (B^{\mathsf{T}} W^{\mathsf{T}} W B + \lambda_1 B^{\mathsf{T}} D_1^{\mathsf{T}} D_1 B + \lambda D_d^{\mathsf{T}} D_d) c
+    = (B^{\mathsf{T}} W^{\mathsf{T}} W B + \lambda_1 B^{\mathsf{T}} D_1^{\mathsf{T}} D_1) y
 
 Weighting:
 
@@ -328,6 +343,8 @@ Weighting:
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -357,14 +374,14 @@ Minimized function:
 
 .. math::
 
-    \sum\limits_{i}^N w_i (y_i - \sum\limits_{j}^M {B_j(x_i) c_j})^2
+    \sum\limits_{i}^N w_i (y_i - v(x_i))^2
     + \lambda \sum\limits_{i}^{M - d} (\Delta^d c_i)^2
 
 Linear system:
 
 .. math::
 
-    (B^{\top} W B + \lambda D_d^{\top} D_d) c = B^{\top} W y
+    (B^{\mathsf{T}} W B + \lambda D_d^{\mathsf{T}} D_d) c = B^{\mathsf{T}} W y
 
 Weighting:
 
@@ -383,6 +400,8 @@ publication, as `specified by the author <https://github.com/zmzhang/airPLS/issu
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -406,14 +425,14 @@ Minimized function:
 
 .. math::
 
-    \sum\limits_{i}^N w_i (y_i - \sum\limits_{j}^M {B_j(x_i) c_j})^2
+    \sum\limits_{i}^N w_i (y_i - v(x_i))^2
     + \lambda \sum\limits_{i}^{M - d} (\Delta^d c_i)^2
 
 Linear system:
 
 .. math::
 
-    (B^{\top} W B + \lambda D_d^{\top} D_d) c = B^{\top} W y
+    (B^{\mathsf{T}} W B + \lambda D_d^{\mathsf{T}} D_d) c = B^{\mathsf{T}} W y
 
 Weighting:
 
@@ -433,6 +452,8 @@ values in the residual vector :math:`\mathbf r`.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -450,7 +471,7 @@ Minimized function:
 
 .. math::
 
-    \sum\limits_{i}^N w_i (y_i - \sum\limits_{j}^M {B_j(x_i) c_j})^2
+    \sum\limits_{i}^N w_i (y_i - v(x_i))^2
     + \lambda \sum\limits_{i}^{M - d}(1 - \eta w_{i,intp}) (\Delta^d c_i)^2
     + \sum\limits_{i}^{M - 1} (\Delta^1 (c_i))^2
 
@@ -463,7 +484,7 @@ Linear system:
 
 .. math::
 
-    (B^{\top}W B + D_1^{\top} D_1 + \lambda (I - \eta W_{intp}) D_d^{\top} D_d) c = B^{\top} W y
+    (B^{\mathsf{T}}W B + D_1^{\mathsf{T}} D_1 + \lambda (I - \eta W_{intp}) D_d^{\mathsf{T}} D_d) c = B^{\mathsf{T}} W y
 
 where :math:`I` is the identity matrix.
 
@@ -485,6 +506,8 @@ respectively, of the negative values in the residual vector :math:`\mathbf r`.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -506,14 +529,14 @@ Minimized function:
 
 .. math::
 
-    \sum\limits_{i}^N w_i (y_i - \sum\limits_{j}^M {B_j(x_i) c_j})^2
+    \sum\limits_{i}^N w_i (y_i - v(x_i))^2
     + \lambda \sum\limits_{i}^{M - d} (\Delta^d c_i)^2
 
 Linear system:
 
 .. math::
 
-    (B^{\top} W B + \lambda D_d^{\top} D_d) c = B^{\top} W y
+    (B^{\mathsf{T}} W B + \lambda D_d^{\mathsf{T}} D_d) c = B^{\mathsf{T}} W y
 
 Weighting:
 
@@ -533,6 +556,8 @@ the residual vector :math:`\mathbf r`.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -554,7 +579,7 @@ Minimized function:
 
 .. math::
 
-    \sum\limits_{i}^N w_i (y_i - \sum\limits_{j}^M {B_j(x_i) c_j})^2
+    \sum\limits_{i}^N w_i (y_i - v(x_i))^2
     + \lambda \sum\limits_{i}^{M - d} \alpha_{i,intp} (\Delta^d c_i)^2
 
 where
@@ -573,7 +598,7 @@ Linear system:
 
 .. math::
 
-    (B^{\top} W B + \lambda \alpha_{intp} D_d^{\top} D_d) c = B^{\top} W y
+    (B^{\mathsf{T}} W B + \lambda \alpha_{intp} D_d^{\mathsf{T}} D_d) c = B^{\mathsf{T}} W y
 
 Weighting:
 
@@ -596,6 +621,8 @@ of the asPLS paper closer than the factor of 2 and fits noisy data much better).
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -619,14 +646,14 @@ Minimized function:
 
 .. math::
 
-    \sum\limits_{i}^N w_i (y_i - \sum\limits_{j}^M {B_j(x_i) c_j})^2
+    \sum\limits_{i}^N w_i (y_i - v(x_i))^2
     + \lambda \sum\limits_{i}^{M - d} (\Delta^d c_i)^2
 
 Linear system:
 
 .. math::
 
-    (B^{\top} W B + \lambda D_d^{\top} D_d) c = B^{\top} W y
+    (B^{\mathsf{T}} W B + \lambda D_d^{\mathsf{T}} D_d) c = B^{\mathsf{T}} W y
 
 Weighting:
 
@@ -644,6 +671,8 @@ be considered a peak.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -666,14 +695,14 @@ Minimized function:
 
 .. math::
 
-    \sum\limits_{i}^N w_i (y_i - \sum\limits_{j}^M {B_j(x_i) c_j})^2
+    \sum\limits_{i}^N w_i (y_i - v(x_i))^2
     + \lambda \sum\limits_{i}^{M - d} (\Delta^d c_i)^2
 
 Linear system:
 
 .. math::
 
-    (B^{\top} W B + \lambda D_d^{\top} D_d) c = B^{\top} W y
+    (B^{\mathsf{T}} W B + \lambda D_d^{\mathsf{T}} D_d) c = B^{\mathsf{T}} W y
 
 Weighting:
 
@@ -707,6 +736,8 @@ respectively, of the smoothed data, :math:`y_{sm}`, and :math:`rms()` is the roo
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -728,14 +759,14 @@ Minimized function:
 
 .. math::
 
-    \sum\limits_{i}^N w_i (y_i - \sum\limits_{j}^M {B_j(x_i) c_j})^2
+    \sum\limits_{i}^N w_i (y_i - v(x_i))^2
     + \lambda \sum\limits_{i}^{M - d} (\Delta^d c_i)^2
 
 Linear system:
 
 .. math::
 
-    (B^{\top} W B + \lambda D_d^{\top} D_d) c = B^{\top} W y
+    (B^{\mathsf{T}} W B + \lambda D_d^{\mathsf{T}} D_d) c = B^{\mathsf{T}} W y
 
 Weighting:
 
@@ -749,6 +780,8 @@ Weighting:
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -773,14 +806,14 @@ Minimized function:
 
 .. math::
 
-    \sum\limits_{i}^N w_i (y_i - \sum\limits_{j}^M {B_j(x_i) c_j})^2
+    \sum\limits_{i}^N w_i (y_i - v(x_i))^2
     + \lambda \sum\limits_{i}^{M - d} (\Delta^d c_i)^2
 
 Linear system:
 
 .. math::
 
-    (B^{\top} W B + \lambda D_d^{\top} D_d) c = B^{\top} W y
+    (B^{\mathsf{T}} W B + \lambda D_d^{\mathsf{T}} D_d) c = B^{\mathsf{T}} W y
 
 Weighting:
 
@@ -810,6 +843,8 @@ within :math:`\mathbf r`.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -828,14 +863,14 @@ Minimized function:
 
 .. math::
 
-    \sum\limits_{i}^N w_i (y_i - \sum\limits_{j}^M {B_j(x_i) c_j})^2
+    \sum\limits_{i}^N w_i (y_i - v(x_i))^2
     + \lambda \sum\limits_{i}^{M - d} (\Delta^d c_i)^2
 
 Linear system:
 
 .. math::
 
-    (B^{\top} W B + \lambda D_d^{\top} D_d) c = B^{\top} W y
+    (B^{\mathsf{T}} W B + \lambda D_d^{\mathsf{T}} D_d) c = B^{\mathsf{T}} W y
 
 Weighting:
 
@@ -855,6 +890,8 @@ respectively, of the negative values in the residual vector :math:`\mathbf r`.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
diff --git a/docs/algorithms/whittaker.rst b/docs/algorithms/whittaker.rst
index 78cd3ff0..7bc55665 100644
--- a/docs/algorithms/whittaker.rst
+++ b/docs/algorithms/whittaker.rst
@@ -13,7 +13,7 @@ and penalized least squares (like :meth:`~.Baseline.penalized_poly`).
 
 A great introduction to Whittaker smoothing is Paul Eilers's
 `A Perfect Smoother paper <https://doi.org/10.1021/ac034173t>`_ (note that Whittaker
-smoothing is often also referred to as Whittaker-Henderson smoothing and as Hodrick-Prescott
+smoothing is often also referred to as Whittaker-Henderson smoothing or as Hodrick-Prescott
 filtering in the case of a second order difference). The general idea behind
 Whittaker smoothing algorithms is to make the baseline match the measured
 data as well as it can while also penalizing the roughness of the baseline. The
@@ -31,7 +31,7 @@ The resulting linear equation for solving the above minimization is:
 
 .. math::
 
-    (W + \lambda D_d^{\top} D_d) v = W y
+    (W + \lambda D_d^{\mathsf{T}} D_d) v = W y
 
 .. _difference-matrix-explanation:
 
@@ -61,11 +61,111 @@ and :math:`D_2` (second order difference matrix) is:
 Most Whittaker-smoothing-based techniques recommend using the second order difference matrix,
 although some techniques use both the first and second order difference matrices.
 
-The baseline is iteratively calculated using the linear system above by solving for
-the baseline, :math:`v`, updating the weights, solving for the baseline using the new
-weights, and repeating until some exit criteria.
-The difference between Whittaker-smoothing-based algorithms is the selection of weights
-and/or the function that is minimized.
+.. _iterative-reweighting-explanation:
+
+Baseline algorithms based on Whittaker smoothing use
+`iterative reweighting <https://en.wikipedia.org/wiki/Iteratively_reweighted_least_squares>`_,
+in which the baseline, :math:`v`, is calculated by solving the linear equation above, updating the
+weights based on that baseline, solving for a new baseline using the new weights, and repeating
+until some exit criteria. The major difference between Whittaker-smoothing-based algorithms is the
+selection of weights and/or the function that is minimized. An example of this process is shown
+below using the :meth:`~.Baseline.arpls` method.
+
+.. plot::
+   :align: center
+   :context: reset
+   :include-source: False
+   :show-source-link: True
+   :nofigs:
+
+    from pathlib import Path
+
+    import matplotlib.animation as animation
+    import matplotlib.pyplot as plt
+    import numpy as np
+
+    from pybaselines import Baseline, utils
+
+    x = np.linspace(1, 1000, 1000)
+    signal = (
+        utils.gaussian(x, 4, 120, 5)
+        + utils.gaussian(x, 5, 220, 12)
+        + utils.gaussian(x, 5, 350, 10)
+        + utils.gaussian(x, 7, 400, 8)
+        + utils.gaussian(x, 4, 550, 6)
+        + utils.gaussian(x, 5, 660, 14)
+        + utils.gaussian(x, 4, 750, 12)
+        + utils.gaussian(x, 5, 880, 8)
+    )
+    true_baseline = 2 + 10 * np.exp(-x / 400)
+    noise = np.random.default_rng(1).normal(0, 0.2, x.size)
+
+    y = signal + true_baseline + noise
+
+    baseline_fitter = Baseline(x_data=x)
+    tol = 1e-3
+    lam = 1e5
+
+    # do a complete fit once to figure out how many iterations (ie. frames) are required
+    _, params_complete = baseline_fitter.arpls(y, lam=lam, tol=tol)
+    max_frames = len(params_complete['tol_history'])
+
+    # do an initial fit to get axes set up
+    fit, params = baseline_fitter.arpls(y, lam=lam, max_iter=0, tol=tol)
+    fig, (ax, ax2, ax3) = plt.subplots(nrows=3, tight_layout=True)
+
+    ax.plot(x, y, label='data')
+    baseline_plot = ax.plot(x, fit, label='baseline')[0]
+    weight_plot = ax2.plot(x, np.ones_like(y), 'r.')[0]
+    tol_plot = ax3.plot(1, params['tol_history'], 'o-')[0]
+    ax3.axhline(tol, linestyle='--', color='k', label='Exit tolerance')
+
+    ax.set_xlabel('x')
+    ax.legend()
+    ax2.set(ylim=[-0.1, 1.1], ylabel='Weights', xlabel='x')
+    ax3.set(
+        xlim=[0, max_frames + 1], ylim=[tol / 10, params_complete['tol_history'].max() * 10],
+        ylabel='Tolerance', xlabel='Iteration'
+    )
+    ax3.semilogy()
+    ax3.legend()
+
+    def update_lines(frame):
+        # frame == max number of iterations
+        fit, params = baseline_fitter.arpls(y, lam=lam, max_iter=frame, tol=tol)
+        baseline_plot.set_ydata(fit)
+        # until convergence, output weights actually correspond to iteration `frame + 1`,
+        # so do a second fit using `frame - 1` to get the correct weights for this frame
+        if frame == 0:
+            weight_plot.set_ydata(np.ones_like(y))
+        else:
+            _, params_weights = baseline_fitter.arpls(y, lam=lam, max_iter=frame - 1, tol=tol)
+            weight_plot.set_ydata(params_weights['weights'])
+        changed_lines = [baseline_plot, weight_plot]
+        if len(params['tol_history']) == frame + 1:  # in case tolerance was reached early
+            tol_plot.set_xdata(np.arange(frame + 1) + 1)
+            tol_plot.set_ydata(params['tol_history'])
+            changed_lines.append(tol_plot)
+
+        return changed_lines
+
+    # matplotlib's `plot` directive does not display gifs, so save the generated
+    # gif instead and then embed it below
+    ani = animation.FuncAnimation(
+        fig=fig, func=update_lines, frames=max_frames, blit=True, interval=500
+    )
+    if Path.cwd().stem == 'algorithms':  # within documentation, so just save the gif
+        output_path = Path('../generated/images')
+        output_path.mkdir(exist_ok=True, parents=True)
+        ani.save(output_path.joinpath('iterative_reweighting.gif'))
+        plt.close(fig)
+    else:  # downloaded, so show the plot rather than saving
+        plt.show()
+
+.. image:: ../generated/images/iterative_reweighting.gif
+   :alt: Iterative Reweighting
+   :align: center
+
 
 .. note::
    The :math:`\lambda` (``lam``) value required to fit a particular baseline for all
@@ -94,7 +194,7 @@ Linear system:
 
 .. math::
 
-    (W + \lambda D_d^{\top} D_d) v = W y
+    (W + \lambda D_d^{\mathsf{T}} D_d) v = W y
 
 Weighting:
 
@@ -109,6 +209,8 @@ Weighting:
 .. plot::
    :align: center
    :context: reset
+   :include-source: False
+   :show-source-link: True
 
     import numpy as np
     import matplotlib.pyplot as plt
@@ -247,8 +349,8 @@ Linear system:
 
 .. math::
 
-    (W^{\top} W + \lambda_1 D_1^{\top} D_1 + \lambda D_d^{\top} D_d) v
-    = (W^{\top} W + \lambda_1 D_1^{\top} D_1) y
+    (W^{\mathsf{T}} W + \lambda_1 D_1^{\mathsf{T}} D_1 + \lambda D_d^{\mathsf{T}} D_d) v
+    = (W^{\mathsf{T}} W + \lambda_1 D_1^{\mathsf{T}} D_1) y
 
 Weighting:
 
@@ -263,6 +365,8 @@ Weighting:
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -279,7 +383,7 @@ Weighting:
         else:
             lam = 1e3
             p = 0.01
-        baseline, params = baseline_fitter.iasls(y, lam=lam, lam_1=1e-4, p=p)
+        baseline, params = baseline_fitter.iasls(y, lam=lam, p=p)
         ax.plot(baseline, 'g--')
 
 
@@ -299,7 +403,7 @@ Linear system:
 
 .. math::
 
-    (W + \lambda D_d^{\top} D_d) v = W y
+    (W + \lambda D_d^{\mathsf{T}} D_d) v = W y
 
 Weighting:
 
@@ -310,7 +414,7 @@ Weighting:
         \exp{\left(\frac{\text{abs}(y_i - v_i) t}{|\mathbf{r}^-|}\right)} & y_i < v_i
     \end{array}\right.
 
-where :math:`t` is the iteration number and :math:`|\mathbf{r}^-|` is the l1-norm of the negative
+where :math:`t` is the iteration number and :math:`|\mathbf{r}^-|` is the L1-norm of the negative
 values in the residual vector :math:`\mathbf r`, ie. :math:`\sum\limits_{y_i - v_i < 0} |y_i - v_i|`.
 Note that the absolute value within the weighting was mistakenly omitted in the original
 publication, as `specified by the author <https://github.com/zmzhang/airPLS/issues/8>`_.
@@ -318,6 +422,8 @@ publication, as `specified by the author <https://github.com/zmzhang/airPLS/issu
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_plots function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -342,7 +448,7 @@ Linear system:
 
 .. math::
 
-    (W + \lambda D_d^{\top} D_d) v = W y
+    (W + \lambda D_d^{\mathsf{T}} D_d) v = W y
 
 Weighting:
 
@@ -361,6 +467,8 @@ deviation, respectively, of the negative values in the residual vector :math:`\m
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_plots function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -392,7 +500,7 @@ Linear system:
 
 .. math::
 
-    (W + D_1^{\top} D_1 + \lambda (I - \eta W) D_d^{\top} D_d) v = W y
+    (W + D_1^{\mathsf{T}} D_1 + \lambda (I - \eta W) D_d^{\mathsf{T}} D_d) v = W y
 
 where :math:`I` is the identity matrix.
 
@@ -414,6 +522,8 @@ respectively, of the negative values in the residual vector :math:`\mathbf r`.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_plots function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -443,7 +553,7 @@ Linear system:
 
 .. math::
 
-    (W + \lambda D_d^{\top} D_d) v = W y
+    (W + \lambda D_d^{\mathsf{T}} D_d) v = W y
 
 Weighting:
 
@@ -463,6 +573,8 @@ the residual vector :math:`\mathbf r`.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_plots function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -496,7 +608,7 @@ Linear system:
 
 .. math::
 
-    (W + \lambda \alpha D_d^{\top} D_d) v = W y
+    (W + \lambda \alpha D_d^{\mathsf{T}} D_d) v = W y
 
 Weighting:
 
@@ -519,6 +631,8 @@ of the asPLS paper closer than the factor of 2 and fits noisy data much better).
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_plots function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -543,7 +657,7 @@ Linear system:
 
 .. math::
 
-    (W + \lambda D_d^{\top} D_d) v = W y
+    (W + \lambda D_d^{\mathsf{T}} D_d) v = W y
 
 Weighting:
 
@@ -561,6 +675,8 @@ be considered a peak.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_plots function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -590,7 +706,7 @@ Linear system:
 
 .. math::
 
-    (W + \lambda D_d^{\top} D_d) v = W y
+    (W + \lambda D_d^{\mathsf{T}} D_d) v = W y
 
 Weighting:
 
@@ -624,6 +740,8 @@ respectively, of the smoothed data, :math:`y_{sm}`, and :math:`rms()` is the roo
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_plots function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -652,7 +770,7 @@ Linear system:
 
 .. math::
 
-    (W + \lambda D_d^{\top} D_d) v = W y
+    (W + \lambda D_d^{\mathsf{T}} D_d) v = W y
 
 Weighting:
 
@@ -688,6 +806,8 @@ within :math:`\mathbf r`.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_plots function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
@@ -713,7 +833,7 @@ Linear system:
 
 .. math::
 
-    (W + \lambda D_d^{\top} D_d) v = W y
+    (W + \lambda D_d^{\mathsf{T}} D_d) v = W y
 
 Weighting:
 
@@ -733,6 +853,8 @@ respectively, of the negative values in the residual vector :math:`\mathbf r`.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_plots function, look at the top-most algorithm's code
     figure, axes, handles = create_plots(data, baselines)
diff --git a/docs/algorithms_2d/morphological_2d.rst b/docs/algorithms_2d/morphological_2d.rst
index 29b69e5f..50d62034 100644
--- a/docs/algorithms_2d/morphological_2d.rst
+++ b/docs/algorithms_2d/morphological_2d.rst
@@ -21,6 +21,8 @@ mor (Morphological)
 .. plot::
    :align: center
    :context: reset
+   :include-source: False
+   :show-source-link: True
 
     import numpy as np
     import matplotlib.pyplot as plt
@@ -82,6 +84,8 @@ imor (Improved Morphological)
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     baseline, params = baseline_fitter.imor(y, half_window=(4, 2), tol=5e-3)
@@ -97,6 +101,8 @@ rolling_ball (Rolling Ball)
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     baseline, params = baseline_fitter.rolling_ball(y, half_window=(8, 5), smooth_half_window=3)
@@ -113,6 +119,8 @@ tophat (Top-hat Transformation)
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     baseline, params = baseline_fitter.tophat(y, half_window=(8, 5))
diff --git a/docs/algorithms_2d/optimizers_2d.rst b/docs/algorithms_2d/optimizers_2d.rst
index a30e3440..3a9317fd 100644
--- a/docs/algorithms_2d/optimizers_2d.rst
+++ b/docs/algorithms_2d/optimizers_2d.rst
@@ -21,6 +21,8 @@ adaptive_minmax (Adaptive MinMax)
 .. plot::
    :align: center
    :context: reset
+   :include-source: False
+   :show-source-link: True
 
     import numpy as np
     import matplotlib.pyplot as plt
@@ -90,6 +92,8 @@ baseline correction instead.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     baseline, params = baseline_fitter.individual_axes(
diff --git a/docs/algorithms_2d/polynomial_2d.rst b/docs/algorithms_2d/polynomial_2d.rst
index 064c5b28..31116d33 100644
--- a/docs/algorithms_2d/polynomial_2d.rst
+++ b/docs/algorithms_2d/polynomial_2d.rst
@@ -27,7 +27,7 @@ the polynomial estimate at :math:`x_i`, and :math:`z_j` and :math:`w_{ij}` is th
 However, since only the baseline of the data is desired, the least-squares
 approach must be modified. For polynomial-based algorithms, this is done
 by 1) only fitting the data in regions where there is only baseline, 2)
-modifying the y-values being fit each iteration, or 3) penalyzing outliers.
+modifying the y-values being fit each iteration, or 3) penalizing outliers.
 
 .. note::
    For two dimensional data, polynomial algorithms take a single ``poly_order``
@@ -57,6 +57,8 @@ modpoly (Modified Polynomial)
 .. plot::
    :align: center
    :context: reset
+   :include-source: False
+   :show-source-link: True
 
     import numpy as np
     import matplotlib.pyplot as plt
@@ -117,6 +119,8 @@ imodpoly (Improved Modified Polynomial)
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     baseline, params = baseline_fitter.imodpoly(y, poly_order=(1, 2), max_cross=0)
@@ -132,6 +136,8 @@ penalized_poly (Penalized Polynomial)
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     baseline, params = baseline_fitter.penalized_poly(y, poly_order=(1, 2), max_cross=0)
@@ -147,6 +153,8 @@ quant_reg (Quantile Regression)
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     baseline, params = baseline_fitter.quant_reg(
diff --git a/docs/algorithms_2d/smooth_2d.rst b/docs/algorithms_2d/smooth_2d.rst
index 67e0101b..5fe2245e 100644
--- a/docs/algorithms_2d/smooth_2d.rst
+++ b/docs/algorithms_2d/smooth_2d.rst
@@ -20,6 +20,8 @@ noise_median (Noise Median method)
 .. plot::
    :align: center
    :context: reset
+   :include-source: False
+   :show-source-link: True
 
     import numpy as np
     import matplotlib.pyplot as plt
diff --git a/docs/algorithms_2d/spline_2d.rst b/docs/algorithms_2d/spline_2d.rst
index 2480d703..774c39ec 100644
--- a/docs/algorithms_2d/spline_2d.rst
+++ b/docs/algorithms_2d/spline_2d.rst
@@ -43,7 +43,7 @@ solving the above minimization is:
 
 .. math::
 
-    (B^{\top} W_{diag} B + \lambda_r I_h \otimes D_{d_r}^{\top} D_{d_r} + \lambda_c D_{d_c}^{\top} D_{d_c} \otimes I_g) \alpha = B^{\top} W_{diag} y
+    (B^{\mathsf{T}} W_{diag} B + \lambda_r I_h \otimes D_{d_r}^{\mathsf{T}} D_{d_r} + \lambda_c D_{d_c}^{\mathsf{T}} D_{d_c} \otimes I_g) \alpha = B^{\mathsf{T}} W_{diag} y
 
 and the baseline is then:
 
@@ -68,20 +68,20 @@ memory and computation time.
 
 Let :math:`F` be the
 `face-splitting product operator <https://en.wikipedia.org/wiki/Khatri%E2%80%93Rao_product#Face-splitting_product>`_
-of a matrix with itself such that :math:`F(B_r) = (B_r \otimes 1_{g}^{\top}) \odot (1_{g}^{\top} \otimes B_r)`
-and :math:`F(B_c) = (B_c \otimes 1_{h}^{\top}) \odot (1_{h}^{\top} \otimes B_c)`, where
+of a matrix with itself such that :math:`F(B_r) = (B_r \otimes 1_{g}^{\mathsf{T}}) \odot (1_{g}^{\mathsf{T}} \otimes B_r)`
+and :math:`F(B_c) = (B_c \otimes 1_{h}^{\mathsf{T}}) \odot (1_{h}^{\mathsf{T}} \otimes B_c)`, where
 :math:`1_g` and :math:`1_h` are vectors of ones of length :math:`g` and :math:`h`, respecitvely,
 and :math:`\odot` signifies elementwise multiplication. Then the linear equation can be rewritten as:
 
 .. math::
 
-    (F(B_r)^{\top} W F(B_c) + \lambda_r I_h \otimes D_{d_r}^{\top} D_{d_r} + \lambda_c D_{d_c}^{\top} D_{d_c} \otimes I_g) \alpha = B_{r}^{\top} (W \odot Y) B_c
+    (F(B_r)^{\mathsf{T}} W F(B_c) + \lambda_r I_h \otimes D_{d_r}^{\mathsf{T}} D_{d_r} + \lambda_c D_{d_c}^{\mathsf{T}} D_{d_c} \otimes I_g) \alpha = B_{r}^{\mathsf{T}} (W \odot Y) B_c
 
 and the baseline is:
 
 .. math::
 
-    V = B_r \alpha B_{c}^{\top}
+    V = B_r \alpha B_{c}^{\mathsf{T}}
 
 
 Algorithms
@@ -96,6 +96,8 @@ mixture_model (Mixture Model)
 .. plot::
    :align: center
    :context: reset
+   :include-source: False
+   :show-source-link: True
 
     import numpy as np
     import matplotlib.pyplot as plt
@@ -157,6 +159,8 @@ irsqr (Iterative Reweighted Spline Quantile Regression)
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_plots function, look at the top-most algorithm's code
     baseline, params = baseline_fitter.irsqr(y, lam=(1e3, 1e2), quantile=0.3)
@@ -172,6 +176,8 @@ pspline_asls (Penalized Spline Version of asls)
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_plots function, look at the top-most algorithm's code
     baseline, params = baseline_fitter.pspline_asls(y, lam=(1e3, 1e0), p=0.005)
@@ -187,6 +193,8 @@ pspline_iasls (Penalized Spline Version of iasls)
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_plots function, look at the top-most algorithm's code
     baseline, params = baseline_fitter.pspline_iasls(y, lam=(1e2, 1e-2))
@@ -202,6 +210,8 @@ pspline_airpls (Penalized Spline Version of airpls)
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_plots function, look at the top-most algorithm's code
     baseline, params = baseline_fitter.pspline_airpls(y, lam=(1e3, 1e-1))
@@ -217,6 +227,8 @@ pspline_arpls (Penalized Spline Version of arpls)
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_plots function, look at the top-most algorithm's code
     baseline, params = baseline_fitter.pspline_arpls(y, lam=(1e3, 5e0))
@@ -232,6 +244,8 @@ pspline_iarpls (Penalized Spline Version of iarpls)
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_plots function, look at the top-most algorithm's code
     baseline, params = baseline_fitter.pspline_iarpls(y, lam=(1e2, 1e0))
@@ -247,6 +261,8 @@ pspline_psalsa (Penalized Spline Version of psalsa)
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_plots function, look at the top-most algorithm's code
     baseline, params = baseline_fitter.pspline_psalsa(y, lam=(1e3, 5e0), k=0.5)
@@ -262,6 +278,8 @@ pspline_brpls (Penalized Spline Version of brpls)
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_plots function, look at the top-most algorithm's code
     baseline, params = baseline_fitter.pspline_brpls(y, lam=(1e3, 5e0))
@@ -277,6 +295,8 @@ pspline_lsrpls (Penalized Spline Version of lsrpls)
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_plots function, look at the top-most algorithm's code
     baseline, params = baseline_fitter.pspline_lsrpls(y, lam=(1e3, 5e0))
diff --git a/docs/algorithms_2d/whittaker_2d.rst b/docs/algorithms_2d/whittaker_2d.rst
index 0a546ff4..ba4a6354 100644
--- a/docs/algorithms_2d/whittaker_2d.rst
+++ b/docs/algorithms_2d/whittaker_2d.rst
@@ -32,7 +32,7 @@ The resulting linear equation for solving the above minimization is:
 
 .. math::
 
-    (W_{diag} + \lambda_r I_M \otimes D_{d_r}^{\top} D_{d_r} + \lambda_c D_{d_c}^{\top} D_{d_c} \otimes I_M) v = w y
+    (W_{diag} + \lambda_r I_M \otimes D_{d_r}^{\mathsf{T}} D_{d_r} + \lambda_c D_{d_c}^{\mathsf{T}} D_{d_c} \otimes I_M) v = w y
 
 
 where :math:`W_{diag}` is the diagaonal matrix of the flattened weights, and :math:`D_d` is the matrix
@@ -54,7 +54,7 @@ Eigendecomposition
 
 By following the excellent insights laid out by G. Biessy in `[2] <https://doi.org/10.48550/arXiv.2306.06932>`_,
 the dimensionality of the system can be reduced by using eigendecomposition on each of the two
-penalty matrices, :math:`D_{d_r}^{\top} D_{d_r}` and :math:`D_{d_c}^{\top} D_{d_c}`. (Note that speeding up
+penalty matrices, :math:`D_{d_r}^{\mathsf{T}} D_{d_r}` and :math:`D_{d_c}^{\mathsf{T}} D_{d_c}`. (Note that speeding up
 Whittaker smoothing using `factorization in 1D <https://doi.org/10.1016/j.csda.2006.11.038>`_ and using the
 `analytical eigenvalues in nD (great paper) <https://doi.org/10.1016/j.csda.2009.09.020>`_ are established
 methods, although they require using a fixed difference order, and, in the second case, of using
@@ -63,7 +63,7 @@ The general eigendecomposition of the penalty matrix gives
 
 .. math::
 
-    D_{d}^{\top} D_{d} = U \Sigma U^{\top}
+    D_{d}^{\mathsf{T}} D_{d} = U \Sigma U^{\mathsf{T}}
 
 where :math:`U` is the matrix of eigenvectors and :math:`\Sigma` is a diagonal matrix
 with the eigenvalues along the diagonal. Letting :math:`B = U_c \otimes U_r` denote the Kronecker
@@ -73,7 +73,7 @@ can be rewritten as:
 
 .. math::
 
-    (B^{\top} W_{diag} B + \lambda_r I_h \otimes \Sigma_r + \lambda_c \Sigma_c \otimes I_g) \alpha = B^{\top} W_{diag} y
+    (B^{\mathsf{T}} W_{diag} B + \lambda_r I_h \otimes \Sigma_r + \lambda_c \Sigma_c \otimes I_g) \alpha = B^{\mathsf{T}} W_{diag} y
 
 and the baseline is then:
 
@@ -117,6 +117,8 @@ asls (Asymmetric Least Squares)
 .. plot::
    :align: center
    :context: reset
+   :include-source: False
+   :show-source-link: True
 
     import numpy as np
     import matplotlib.pyplot as plt
@@ -179,6 +181,8 @@ Eigendecomposition is not allowed for this method.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_data function, look at the top-most algorithm's code
     baseline, params = baseline_fitter.iasls(y, lam=(1e3, 1e0))
@@ -194,6 +198,8 @@ airpls (Adaptive Iteratively Reweighted Penalized Least Squares)
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_plots function, look at the top-most algorithm's code
     baseline, params = baseline_fitter.airpls(y, lam=(1e3, 1e1))
@@ -209,6 +215,8 @@ arpls (Asymmetrically Reweighted Penalized Least Squares)
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_plots function, look at the top-most algorithm's code
     baseline, params = baseline_fitter.arpls(y, lam=(1e4, 1e2))
@@ -225,6 +233,8 @@ Eigendecomposition is not allowed for this method.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_plots function, look at the top-most algorithm's code
     baseline, params = baseline_fitter.drpls(y, lam=(1e3, 1e2))
@@ -240,6 +250,8 @@ iarpls (Improved Asymmetrically Reweighted Penalized Least Squares)
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_plots function, look at the top-most algorithm's code
     baseline, params = baseline_fitter.iarpls(y, lam=(1e3, 1e2))
@@ -256,6 +268,8 @@ Eigendecomposition is not allowed for this method.
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_plots function, look at the top-most algorithm's code
     baseline, params = baseline_fitter.aspls(y, lam=(1e3, 1e2))
@@ -271,6 +285,8 @@ psalsa (Peaked Signal's Asymmetric Least Squares Algorithm)
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_plots function, look at the top-most algorithm's code
     baseline, params = baseline_fitter.psalsa(y, lam=(1e3, 1e2), k=0.5)
@@ -286,6 +302,8 @@ brpls (Bayesian Reweighted Penalized Least Squares)
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_plots function, look at the top-most algorithm's code
     baseline, params = baseline_fitter.brpls(y, lam=(1e4, 1e2))
@@ -301,6 +319,8 @@ lsrpls (Locally Symmetric Reweighted Penalized Least Squares)
 .. plot::
    :align: center
    :context: close-figs
+   :include-source: False
+   :show-source-link: True
 
     # to see contents of create_plots function, look at the top-most algorithm's code
     baseline, params = baseline_fitter.lsrpls(y, lam=(1e4, 1e2))
diff --git a/docs/citing.rst b/docs/citing.rst
index 2e11f376..b22ccc99 100644
--- a/docs/citing.rst
+++ b/docs/citing.rst
@@ -8,7 +8,7 @@ research:
 
 .. code-block:: text
 
-    @software{pybaselines,
+    @software{erb2021pybaselines,
       author = {Erb, Donald},
       doi = {10.5281/zenodo.5608581},
       title = {{pybaselines}: A {Python} library of algorithms for the baseline correction of experimental data},
diff --git a/docs/conf.py b/docs/conf.py
index 0cda6fdf..15cb6cc4 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -156,9 +156,9 @@
 
 # -- Settings for matplotlib plot_directive extension ----------------------------
 
-plot_include_source = False
+plot_include_source = True  # set default to True so that docstring examples don't show source
 plot_html_show_formats = False
-plot_html_show_source_link = True
+plot_html_show_source_link = False  # set default to False for docstring examples
 
 plot_formats = ['png']
 
@@ -227,16 +227,23 @@
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
+html_logo = 'images/logo.png'
 # Theme options are theme-specific and customize the look and feel of a
 # theme further.  For a list of options available for each theme, see the
 # documentation.
 try:
-    import sphinx_rtd_theme
-    html_theme = 'sphinx_rtd_theme'
+    import pydata_sphinx_theme
+    html_theme = 'pydata_sphinx_theme'
     html_theme_options = {
-        'navigation_depth': 5,
-        'prev_next_buttons_location': 'both',
-    }
+    'icon_links': [
+            {
+                'name': 'GitHub',
+                'url': 'https://github.com/derb12/pybaselines',
+                'icon': 'fa-brands fa-github',
+                'type': 'fontawesome',
+            },
+        ],
+   }
 except ImportError:
     html_theme = 'nature'
     html_theme_options = {}
diff --git a/docs/contributing.rst b/docs/contributing.rst
index 656fbb4a..cd58e852 100644
--- a/docs/contributing.rst
+++ b/docs/contributing.rst
@@ -49,7 +49,7 @@ ensure `git <https://git-scm.com>`_ is installed and then run:
 
     git clone https://github.com/derb12/pybaselines.git
     cd pybaselines
-    pip install .[dev]
+    pip install -e .[dev]
 
 All sections below assume the above commands were ran.
 
@@ -136,6 +136,19 @@ please ensure the documentation builds locally by running the following command
 and ensure that no warnings or errors are raised during building. The built documentation can
 then be viewed in the ``pybaselines/docs/_build/html`` folder.
 
+Running Doctests
+^^^^^^^^^^^^^^^^
+
+If adding new code, it is often good to show an example usage of the class/method/function
+within its docstring using
+`doctest format <https://numpydoc.readthedocs.io/en/latest/format.html#examples>`_. pybaselines
+uses `scipy-doctest <https://github.com/scipy/scipy_doctest>`_ to ensure that these examples are
+correctly formatted and will run without errors. To perform these doctests, run the following
+command in the terminal while in the pybaselines directory:
+
+.. code-block:: console
+
+    pytest pybaselines --doctest-modules
 
 Adding New Algorithms
 ^^^^^^^^^^^^^^^^^^^^^
@@ -146,6 +159,9 @@ If adding a new baseline algorithm to pybaselines:
     ``pybaselines/tests/base_tests.py`` file that should be subclassed to ensure all basic
     requirements for a new algorithm are met. Additional tests should also be added as needed.
     See existing tests for examples.
+*   Try to add example usages of the algorithm within its docstring, showing basic usage and
+    any noteworthy fine-tuning. Ensure these examples run correctly by performing doctests as
+    outlined above.
 *   Add a short summary of the algorithm to the appropriate place in the
     `algorithms section <https://pybaselines.readthedocs.io/en/latest/algorithms/index.html>`_,
     and, if possible, add a plot showing how the algorithm fits different baselines using
diff --git a/docs/examples/README.rst b/docs/examples/README.rst
index 24f99432..90dba3c5 100644
--- a/docs/examples/README.rst
+++ b/docs/examples/README.rst
@@ -4,7 +4,7 @@ Examples
 
 This section shows various uses of pybaselines.
 
-To run the examples, `Matplotlib <https://matplotlib.org/>`_ must be installed,
+To run the examples locally, `Matplotlib <https://matplotlib.org/>`_ must be installed,
 which can be done by running:
 
 .. code-block:: console
diff --git a/docs/examples/general/plot_masked_data.py b/docs/examples/general/plot_masked_data.py
index 412d9fb8..264d357e 100644
--- a/docs/examples/general/plot_masked_data.py
+++ b/docs/examples/general/plot_masked_data.py
@@ -5,20 +5,21 @@
 
 There are cases where data needs to be filtered/masked before processing, for example a faulty
 detector can result in problematic regions in measurements. In these cases, baseline correction on
-the raw data could lead to severely incorrect fits. One such use of masking in literature is presented by
-`Temmink, et al. <https://doi.org/10.1051/0004-6361/202348911>`_ for removing downward spikes in
-mid-infrared data collected from the James Webb Space Telescope before performing baseline correction.
-This example will detail  how to handle working with masked data for the various types of baseline
-correction algorithms in pybaselines.
+the raw data could lead to severely incorrect fits. One such use of masking in literature is
+presented by `Temmink, et al. <https://doi.org/10.1051/0004-6361/202348911>`_ for removing
+downward spikes in mid-infrared data collected from the James Webb Space Telescope before
+performing baseline correction. This example will detail  how to handle working with masked data
+for the various types of baseline correction algorithms in pybaselines.
 
 As a preface for this guide, the author of this library acknowledges that masking could potentially
 be handled internally within each individual algorithm to make them "mask-aware". However, it would
-be a large change to the codebase, and there are many edge cases that would make it a non-trivial endeavor.
-Support for masking following the guidelines presented in this example could alternatively be added
-as an :doc:`optimizer-type algorithm <../../../algorithms/optimizers>`, but, in the author's opinion,
-making one single function that is expected to cover 60+ different algorithms would be fiddly at best
-and very prone to bugs. Therefore, this guide is presented as a starting point for users to adapt for
-targeting a spectific baseline correction algorithm or group of algorithms.
+be a large change to the codebase, and there are many edge cases that would make it a non-trivial
+endeavor. Support for masking following the guidelines presented in this example could
+alternatively be added as an :doc:`optimizer-type algorithm <../../../algorithms/optimizers>`,
+but, in the author's opinion, making one single function that is expected to cover 60+ different
+algorithms would be fiddly at best and very prone to bugs. Therefore, this guide is presented as a
+starting point for users to adapt for targeting a specific baseline correction algorithm or group
+of algorithms.
 
 The various algorithms in pybaselines can be broadly grouped into three different categories
 for how they handle masked data:
@@ -47,8 +48,6 @@
 
 from pybaselines import Baseline
 from pybaselines.utils import gaussian, relative_difference
-from pybaselines._banded_utils import PenalizedSystem
-from pybaselines._weighting import _arpls
 
 
 x = np.linspace(500, 4000, 1000)
@@ -83,8 +82,9 @@
 # To start, the mask for fitting the data has to be made. This can be done by eye if fitting
 # a few datasets, or can be automated using some metric. Many
 # :doc:`classification methods <../../../algorithms/classification>` use different methods for
-# excluding positive peaks; for excluding negative peaks, see
-# `Temmink, et al. <https://doi.org/10.1051/0004-6361/202348911>`_ for an example. This example
+# excluding positive peaks; for an example of excluding negative peaks, see
+# `Temmink, et al. <https://doi.org/10.1051/0004-6361/202348911>`_, which uses
+# Savitzky-Golay filtering combined with iterative thresholding. This example
 # will simply define the mask region by hand.
 fit_mask = (x < 1900) | (x > 2550)  # 1 in regions to fit, 0 in masked region
 
@@ -160,6 +160,8 @@
 # "Mask-aware" versions of these algorithms could be implemented, as shown below for
 # the arPLS algorithm. However, the "mask-aware" behavior can be closely approximated
 # simply by using weighted interpolation, to be shown below.
+from pybaselines._banded_utils import PenalizedSystem  # noqa: E402
+from pybaselines._weighting import _arpls  # noqa: E402
 
 
 def masked_arpls(y, mask=None, lam=1e5, diff_order=2, tol=1e-3, max_iter=50, weights=None):
diff --git a/docs/examples/general/plot_reuse_Baseline.py b/docs/examples/general/plot_reuse_Baseline.py
index 17f22bb0..c308bf9a 100644
--- a/docs/examples/general/plot_reuse_Baseline.py
+++ b/docs/examples/general/plot_reuse_Baseline.py
@@ -10,7 +10,7 @@
 matrix, and potentially its pseudoinverse, once. Likewise,
 :doc:`spline methods <../../../algorithms/spline>` will only have to compute the spline
 basis matrix once. Note that this only applies if the same non-data parameters
-(eg. ``poly_order``, ``lam``, etc.) are used for each fit.
+(eg. ``poly_order``, ``num_knots``, etc.) are used for each fit.
 
 This example will explore the efficiency of reusing the same ``Baseline`` object when fitting
 multiple datasets for different types of algorithms.
diff --git a/docs/examples/spline/plot_lam_vs_num_knots.py b/docs/examples/spline/plot_lam_vs_num_knots.py
index db1bc9a4..a5ddff14 100644
--- a/docs/examples/spline/plot_lam_vs_num_knots.py
+++ b/docs/examples/spline/plot_lam_vs_num_knots.py
@@ -72,7 +72,7 @@ def optimize_lam(data, known_baseline, func, previous_min=None, **kwargs):
 # Other baseline types could be examined, similar to the
 # :ref:`Whittaker lam vs data size example <sphx_glr_generated_examples_whittaker_plot_lam_vs_data_size.py>`,
 # which should give similar results.
-plt.plot(utils._make_data(1000, bkg_type='exponential')[1])
+plt.plot(utils.make_data(1000, bkg_type='exponential', signal_type=2)[1])
 
 # %%
 # The number of knots will vary from 20 to 1000 on a logarithmic scale. For each number
@@ -84,10 +84,12 @@ def optimize_lam(data, known_baseline, func, previous_min=None, **kwargs):
 symbols = cycle(['o', 's', 'd', 'h', '^', 'x'])
 best_lams = np.empty((len(num_knots), len(num_points)))
 for i, num_knot in enumerate(num_knots):
-    min_lam = 0
+    min_lam = None
     for j, num_x in enumerate(num_points):
         func = partial(Baseline().mixture_model, num_knots=num_knot, diff_order=2)
-        x, y, baseline = utils._make_data(num_x, bkg_type='exponential')
+        x, y, baseline = utils.make_data(
+            num_x, bkg_type='exponential', signal_type=2, noise_std=0.1, return_baseline=True
+        )
         # use a slightly lower tolerance to speed up the calculation
         min_lam = optimize_lam(y, baseline, func, min_lam, tol=1e-2, max_iter=50)
         best_lams[i, j] = min_lam
diff --git a/docs/examples/spline/plot_pspline_whittaker.py b/docs/examples/spline/plot_pspline_whittaker.py
index 79d8eefe..29ab1262 100644
--- a/docs/examples/spline/plot_pspline_whittaker.py
+++ b/docs/examples/spline/plot_pspline_whittaker.py
@@ -75,7 +75,7 @@ def optimize_lam(data, known_baseline, func, previous_min=None, **kwargs):
 # Other baseline types could be examined, similar to the
 # :ref:`Whittaker lam vs data size example <sphx_glr_generated_examples_whittaker_plot_lam_vs_data_size.py>`,
 # which should give similar results.
-plt.plot(utils._make_data(1000, bkg_type='exponential')[1])
+plt.plot(utils.make_data(1000, bkg_type='exponential', signal_type=2)[1])
 
 # %%
 # For each function, the optimal `lam` value will be calculated for data sizes
@@ -97,7 +97,9 @@ def optimize_lam(data, known_baseline, func, previous_min=None, **kwargs):
     min_lam = None
     for j, num_x in enumerate(num_points):
         func = getattr(Baseline(), func_name)
-        x, y, baseline = utils._make_data(num_x, bkg_type='exponential')
+        x, y, baseline = utils.make_data(
+            num_x, bkg_type='exponential', signal_type=2, return_baseline=True
+        )
         # use a slightly lower tolerance to speed up the calculation
         min_lam = optimize_lam(y, baseline, func, min_lam, tol=1e-2, max_iter=50)
         best_lams[j] = min_lam
diff --git a/docs/examples/whittaker/plot_lam_vs_data_size.py b/docs/examples/whittaker/plot_lam_vs_data_size.py
index ac936a3a..6387bed0 100644
--- a/docs/examples/whittaker/plot_lam_vs_data_size.py
+++ b/docs/examples/whittaker/plot_lam_vs_data_size.py
@@ -88,7 +88,7 @@ def iasls(*args, lam=1, p=0.05, **kwargs):
 bkg_types = ('exponential', 'gaussian', 'sine')
 for bkg_type in bkg_types:
     bkg_dict[bkg_type] = {}
-    plt.plot(utils._make_data(1000, bkg_type)[1], label=bkg_type)
+    plt.plot(utils.make_data(1000, bkg_type, signal_type=2, noise_std=0.1)[1], label=bkg_type)
 plt.legend()
 
 # %%
@@ -115,7 +115,9 @@ def iasls(*args, lam=1, p=0.05, **kwargs):
                 func = iasls
             else:
                 func = getattr(Baseline(), func_name)
-            x, y, baseline = utils._make_data(num_x, bkg_type)
+            x, y, baseline = utils.make_data(
+                num_x, bkg_type, signal_type=2, noise_std=0.1, return_baseline=True
+            )
             # use a slightly lower tolerance to speed up the calculation
             min_lam = optimize_lam(y, baseline, func, min_lam, tol=1e-2, max_iter=50)
             best_lams[j] = min_lam
diff --git a/docs/examples/whittaker/plot_whittaker_solvers.py b/docs/examples/whittaker/plot_whittaker_solvers.py
index c32d5d55..46ae43da 100644
--- a/docs/examples/whittaker/plot_whittaker_solvers.py
+++ b/docs/examples/whittaker/plot_whittaker_solvers.py
@@ -103,7 +103,7 @@ def sparse_asls(data, lam=1e6, p=1e-2, diff_order=2, max_iter=50, tol=1e-3, weig
     for func_name in functions:
         timings = []
         for num_x in data_sizes:
-            y = utils._make_data(num_x)[1]
+            y = utils.make_data(num_x, bkg_type='exponential', noise_std=0.1, signal_type=2)[1]
             lam = lam_equation(num_x)
             if func_name == 'sparse':
                 func = sparse_asls
diff --git a/docs/images/logo.png b/docs/images/logo.png
index 99989a25..7814a29f 100644
Binary files a/docs/images/logo.png and b/docs/images/logo.png differ
diff --git a/docs/index.rst b/docs/index.rst
index a111a75e..8852e4a7 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -4,6 +4,7 @@ pybaselines Documentation
 .. image:: images/logo.png
    :alt: Logo
    :align: center
+   :class: dark-light
 
 
 pybaselines is a library of algorithms for the baseline correction of experimental data.
@@ -22,9 +23,9 @@ pybaselines is a library of algorithms for the baseline correction of experiment
    installation
    quickstart
    parameter_selection
-   performance
    algorithms/index
    algorithms_2d/index
+   performance
    generated/examples/index
    api/index
    contributing
diff --git a/docs/installation.rst b/docs/installation.rst
index 7a749037..e0db316c 100644
--- a/docs/installation.rst
+++ b/docs/installation.rst
@@ -25,23 +25,23 @@ pybaselines has the following optional dependencies:
 * `Numba <https://github.com/numba/numba>`_ (>= 0.53):
   speeds up calculations used by the following functions:
 
-    * :meth:`~.Baseline.loess`
-    * :meth:`~.Baseline.dietrich`
-    * :meth:`~.Baseline.golotvin`
-    * :meth:`~.Baseline.std_distribution`
-    * :meth:`~.Baseline.fastchrom`
-    * :meth:`~.Baseline.beads`
-    * :meth:`~.Baseline.mpspline`
-    * all :ref:`spline <api/Baseline:Spline Algorithms>` methods
+  * :meth:`~.Baseline.loess`
+  * :meth:`~.Baseline.dietrich`
+  * :meth:`~.Baseline.golotvin`
+  * :meth:`~.Baseline.std_distribution`
+  * :meth:`~.Baseline.fastchrom`
+  * :meth:`~.Baseline.beads`
+  * :meth:`~.Baseline.mpspline`
+  * all :ref:`spline <api/Baseline:Spline Algorithms>` methods
 
 * `pentapy <https://github.com/GeoStat-Framework/pentapy>`_ (>= 1.1):
   provides a faster solver for banded pentadiagonal linear systems, which are
   used by the following functions (when ``diff_order=2``):
 
-    * all :ref:`Whittaker smoothing <api/Baseline:Whittaker Smoothing Algorithms>` methods
-    * :meth:`~.Baseline.mpls`
-    * :meth:`~.Baseline.jbcd`
-    * :meth:`~.Baseline.fabc`
+  * all :ref:`Whittaker smoothing <api/Baseline:Whittaker Smoothing Algorithms>` methods
+  * :meth:`~.Baseline.mpls`
+  * :meth:`~.Baseline.jbcd`
+  * :meth:`~.Baseline.fabc`
 
 
 Stable Release
diff --git a/docs/parameter_selection.rst b/docs/parameter_selection.rst
index ad4cdb35..944d459f 100644
--- a/docs/parameter_selection.rst
+++ b/docs/parameter_selection.rst
@@ -12,42 +12,42 @@ adjusting for each family of algorithms within pybaselines:
 
 * Polynomial methods
 
-    * ``poly_order`` controls the curvature of the baseline.
+  * ``poly_order`` controls the curvature of the baseline.
 
 * Whittaker-smoothing-based methods
 
-    * ``lam`` controls the curvature of the baseline. See
-      :ref:`this example <sphx_glr_generated_examples_whittaker_plot_lam_effects.py>`
-      to get an idea of how ``lam`` effects the baseline. The optimal ``lam``
-      value for each algorithm is not typically the same.
+  * ``lam`` controls the curvature of the baseline. See
+    :ref:`this example <sphx_glr_generated_examples_whittaker_plot_lam_effects.py>`
+    to get an idea of how ``lam`` effects the baseline. The optimal ``lam``
+    value for each algorithm is not typically the same.
 
 * Morphological methods
 
-    * ``half_window`` controls the general fit of the baseline. See
-      :ref:`this example <sphx_glr_generated_examples_morphological_plot_half_window_effects.py>`
-      to get an idea of how ``half_window`` effects the baseline. The optimal
-      ``half_window`` value for each algorithm is not typically the same.
+  * ``half_window`` controls the general fit of the baseline. See
+    :ref:`this example <sphx_glr_generated_examples_morphological_plot_half_window_effects.py>`
+    to get an idea of how ``half_window`` effects the baseline. The optimal
+    ``half_window`` value for each algorithm is not typically the same.
 
 * Spline methods
 
-    * ``lam`` controls the curvature of the baseline. The
-      :ref:`Whittaker example <sphx_glr_generated_examples_whittaker_plot_lam_effects.py>`
-      also generally applies to spline methods.
+  * ``lam`` controls the curvature of the baseline. The
+    :ref:`Whittaker example <sphx_glr_generated_examples_whittaker_plot_lam_effects.py>`
+    also generally applies to spline methods.
 
 * Smoothing-based methods
 
-    * ``half_window`` controls the general fit of the baseline. The
-      :ref:`Morphological example <sphx_glr_generated_examples_morphological_plot_half_window_effects.py>`
-      also generally applies to smoothing methods.
+  * ``half_window`` controls the general fit of the baseline. The
+    :ref:`Morphological example <sphx_glr_generated_examples_morphological_plot_half_window_effects.py>`
+    also generally applies to smoothing methods.
 
 * Baseline/Peak Classification methods
 
-    * Algorithm dependent
+  * Algorithm dependent
 
 * Optimizers
 
-    * Algorithm dependent
+  * Algorithm dependent
 
 * Miscellaneous methods
 
-    * Algorithm dependent
+  * Algorithm dependent
diff --git a/pybaselines/classification.py b/pybaselines/classification.py
index 2d520bd7..5eb72ed5 100644
--- a/pybaselines/classification.py
+++ b/pybaselines/classification.py
@@ -987,6 +987,8 @@ def _refine_mask(mask, min_length=2):
 
     Examples
     --------
+    >>> import numpy as np
+    >>> from pybaselines.classification import _refine_mask
     >>> mask = np.array([1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1])
     >>> _refine_mask(mask, 3).astype(int)
     array([1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1])
diff --git a/pybaselines/polynomial.py b/pybaselines/polynomial.py
index 01196004..46c0203d 100644
--- a/pybaselines/polynomial.py
+++ b/pybaselines/polynomial.py
@@ -378,16 +378,16 @@ def penalized_poly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=Non
             size equal to N and all values set to 1.
         cost_function : str, optional
             The non-quadratic cost function to minimize. Must indicate symmetry of the
-            method by appending 'a' or 'asymmetric' for asymmetric loss, and 's' or
+            method by prepending 'a' or 'asymmetric' for asymmetric loss, and 's' or
             'symmetric' for symmetric loss. Default is 'asymmetric_truncated_quadratic'.
             Available methods, and their associated reference, are:
 
-                * 'asymmetric_truncated_quadratic'[1]_
-                * 'symmetric_truncated_quadratic'[1]_
-                * 'asymmetric_huber'[1]_
-                * 'symmetric_huber'[1]_
-                * 'asymmetric_indec'[2]_
-                * 'symmetric_indec'[2]_
+            * 'asymmetric_truncated_quadratic'[1]_
+            * 'symmetric_truncated_quadratic'[1]_
+            * 'asymmetric_huber'[1]_
+            * 'symmetric_huber'[1]_
+            * 'asymmetric_indec'[2]_
+            * 'symmetric_indec'[2]_
 
         threshold : float, optional
             The threshold value for the loss method, where the function goes from
@@ -761,7 +761,7 @@ def quant_reg(self, data, poly_order=2, quantile=0.05, tol=1e-6, max_iter=250,
 
         Notes
         -----
-        Application of quantile regression for baseline fitting ss described in [1]_.
+        Application of quantile regression for baseline fitting as described in [1]_.
 
         Performs quantile regression using iteratively reweighted least squares (IRLS)
         as described in [2]_.
@@ -833,9 +833,9 @@ def goldindec(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=None,
             ('a' or 'asymmetric') is optional (eg. 'indec' and 'a_indec' are the same). Default
             is 'asymmetric_indec'. Available methods, and their associated reference, are:
 
-                * 'asymmetric_indec'[1]_
-                * 'asymmetric_truncated_quadratic'[2]_
-                * 'asymmetric_huber'[2]_
+            * 'asymmetric_indec'[1]_
+            * 'asymmetric_truncated_quadratic'[2]_
+            * 'asymmetric_huber'[2]_
 
         peak_ratio : float, optional
             A value between 0 and 1 that designates how many points in the data belong
@@ -1383,7 +1383,7 @@ def _identify_loss_method(loss_method):
     """
     prefix, *split_method = loss_method.lower().split('_')
     if prefix not in ('a', 's', 'asymmetric', 'symmetric') or not split_method:
-        raise ValueError('must specify loss function symmetry by appending "a_" or "s_"')
+        raise ValueError('must specify loss function symmetry by prepending "a_" or "s_"')
     if prefix in ('a', 'asymmetric'):
         symmetric = False
     else:
@@ -1421,16 +1421,16 @@ def penalized_poly(data, x_data=None, poly_order=2, tol=1e-3, max_iter=250,
         size equal to N and all values set to 1.
     cost_function : str, optional
         The non-quadratic cost function to minimize. Must indicate symmetry of the
-        method by appending 'a' or 'asymmetric' for asymmetric loss, and 's' or
+        method by prepending 'a' or 'asymmetric' for asymmetric loss, and 's' or
         'symmetric' for symmetric loss. Default is 'asymmetric_truncated_quadratic'.
         Available methods, and their associated reference, are:
 
-            * 'asymmetric_truncated_quadratic'[7]_
-            * 'symmetric_truncated_quadratic'[7]_
-            * 'asymmetric_huber'[7]_
-            * 'symmetric_huber'[7]_
-            * 'asymmetric_indec'[8]_
-            * 'symmetric_indec'[8]_
+        * 'asymmetric_truncated_quadratic'[7]_
+        * 'symmetric_truncated_quadratic'[7]_
+        * 'asymmetric_huber'[7]_
+        * 'symmetric_huber'[7]_
+        * 'asymmetric_indec'[8]_
+        * 'symmetric_indec'[8]_
 
     threshold : float, optional
         The threshold value for the loss method, where the function goes from
@@ -2189,9 +2189,9 @@ def goldindec(data, x_data=None, poly_order=2, tol=1e-3, max_iter=250, weights=N
         ('a' or 'asymmetric') is optional (eg. 'indec' and 'a_indec' are the same). Default
         is 'asymmetric_indec'. Available methods, and their associated reference, are:
 
-            * 'asymmetric_indec'[18]_
-            * 'asymmetric_truncated_quadratic'[19]_
-            * 'asymmetric_huber'[19]_
+        * 'asymmetric_indec'[18]_
+        * 'asymmetric_truncated_quadratic'[19]_
+        * 'asymmetric_huber'[19]_
 
     peak_ratio : float, optional
         A value between 0 and 1 that designates how many points in the data belong
diff --git a/pybaselines/two_d/polynomial.py b/pybaselines/two_d/polynomial.py
index 8faf6edb..b92eb9b9 100644
--- a/pybaselines/two_d/polynomial.py
+++ b/pybaselines/two_d/polynomial.py
@@ -372,16 +372,16 @@ def penalized_poly(self, data, poly_order=2, tol=1e-3, max_iter=250, weights=Non
             shape equal to (M, N) and all values set to 1.
         cost_function : str, optional
             The non-quadratic cost function to minimize. Must indicate symmetry of the
-            method by appending 'a' or 'asymmetric' for asymmetric loss, and 's' or
+            method by prepending 'a' or 'asymmetric' for asymmetric loss, and 's' or
             'symmetric' for symmetric loss. Default is 'asymmetric_truncated_quadratic'.
             Available methods, and their associated reference, are:
 
-                * 'asymmetric_truncated_quadratic'[1]_
-                * 'symmetric_truncated_quadratic'[1]_
-                * 'asymmetric_huber'[1]_
-                * 'symmetric_huber'[1]_
-                * 'asymmetric_indec'[2]_
-                * 'symmetric_indec'[2]_
+            * 'asymmetric_truncated_quadratic'[1]_
+            * 'symmetric_truncated_quadratic'[1]_
+            * 'asymmetric_huber'[1]_
+            * 'symmetric_huber'[1]_
+            * 'asymmetric_indec'[2]_
+            * 'symmetric_indec'[2]_
 
         threshold : float, optional
             The threshold value for the loss method, where the function goes from
@@ -547,7 +547,7 @@ def quant_reg(self, data, poly_order=2, quantile=0.05, tol=1e-6, max_iter=250,
 
         Notes
         -----
-        Application of quantile regression for baseline fitting ss described in [1]_.
+        Application of quantile regression for baseline fitting as described in [1]_.
 
         Performs quantile regression using iteratively reweighted least squares (IRLS)
         as described in [2]_.
@@ -790,7 +790,7 @@ def _identify_loss_method(loss_method):
     """
     prefix, *split_method = loss_method.lower().split('_')
     if prefix not in ('a', 's', 'asymmetric', 'symmetric') or not split_method:
-        raise ValueError('must specify loss function symmetry by appending "a_" or "s_"')
+        raise ValueError('must specify loss function symmetry by prepending "a_" or "s_"')
     if prefix in ('a', 'asymmetric'):
         symmetric = False
     else:
diff --git a/pybaselines/utils.py b/pybaselines/utils.py
index f08556ee..0b8a8f75 100644
--- a/pybaselines/utils.py
+++ b/pybaselines/utils.py
@@ -161,9 +161,9 @@ def gaussian_kernel(window_size, sigma=1.0):
 
     Notes
     -----
-    Return gaus/sum(gaus) rather than creating a unit-area gaussian
+    Return ``gaus/sum(gaus)`` rather than creating a unit-area gaussian
     since the unit-area gaussian would have an area smaller than 1
-    for window_size < ~ 6 * sigma.
+    for ``window_size <~ 6 * sigma``.
 
     """
     # centers distribution from -half_window to half_window
@@ -1084,16 +1084,24 @@ def pspline_smooth(data, x_data=None, lam=1e1, num_knots=100, spline_degree=3, d
     return y_smooth, pspline.tck
 
 
-def _make_data(data_size=1000, bkg_type='exponential'):
+def make_data(data_size=1000, bkg_type='gaussian_small', signal_type=1, noise_std=0.05,
+              return_baseline=False):
     """
-    Creates simple datasets for use in examples.
+    Creates a simple dataset for use in examples and docstrings.
 
     Parameters
     ----------
     data_size : int, optional
         The number of data points. Default is 1000.
-    bkg_type : {'exponential', 'gaussian', 'linear', 'sine'}, optional
-        A string designating the type of baseline to add to the data. Default is 'exponential'.
+    bkg_type : {'gaussian_small', 'exponential', 'linear', 'sine', 'gaussian'}, optional
+        A string designating the type of baseline to add to the data. Default is 'gaussian_small'.
+    signal_type : {1, 2}, optional
+        The type of signal to produce. In general, examples within the documentation use 2
+        while docstring examples use 1. Default is 1.
+    noise_std : float, optional
+        The standard deviation of the noise. Default is 0.05.
+    return_baseline : bool, optional
+        Whether to include the computed baseline in the output. Default is False.
 
     Returns
     -------
@@ -1102,12 +1110,12 @@ def _make_data(data_size=1000, bkg_type='exponential'):
     y : numpy.ndarray, shape (`data_size`,)
         The y-values for the example data.
     baseline : numpy.ndarray, shape (`data_size`,)
-        The true baseline for the example data.
+        Only returned if ``return_baseline`` is True. The true baseline for the example data.
 
     Raises
     ------
     ValueError
-        Raised if `bkg_type` is not an allowed value.
+        Raised if ``bkg_type`` or ``signal_type`` is not an allowed value.
 
     Notes
     -----
@@ -1117,19 +1125,35 @@ def _make_data(data_size=1000, bkg_type='exponential'):
 
     """
     x = np.linspace(0, 1000, data_size)
-    signal = (
-        gaussian(x, 9, 100, 12)
-        + gaussian(x, 6, 180, 5)
-        + gaussian(x, 8, 350, 11)
-        + gaussian(x, 15, 400, 18)
-        + gaussian(x, 6, 550, 6)
-        + gaussian(x, 13, 700, 8)
-        + gaussian(x, 9, 800, 9)
-        + gaussian(x, 9, 880, 7)
-    )
+
+    if signal_type == 1:
+        signal = (
+            + gaussian(x, 4, 180, 12)
+            + gaussian(x, 15, 400, 10)
+            + gaussian(x, 6, 540, 6)
+            + gaussian(x, 13, 700, 8)
+            + gaussian(x, 9, 880, 7)
+        )
+    elif signal_type == 2:
+        signal = (
+            gaussian(x, 9, 100, 12)
+            + gaussian(x, 6, 180, 5)
+            + gaussian(x, 8, 350, 11)
+            + gaussian(x, 15, 400, 18)
+            + gaussian(x, 6, 550, 6)
+            + gaussian(x, 13, 700, 8)
+            + gaussian(x, 9, 800, 9)
+            + gaussian(x, 9, 880, 7)
+        )
+    else:
+        raise ValueError(f'unknown signal_type {signal_type}')
+
+    bkg_type = bkg_type.lower()
     if bkg_type == 'exponential':
         baseline = 5 + 15 * np.exp(-x / 200)
-    elif bkg_type == 'gaussian':
+    elif bkg_type == 'gaussian_small':
+        baseline = 15 + gaussian(x, 4, 650, 400)
+    elif bkg_type == 'gaussian':  # a more pronounced gaussian than 'gaussian_small'
         baseline = 30 + gaussian(x, 20, 500, 150)
     elif bkg_type == 'linear':
         baseline = 1 + x * 0.005
@@ -1137,8 +1161,10 @@ def _make_data(data_size=1000, bkg_type='exponential'):
         baseline = 70 + 5 * np.sin(x / 50)
     else:
         raise ValueError(f'unknown bkg_type {bkg_type}')
-
-    noise = np.random.default_rng(0).normal(0, 0.1, data_size)
+    noise = np.random.default_rng(0).normal(0, noise_std, data_size)
     y = signal + baseline + noise
 
-    return x, y, baseline
+    if return_baseline:
+        return x, y, baseline
+
+    return x, y
diff --git a/pybaselines/whittaker.py b/pybaselines/whittaker.py
index 04dd3b6c..bfff13d2 100644
--- a/pybaselines/whittaker.py
+++ b/pybaselines/whittaker.py
@@ -20,21 +20,35 @@ class _Whittaker(_Algorithm):
 
     @_Algorithm._register(sort_keys=('weights',))
     def asls(self, data, lam=1e6, p=1e-2, diff_order=2, max_iter=50, tol=1e-3, weights=None):
-        """
-        Fits the baseline using asymmetric least squares (AsLS) fitting.
+        r"""
+        Fits the baseline using the asymmetric least squares (AsLS) algorithm.
+
+        Iteratively calculates the baseline, v, by solving the linear equation
+        :math:`(W + \lambda D_d^{\mathsf{T}} D_d) v = W y`, where y is the input data,
+        :math:`D_d` is the finite difference matrix of order d, W is the diagonal matrix
+        of the weights, and :math:`\lambda` (``lam``) is the regularization parameter.
+
+        Each iteration, the weights are updated following:
+
+        .. math::
+
+            w_i = \left\{\begin{array}{cr}
+                p & y_i > v_i \\
+                1 - p & y_i \le v_i
+            \end{array}\right.
 
         Parameters
         ----------
         data : array-like, shape (N,)
-            The y-values of the measured data, with N data points. Must not
-            contain missing data (NaN) or Inf.
+            The y-values of the measured data. Must not contain missing data (NaN) or Inf.
         lam : float, optional
             The smoothing parameter. Larger values will create smoother baselines.
             Default is 1e6.
         p : float, optional
             The penalizing weighting factor. Must be between 0 and 1. Values greater
-            than the baseline will be given `p` weight, and values less than the baseline
-            will be given `1 - p` weight. Default is 1e-2.
+            than the baseline will be given ``p`` weight, and values less than the baseline
+            will be given ``1 - p`` weight. Can be considered as analogous to the quantile of
+            the data to fit. Default is 1e-2.
         diff_order : int, optional
             The order of the differential matrix. Must be greater than 0. Default is 2
             (second order differential matrix). Typical values are 2 or 1.
@@ -59,12 +73,17 @@ def asls(self, data, lam=1e6, p=1e-2, diff_order=2, max_iter=50, tol=1e-3, weigh
                 An array containing the calculated tolerance values for
                 each iteration. The length of the array is the number of iterations
                 completed. If the last value in the array is greater than the input
-                `tol` value, then the function did not converge.
+                ``tol`` value, then the function did not converge.
 
         Raises
         ------
         ValueError
-            Raised if `p` is not between 0 and 1.
+            Raised if ``p`` is not between 0 and 1.
+
+        Notes
+        -----
+        In literature, this algorithm is referred to as either "ALS" or "AsLS",
+        both standing for asymmetric least squares.
 
         References
         ----------
@@ -75,6 +94,47 @@ def asls(self, data, lam=1e6, p=1e-2, diff_order=2, max_iter=50, tol=1e-3, weigh
 
         Eilers, P. Parametric Time Warping. Analytical Chemistry, 2004, 76(2), 404-411.
 
+        Examples
+        --------
+        >>> import matplotlib.pyplot as plt
+        >>> from pybaselines import Baseline, utils
+        >>> x, y = utils.make_data(noise_std=0.2)
+        >>> baseline_fitter = Baseline(x)
+
+        The two main parameters to vary are ``lam``, which controls the smoothness of the
+        baseline, and ``p``, which controls the weighting. In general, a higher ``p`` value
+        is needed for noisy data and correspondly requires a slightly higher ``lam`` value.
+
+        >>> fit_1, params_1 = baseline_fitter.asls(y, lam=1e6, p=0.001)
+        >>> fit_2, params_2 = baseline_fitter.asls(y, lam=1e7, p=0.02)
+        >>> plt.plot(x, y)
+        >>> plt.plot(x, fit_1, label='lam=1e6, p=0.001')
+        >>> plt.plot(x, fit_2, '--', label='lam=1e7, p=0.02')
+        >>> plt.legend()
+        >>> plt.show()
+
+        The parameter ``p`` should typically be chosen such that the residuals, ``y - baseline``,
+        are centered around 0. In this example, the use of ``p=0.02`` fits the noise better.
+
+        >>> plt.hist(
+        ...     [y - fit_1, y - fit_2], bins=100, density=True, label=['p=0.001', 'p=0.02'],
+        ...     histtype='step'
+        ... )
+        >>> plt.xlabel('Residuals, y - baseline')
+        >>> plt.legend()
+        >>> plt.show()
+
+        An alternative to adjusting ``p`` for noisy data is to simply smooth the data before
+        calling ``asls``, which allows for ignoring ``p`` (ie. selecting a ``p`` value near 0).
+
+        >>> y_smooth = utils.whittaker_smooth(y, lam=1e1)
+        >>> fit_3, params_3 = baseline_fitter.asls(y_smooth, lam=1e6, p=0.001)
+        >>> plt.plot(x, y)
+        >>> plt.plot(x, y_smooth, '--', label='smoothed data')
+        >>> plt.plot(x, fit_3, label='smoothed fit')
+        >>> plt.legend()
+        >>> plt.show()
+
         """
         if not 0 < p < 1:
             raise ValueError('p must be between 0 and 1')
@@ -99,16 +159,36 @@ def asls(self, data, lam=1e6, p=1e-2, diff_order=2, max_iter=50, tol=1e-3, weigh
     @_Algorithm._register(sort_keys=('weights',))
     def iasls(self, data, lam=1e6, p=1e-2, lam_1=1e-4, max_iter=50, tol=1e-3,
               weights=None, diff_order=2):
-        """
+        r"""
         Fits the baseline using the improved asymmetric least squares (IAsLS) algorithm.
 
-        The algorithm consideres both the first and second derivatives of the residual.
+        The algorithm penalizes both the smoothness of the baseline as well as the first
+        derivative of the signal, ``y - baseline``. The baseline, v, is iteratively calculated
+        by solving the linear equation
+
+        .. math::
+
+            (W^{\mathsf{T}} W + \lambda_1 D_1^{\mathsf{T}} D_1 + \lambda D_d^{\mathsf{T}} D_d) v
+            = (W^{\mathsf{T}} W + \lambda_1 D_1^{\mathsf{T}} D_1) y
+
+        where y is the input data, :math:`D_d` is the finite difference matrix of order d,
+        :math:`D_1` is the first-order finite difference matrix, W is the diagonal matrix
+        of the weights, and :math:`\lambda` (``lam``) and :math:`\lambda_1` (``lam_1``) are
+        regularization parameters.
+
+        Each iteration, the weights are updated following:
+
+        .. math::
+
+            w_i = \left\{\begin{array}{cr}
+                p & y_i > v_i \\
+                1 - p & y_i \le v_i
+            \end{array}\right.
 
         Parameters
         ----------
         data : array-like, shape (N,)
-            The y-values of the measured data, with `N` data points. Must not
-            contain missing data (NaN) or Inf.
+            The y-values of the measured data. Must not contain missing data (NaN) or Inf.
         lam : float, optional
             The smoothing parameter. Larger values will create smoother baselines.
             Default is 1e6.
@@ -149,11 +229,79 @@ def iasls(self, data, lam=1e6, p=1e-2, lam_1=1e-4, max_iter=50, tol=1e-3,
         ValueError
             Raised if `p` is not between 0 and 1 or if `diff_order` is less than 2.
 
+        Notes
+        -----
+        Although both ``iasls`` and :meth:`~.Baseline.asls` use ``p`` for defining the weights,
+        the appropriate ``p`` value for ``iasls`` will be approximately equal to the square root
+        of the value used for ``asls`` since ``iasls`` squares the weights within its linear
+        equation.
+
+        Omits the outer loop described by the reference implementation of the IAsLs algorithm,
+        in which the baseline fitting is repeated after subtracting the baseline from the data.
+        The outer loop makes the algorithm's performance difficult to predict and often leads
+        to either peak clipping or no change from the first fit baseline. If this outer loop is
+        desired, this method simply needs to be called repeatedly, as demonstrated in the
+        Examples section below.
+
         References
         ----------
         He, S., et al. Baseline correction for raman spectra using an improved
         asymmetric least squares method, Analytical Methods, 2014, 6(12), 4402-4407.
 
+        Examples
+        --------
+        >>> import matplotlib.pyplot as plt
+        >>> import numpy as np
+        >>> from pybaselines import Baseline, utils
+        >>> x, y = utils.make_data(noise_std=0.2)
+        >>> baseline_fitter = Baseline(x)
+
+        The two main parameters to vary are ``lam``, which controls the smoothness of the
+        baseline, and ``p``, which controls the weighting. In general, a higher ``p`` value
+        is needed for noisy data and correspondly requires a slightly higher ``lam`` value.
+
+        >>> fit_1, params_1 = baseline_fitter.iasls(y, lam=1e6, p=0.05)
+        >>> fit_2, params_2 = baseline_fitter.iasls(y, lam=1e7, p=0.15)
+        >>> plt.plot(x, y)
+        >>> plt.plot(x, fit_1, label='lam=1e6, p=0.05')
+        >>> plt.plot(x, fit_2, '--', label='lam=1e7, p=0.15')
+        >>> plt.legend()
+        >>> plt.show()
+
+        The parameter ``p`` should typically be chosen such that the residuals, ``y - baseline``,
+        are centered around 0. In this example, the use of ``p=0.15`` fits the noise better.
+
+        >>> plt.hist(
+        ...     [y - fit_1, y - fit_2], bins=100, density=True, label=['p=0.05', 'p=0.15'],
+        ...     histtype='step'
+        ... )
+        >>> plt.xlabel('Residuals, y - baseline')
+        >>> plt.legend()
+        >>> plt.show()
+
+        As stated in the Notes section above, the original IAsLs algorithm used an outer loop
+        which subtracted the fit baseline from the data and then used the result to fit a new
+        baseline. This behavior can be emulated by calling ``iasls`` within a loop as shown below.
+
+        >>> baseline = np.zeros_like(y)
+        >>> data = y
+        >>> tol = 5e-3
+        >>> max_iter = 50
+        >>> for i in range(max_iter):
+        ...     fit, params = baseline_fitter.iasls(data, lam=1e7, p=0.15)
+        ...     residual = data - fit
+        ...     if utils.relative_difference(data, residual) < tol:
+        ...         break
+        ...     data = residual
+        ...     baseline += fit
+        >>> print(f'Performed {i + 1} iterations')
+        Performed 3 iterations
+        >>> plt.plot(x, y)
+        >>> plt.plot(x, fit_2, label='single iteration')
+        >>> plt.plot(x, baseline, '--', label='multiple iterations')
+        >>> plt.legend()
+        >>> plt.show()
+
         """
         if not 0 < p < 1:
             raise ValueError('p must be between 0 and 1')
@@ -201,14 +349,31 @@ def iasls(self, data, lam=1e6, p=1e-2, lam_1=1e-4, max_iter=50, tol=1e-3,
     @_Algorithm._register(sort_keys=('weights',))
     def airpls(self, data, lam=1e6, diff_order=2, max_iter=50, tol=1e-3, weights=None,
                normalize_weights=False):
-        """
+        r"""
         Adaptive iteratively reweighted penalized least squares (airPLS) baseline.
 
+        Iteratively calculates the baseline, v, by solving the linear equation
+        :math:`(W + \lambda D_d^{\mathsf{T}} D_d) v = W y`, where y is the input data,
+        :math:`D_d` is the finite difference matrix of order d, W is the diagonal matrix
+        of the weights, and :math:`\lambda` (``lam``) is the regularization parameter.
+
+        Each iteration, the weights are updated following:
+
+        .. math::
+
+            w_i = \left\{\begin{array}{cr}
+                0 & y_i \ge v_i \\
+                \exp{\left(\frac{\text{abs}(y_i - v_i) t}{|\mathbf{r}^-|}\right)} & y_i < v_i
+            \end{array}\right.
+
+        where t is the iteration number and :math:`|\mathbf{r}^-|` is the L1-norm of the
+        negative values in the residual vector :math:`\mathbf r`, ie.
+        :math:`\sum\limits_{y_i - v_i < 0} |y_i - v_i|`.
+
         Parameters
         ----------
         data : array-like, shape (N,)
-            The y-values of the measured data, with N data points. Must not
-            contain missing data (NaN) or Inf.
+            The y-values of the measured data. Must not contain missing data (NaN) or Inf.
         lam : float, optional
             The smoothing parameter. Larger values will create smoother baselines.
             Default is 1e6.
@@ -242,11 +407,49 @@ def airpls(self, data, lam=1e6, diff_order=2, max_iter=50, tol=1e-3, weights=Non
                 completed. If the last value in the array is greater than the input
                 `tol` value, then the function did not converge.
 
+        Notes
+        -----
+        The original publication for the airPLS algorithm contained a typo in its
+        weighting scheme which omitted the use of absolute values within the exponential,
+        as indicated by the author: https://github.com/zmzhang/airPLS/issues/8.
+
         References
         ----------
         Zhang, Z.M., et al. Baseline correction using adaptive iteratively
         reweighted penalized least squares. Analyst, 2010, 135(5), 1138-1146.
 
+        Examples
+        --------
+        >>> import matplotlib.pyplot as plt
+        >>> from pybaselines import Baseline, utils
+        >>> x, y = utils.make_data()
+        >>> baseline_fitter = Baseline(x)
+
+        The main parameter to vary is ``lam``, which controls the smoothness of the
+        baseline, with larger values giving smoother baselines.
+
+        >>> fit_1, params_1 = baseline_fitter.airpls(y, lam=1e4)
+        >>> fit_2, params_2 = baseline_fitter.airpls(y, lam=1e6)
+        >>> plt.plot(x, y)
+        >>> plt.plot(x, fit_1, label='lam=1e4')
+        >>> plt.plot(x, fit_2, '--', label='lam=1e6')
+        >>> plt.legend()
+        >>> plt.show()
+
+        For noisy data, ``airpls`` underestimates the baseline, which can be mitigated by
+        smoothing the input data before calling the method.
+
+        >>> _, y_noisy = utils.make_data(noise_std=0.3)
+        >>> y_smooth = utils.whittaker_smooth(y_noisy, lam=1e2)
+        >>> fit_3, params_4 = baseline_fitter.airpls(y_noisy, lam=1e6)
+        >>> fit_4, params_4 = baseline_fitter.airpls(y_smooth, lam=1e6)
+        >>> plt.plot(x, y_noisy)
+        >>> plt.plot(x, y_smooth, '--', label='smoothed data')
+        >>> plt.plot(x, fit_3, ':', label='noisy fit')
+        >>> plt.plot(x, fit_4, label='smoothed fit')
+        >>> plt.legend()
+        >>> plt.show()
+
         """
         y, weight_array, whittaker_system = self._setup_whittaker(data, lam, diff_order, weights)
         y_l1_norm = np.abs(y).sum()
@@ -274,14 +477,33 @@ def airpls(self, data, lam=1e6, diff_order=2, max_iter=50, tol=1e-3, weights=Non
 
     @_Algorithm._register(sort_keys=('weights',))
     def arpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=None):
-        """
+        r"""
         Asymmetrically reweighted penalized least squares smoothing (arPLS).
 
+        Iteratively calculates the baseline, v, by solving the linear equation
+        :math:`(W + \lambda D_d^{\mathsf{T}} D_d) v = W y`, where y is the input data,
+        :math:`D_d` is the finite difference matrix of order d, W is the diagonal matrix
+        of the weights, and :math:`\lambda` (``lam``) is the regularization parameter.
+
+        Each iteration, the weights are updated following:
+
+        .. math::
+
+            w_i = \frac
+                {1}
+                {1 + \exp{\left(\frac
+                    {2(r_i - (-\mu^- + 2 \sigma^-))}
+                    {\sigma^-}
+                \right)}}
+
+        where :math:`r_i = y_i - v_i` and :math:`\mu^-` and :math:`\sigma^-` are the mean and
+        standard deviation, respectively, of the negative values in the residual vector,
+        ``y - v``.
+
         Parameters
         ----------
         data : array-like, shape (N,)
-            The y-values of the measured data, with N data points. Must not
-            contain missing data (NaN) or Inf.
+            The y-values of the measured data. Must not contain missing data (NaN) or Inf.
         lam : float, optional
             The smoothing parameter. Larger values will create smoother baselines.
             Default is 1e5.
@@ -316,6 +538,24 @@ def arpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=None
         Baek, S.J., et al. Baseline correction using asymmetrically reweighted
         penalized least squares smoothing. Analyst, 2015, 140, 250-257.
 
+        Examples
+        --------
+        >>> import matplotlib.pyplot as plt
+        >>> from pybaselines import Baseline, utils
+        >>> x, y = utils.make_data(noise_std=0.2)
+        >>> baseline_fitter = Baseline(x)
+
+        The main parameter to vary is ``lam``, which controls the smoothness of the
+        baseline, with larger values giving smoother baselines.
+
+        >>> fit_1, params_1 = baseline_fitter.arpls(y, lam=1e4)
+        >>> fit_2, params_2 = baseline_fitter.arpls(y, lam=1e6)
+        >>> plt.plot(x, y)
+        >>> plt.plot(x, fit_1, label='lam=1e4')
+        >>> plt.plot(x, fit_2, '--', label='lam=1e6')
+        >>> plt.legend()
+        >>> plt.show()
+
         """
         y, weight_array, whittaker_system = self._setup_whittaker(data, lam, diff_order, weights)
         tol_history = np.empty(max_iter + 1)
@@ -346,8 +586,7 @@ def drpls(self, data, lam=1e5, eta=0.5, max_iter=50, tol=1e-3, weights=None, dif
         Parameters
         ----------
         data : array-like, shape (N,)
-            The y-values of the measured data, with N data points. Must not
-            contain missing data (NaN) or Inf.
+            The y-values of the measured data. Must not contain missing data (NaN) or Inf.
         lam : float, optional
             The smoothing parameter. Larger values will create smoother baselines.
             Default is 1e5.
@@ -442,8 +681,7 @@ def iarpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=Non
         Parameters
         ----------
         data : array-like, shape (N,)
-            The y-values of the measured data, with N data points. Must not
-            contain missing data (NaN) or Inf.
+            The y-values of the measured data. Must not contain missing data (NaN) or Inf.
         lam : float, optional
             The smoothing parameter. Larger values will create smoother baselines.
             Default is 1e5.
@@ -510,8 +748,7 @@ def aspls(self, data, lam=1e5, diff_order=2, max_iter=100, tol=1e-3,
         Parameters
         ----------
         data : array-like, shape (N,)
-            The y-values of the measured data, with N data points. Must not
-            contain missing data (NaN) or Inf.
+            The y-values of the measured data. Must not contain missing data (NaN) or Inf.
         lam : float, optional
             The smoothing parameter. Larger values will create smoother baselines.
             Default is 1e5.
@@ -623,8 +860,7 @@ def psalsa(self, data, lam=1e5, p=0.5, k=None, diff_order=2, max_iter=50, tol=1e
         Parameters
         ----------
         data : array-like, shape (N,)
-            The y-values of the measured data, with N data points. Must not
-            contain missing data (NaN) or Inf.
+            The y-values of the measured data. Must not contain missing data (NaN) or Inf.
         lam : float, optional
             The smoothing parameter. Larger values will create smoother baselines.
             Default is 1e5.
@@ -718,8 +954,7 @@ def derpsalsa(self, data, lam=1e6, p=0.01, k=None, diff_order=2, max_iter=50, to
         Parameters
         ----------
         data : array-like, shape (N,)
-            The y-values of the measured data, with N data points. Must not
-            contain missing data (NaN) or Inf.
+            The y-values of the measured data. Must not contain missing data (NaN) or Inf.
         lam : float, optional
             The smoothing parameter. Larger values will create smoother baselines.
             Default is 1e6.
@@ -842,8 +1077,7 @@ def brpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, max_iter_2=5
         Parameters
         ----------
         data : array-like, shape (N,)
-            The y-values of the measured data, with N data points. Must not
-            contain missing data (NaN) or Inf.
+            The y-values of the measured data. Must not contain missing data (NaN) or Inf.
         lam : float, optional
             The smoothing parameter. Larger values will create smoother baselines.
             Default is 1e5.
@@ -946,8 +1180,7 @@ def lsrpls(self, data, lam=1e5, diff_order=2, max_iter=50, tol=1e-3, weights=Non
         Parameters
         ----------
         data : array-like, shape (N,)
-            The y-values of the measured data, with N data points. Must not
-            contain missing data (NaN) or Inf.
+            The y-values of the measured data. Must not contain missing data (NaN) or Inf.
         lam : float, optional
             The smoothing parameter. Larger values will create smoother baselines.
             Default is 1e5.
diff --git a/pyproject.toml b/pyproject.toml
index f7de4786..002a43d9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -83,11 +83,15 @@ test = [
     "pytest>=6.0",  # first version to work with pyproject.toml
     "ruff",
 ]
+doctest = [
+    "scipy-doctest",
+    "matplotlib",
+]
 docs = [
     "sphinx",
     "sphinx-copybutton",
     "sphinx-gallery>=0.16", # first version allowing strings to specify gallery sorting
-    "sphinx-rtd-theme",
+    "pydata-sphinx-theme",
     "matplotlib",
     "numpydoc",
 ]
@@ -96,7 +100,7 @@ release = [
     "bump-my-version",
     "twine",
 ]
-dev = ["pybaselines[full, docs, test, release]"]
+dev = ["pybaselines[full, docs, test, release, doctest]"]
 
 
 [tool.pytest.ini_options]
diff --git a/requirements/requirements-documentation.txt b/requirements/requirements-documentation.txt
index 18c77d8e..c574675e 100644
--- a/requirements/requirements-documentation.txt
+++ b/requirements/requirements-documentation.txt
@@ -7,14 +7,20 @@ pentapy==1.3.0
 sphinx==8.2.3
 sphinx-copybutton==0.5.2
 sphinx-gallery==0.19.0
-sphinx-rtd-theme==3.0.2
+pydata-sphinx-theme==0.16.1
 
 
 # Additional transitive dependencies, generated by pip-tools
+accessible-pygments==0.0.5
+    # via pydata-sphinx-theme
 alabaster==1.0.0
     # via sphinx
 babel==2.17.0
-    # via sphinx
+    # via
+    #   pydata-sphinx-theme
+    #   sphinx
+beautifulsoup4==4.13.5
+    # via pydata-sphinx-theme
 certifi==2025.1.31
     # via requests
 charset-normalizer==3.4.1
@@ -27,8 +33,8 @@ cycler==0.12.1
     # via matplotlib
 docutils==0.21.2
     # via
+    #   pydata-sphinx-theme
     #   sphinx
-    #   sphinx-rtd-theme
 fonttools==4.56.0
     # via matplotlib
 idna==3.10
@@ -52,7 +58,9 @@ pillow==11.1.0
     #   matplotlib
     #   sphinx-gallery
 pygments==2.19.1
-    # via sphinx
+    #   accessible-pygments
+    #   pydata-sphinx-theme
+    #   sphinx
 pyparsing==3.2.1
     # via matplotlib
 python-dateutil==2.9.0.post0
@@ -65,14 +73,14 @@ six==1.17.0
     # via python-dateutil
 snowballstemmer==2.2.0
     # via sphinx
+soupsieve==2.8
+    # via beautifulsoup4
 sphinxcontrib-applehelp==2.0.0
     # via sphinx
 sphinxcontrib-devhelp==2.0.0
     # via sphinx
 sphinxcontrib-htmlhelp==2.1.0
     # via sphinx
-sphinxcontrib-jquery==4.1
-    # via sphinx-rtd-theme
 sphinxcontrib-jsmath==1.0.1
     # via sphinx
 sphinxcontrib-qthelp==2.0.0
@@ -81,5 +89,9 @@ sphinxcontrib-serializinghtml==2.0.0
     # via sphinx
 tabulate==0.9.0
     # via numpydoc
+typing-extensions==4.15.0
+    # via
+    #   beautifulsoup4
+    #   pydata-sphinx-theme
 urllib3==2.3.0
     # via requests
diff --git a/tests/conftest.py b/tests/conftest.py
index 65ab1267..bef4bda5 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -14,6 +14,17 @@
 from .base_tests import get_data, get_data2d
 
 
+pytest_plugins = []
+try:
+    # ignore if scipy-doctest is unavailable since it's not required for running non-doctest
+    # tests; if running doctests without scipy-doctest, the doctests will fail due to expecting no
+    # output from matplotlib's plotting routines
+    import scipy_doctest  # noqa: F401
+    pytest_plugins.append('scipy_doctest')
+except ImportError:
+    pass
+
+
 def pytest_addoption(parser):
     """Adds additional pytest command line options."""
     if hasattr(sys, '_is_gil_enabled'):  # sys._is_gil_enabled added in Python 3.13
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 26b9893c..f1a1f6cb 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -831,23 +831,45 @@ def test_optimize_window(small_data2d, two_d):
 
 
 @pytest.mark.parametrize('data_size', (500, 1000, 1001))
-@pytest.mark.parametrize('baseline_type', ('exponential', 'gaussian', 'linear', 'sine'))
-def test_make_data(data_size, baseline_type):
-    """Ensures basic functionality of the _make_data function."""
-    x, y, baseline = utils._make_data(data_size, baseline_type)
+@pytest.mark.parametrize(
+    'baseline_type', ('exponential', 'gaussian', 'linear', 'sine', 'gaussian_small')
+)
+@pytest.mark.parametrize('signal_type', (1, 2))
+@pytest.mark.parametrize('return_baseline', (True, False))
+def test_make_data(data_size, baseline_type, signal_type, return_baseline):
+    """Ensures basic functionality of the make_data function."""
+    output = utils.make_data(
+        data_size, bkg_type=baseline_type, signal_type=signal_type, return_baseline=return_baseline,
+        noise_std=0.2
+    )
+    if return_baseline:
+        assert len(output) == 3
+        x, y, baseline = output
+    else:
+        assert len(output) == 2
+        x, y = output
+        baseline = None
+
     assert isinstance(x, np.ndarray)
     assert isinstance(y, np.ndarray)
-    assert isinstance(baseline, np.ndarray)
-
     assert x.shape == (data_size,)
     assert y.shape == (data_size,)
-    assert baseline.shape == (data_size,)
+
+    if baseline is not None:
+        assert isinstance(baseline, np.ndarray)
+        assert baseline.shape == (data_size,)
 
 
 def test_make_data_invalid_baseline():
     """Ensures an error is raised when an invalid baseline type is specified."""
     with pytest.raises(ValueError):
-        utils._make_data(100, 'aaaaa')
+        utils.make_data(100, bkg_type='aaaaa')
+
+
+def test_make_data_invalid_signal():
+    """Ensures an error is raised when an invalid signal type is specified."""
+    with pytest.raises(ValueError):
+        utils.make_data(100, signal_type=3)
 
 
 @pytest.mark.parametrize('half_window', (1, 2, 22))
diff --git a/tools/logo_plot.py b/tools/logo_plot.py
index 672ca0eb..9fb6d23a 100644
--- a/tools/logo_plot.py
+++ b/tools/logo_plot.py
@@ -1,7 +1,5 @@
 # -*- coding: utf-8 -*-
-"""Creates the logo plot.
-
-Creates the plot used for the logo; further edits are done in Inkscape to finalize the logo.
+"""Creates the logo for pybaselines.
 
 Created on November 15, 2021
 
@@ -16,6 +14,7 @@
 
     try:
         import matplotlib.pyplot as plt
+        from matplotlib import patheffects
     except ImportError:
         print('This file requires matplotlib to run')
         raise
@@ -25,11 +24,15 @@
 
     # assumes file is in pybaselines/tools
     image_directory = Path(__file__).parent
-    with plt.rc_context(
-        {'interactive': False, 'lines.linewidth': 2.5,
-         'figure.dpi': 300}
-    ):
-        fig, ax = plt.subplots(tight_layout={'pad': 0.1}, frameon=False)
+    figure_dpi = 300
+    with plt.rc_context({
+        'font.family': 'sans-serif',
+        'font.sans-serif': 'arial',
+    }):
+        fig, ax = plt.subplots(
+            tight_layout={'pad': 0.1}, frameon=False, figsize=(1710 / figure_dpi, 473 / figure_dpi),
+            dpi=figure_dpi
+        )
 
         x = np.linspace(1, 1000, 1000)
         signal = (
@@ -42,15 +45,31 @@
             + utils.gaussian(x, 5, 880, 8)
         )
         true_baseline = 2 + 1e-3 * x + utils.gaussian(x, 1, 600, 300)
-        noise = np.random.default_rng(1).normal(0, 0.05, x.size)
+        noise = np.random.default_rng(1).normal(0, 0.01, x.size)
 
         y = signal + true_baseline + noise
         baseline = Baseline().arpls(y, lam=1e7)[0]
 
-        blue = '#0952ff'
+        # for reference, see the matplotlib examples
+        # https://matplotlib.org/stable/gallery/text_labels_and_annotations/rainbow_text.html
+        # and https://matplotlib.org/stable/gallery/misc/patheffect_demo.html for how to make
+        # multicolored aligned text with borders
+        blue = '#137bff'
         pink = '#ff5255'
-        ax.plot(x, y, lw=1.5, color=blue)
-        ax.plot(x, baseline, lw=4, color=pink)
+        text_size = 52
+        text_border = [patheffects.withStroke(linewidth=1.5, foreground='black')]
+
+        ax.plot(x, y, lw=2.5, color=blue)
+        ax.plot(x, baseline, lw=3, color=pink)
+
+        x_lims = ax.get_xlim()
+        ax.set_xlim(x_lims[0] - 200, x_lims[1] + 200)
+        ax.set_ylim(ax.get_ylim()[0] - 5)
+        text = ax.text(1, -1.8, 'py', color=blue, size=text_size, path_effects=text_border)
+        text = ax.annotate(
+            'baselines', xycoords=text, xy=(1, 0), verticalalignment='bottom', size=text_size,
+            color=pink, path_effects=text_border
+        )
 
         ax.set_yticks([])
         ax.set_xticks([])
@@ -59,7 +78,10 @@
         ax.spines['left'].set_visible(False)
         ax.spines['right'].set_visible(False)
 
-        # save as an svg so that it can be edited/scaled in inkskape without
-        # losing image quality
-        fig.savefig(image_directory.joinpath('logo_new.svg'), transparent=True)
-        plt.close(fig)
+        # No idea what's happening here, but for the first save, the scaling of the plot gets
+        # shrunk legthwise such that it does not look like the displayed plot, but after saving
+        # again it looks correct... just overwrite the first and ignore whatever is causing this
+        fig.savefig(image_directory.joinpath('logo_new.png'), transparent=True)
+        fig.savefig(image_directory.joinpath('logo_new.png'), transparent=True)
+
+        plt.show()