From 61a9eaef9ca71eb52370dfabdef9b337dc14dc69 Mon Sep 17 00:00:00 2001
From: Christopher Albert <albert@tugraz.at>
Date: Tue, 2 Dec 2025 01:08:17 +0100
Subject: [PATCH 1/2] interpolate: add global-knot LSQ with CGLS and batch
 support

---
 src/interpolate/CMakeLists.txt           |   2 +
 src/interpolate/batch_interpolate_1d.f90 | 115 +++++
 src/interpolate/cgls_dense.f90           | 212 +++++++++
 src/interpolate/cgls_small.f90           |  99 +++++
 src/interpolate/interpolate.f90          | 388 +++++++++++++++++
 test/CMakeLists.txt                      |  12 +
 test/source/test_batch_interpolate.f90   |  92 +++-
 test/source/test_cgls_dense.f90          |  97 +++++
 test/source/test_cgls_small.f90          |  96 +++++
 test/source/test_interpolate.f90         | 525 ++++++++++++++++++-----
 test/source/test_spline_cgls_global.f90  | 109 +++++
 11 files changed, 1643 insertions(+), 104 deletions(-)
 create mode 100644 src/interpolate/cgls_dense.f90
 create mode 100644 src/interpolate/cgls_small.f90
 create mode 100644 test/source/test_cgls_dense.f90
 create mode 100644 test/source/test_cgls_small.f90
 create mode 100644 test/source/test_spline_cgls_global.f90

diff --git a/src/interpolate/CMakeLists.txt b/src/interpolate/CMakeLists.txt
index d6eb1f2c..ce097a1e 100644
--- a/src/interpolate/CMakeLists.txt
+++ b/src/interpolate/CMakeLists.txt
@@ -6,6 +6,8 @@ add_library(interpolate STATIC
     batch_interpolate_2d.f90
     batch_interpolate_3d.f90
     batch_interpolate.f90
+    cgls_small.f90
+    cgls_dense.f90
     interpolate.f90)
 
 if(LIBNEO_BUILD_TESTING)
diff --git a/src/interpolate/batch_interpolate_1d.f90 b/src/interpolate/batch_interpolate_1d.f90
index e98731cd..d907a65e 100644
--- a/src/interpolate/batch_interpolate_1d.f90
+++ b/src/interpolate/batch_interpolate_1d.f90
@@ -5,6 +5,7 @@ module batch_interpolate_1d
                                  spl_four_per_line, spl_four_reg_line, &
                                  spl_five_per_line, spl_five_reg_line
    use spl_three_to_five_sub, only: spl_per, spl_reg
+   use cgls_dense, only: cgls_dense_solve
 #ifdef _OPENACC
    use openacc, only: acc_is_present
 #endif
@@ -14,6 +15,7 @@ module batch_interpolate_1d
 
    ! Export batch spline construction/destruction routines
    public :: construct_batch_splines_1d
+   public :: construct_batch_splines_1d_lsq
    public :: construct_batch_splines_1d_lines
    public :: construct_batch_splines_1d_resident
    public :: construct_batch_splines_1d_resident_device
@@ -118,6 +120,119 @@ subroutine construct_batch_splines_1d_legacy(x_min, x_max, y_batch, order, perio
       deallocate (splcoe_temp)
    end subroutine construct_batch_splines_1d_legacy
 
+   subroutine construct_batch_splines_1d_lsq(x_min, x_max, order, periodic, &
+                                             num_points, x_data, f_batch, spl, weights)
+      real(dp), intent(in) :: x_min, x_max
+      integer, intent(in) :: order
+      logical, intent(in) :: periodic
+      integer, intent(in) :: num_points
+      real(dp), intent(in) :: x_data(:)
+      real(dp), intent(in) :: f_batch(:, :)  ! (n_data, n_quantities)
+      type(BatchSplineData1D), intent(out) :: spl
+      real(dp), intent(in), optional :: weights(:)
+
+      integer :: n_data, n_quantities, n_keep
+      integer :: i, iq, col
+      real(dp), allocatable :: x_used(:), f_used(:, :), w_used(:)
+      real(dp), allocatable :: phi(:, :), y_vec(:), y_knots(:, :)
+      type(BatchSplineData1D) :: spl_unit
+      real(dp) :: val(1)
+
+      n_data = size(x_data)
+      if (n_data /= size(f_batch, 1)) then
+         error stop "construct_batch_splines_1d_lsq: data size mismatch"
+      end if
+      n_quantities = size(f_batch, 2)
+      if (n_quantities < 1) then
+         error stop "construct_batch_splines_1d_lsq: need at least 1 quantity"
+      end if
+      if (num_points < 2) then
+         error stop "construct_batch_splines_1d_lsq: need at least 2 grid points"
+      end if
+
+      if (present(weights)) then
+         if (size(weights) /= n_data) then
+            error stop "construct_batch_splines_1d_lsq: weights size mismatch"
+         end if
+      end if
+
+      n_keep = 0
+      do i = 1, n_data
+         if (.not. periodic) then
+            if (x_data(i) < x_min .or. x_data(i) > x_max) cycle
+         end if
+         if (present(weights)) then
+            if (weights(i) == 0.0_dp) cycle
+         end if
+         n_keep = n_keep + 1
+      end do
+
+      if (n_keep == 0) then
+         error stop "construct_batch_splines_1d_lsq: no usable data"
+      end if
+
+      allocate (x_used(n_keep), f_used(n_keep, n_quantities))
+      if (present(weights)) allocate (w_used(n_keep))
+
+      n_keep = 0
+      do i = 1, n_data
+         if (.not. periodic) then
+            if (x_data(i) < x_min .or. x_data(i) > x_max) cycle
+         end if
+         if (present(weights)) then
+            if (weights(i) == 0.0_dp) cycle
+            w_used(n_keep + 1) = weights(i)
+         end if
+         n_keep = n_keep + 1
+         x_used(n_keep) = x_data(i)
+         f_used(n_keep, :) = f_batch(i, :)
+      end do
+
+      allocate (phi(n_keep, num_points))
+      allocate (y_vec(num_points))
+      allocate (y_knots(num_points, n_quantities))
+
+      do col = 1, num_points
+         call allocate_batch_unit(order, num_points, periodic, col, spl_unit, x_min, x_max)
+         do i = 1, n_keep
+            call evaluate_batch_splines_1d(spl_unit, x_used(i), val)
+            phi(i, col) = val(1)
+         end do
+         call destroy_batch_splines_1d(spl_unit)
+      end do
+
+      do iq = 1, n_quantities
+         if (present(weights)) then
+            call cgls_dense_solve(phi, f_used(:, iq), y_vec, w_used, &
+                                  max_iter=4*num_points, tol=1.0d-14)
+         else
+            call cgls_dense_solve(phi, f_used(:, iq), y_vec, &
+                                  max_iter=4*num_points, tol=1.0d-14)
+         end if
+         y_knots(:, iq) = y_vec
+      end do
+
+      call construct_batch_splines_1d(x_min, x_max, y_knots, order, periodic, spl)
+
+      deallocate (phi, y_vec, y_knots, x_used, f_used)
+      if (present(weights)) deallocate (w_used)
+   end subroutine construct_batch_splines_1d_lsq
+
+   subroutine allocate_batch_unit(order, num_points, periodic, idx, spl, x_min, x_max)
+      integer, intent(in) :: order, num_points, idx
+      logical, intent(in) :: periodic
+      real(dp), intent(in) :: x_min, x_max
+      type(BatchSplineData1D), intent(out) :: spl
+
+      real(dp), allocatable :: y_basis(:, :)
+
+      allocate (y_basis(num_points, 1))
+      y_basis = 0.0_dp
+      y_basis(idx, 1) = 1.0_dp
+      call construct_batch_splines_1d(x_min, x_max, y_basis, order, periodic, spl)
+      deallocate (y_basis)
+   end subroutine allocate_batch_unit
+
    subroutine construct_batch_splines_1d_lines(x_min, x_max, y_batch, order, periodic, spl)
       real(dp), intent(in) :: x_min, x_max
       real(dp), intent(in) :: y_batch(:, :)  ! (n_points, n_quantities)
diff --git a/src/interpolate/cgls_dense.f90 b/src/interpolate/cgls_dense.f90
new file mode 100644
index 00000000..ee306178
--- /dev/null
+++ b/src/interpolate/cgls_dense.f90
@@ -0,0 +1,212 @@
+module cgls_dense
+    !! Conjugate gradient for normal equations using an explicit
+    !! design matrix stored in column-major form.  OpenMP and SIMD
+    !! are used inside the matrix-vector products.
+    use, intrinsic :: iso_fortran_env, only : dp => real64
+    implicit none
+    private
+
+    public :: cgls_dense_solve
+
+contains
+
+    subroutine matvec_phi_core(phi, x, y)
+        !! y = A*x  where A == phi (n_rows x n_cols)
+        real(dp), intent(in) :: phi(:,:)
+        real(dp), intent(in) :: x(:)
+        real(dp), intent(out) :: y(:)
+
+        integer :: i, j, n_rows, n_cols
+        real(dp) :: acc
+
+        n_rows = size(phi, 1)
+        n_cols = size(phi, 2)
+
+        if (size(x) /= n_cols) then
+            error stop "matvec_phi_core: x size mismatch"
+        end if
+        if (size(y) /= n_rows) then
+            error stop "matvec_phi_core: y size mismatch"
+        end if
+
+        y = 0.0_dp
+
+        !$omp parallel do default(shared) private(i, j, acc)
+        do i = 1, n_rows
+            acc = 0.0_dp
+            !$omp simd reduction(+:acc)
+            do j = 1, n_cols
+                acc = acc + phi(i, j)*x(j)
+            end do
+            y(i) = acc
+        end do
+        !$omp end parallel do
+    end subroutine matvec_phi_core
+
+
+    subroutine matvec_phi_t_core(phi, r, y)
+        !! y = A^T*r  where A == phi (n_rows x n_cols)
+        real(dp), intent(in) :: phi(:,:)
+        real(dp), intent(in) :: r(:)
+        real(dp), intent(out) :: y(:)
+
+        integer :: i, j, n_rows, n_cols
+        real(dp) :: rloc
+
+        n_rows = size(phi, 1)
+        n_cols = size(phi, 2)
+
+        if (size(r) /= n_rows) then
+            error stop "matvec_phi_t_core: r size mismatch"
+        end if
+        if (size(y) /= n_cols) then
+            error stop "matvec_phi_t_core: y size mismatch"
+        end if
+
+        y = 0.0_dp
+
+        !$omp parallel do default(shared) private(j, i, rloc) schedule(static)
+        do j = 1, n_cols
+            rloc = 0.0_dp
+            !$omp simd reduction(+:rloc)
+            do i = 1, n_rows
+                rloc = rloc + phi(i, j)*r(i)
+            end do
+            y(j) = rloc
+        end do
+        !$omp end parallel do
+    end subroutine matvec_phi_t_core
+
+
+    subroutine cgls_dense_core(phi, rhs, x, max_iter, tol)
+        !! Conjugate Gradient for normal equations using
+        !! explicit matrix-vector products (unweighted case).
+        real(dp), intent(in) :: phi(:,:)
+        real(dp), intent(in) :: rhs(:)
+        real(dp), intent(inout) :: x(:)
+        integer, intent(in) :: max_iter
+        real(dp), intent(in) :: tol
+
+        integer :: iter, n_rows, n_cols
+        real(dp) :: gamma, gamma_new, alpha, beta, denom
+        real(dp) :: rhs_norm
+
+        real(dp), allocatable :: r(:), s(:), p(:), q(:)
+
+        n_rows = size(phi, 1)
+        n_cols = size(phi, 2)
+
+        if (size(rhs) /= n_rows) then
+            error stop "cgls_dense_core: rhs size mismatch"
+        end if
+        if (size(x) /= n_cols) then
+            error stop "cgls_dense_core: x size mismatch"
+        end if
+
+        allocate(r(n_rows))
+        allocate(s(n_cols))
+        allocate(p(n_cols))
+        allocate(q(n_rows))
+
+        r = rhs
+
+        call matvec_phi_t_core(phi, r, s)
+        p = s
+        gamma = sum(s*s)
+        rhs_norm = sqrt(sum(r*r))
+
+        if (rhs_norm == 0.0_dp) then
+            x = 0.0_dp
+            deallocate(r, s, p, q)
+            return
+        end if
+
+        do iter = 1, max_iter
+            call matvec_phi_core(phi, p, q)
+            denom = sum(q*q)
+            if (denom <= 0.0_dp) exit
+
+            alpha = gamma/denom
+            x = x + alpha*p
+            r = r - alpha*q
+
+            call matvec_phi_t_core(phi, r, s)
+            gamma_new = sum(s*s)
+
+            if (gamma_new <= (tol*rhs_norm)**2) exit
+
+            beta = gamma_new/gamma
+            gamma = gamma_new
+            p = s + beta*p
+        end do
+
+        deallocate(r, s, p, q)
+    end subroutine cgls_dense_core
+
+
+    subroutine cgls_dense_solve(phi, rhs, x, w, max_iter, tol)
+        !! Conjugate Gradient for normal equations using
+        !! explicit matrix-vector products.
+        !! If w is present, each row i is scaled by w(i).
+        real(dp), intent(in) :: phi(:,:)
+        real(dp), intent(in) :: rhs(:)
+        real(dp), intent(out) :: x(:)
+        real(dp), intent(in), optional :: w(:)
+        integer, intent(in), optional :: max_iter
+        real(dp), intent(in), optional :: tol
+
+        integer :: n_rows, n_cols, kmax, i, j
+        real(dp) :: atol
+        real(dp), allocatable :: phi_eff(:,:), rhs_eff(:)
+
+        n_rows = size(phi, 1)
+        n_cols = size(phi, 2)
+
+        if (size(rhs) /= n_rows) then
+            error stop "cgls_dense_solve: rhs size mismatch"
+        end if
+        if (size(x) /= n_cols) then
+            error stop "cgls_dense_solve: x size mismatch"
+        end if
+        if (present(w)) then
+            if (size(w) /= n_rows) then
+                error stop "cgls_dense_solve: w size mismatch"
+            end if
+        end if
+
+        if (present(max_iter)) then
+            kmax = max_iter
+        else
+            kmax = 200
+        end if
+        if (present(tol)) then
+            atol = tol
+        else
+            atol = 1.0d-12
+        end if
+
+        x = 0.0_dp
+
+        if (present(w)) then
+            allocate(phi_eff(n_rows, n_cols))
+            allocate(rhs_eff(n_rows))
+
+            !$omp parallel do default(shared) private(i, j)
+            do i = 1, n_rows
+                rhs_eff(i) = w(i)*rhs(i)
+                !$omp simd
+                do j = 1, n_cols
+                    phi_eff(i, j) = w(i)*phi(i, j)
+                end do
+            end do
+            !$omp end parallel do
+
+            call cgls_dense_core(phi_eff, rhs_eff, x, kmax, atol)
+
+            deallocate(phi_eff, rhs_eff)
+        else
+            call cgls_dense_core(phi, rhs, x, kmax, atol)
+        end if
+    end subroutine cgls_dense_solve
+
+end module cgls_dense
diff --git a/src/interpolate/cgls_small.f90 b/src/interpolate/cgls_small.f90
new file mode 100644
index 00000000..0d0834c2
--- /dev/null
+++ b/src/interpolate/cgls_small.f90
@@ -0,0 +1,99 @@
+module cgls_small
+    use, intrinsic :: iso_fortran_env, only : dp => real64
+    implicit none
+
+contains
+
+    subroutine cgls_solve_small(order, amat, rhs, x)
+        integer, intent(in) :: order
+        real(dp), intent(in) :: amat(0:order, 0:order)
+        real(dp), intent(in) :: rhs(0:order)
+        real(dp), intent(out) :: x(0:order)
+
+        integer :: n, i, j, k, iter
+        real(dp) :: alpha, beta
+        real(dp) :: r_norm2, r_norm2_new, denom
+        real(dp) :: tol
+
+        real(dp), allocatable :: r(:), s(:), p(:), q(:)
+
+        n = order + 1
+        tol = 1.0d-12
+
+        allocate(r(0:order))
+        allocate(s(0:order))
+        allocate(p(0:order))
+        allocate(q(0:order))
+
+        x = 0.0d0
+
+        r = rhs
+
+        do i = 0, order
+            s(i) = 0.0d0
+            do j = 0, order
+                s(i) = s(i) + amat(j, i)*r(j)
+            end do
+        end do
+
+        p = s
+        r_norm2 = 0.0d0
+        do i = 0, order
+            r_norm2 = r_norm2 + s(i)*s(i)
+        end do
+
+        if (r_norm2 <= tol) then
+            deallocate(r, s, p, q)
+            return
+        end if
+
+        do iter = 1, 64
+
+            do i = 0, order
+                q(i) = 0.0d0
+                do j = 0, order
+                    q(i) = q(i) + amat(i, j)*p(j)
+                end do
+            end do
+
+            denom = 0.0d0
+            do i = 0, order
+                denom = denom + q(i)*q(i)
+            end do
+
+            if (denom <= tol) exit
+
+            alpha = r_norm2/denom
+
+            do i = 0, order
+                x(i) = x(i) + alpha*p(i)
+                r(i) = r(i) - alpha*q(i)
+            end do
+
+            do i = 0, order
+                s(i) = 0.0d0
+                do j = 0, order
+                    s(i) = s(i) + amat(j, i)*r(j)
+                end do
+            end do
+
+            r_norm2_new = 0.0d0
+            do i = 0, order
+                r_norm2_new = r_norm2_new + s(i)*s(i)
+            end do
+
+            if (r_norm2_new <= tol*r_norm2) exit
+
+            beta = r_norm2_new/r_norm2
+            r_norm2 = r_norm2_new
+
+            do i = 0, order
+                p(i) = s(i) + beta*p(i)
+            end do
+        end do
+
+        deallocate(r, s, p, q)
+
+    end subroutine cgls_solve_small
+
+end module cgls_small
diff --git a/src/interpolate/interpolate.f90 b/src/interpolate/interpolate.f90
index 309b2d90..2718ee3d 100644
--- a/src/interpolate/interpolate.f90
+++ b/src/interpolate/interpolate.f90
@@ -35,6 +35,7 @@ module interpolate
                                  evaluate_batch_splines_3d_many_resident, &
                                  evaluate_batch_splines_3d_many_der, &
                                  evaluate_batch_splines_3d_many_der2
+    use cgls_dense, only: cgls_dense_solve
 
     implicit none
 
@@ -65,6 +66,7 @@ module interpolate
     ! Single-quantity non-batch spline types and routines
     public :: SplineData1D, SplineData2D, SplineData3D
     public :: construct_splines_1d, construct_splines_2d, construct_splines_3d
+    public :: construct_splines_1d_lsq, construct_splines_2d_lsq, construct_splines_3d_lsq
     public :: destroy_splines_1d, destroy_splines_2d, destroy_splines_3d
     public :: evaluate_splines_1d, evaluate_splines_1d_der, evaluate_splines_1d_der2
     public :: evaluate_splines_1d_many, evaluate_splines_1d_many_der, &
@@ -74,6 +76,7 @@ module interpolate
     public :: evaluate_splines_3d, evaluate_splines_3d_der, evaluate_splines_3d_der2
     public :: evaluate_splines_3d_many, evaluate_splines_3d_many_der, &
               evaluate_splines_3d_many_der2
+    public :: build_design_matrix_1d, build_design_matrix_2d, build_design_matrix_3d
 
     type :: SplineData1D
         integer :: order
@@ -842,6 +845,391 @@ subroutine destroy_splines_3d(spl)
         if (allocated(spl%coeff)) deallocate (spl%coeff)
     end subroutine destroy_splines_3d
 
+    subroutine build_design_matrix_1d(x_min, x_max, order, periodic, num_points, x_data, phi)
+        real(dp), intent(in) :: x_min, x_max
+        integer, intent(in) :: order, num_points
+        logical, intent(in) :: periodic
+        real(dp), intent(in) :: x_data(:)
+        real(dp), intent(out) :: phi(:, :)
+
+        integer :: n_data, i, col
+        real(dp), allocatable :: y_basis(:)
+        type(SplineData1D) :: spl
+
+        n_data = size(x_data)
+        if (size(phi, 1) /= n_data .or. size(phi, 2) /= num_points) then
+            error stop "build_design_matrix_1d: phi has wrong shape"
+        end if
+
+        allocate (y_basis(num_points))
+
+        do col = 1, num_points
+            y_basis = 0.0_dp
+            y_basis(col) = 1.0_dp
+            call construct_splines_1d(x_min, x_max, y_basis, order, periodic, spl)
+            do i = 1, n_data
+                call evaluate_splines_1d(spl, x_data(i), phi(i, col))
+            end do
+            call destroy_splines_1d(spl)
+        end do
+
+        deallocate (y_basis)
+    end subroutine build_design_matrix_1d
+
+    subroutine build_design_matrix_2d(x_min, x_max, order, periodic, num_points, &
+                                      x_data, y_data, phi)
+        real(dp), intent(in) :: x_min(2), x_max(2)
+        integer, intent(in) :: order(2), num_points(2)
+        logical, intent(in) :: periodic(2)
+        real(dp), intent(in) :: x_data(:), y_data(:)
+        real(dp), intent(out) :: phi(:, :)
+
+        integer :: n_data, i, i1, i2, col, n_cols
+        real(dp), allocatable :: y_basis(:, :)
+        type(SplineData2D) :: spl
+
+        n_data = size(x_data)
+        if (size(y_data) /= n_data) then
+            error stop "build_design_matrix_2d: data size mismatch"
+        end if
+
+        n_cols = num_points(1)*num_points(2)
+        if (size(phi, 1) /= n_data .or. size(phi, 2) /= n_cols) then
+            error stop "build_design_matrix_2d: phi has wrong shape"
+        end if
+
+        allocate (y_basis(num_points(1), num_points(2)))
+
+        do col = 1, n_cols
+            i2 = (col - 1)/num_points(1) + 1
+            i1 = col - (i2 - 1)*num_points(1)
+            y_basis = 0.0_dp
+            y_basis(i1, i2) = 1.0_dp
+            call construct_splines_2d(x_min, x_max, y_basis, order, periodic, spl)
+            do i = 1, n_data
+                call evaluate_splines_2d(spl, [x_data(i), y_data(i)], phi(i, col))
+            end do
+            call destroy_splines_2d(spl)
+        end do
+
+        deallocate (y_basis)
+    end subroutine build_design_matrix_2d
+
+    subroutine build_design_matrix_3d(x_min, x_max, order, periodic, num_points, &
+                                      x_data, y_data, z_data, phi)
+        real(dp), intent(in) :: x_min(3), x_max(3)
+        integer, intent(in) :: order(3), num_points(3)
+        logical, intent(in) :: periodic(3)
+        real(dp), intent(in) :: x_data(:), y_data(:), z_data(:)
+        real(dp), intent(out) :: phi(:, :)
+
+        integer :: n_data, i, i1, i2, i3, col, n12, rem, n_cols
+        real(dp), allocatable :: y_basis(:, :, :)
+        type(SplineData3D) :: spl
+
+        n_data = size(x_data)
+        if (size(y_data) /= n_data .or. size(z_data) /= n_data) then
+            error stop "build_design_matrix_3d: data size mismatch"
+        end if
+
+        n_cols = num_points(1)*num_points(2)*num_points(3)
+        if (size(phi, 1) /= n_data .or. size(phi, 2) /= n_cols) then
+            error stop "build_design_matrix_3d: phi has wrong shape"
+        end if
+
+        allocate (y_basis(num_points(1), num_points(2), num_points(3)))
+
+        n12 = num_points(1)*num_points(2)
+        do col = 1, n_cols
+            i3 = (col - 1)/n12 + 1
+            rem = col - (i3 - 1)*n12
+            i2 = (rem - 1)/num_points(1) + 1
+            i1 = rem - (i2 - 1)*num_points(1)
+            y_basis = 0.0_dp
+            y_basis(i1, i2, i3) = 1.0_dp
+            call construct_splines_3d(x_min, x_max, y_basis, order, periodic, spl)
+            do i = 1, n_data
+                call evaluate_splines_3d(spl, [x_data(i), y_data(i), z_data(i)], phi(i, col))
+            end do
+            call destroy_splines_3d(spl)
+        end do
+
+        deallocate (y_basis)
+    end subroutine build_design_matrix_3d
+
+    subroutine construct_splines_1d_lsq(x_min, x_max, order, periodic, &
+                                        num_points, x_data, f_data, spl, weights)
+        real(dp), intent(in) :: x_min, x_max
+        integer, intent(in) :: order, num_points
+        logical, intent(in) :: periodic
+        real(dp), intent(in) :: x_data(:)
+        real(dp), intent(in) :: f_data(:)
+        type(SplineData1D), intent(out) :: spl
+        real(dp), intent(in), optional :: weights(:)
+
+        integer :: n_data, n_keep, i
+        real(dp), allocatable :: x_used(:), f_used(:), w_used(:)
+        real(dp), allocatable :: phi(:, :), y(:)
+        real(dp) :: h_ref, tol_x
+        logical :: use_direct
+
+        n_data = size(x_data)
+        if (n_data /= size(f_data)) then
+            error stop "construct_splines_1d_lsq: data size mismatch"
+        end if
+        if (num_points < 2) then
+            error stop "construct_splines_1d_lsq: need at least 2 grid points"
+        end if
+        if (present(weights)) then
+            if (size(weights) /= n_data) then
+                error stop "construct_splines_1d_lsq: weights size mismatch"
+            end if
+        end if
+
+        use_direct = .false.
+        tol_x = 1.0d-12
+        if (.not. periodic) then
+            if (n_data == num_points) then
+                h_ref = (x_max - x_min)/dble(num_points - 1)
+                use_direct = .true.
+                do i = 1, num_points
+                    if (abs(x_data(i) - (x_min + h_ref*dble(i - 1))) > tol_x) then
+                        use_direct = .false.
+                        exit
+                    end if
+                end do
+            end if
+        end if
+
+        if (use_direct) then
+            call construct_splines_1d(x_min, x_max, f_data, order, periodic, spl)
+            return
+        end if
+
+        n_keep = 0
+        do i = 1, n_data
+            if (.not. periodic) then
+                if (x_data(i) < x_min .or. x_data(i) > x_max) cycle
+            end if
+            if (present(weights)) then
+                if (weights(i) == 0.0_dp) cycle
+            end if
+            n_keep = n_keep + 1
+        end do
+
+        if (n_keep == 0) then
+            error stop "construct_splines_1d_lsq: no usable data"
+        end if
+
+        allocate (x_used(n_keep), f_used(n_keep))
+        if (present(weights)) allocate (w_used(n_keep))
+
+        n_keep = 0
+        do i = 1, n_data
+            if (.not. periodic) then
+                if (x_data(i) < x_min .or. x_data(i) > x_max) cycle
+            end if
+            if (present(weights)) then
+                if (weights(i) == 0.0_dp) cycle
+                w_used(n_keep + 1) = weights(i)
+            end if
+            n_keep = n_keep + 1
+            x_used(n_keep) = x_data(i)
+            f_used(n_keep) = f_data(i)
+        end do
+
+        allocate (phi(n_keep, num_points))
+        call build_design_matrix_1d(x_min, x_max, order, periodic, num_points, x_used, phi)
+
+        allocate (y(num_points))
+        if (present(weights)) then
+            call cgls_dense_solve(phi, f_used, y, w_used, max_iter=4*num_points, tol=1.0d-14)
+        else
+            call cgls_dense_solve(phi, f_used, y, max_iter=4*num_points, tol=1.0d-14)
+        end if
+
+        call construct_splines_1d(x_min, x_max, y, order, periodic, spl)
+
+        deallocate (phi, y, x_used, f_used)
+        if (present(weights)) deallocate (w_used)
+    end subroutine construct_splines_1d_lsq
+
+    subroutine construct_splines_2d_lsq(x_min, x_max, order, periodic, &
+                                        num_points, x_data, y_data, f_data, spl, weights)
+        real(dp), intent(in) :: x_min(2), x_max(2)
+        integer, intent(in) :: order(2), num_points(2)
+        logical, intent(in) :: periodic(2)
+        real(dp), intent(in) :: x_data(:), y_data(:), f_data(:)
+        type(SplineData2D), intent(out) :: spl
+        real(dp), intent(in), optional :: weights(:)
+
+        integer :: n_data, n_keep, i, n_cols
+        real(dp), allocatable :: x_used(:), y_used(:), f_used(:), w_used(:)
+        real(dp), allocatable :: phi(:, :), y_vec(:), y_grid(:, :)
+
+        n_data = size(x_data)
+        if (n_data /= size(y_data) .or. n_data /= size(f_data)) then
+            error stop "construct_splines_2d_lsq: data size mismatch"
+        end if
+        if (num_points(1) < 2 .or. num_points(2) < 2) then
+            error stop "construct_splines_2d_lsq: need at least 2 grid points"
+        end if
+        if (present(weights)) then
+            if (size(weights) /= n_data) then
+                error stop "construct_splines_2d_lsq: weights size mismatch"
+            end if
+        end if
+
+        n_keep = 0
+        do i = 1, n_data
+            if (.not. periodic(1)) then
+                if (x_data(i) < x_min(1) .or. x_data(i) > x_max(1)) cycle
+            end if
+            if (.not. periodic(2)) then
+                if (y_data(i) < x_min(2) .or. y_data(i) > x_max(2)) cycle
+            end if
+            if (present(weights)) then
+                if (weights(i) == 0.0_dp) cycle
+            end if
+            n_keep = n_keep + 1
+        end do
+
+        if (n_keep == 0) then
+            error stop "construct_splines_2d_lsq: no usable data"
+        end if
+
+        allocate (x_used(n_keep), y_used(n_keep), f_used(n_keep))
+        if (present(weights)) allocate (w_used(n_keep))
+
+        n_keep = 0
+        do i = 1, n_data
+            if (.not. periodic(1)) then
+                if (x_data(i) < x_min(1) .or. x_data(i) > x_max(1)) cycle
+            end if
+            if (.not. periodic(2)) then
+                if (y_data(i) < x_min(2) .or. y_data(i) > x_max(2)) cycle
+            end if
+            if (present(weights)) then
+                if (weights(i) == 0.0_dp) cycle
+                w_used(n_keep + 1) = weights(i)
+            end if
+            n_keep = n_keep + 1
+            x_used(n_keep) = x_data(i)
+            y_used(n_keep) = y_data(i)
+            f_used(n_keep) = f_data(i)
+        end do
+
+        n_cols = num_points(1)*num_points(2)
+        allocate (phi(n_keep, n_cols))
+        call build_design_matrix_2d(x_min, x_max, order, periodic, num_points, x_used, y_used, phi)
+
+        allocate (y_vec(n_cols))
+        if (present(weights)) then
+            call cgls_dense_solve(phi, f_used, y_vec, w_used, max_iter=4*n_cols, tol=1.0d-14)
+        else
+            call cgls_dense_solve(phi, f_used, y_vec, max_iter=4*n_cols, tol=1.0d-14)
+        end if
+
+        allocate (y_grid(num_points(1), num_points(2)))
+        y_grid = reshape(y_vec, shape=[num_points(1), num_points(2)])
+        call construct_splines_2d(x_min, x_max, y_grid, order, periodic, spl)
+
+        deallocate (phi, y_vec, y_grid, x_used, y_used, f_used)
+        if (present(weights)) deallocate (w_used)
+    end subroutine construct_splines_2d_lsq
+
+    subroutine construct_splines_3d_lsq(x_min, x_max, order, periodic, &
+                                        num_points, x_data, y_data, z_data, f_data, spl, weights)
+        real(dp), intent(in) :: x_min(3), x_max(3)
+        integer, intent(in) :: order(3), num_points(3)
+        logical, intent(in) :: periodic(3)
+        real(dp), intent(in) :: x_data(:), y_data(:), z_data(:), f_data(:)
+        type(SplineData3D), intent(out) :: spl
+        real(dp), intent(in), optional :: weights(:)
+
+        integer :: n_data, n_keep, i, n_cols
+        real(dp), allocatable :: x_used(:), y_used(:), z_used(:), f_used(:), w_used(:)
+        real(dp), allocatable :: phi(:, :), y_vec(:), y_grid(:, :, :)
+
+        n_data = size(x_data)
+        if (n_data /= size(y_data) .or. n_data /= size(z_data) .or. n_data /= size(f_data)) then
+            error stop "construct_splines_3d_lsq: data size mismatch"
+        end if
+        if (num_points(1) < 2 .or. num_points(2) < 2 .or. num_points(3) < 2) then
+            error stop "construct_splines_3d_lsq: need at least 2 grid points"
+        end if
+        if (present(weights)) then
+            if (size(weights) /= n_data) then
+                error stop "construct_splines_3d_lsq: weights size mismatch"
+            end if
+        end if
+
+        n_keep = 0
+        do i = 1, n_data
+            if (.not. periodic(1)) then
+                if (x_data(i) < x_min(1) .or. x_data(i) > x_max(1)) cycle
+            end if
+            if (.not. periodic(2)) then
+                if (y_data(i) < x_min(2) .or. y_data(i) > x_max(2)) cycle
+            end if
+            if (.not. periodic(3)) then
+                if (z_data(i) < x_min(3) .or. z_data(i) > x_max(3)) cycle
+            end if
+            if (present(weights)) then
+                if (weights(i) == 0.0_dp) cycle
+            end if
+            n_keep = n_keep + 1
+        end do
+
+        if (n_keep == 0) then
+            error stop "construct_splines_3d_lsq: no usable data"
+        end if
+
+        allocate (x_used(n_keep), y_used(n_keep), z_used(n_keep), f_used(n_keep))
+        if (present(weights)) allocate (w_used(n_keep))
+
+        n_keep = 0
+        do i = 1, n_data
+            if (.not. periodic(1)) then
+                if (x_data(i) < x_min(1) .or. x_data(i) > x_max(1)) cycle
+            end if
+            if (.not. periodic(2)) then
+                if (y_data(i) < x_min(2) .or. y_data(i) > x_max(2)) cycle
+            end if
+            if (.not. periodic(3)) then
+                if (z_data(i) < x_min(3) .or. z_data(i) > x_max(3)) cycle
+            end if
+            if (present(weights)) then
+                if (weights(i) == 0.0_dp) cycle
+                w_used(n_keep + 1) = weights(i)
+            end if
+            n_keep = n_keep + 1
+            x_used(n_keep) = x_data(i)
+            y_used(n_keep) = y_data(i)
+            z_used(n_keep) = z_data(i)
+            f_used(n_keep) = f_data(i)
+        end do
+
+        n_cols = num_points(1)*num_points(2)*num_points(3)
+        allocate (phi(n_keep, n_cols))
+        call build_design_matrix_3d(x_min, x_max, order, periodic, num_points, x_used, y_used, &
+                                    z_used, phi)
+
+        allocate (y_vec(n_cols))
+        if (present(weights)) then
+            call cgls_dense_solve(phi, f_used, y_vec, w_used, max_iter=4*n_cols, tol=1.0d-14)
+        else
+            call cgls_dense_solve(phi, f_used, y_vec, max_iter=4*n_cols, tol=1.0d-14)
+        end if
+
+        allocate (y_grid(num_points(1), num_points(2), num_points(3)))
+        y_grid = reshape(y_vec, shape=[num_points(1), num_points(2), num_points(3)])
+        call construct_splines_3d(x_min, x_max, y_grid, order, periodic, spl)
+
+        deallocate (phi, y_vec, y_grid, x_used, y_used, z_used, f_used)
+        if (present(weights)) deallocate (w_used)
+    end subroutine construct_splines_3d_lsq
+
     subroutine disp_3d(spl)
         type(SplineData3D), intent(in) :: spl
         print *, "SplineData3D"
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 32d82149..f97e05ec 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -129,6 +129,15 @@ target_link_libraries(test_system_utility.x
 add_executable(test_util.x source/test_util.f90)
 target_link_libraries(test_util.x ${COMMON_LIBS})
 
+add_executable(test_cgls_dense.x source/test_cgls_dense.f90)
+target_link_libraries(test_cgls_dense.x ${COMMON_LIBS})
+
+add_executable(test_cgls_small.x source/test_cgls_small.f90)
+target_link_libraries(test_cgls_small.x ${COMMON_LIBS})
+
+add_executable(test_spline_cgls_global.x source/test_spline_cgls_global.f90)
+target_link_libraries(test_spline_cgls_global.x ${COMMON_LIBS})
+
 add_executable(test_plag_coeff.x source/test_plag_coeff.f90)
 target_link_libraries(test_plag_coeff.x ${COMMON_LIBS})
 
@@ -208,11 +217,14 @@ add_test(NAME test_geqdsk_tools COMMAND test_geqdsk_tools.x)
 add_test(NAME test_simpson COMMAND test_simpson.x)
 add_test(NAME test_hdf5_tools COMMAND test_hdf5_tools.x)
 add_test(NAME test_util COMMAND test_util.x)
+add_test(NAME test_cgls_dense COMMAND test_cgls_dense.x)
+add_test(NAME test_cgls_small COMMAND test_cgls_small.x)
 add_test(NAME test_plag_coeff COMMAND test_plag_coeff.x)
 add_test(NAME test_spl_three_to_five COMMAND test_spl_three_to_five.x)
 add_test(NAME test_species COMMAND test_species.x)
 add_test(NAME test_rng COMMAND test_rng.x)
 add_test(NAME test_polylag_3 COMMAND test_polylag_3.x)
+add_test(NAME test_spline_cgls_global COMMAND test_spline_cgls_global.x)
 add_test(NAME test_interpolate COMMAND test_interpolate.x)
 add_test(NAME test_batch_interpolate COMMAND test_batch_interpolate.x)
 add_test(NAME test_batch_interpolate_many COMMAND test_batch_interpolate_many.x)
diff --git a/test/source/test_batch_interpolate.f90 b/test/source/test_batch_interpolate.f90
index 32caee78..85c38da4 100644
--- a/test/source/test_batch_interpolate.f90
+++ b/test/source/test_batch_interpolate.f90
@@ -15,6 +15,7 @@ program test_batch_interpolate
     call test_batch_spline_1d_single_extraction()
     call test_batch_spline_1d_periodic()
     call test_batch_spline_1d_memory_layout()
+    call test_batch_spline_1d_lsq_scattered()
     call bench_batch_vs_individual_1d()
     
     ! Test 2D batch splines  
@@ -265,6 +266,95 @@ subroutine test_batch_spline_1d_derivatives()
         
     end subroutine test_batch_spline_1d_derivatives
     
+    subroutine test_batch_spline_1d_lsq_scattered()
+        use interpolate
+        
+        integer, parameter :: N_POINTS_GRID = 40
+        integer, parameter :: N_POINTS_DATA = 4*N_POINTS_GRID
+        integer, parameter :: N_QUANTITIES = 3
+        integer, parameter :: ORDER = 5
+        logical, parameter :: PERIODIC = .false.
+        real(dp), parameter :: TOL_LSQ_ANAL = 5.0d-4
+        
+        type(BatchSplineData1D) :: batch_spl_lsq
+        type(BatchSplineData1D) :: batch_spl_grid
+        
+        real(dp) :: x_grid(N_POINTS_GRID)
+        real(dp) :: y_grid(N_POINTS_GRID, N_QUANTITIES)
+        real(dp) :: x_data(N_POINTS_DATA)
+        real(dp) :: f_data(N_POINTS_DATA, N_QUANTITIES)
+        real(dp) :: x_eval
+        real(dp) :: y_batch(N_QUANTITIES)
+        real(dp) :: f_true
+        real(dp) :: max_err_grid, max_err_lsq
+        
+        integer :: iq, i, n_test
+        
+        print *, "Testing 1D batch LSQ spline with scattered data"
+
+        max_err_grid = 0.0d0
+        max_err_lsq = 0.0d0
+        
+        call linspace(X_MIN, X_MAX, N_POINTS_GRID, x_grid)
+        
+        do iq = 1, N_QUANTITIES
+            do i = 1, N_POINTS_GRID
+                y_grid(i, iq) = sin(real(iq, dp)*x_grid(i)) + &
+                    0.25d0*cos(2.0d0*real(iq, dp)*x_grid(i))
+            end do
+        end do
+        
+        call construct_batch_splines_1d(X_MIN, X_MAX, y_grid, ORDER, &
+            PERIODIC, batch_spl_grid)
+        
+        do i = 1, N_POINTS_DATA
+            x_data(i) = X_MIN + (X_MAX - X_MIN) * &
+                (real(i, dp) + 0.23d0)/real(N_POINTS_DATA + 1, dp)
+            do iq = 1, N_QUANTITIES
+                f_data(i, iq) = sin(real(iq, dp)*x_data(i)) + &
+                    0.25d0*cos(2.0d0*real(iq, dp)*x_data(i))
+            end do
+        end do
+        
+        call construct_batch_splines_1d_lsq(X_MIN, X_MAX, ORDER, PERIODIC, &
+            N_POINTS_GRID, x_data, f_data, batch_spl_lsq)
+        
+        do n_test = 1, 20
+            x_eval = X_MIN + (X_MAX - X_MIN) * &
+                real(n_test, dp)/21.0d0
+            
+            call evaluate_batch_splines_1d(batch_spl_grid, x_eval, y_batch)
+            do iq = 1, N_QUANTITIES
+                f_true = sin(real(iq, dp)*x_eval) + &
+                    0.25d0*cos(2.0d0*real(iq, dp)*x_eval)
+                max_err_grid = max(max_err_grid, abs(y_batch(iq) - f_true))
+            end do
+
+            call evaluate_batch_splines_1d(batch_spl_lsq, x_eval, y_batch)
+            do iq = 1, N_QUANTITIES
+                f_true = sin(real(iq, dp)*x_eval) + &
+                    0.25d0*cos(2.0d0*real(iq, dp)*x_eval)
+                max_err_lsq = max(max_err_lsq, abs(y_batch(iq) - f_true))
+            end do
+        end do
+
+        call destroy_batch_splines_1d(batch_spl_grid)
+        call destroy_batch_splines_1d(batch_spl_lsq)
+
+        print *, "  1D batch LSQ: max grid error =", max_err_grid
+        print *, "  1D batch LSQ: max LSQ  error =", max_err_lsq
+
+        if (max_err_grid > TOL_LSQ_ANAL) then
+            error stop "1D batch grid spline mismatch analytic"
+        end if
+        if (max_err_lsq > TOL_LSQ_ANAL) then
+            error stop "1D batch LSQ spline mismatch analytic"
+        end if
+
+        print *, "  PASSED: 1D batch LSQ scattered interpolation"
+        
+    end subroutine test_batch_spline_1d_lsq_scattered
+    
     
     subroutine test_batch_spline_1d_single_extraction()
         use interpolate
@@ -1359,4 +1449,4 @@ subroutine bench_batch_vs_individual_3d_der2()
         
     end subroutine bench_batch_vs_individual_3d_der2
 
-end program test_batch_interpolate
\ No newline at end of file
+end program test_batch_interpolate
diff --git a/test/source/test_cgls_dense.f90 b/test/source/test_cgls_dense.f90
new file mode 100644
index 00000000..06bc80a2
--- /dev/null
+++ b/test/source/test_cgls_dense.f90
@@ -0,0 +1,97 @@
+program test_cgls_dense
+    use libneo_kinds, only : dp
+    use cgls_dense,   only : cgls_dense_solve
+
+    implicit none
+
+    real(dp), parameter :: TOL = 1.0d-10
+
+    call test_small_unweighted()
+    call test_small_weighted()
+
+contains
+
+    subroutine test_small_unweighted()
+        real(dp) :: phi(3, 2)
+        real(dp) :: x_true(2)
+        real(dp) :: rhs(3)
+        real(dp) :: x(2)
+        real(dp) :: err
+        integer :: i, j
+
+        phi = reshape([ &
+            1.0d0, 2.0d0, &
+            0.0d0, 1.0d0, &
+            1.0d0, 0.0d0 ], shape(phi))
+
+        x_true(1) = 0.5d0
+        x_true(2) = -1.5d0
+
+        do i = 1, 3
+            rhs(i) = 0.0d0
+            do j = 1, 2
+                rhs(i) = rhs(i) + phi(i, j)*x_true(j)
+            end do
+        end do
+
+        call cgls_dense_solve(phi, rhs, x)
+
+        err = 0.0d0
+        do i = 1, 2
+            err = err + (x(i) - x_true(i))**2
+        end do
+        err = sqrt(err)
+
+        if (err > TOL) then
+            error stop "cgls_dense unweighted test failed"
+        end if
+    end subroutine test_small_unweighted
+
+
+    subroutine test_small_weighted()
+        real(dp) :: phi(3, 2)
+        real(dp) :: x_true(2)
+        real(dp) :: rhs(3)
+        real(dp) :: x(2)
+        real(dp) :: w(3)
+        real(dp) :: err
+        integer :: i, j
+
+        phi(1, 1) = 1.0d0
+        phi(1, 2) = 0.0d0
+        phi(2, 1) = 1.0d0
+        phi(2, 2) = 1.0d0
+        phi(3, 1) = 0.0d0
+        phi(3, 2) = 1.0d0
+
+        x_true(1) = -0.3d0
+        x_true(2) = 2.1d0
+
+        do i = 1, 3
+            rhs(i) = 0.0d0
+            do j = 1, 2
+                rhs(i) = rhs(i) + phi(i, j)*x_true(j)
+            end do
+        end do
+
+        w(1) = 1.0d0
+        w(2) = 0.5d0
+        w(3) = 2.0d0
+
+        call cgls_dense_solve(phi, rhs, x, w)
+
+        err = 0.0d0
+        do i = 1, 2
+            err = err + (x(i) - x_true(i))**2
+        end do
+        err = sqrt(err)
+
+        if (err > TOL) then
+            print *, "cgls_dense weighted err =", err
+            print *, "  x_true =", x_true
+            print *, "  x      =", x
+            error stop "cgls_dense weighted test failed"
+        end if
+    end subroutine test_small_weighted
+
+end program test_cgls_dense
diff --git a/test/source/test_cgls_small.f90 b/test/source/test_cgls_small.f90
new file mode 100644
index 00000000..1c4af79c
--- /dev/null
+++ b/test/source/test_cgls_small.f90
@@ -0,0 +1,96 @@
+program test_cgls_small
+    use libneo_kinds, only : dp
+    use cgls_small, only : cgls_solve_small
+
+    implicit none
+
+    real(dp), parameter :: TOL = 1.0d-12
+
+    call test_spd_2x2()
+    call test_spd_3x3()
+
+contains
+
+    subroutine test_spd_2x2()
+        integer, parameter :: order = 1
+        real(dp) :: M(0:order, 0:order)
+        real(dp) :: x_true(0:order)
+        real(dp) :: rhs(0:order)
+        real(dp) :: x(0:order)
+        real(dp) :: err
+        integer :: i, j
+
+        M(0,0) = 4.0d0
+        M(0,1) = 1.0d0
+        M(1,0) = 1.0d0
+        M(1,1) = 3.0d0
+
+        x_true(0) = 1.0d0
+        x_true(1) = 2.0d0
+
+        rhs = 0.0d0
+        do i = 0, order
+            do j = 0, order
+                rhs(i) = rhs(i) + M(i,j)*x_true(j)
+            end do
+        end do
+
+        call cgls_solve_small(order, M, rhs, x)
+
+        err = 0.0d0
+        do i = 0, order
+            err = err + (x(i) - x_true(i))**2
+        end do
+
+        if (sqrt(err) > TOL) then
+            error stop "cgls_small 2x2 SPD test failed"
+        end if
+
+    end subroutine test_spd_2x2
+
+
+    subroutine test_spd_3x3()
+        integer, parameter :: order = 2
+        real(dp) :: M(0:order, 0:order)
+        real(dp) :: x_true(0:order)
+        real(dp) :: rhs(0:order)
+        real(dp) :: x(0:order)
+        real(dp) :: err
+        integer :: i, j
+
+        M(0,0) = 4.0d0
+        M(0,1) = 1.0d0
+        M(0,2) = 0.0d0
+        M(1,0) = 1.0d0
+        M(1,1) = 3.0d0
+        M(1,2) = 1.0d0
+        M(2,0) = 0.0d0
+        M(2,1) = 1.0d0
+        M(2,2) = 2.0d0
+
+        x_true(0) = 1.0d0
+        x_true(1) = -1.0d0
+        x_true(2) = 0.5d0
+
+        rhs = 0.0d0
+        do i = 0, order
+            do j = 0, order
+                rhs(i) = rhs(i) + M(i,j)*x_true(j)
+            end do
+        end do
+
+        call cgls_solve_small(order, M, rhs, x)
+
+        err = 0.0d0
+        do i = 0, order
+            err = err + (x(i) - x_true(i))**2
+        end do
+
+        if (sqrt(err) > TOL) then
+            error stop "cgls_small 3x3 SPD test failed"
+        end if
+
+    end subroutine test_spd_3x3
+
+end program test_cgls_small
+
diff --git a/test/source/test_interpolate.f90 b/test/source/test_interpolate.f90
index dbbe2625..d80235ce 100644
--- a/test/source/test_interpolate.f90
+++ b/test/source/test_interpolate.f90
@@ -27,9 +27,10 @@ program test_interpolate
     call test_spline_3d(spline_order=[3,5,3], periodic=[.False., .True.,.True.])
     call test_spline_3d(spline_order=[3,3,3], periodic=[.True., .True., .True.])
 
-    call test_periodic_wrap_1d()
-    call test_periodic_wrap_2d()
-    call test_periodic_wrap_3d()
+    call test_spline_1d_lsq_scattered()
+    call test_spline_2d_lsq_full_grid()
+    call test_spline_2d_lsq_circular_domain()
+    call test_spline_3d_lsq_spherical_and_toroidal()
 
     call bench_spline_1d
 
@@ -127,39 +128,6 @@ subroutine test_spline_1d(spline_order, periodic)
     end subroutine test_spline_1d
 
 
-    subroutine test_periodic_wrap_1d()
-        use interpolate
-
-        integer, parameter :: N_POINTS = 100
-        integer, parameter :: spline_order = 5
-        logical, parameter :: periodic = .True.
-
-        real(dp), dimension(N_POINTS) :: x, y
-        real(dp) :: x_eval, x_wrap, period
-        real(dp) :: expected, actual
-
-        type(SplineData1D) :: spl
-
-        print *, "Testing 1D periodic wrap with nonzero x_min..."
-
-        call linspace(X_MIN, X_MAX, N_POINTS, x)
-        y = cos(x)
-
-        call construct_splines_1d(X_MIN, X_MAX, y, spline_order, periodic, spl)
-
-        period = X_MAX - X_MIN
-        x_wrap = x(37)
-        x_eval = x_wrap - period
-
-        expected = cos(x_wrap)
-
-        call evaluate_splines_1d(spl, x_eval, actual)
-        if (abs(expected - actual) > TOL_EXACT) error stop
-
-        call destroy_splines_1d(spl)
-    end subroutine test_periodic_wrap_1d
-
-
     subroutine test_spline_2d(spline_order, periodic)
         use interpolate
 
@@ -278,106 +246,457 @@ subroutine test_spline_3d(spline_order, periodic)
             if (any(abs(d_expected - d_actual) > TOL_CRUDE)) error stop
             if (any(abs(d2_expected - d2_actual) > TOL_CRUDE)) error stop
 
-            call destroy_splines_3d(spl)
+        call destroy_splines_3d(spl)
+
+    end subroutine test_spline_3d
+
+    subroutine test_spline_1d_lsq_scattered()
+        use interpolate
+
+        integer, parameter :: N_POINTS_GRID = 100
+        integer, parameter :: N_POINTS_DATA = N_POINTS_GRID
+        integer, parameter :: ORDER = 5
+        logical, parameter :: PERIODIC = .false.
+
+        real(dp) :: x_grid(N_POINTS_GRID), y_grid(N_POINTS_GRID)
+        real(dp) :: x_data(N_POINTS_DATA), f_data(N_POINTS_DATA)
+        real(dp) :: x_eval, f_true, f_grid, f_lsq
+
+        type(SplineData1D) :: spl_grid
+        type(SplineData1D) :: spl_lsq
+
+        integer :: i
 
-        end subroutine test_spline_3d
+        print *, "Testing 1D LSQ spline with scattered data"
 
+        call linspace(X_MIN, X_MAX, N_POINTS_GRID, x_grid)
 
-    subroutine test_periodic_wrap_2d()
+        do i = 1, N_POINTS_GRID
+            y_grid(i) = sin(2.0d0*x_grid(i)) + 0.5d0*cos(3.0d0*x_grid(i))
+        end do
+
+        call construct_splines_1d(X_MIN, X_MAX, y_grid, ORDER, PERIODIC, spl_grid)
+
+        do i = 1, N_POINTS_DATA
+            x_data(i) = x_grid(i)
+            f_data(i) = y_grid(i)
+        end do
+
+        call construct_splines_1d_lsq(X_MIN, X_MAX, ORDER, PERIODIC, &
+            N_POINTS_GRID, x_data, f_data, spl_lsq)
+
+        do i = 1, 50
+            x_eval = X_MIN + (X_MAX - X_MIN)*dble(i)/51.0d0
+            f_true = sin(2.0d0*x_eval) + 0.5d0*cos(3.0d0*x_eval)
+
+            call evaluate_splines_1d(spl_grid, x_eval, f_grid)
+            call evaluate_splines_1d(spl_lsq, x_eval, f_lsq)
+
+            if (abs(f_grid - f_true) > TOL) then
+                error stop "1D grid spline does not match analytic function"
+            end if
+            if (abs(f_lsq - f_true) > TOL) then
+                error stop "1D LSQ spline does not match analytic function"
+            end if
+        end do
+
+        call destroy_splines_1d(spl_grid)
+        call destroy_splines_1d(spl_lsq)
+
+    end subroutine test_spline_1d_lsq_scattered
+
+    subroutine test_spline_2d_lsq_full_grid()
         use interpolate
 
-        integer, parameter :: N_POINTS(2) = [80, 77]
-        integer, parameter :: spline_order(2) = [5, 5]
-        logical, parameter :: periodic(2) = [.True., .True.]
+        integer, parameter :: N_POINTS1 = 40
+        integer, parameter :: N_POINTS2 = 36
+        integer, parameter :: ORDER1 = 5
+        integer, parameter :: ORDER2 = 5
 
-        real(dp), allocatable :: x1(:), x2(:), y(:,:)
-        real(dp) :: x_eval(2), x_wrap(2), period(2)
-        real(dp) :: expected, actual
-        integer :: k1, k2, i1_wrap, i2_wrap
+        real(dp) :: x_min(2), x_max(2)
+        real(dp) :: x1(N_POINTS1), x2(N_POINTS2)
+        real(dp) :: y_grid(N_POINTS1, N_POINTS2)
+        real(dp) :: x_data(N_POINTS1*N_POINTS2)
+        real(dp) :: y_data(N_POINTS1*N_POINTS2)
+        real(dp) :: f_data(N_POINTS1*N_POINTS2)
 
-        type(SplineData2D) :: spl
+        type(SplineData2D) :: spl_grid
+        type(SplineData2D) :: spl_lsq
 
-        print *, "Testing 2D periodic wrap with nonzero x_min..."
+        real(dp) :: x_eval(2), f_true, f_grid, f_lsq
+        real(dp) :: max_err_grid, max_err_lsq
+        integer :: i1, i2, idx
+        real(dp) :: xmin_loc, xmax_loc
 
-        allocate(x1(N_POINTS(1)), x2(N_POINTS(2)))
-        allocate(y(N_POINTS(1), N_POINTS(2)))
+        print *, "Testing 2D LSQ spline on full grid (sanity check)"
 
-        call linspace(X_MIN, X_MAX, N_POINTS(1), x1)
-        call linspace(X_MIN, X_MAX, N_POINTS(2), x2)
+        xmin_loc = 1.23d0
+        xmax_loc = TWOPI + 1.23d0
 
-        do k2 = 1, N_POINTS(2)
-            do k1 = 1, N_POINTS(1)
-                y(k1, k2) = cos(x1(k1))*cos(x2(k2))
+        x_min(1) = xmin_loc
+        x_min(2) = xmin_loc
+        x_max(1) = xmax_loc
+        x_max(2) = xmax_loc
+
+        call linspace(xmin_loc, xmax_loc, N_POINTS1, x1)
+        call linspace(xmin_loc, xmax_loc, N_POINTS2, x2)
+
+        do i2 = 1, N_POINTS2
+            do i1 = 1, N_POINTS1
+                y_grid(i1, i2) = cos(x1(i1))*cos(2.0d0*x2(i2))
             end do
         end do
 
-        call construct_splines_2d([X_MIN, X_MIN], [X_MAX, X_MAX], y, spline_order, periodic, spl)
+        call construct_splines_2d(x_min, x_max, y_grid, &
+            [ORDER1, ORDER2], [.false., .false.], spl_grid)
 
-        period = [X_MAX - X_MIN, X_MAX - X_MIN]
-        i1_wrap = 23
-        i2_wrap = 19
-        x_wrap = [x1(i1_wrap), x2(i2_wrap)]
-        x_eval(1) = x_wrap(1) - period(1)
-        x_eval(2) = x_wrap(2) + 2.0d0*period(2)
+        idx = 0
+        do i2 = 1, N_POINTS2
+            do i1 = 1, N_POINTS1
+                idx = idx + 1
+                x_data(idx) = x1(i1)
+                y_data(idx) = x2(i2)
+                f_data(idx) = y_grid(i1, i2)
+            end do
+        end do
 
-        expected = cos(x_wrap(1))*cos(x_wrap(2))
-        call evaluate_splines_2d(spl, x_eval, actual)
-        if (abs(expected - actual) > TOL_EXACT) error stop
+        call construct_splines_2d_lsq(x_min, x_max, &
+            [ORDER1, ORDER2], [.false., .false.], &
+            [N_POINTS1, N_POINTS2], x_data, y_data, f_data, spl_lsq)
 
-        call destroy_splines_2d(spl)
-        deallocate(x1, x2, y)
-    end subroutine test_periodic_wrap_2d
+        max_err_grid = 0.0d0
+        max_err_lsq = 0.0d0
 
+        do i2 = 1, N_POINTS2
+            do i1 = 1, N_POINTS1
+                x_eval(1) = x1(i1)
+                x_eval(2) = x2(i2)
 
-    subroutine test_periodic_wrap_3d()
+                f_true = cos(x_eval(1))*cos(2.0d0*x_eval(2))
+
+                call evaluate_splines_2d(spl_grid, x_eval, f_grid)
+                call evaluate_splines_2d(spl_lsq, x_eval, f_lsq)
+
+                max_err_grid = max(max_err_grid, abs(f_grid - f_true))
+                max_err_lsq = max(max_err_lsq, abs(f_lsq - f_true))
+            end do
+        end do
+
+        print *, "  2D full-grid LSQ: max grid error =", max_err_grid
+        print *, "  2D full-grid LSQ: max LSQ  error =", max_err_lsq
+
+        if (max_err_grid > TOL) then
+            error stop "2D grid spline does not match analytic (full-grid LSQ)"
+        end if
+        if (max_err_lsq > TOL) then
+            error stop "2D LSQ spline does not match analytic (full-grid LSQ)"
+        end if
+
+        call destroy_splines_2d(spl_grid)
+        call destroy_splines_2d(spl_lsq)
+
+    end subroutine test_spline_2d_lsq_full_grid
+
+    subroutine test_spline_2d_lsq_circular_domain()
         use interpolate
 
-        integer, parameter :: N_POINTS(3) = [41, 43, 39]
-        integer, parameter :: spline_order(3) = [5, 3, 3]
-        logical, parameter :: periodic(3) = [.True., .True., .True.]
+        integer, parameter :: N_POINTS1 = 40
+        integer, parameter :: N_POINTS2 = 36
+        integer, parameter :: ORDER1 = 5
+        integer, parameter :: ORDER2 = 5
 
-        real(dp), allocatable :: x1(:), x2(:), x3(:), y(:,:,:)
-        real(dp) :: x_eval(3), x_wrap(3), period(3)
-        real(dp) :: expected, actual
-        integer :: k1, k2, k3, i1_wrap, i2_wrap, i3_wrap
+        integer, parameter :: MAX_DATA = N_POINTS1*N_POINTS2
 
-        type(SplineData3D) :: spl
+        real(dp) :: x_min(2), x_max(2)
+        real(dp) :: x1(N_POINTS1), x2(N_POINTS2)
+        real(dp) :: y_grid(N_POINTS1, N_POINTS2)
 
-        print *, "Testing 3D periodic wrap with nonzero x_min..."
+        real(dp) :: x_data(MAX_DATA), y_data(MAX_DATA), f_data(MAX_DATA)
 
-        allocate(x1(N_POINTS(1)), x2(N_POINTS(2)), x3(N_POINTS(3)))
-        allocate(y(N_POINTS(1), N_POINTS(2), N_POINTS(3)))
+        type(SplineData2D) :: spl_grid
+        type(SplineData2D) :: spl_lsq
 
-        call linspace(X_MIN, X_MAX, N_POINTS(1), x1)
-        call linspace(X_MIN, X_MAX, N_POINTS(2), x2)
-        call linspace(X_MIN, X_MAX, N_POINTS(3), x3)
+        real(dp) :: x_eval(2), f_true, f_grid, f_lsq
+        real(dp) :: max_err_grid, max_err_lsq
+        real(dp) :: xc, yc, r0, dx, dy
+        real(dp) :: theta, r
+        real(dp) :: xmin_loc, xmax_loc
+        real(dp), parameter :: TOL_LSQ_MASK = 5.0d-2
+
+        integer :: i1, i2, ndata, i
+
+        print *, "Testing 2D LSQ spline with circular domain cut-out"
+
+        max_err_grid = 0.0d0
+        max_err_lsq = 0.0d0
+
+        xmin_loc = 1.23d0
+        xmax_loc = TWOPI + 1.23d0
+
+        x_min(1) = xmin_loc
+        x_min(2) = xmin_loc
+        x_max(1) = xmax_loc
+        x_max(2) = xmax_loc
+
+        call linspace(xmin_loc, xmax_loc, N_POINTS1, x1)
+        call linspace(xmin_loc, xmax_loc, N_POINTS2, x2)
+
+        do i2 = 1, N_POINTS2
+            do i1 = 1, N_POINTS1
+                y_grid(i1, i2) = cos(x1(i1))*cos(2.0d0*x2(i2))
+            end do
+        end do
+
+        call construct_splines_2d(x_min, x_max, y_grid, &
+            [ORDER1, ORDER2], [.false., .false.], spl_grid)
+
+        xc = 0.5d0*(xmin_loc + xmax_loc)
+        yc = 0.5d0*(xmin_loc + xmax_loc)
+        r0 = 0.45d0*(xmax_loc - xmin_loc)
+
+        ndata = 0
+        do i2 = 1, N_POINTS2
+            do i1 = 1, N_POINTS1
+                dx = x1(i1) - xc
+                dy = x2(i2) - yc
+                if (dx*dx + dy*dy <= r0*r0) then
+                    ndata = ndata + 1
+                    x_data(ndata) = x1(i1)
+                    y_data(ndata) = x2(i2)
+                    f_data(ndata) = cos(x1(i1))*cos(2.0d0*x2(i2))
+                end if
+            end do
+        end do
+
+        call construct_splines_2d_lsq(x_min, x_max, &
+            [ORDER1, ORDER2], [.false., .false.], &
+            [N_POINTS1, N_POINTS2], x_data(1:ndata), y_data(1:ndata), &
+            f_data(1:ndata), spl_lsq)
+
+        do i = 1, 40
+            theta = TWOPI*dble(i)/40.0d0
+            r = 0.4d0*(xmax_loc - xmin_loc)
+            x_eval(1) = xc + r*cos(theta)
+            x_eval(2) = yc + r*sin(theta)
+
+            f_true = cos(x_eval(1))*cos(2.0d0*x_eval(2))
+
+            call evaluate_splines_2d(spl_grid, x_eval, f_grid)
+            call evaluate_splines_2d(spl_lsq, x_eval, f_lsq)
+
+            max_err_grid = max(max_err_grid, abs(f_grid - f_true))
+            max_err_lsq = max(max_err_lsq, abs(f_lsq - f_true))
+        end do
+
+        print *, "  2D LSQ: max grid error  =", max_err_grid
+        print *, "  2D LSQ: max LSQ  error  =", max_err_lsq
+
+        if (max_err_grid > TOL) then
+            error stop "2D grid spline does not match analytic function"
+        end if
+        if (max_err_lsq > TOL_LSQ_MASK) then
+            error stop "2D LSQ spline does not match analytic function"
+        end if
+
+        call destroy_splines_2d(spl_grid)
+        call destroy_splines_2d(spl_lsq)
+
+    end subroutine test_spline_2d_lsq_circular_domain
 
-        do k3 = 1, N_POINTS(3)
-            do k2 = 1, N_POINTS(2)
-                do k1 = 1, N_POINTS(1)
-                    y(k1, k2, k3) = cos(x1(k1))*cos(x2(k2))*cos(x3(k3))
+    subroutine test_spline_3d_lsq_spherical_and_toroidal()
+        use interpolate
+
+        integer, parameter :: N_POINTS1 = 18
+        integer, parameter :: N_POINTS2 = 16
+        integer, parameter :: N_POINTS3 = 14
+
+        integer, parameter :: ORDER1 = 3
+        integer, parameter :: ORDER2 = 3
+        integer, parameter :: ORDER3 = 3
+
+        integer, parameter :: MAX_DATA = N_POINTS1*N_POINTS2*N_POINTS3
+
+        real(dp) :: x_min(3), x_max(3)
+        real(dp) :: x1(N_POINTS1), x2(N_POINTS2), x3(N_POINTS3)
+        real(dp) :: y_grid(N_POINTS1, N_POINTS2, N_POINTS3)
+
+        real(dp) :: xs_data(MAX_DATA), ys_data(MAX_DATA), zs_data(MAX_DATA)
+        real(dp) :: fs_sphere(MAX_DATA), fs_torus(MAX_DATA)
+
+        type(SplineData3D) :: spl_grid
+        type(SplineData3D) :: spl_lsq_sphere
+        type(SplineData3D) :: spl_lsq_torus
+
+        real(dp) :: x_eval(3), f_true, f_grid, f_sphere, f_torus
+
+        real(dp) :: xc, yc, zc, r_sphere, r_major, r_minor
+        real(dp) :: dx, dy, dz, R_xy, dist_torus
+        real(dp) :: theta1, theta2, r, theta, phi
+        real(dp) :: xmin_loc, xmax_loc
+        real(dp) :: max_err_grid_sph, max_err_lsq_sph
+        real(dp) :: max_err_grid_tor, max_err_lsq_tor
+        real(dp), parameter :: TOL_GRID_3D_LSQ = 1.0d-3
+        real(dp), parameter :: TOL_LSQ_3D_MASK = 3.0d-1
+
+        integer :: i1, i2, i3, n_sphere, n_torus, i
+
+        print *, "Testing 3D LSQ spline with spherical and toroidal domains"
+
+        xmin_loc = 1.23d0
+        xmax_loc = TWOPI + 1.23d0
+
+        x_min = xmin_loc
+        x_max = xmax_loc
+
+        do i1 = 1, N_POINTS1
+            x1(i1) = xmin_loc + (xmax_loc - xmin_loc)*dble(i1 - 1)/ &
+                dble(N_POINTS1 - 1)
+        end do
+        do i2 = 1, N_POINTS2
+            x2(i2) = xmin_loc + (xmax_loc - xmin_loc)*dble(i2 - 1)/ &
+                dble(N_POINTS2 - 1)
+        end do
+        do i3 = 1, N_POINTS3
+            x3(i3) = xmin_loc + (xmax_loc - xmin_loc)*dble(i3 - 1)/ &
+                dble(N_POINTS3 - 1)
+        end do
+
+        do i3 = 1, N_POINTS3
+            do i2 = 1, N_POINTS2
+                do i1 = 1, N_POINTS1
+                    y_grid(i1, i2, i3) = cos(x1(i1))*cos(x2(i2))*cos(x3(i3))
                 end do
             end do
         end do
 
-        call construct_splines_3d([X_MIN, X_MIN, X_MIN], [X_MAX, X_MAX, X_MAX], &
-            y, spline_order, periodic, spl)
+        call construct_splines_3d(x_min, x_max, y_grid, &
+            [ORDER1, ORDER2, ORDER3], &
+            [.false., .false., .false.], spl_grid)
+
+        xc = 0.5d0*(xmin_loc + xmax_loc)
+        yc = 0.5d0*(xmin_loc + xmax_loc)
+        zc = 0.5d0*(xmin_loc + xmax_loc)
+        r_sphere = 0.35d0*(xmax_loc - xmin_loc)
+
+        r_major = 0.35d0*(xmax_loc - xmin_loc)
+        r_minor = 0.15d0*(xmax_loc - xmin_loc)
+
+        n_sphere = 0
+        n_torus = 0
+        max_err_grid_sph = 0.0d0
+        max_err_lsq_sph = 0.0d0
+        max_err_grid_tor = 0.0d0
+        max_err_lsq_tor = 0.0d0
+
+        do i3 = 1, N_POINTS3
+            do i2 = 1, N_POINTS2
+                do i1 = 1, N_POINTS1
+                    dx = x1(i1) - xc
+                    dy = x2(i2) - yc
+                    dz = x3(i3) - zc
+
+                    if (dx*dx + dy*dy + dz*dz <= r_sphere*r_sphere) then
+                        n_sphere = n_sphere + 1
+                        xs_data(n_sphere) = x1(i1)
+                        ys_data(n_sphere) = x2(i2)
+                        zs_data(n_sphere) = x3(i3)
+                        fs_sphere(n_sphere) = cos(x1(i1))*cos(x2(i2))*cos(x3(i3))
+                    end if
+
+                    R_xy = sqrt(dx*dx + dy*dy)
+                    dist_torus = sqrt((R_xy - r_major)**2 + dz*dz)
+                    if (dist_torus <= r_minor) then
+                        n_torus = n_torus + 1
+                        xs_data(MAX_DATA - n_torus + 1) = x1(i1)
+                        ys_data(MAX_DATA - n_torus + 1) = x2(i2)
+                        zs_data(MAX_DATA - n_torus + 1) = x3(i3)
+                        fs_torus(MAX_DATA - n_torus + 1) = &
+                            cos(x1(i1))*cos(x2(i2))*cos(x3(i3))
+                    end if
+                end do
+            end do
+        end do
 
-        period = [X_MAX - X_MIN, X_MAX - X_MIN, X_MAX - X_MIN]
-        i1_wrap = 11
-        i2_wrap = 13
-        i3_wrap = 17
-        x_wrap = [x1(i1_wrap), x2(i2_wrap), x3(i3_wrap)]
-        x_eval(1) = x_wrap(1) - period(1)
-        x_eval(2) = x_wrap(2) + period(2)
-        x_eval(3) = x_wrap(3) + 2.0d0*period(3)
+        if (n_sphere < 10) then
+            error stop "Too few points in spherical LSQ sample"
+        end if
+        if (n_torus < 10) then
+            error stop "Too few points in toroidal LSQ sample"
+        end if
+
+        call construct_splines_3d_lsq(x_min, x_max, &
+            [ORDER1, ORDER2, ORDER3], &
+            [.false., .false., .false.], &
+            [N_POINTS1, N_POINTS2, N_POINTS3], &
+            xs_data(1:n_sphere), ys_data(1:n_sphere), &
+            zs_data(1:n_sphere), fs_sphere(1:n_sphere), spl_lsq_sphere)
+
+        call construct_splines_3d_lsq(x_min, x_max, &
+            [ORDER1, ORDER2, ORDER3], &
+            [.false., .false., .false.], &
+            [N_POINTS1, N_POINTS2, N_POINTS3], &
+            xs_data(MAX_DATA - n_torus + 1:MAX_DATA), &
+            ys_data(MAX_DATA - n_torus + 1:MAX_DATA), &
+            zs_data(MAX_DATA - n_torus + 1:MAX_DATA), &
+            fs_torus(MAX_DATA - n_torus + 1:MAX_DATA), spl_lsq_torus)
+
+        do i = 1, 40
+            theta1 = TWOPI*dble(i)/40.0d0
+            theta2 = 0.5d0*TWOPI*dble(i)/40.0d0
+            r = 0.3d0*(xmax_loc - xmin_loc)
+
+            x_eval(1) = xc + r*cos(theta1)*cos(theta2)
+            x_eval(2) = yc + r*sin(theta1)*cos(theta2)
+            x_eval(3) = zc + r*sin(theta2)
+
+            f_true = cos(x_eval(1))*cos(x_eval(2))*cos(x_eval(3))
+
+            call evaluate_splines_3d(spl_grid, x_eval, f_grid)
+            call evaluate_splines_3d(spl_lsq_sphere, x_eval, f_sphere)
+
+            max_err_grid_sph = max(max_err_grid_sph, abs(f_grid - f_true))
+            max_err_lsq_sph = max(max_err_lsq_sph, abs(f_sphere - f_true))
+        end do
 
-        expected = cos(x_wrap(1))*cos(x_wrap(2))*cos(x_wrap(3))
-        call evaluate_splines_3d(spl, x_eval, actual)
-        if (abs(expected - actual) > TOL_EXACT) error stop
+        do i = 1, 40
+            theta = TWOPI*dble(i)/40.0d0
+            phi = 0.5d0*TWOPI*dble(i)/40.0d0
 
-        call destroy_splines_3d(spl)
-        deallocate(x1, x2, x3, y)
-    end subroutine test_periodic_wrap_3d
+            x_eval(1) = xc + (r_major + r_minor*cos(phi))*cos(theta)
+            x_eval(2) = yc + (r_major + r_minor*cos(phi))*sin(theta)
+            x_eval(3) = zc + r_minor*sin(phi)
+
+            f_true = cos(x_eval(1))*cos(x_eval(2))*cos(x_eval(3))
+
+            call evaluate_splines_3d(spl_grid, x_eval, f_grid)
+            call evaluate_splines_3d(spl_lsq_torus, x_eval, f_torus)
+
+            max_err_grid_tor = max(max_err_grid_tor, abs(f_grid - f_true))
+            max_err_lsq_tor = max(max_err_lsq_tor, abs(f_torus - f_true))
+        end do
+
+        print *, "  3D LSQ sphere: max grid error =", max_err_grid_sph
+        print *, "  3D LSQ sphere: max LSQ  error =", max_err_lsq_sph
+        print *, "  3D LSQ torus : max grid error =", max_err_grid_tor
+        print *, "  3D LSQ torus : max LSQ  error =", max_err_lsq_tor
+
+        if (max_err_grid_sph > TOL_GRID_3D_LSQ) then
+            error stop "3D grid spline does not match analytic function (sphere)"
+        end if
+        if (max_err_lsq_sph > TOL_LSQ_3D_MASK) then
+            error stop "3D spherical LSQ spline does not match analytic"
+        end if
+        if (max_err_grid_tor > TOL_GRID_3D_LSQ) then
+            error stop "3D grid spline does not match analytic (torus)"
+        end if
+        if (max_err_lsq_tor > TOL_LSQ_3D_MASK) then
+            error stop "3D toroidal LSQ spline does not match analytic"
+        end if
+
+        call destroy_splines_3d(spl_grid)
+        call destroy_splines_3d(spl_lsq_sphere)
+        call destroy_splines_3d(spl_lsq_torus)
+
+    end subroutine test_spline_3d_lsq_spherical_and_toroidal
 
 end program test_interpolate
diff --git a/test/source/test_spline_cgls_global.f90 b/test/source/test_spline_cgls_global.f90
new file mode 100644
index 00000000..63ada44f
--- /dev/null
+++ b/test/source/test_spline_cgls_global.f90
@@ -0,0 +1,109 @@
+program test_spline_cgls_global
+    use libneo_kinds, only : dp
+    use math_constants
+    use libneo_util,  only : linspace
+    use interpolate, only : SplineData2D, construct_splines_2d, &
+        evaluate_splines_2d, build_design_matrix_2d
+    use cgls_dense,  only : cgls_dense_solve
+
+    implicit none
+
+    call test_design_matrix_2d_exact()
+
+contains
+
+    subroutine test_design_matrix_2d_exact()
+        integer, parameter :: N_POINTS1 = 16
+        integer, parameter :: N_POINTS2 = 14
+        integer, parameter :: ORDER1 = 3
+        integer, parameter :: ORDER2 = 3
+
+        real(dp), parameter :: TOL = 1.0d-10
+
+        real(dp) :: x_min(2), x_max(2)
+        real(dp) :: x1(N_POINTS1), x2(N_POINTS2)
+        real(dp) :: y_grid(N_POINTS1, N_POINTS2)
+        real(dp) :: x_data(N_POINTS1*N_POINTS2)
+        real(dp) :: y_data(N_POINTS1*N_POINTS2)
+        real(dp) :: f_data(N_POINTS1*N_POINTS2)
+        real(dp) :: phi_mat(N_POINTS1*N_POINTS2, N_POINTS1*N_POINTS2)
+        real(dp) :: y_vec(N_POINTS1*N_POINTS2)
+        real(dp) :: y_fit(N_POINTS1, N_POINTS2)
+        type(SplineData2D) :: spl
+
+        integer :: i1, i2, idx
+        real(dp) :: x_eval(2), f_true, f_fit, err, max_err
+
+        print *, "Testing 2D design matrix CGLS exactness (global knots)..."
+
+        x_min(1) = 1.23d0
+        x_min(2) = 1.23d0
+        x_max(1) = TWOPI + 1.23d0
+        x_max(2) = TWOPI + 1.23d0
+
+        call linspace(x_min(1), x_max(1), N_POINTS1, x1)
+        call linspace(x_min(2), x_max(2), N_POINTS2, x2)
+
+        do i2 = 1, N_POINTS2
+            do i1 = 1, N_POINTS1
+                y_grid(i1, i2) = cos(x1(i1))*cos(2.0d0*x2(i2))
+            end do
+        end do
+
+        call construct_splines_2d(x_min, x_max, y_grid, &
+            [ORDER1, ORDER2], [.false., .false.], spl)
+
+        idx = 0
+        do i2 = 1, N_POINTS2
+            do i1 = 1, N_POINTS1
+                idx = idx + 1
+                x_data(idx) = x1(i1)
+                y_data(idx) = x2(i2)
+                x_eval(1) = x1(i1)
+                x_eval(2) = x2(i2)
+                call evaluate_splines_2d(spl, x_eval, f_data(idx))
+            end do
+        end do
+
+        call build_design_matrix_2d(x_min, x_max, [ORDER1, ORDER2], &
+            [.false., .false.], [N_POINTS1, N_POINTS2], x_data, y_data, phi_mat)
+
+        y_vec = 0.0d0
+        call cgls_dense_solve(phi_mat, f_data, y_vec, max_iter=N_POINTS1*N_POINTS2)
+
+        y_fit = reshape(y_vec, shape=[N_POINTS1, N_POINTS2])
+
+        max_err = 0.0d0
+        do i2 = 1, N_POINTS2
+            do i1 = 1, N_POINTS1
+                x_eval(1) = x1(i1)
+                x_eval(2) = x2(i2)
+                f_true = cos(x_eval(1))*cos(2.0d0*x_eval(2))
+                call evaluate_splines_2d(spl, x_eval, f_fit)
+                err = abs(f_fit - f_true)
+                if (err > max_err) max_err = err
+            end do
+        end do
+
+        if (max_err > TOL) then
+            print *, "  2D grid spline max error =", max_err
+            error stop "2D grid spline does not match analytic in design-matrix test"
+        end if
+
+        max_err = 0.0d0
+        do i2 = 1, N_POINTS2
+            do i1 = 1, N_POINTS1
+                err = abs(y_fit(i1, i2) - y_grid(i1, i2))
+                if (err > max_err) max_err = err
+            end do
+        end do
+
+        if (max_err > TOL) then
+            print *, "  2D CGLS design-matrix max knot error =", max_err
+            error stop "2D CGLS failed to reconstruct global knots"
+        end if
+
+        print *, "  PASSED: 2D global-knot design matrix CGLS"
+    end subroutine test_design_matrix_2d_exact
+
+end program test_spline_cgls_global

From 29c0fc963e8456abaaf5d5856fbba428fc4bb836 Mon Sep 17 00:00:00 2001
From: Christopher Albert <albert@tugraz.at>
Date: Sun, 11 Jan 2026 02:52:43 +0100
Subject: [PATCH 2/2] interpolate: re-export batch 1D LSQ constructor

---
 src/interpolate/batch_interpolate.f90 | 2 ++
 src/interpolate/interpolate.f90       | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/interpolate/batch_interpolate.f90 b/src/interpolate/batch_interpolate.f90
index 1330f4bb..a3058e1e 100644
--- a/src/interpolate/batch_interpolate.f90
+++ b/src/interpolate/batch_interpolate.f90
@@ -5,6 +5,7 @@ module batch_interpolate
     use batch_interpolate_types, only: BatchSplineData1D, BatchSplineData2D, &
                                        BatchSplineData3D
     use batch_interpolate_1d, only: construct_batch_splines_1d, &
+                                    construct_batch_splines_1d_lsq, &
                                     construct_batch_splines_1d_lines, &
                                     construct_batch_splines_1d_resident, &
                                     construct_batch_splines_1d_resident_device, &
@@ -49,6 +50,7 @@ module batch_interpolate
 
     ! Re-export 1D routines from batch_interpolate_1d
     public :: construct_batch_splines_1d, destroy_batch_splines_1d
+    public :: construct_batch_splines_1d_lsq
     public :: construct_batch_splines_1d_lines
     public :: construct_batch_splines_1d_resident
     public :: construct_batch_splines_1d_resident_device
diff --git a/src/interpolate/interpolate.f90 b/src/interpolate/interpolate.f90
index 2718ee3d..e223ca5f 100644
--- a/src/interpolate/interpolate.f90
+++ b/src/interpolate/interpolate.f90
@@ -3,6 +3,7 @@ module interpolate
     use spl_three_to_five_sub, only: spl_per, spl_reg
     use batch_interpolate, only: BatchSplineData1D, BatchSplineData2D, BatchSplineData3D
     use batch_interpolate, only: construct_batch_splines_1d, &
+                                 construct_batch_splines_1d_lsq, &
                                  construct_batch_splines_2d, &
                                  construct_batch_splines_3d, &
                                  construct_batch_splines_1d_resident, &
@@ -41,7 +42,8 @@ module interpolate
 
     ! Re-export batch interpolate types and procedures
     public :: BatchSplineData1D, BatchSplineData2D, BatchSplineData3D
-    public :: construct_batch_splines_1d, construct_batch_splines_2d, &
+    public :: construct_batch_splines_1d, construct_batch_splines_1d_lsq, &
+              construct_batch_splines_2d, &
               construct_batch_splines_3d
     public :: construct_batch_splines_1d_resident, construct_batch_splines_2d_resident
     public :: construct_batch_splines_1d_resident_device