itpplasma · krystophny · Dec 2, 2025 · Jan 11, 2026
diff --git a/src/interpolate/CMakeLists.txt b/src/interpolate/CMakeLists.txt
@@ -6,6 +6,8 @@ add_library(interpolate STATIC
     batch_interpolate_2d.f90
     batch_interpolate_3d.f90
     batch_interpolate.f90
+    cgls_small.f90
+    cgls_dense.f90
     interpolate.f90)
 
 if(LIBNEO_BUILD_TESTING)

diff --git a/src/interpolate/batch_interpolate.f90 b/src/interpolate/batch_interpolate.f90
@@ -5,6 +5,7 @@ module batch_interpolate
     use batch_interpolate_types, only: BatchSplineData1D, BatchSplineData2D, &
                                        BatchSplineData3D
     use batch_interpolate_1d, only: construct_batch_splines_1d, &
+                                    construct_batch_splines_1d_lsq, &
                                     construct_batch_splines_1d_lines, &
                                     construct_batch_splines_1d_resident, &
                                     construct_batch_splines_1d_resident_device, &
@@ -49,6 +50,7 @@ module batch_interpolate
 
     ! Re-export 1D routines from batch_interpolate_1d
     public :: construct_batch_splines_1d, destroy_batch_splines_1d
+    public :: construct_batch_splines_1d_lsq
     public :: construct_batch_splines_1d_lines
     public :: construct_batch_splines_1d_resident
     public :: construct_batch_splines_1d_resident_device

diff --git a/src/interpolate/batch_interpolate_1d.f90 b/src/interpolate/batch_interpolate_1d.f90
@@ -5,6 +5,7 @@ module batch_interpolate_1d
                                  spl_four_per_line, spl_four_reg_line, &
                                  spl_five_per_line, spl_five_reg_line
    use spl_three_to_five_sub, only: spl_per, spl_reg
+   use cgls_dense, only: cgls_dense_solve
 #ifdef _OPENACC
    use openacc, only: acc_is_present
 #endif
@@ -14,6 +15,7 @@ module batch_interpolate_1d
 
    ! Export batch spline construction/destruction routines
    public :: construct_batch_splines_1d
+   public :: construct_batch_splines_1d_lsq
    public :: construct_batch_splines_1d_lines
    public :: construct_batch_splines_1d_resident
    public :: construct_batch_splines_1d_resident_device
@@ -118,6 +120,119 @@ subroutine construct_batch_splines_1d_legacy(x_min, x_max, y_batch, order, perio
       deallocate (splcoe_temp)
    end subroutine construct_batch_splines_1d_legacy
 
+   subroutine construct_batch_splines_1d_lsq(x_min, x_max, order, periodic, &
+                                             num_points, x_data, f_batch, spl, weights)
+      real(dp), intent(in) :: x_min, x_max
+      integer, intent(in) :: order
+      logical, intent(in) :: periodic
+      integer, intent(in) :: num_points
+      real(dp), intent(in) :: x_data(:)
+      real(dp), intent(in) :: f_batch(:, :)  ! (n_data, n_quantities)
+      type(BatchSplineData1D), intent(out) :: spl
+      real(dp), intent(in), optional :: weights(:)
+
+      integer :: n_data, n_quantities, n_keep
+      integer :: i, iq, col
+      real(dp), allocatable :: x_used(:), f_used(:, :), w_used(:)
+      real(dp), allocatable :: phi(:, :), y_vec(:), y_knots(:, :)
+      type(BatchSplineData1D) :: spl_unit
+      real(dp) :: val(1)
+
+      n_data = size(x_data)
+      if (n_data /= size(f_batch, 1)) then
+         error stop "construct_batch_splines_1d_lsq: data size mismatch"
+      end if
+      n_quantities = size(f_batch, 2)
+      if (n_quantities < 1) then
+         error stop "construct_batch_splines_1d_lsq: need at least 1 quantity"
+      end if
+      if (num_points < 2) then
+         error stop "construct_batch_splines_1d_lsq: need at least 2 grid points"
+      end if
+
+      if (present(weights)) then
+         if (size(weights) /= n_data) then
+            error stop "construct_batch_splines_1d_lsq: weights size mismatch"
+         end if
+      end if
+
+      n_keep = 0
+      do i = 1, n_data
+         if (.not. periodic) then
+            if (x_data(i) < x_min .or. x_data(i) > x_max) cycle
+         end if
+         if (present(weights)) then
+            if (weights(i) == 0.0_dp) cycle
+         end if
+         n_keep = n_keep + 1
+      end do
+
+      if (n_keep == 0) then
+         error stop "construct_batch_splines_1d_lsq: no usable data"
+      end if
+
+      allocate (x_used(n_keep), f_used(n_keep, n_quantities))
+      if (present(weights)) allocate (w_used(n_keep))
+
+      n_keep = 0
+      do i = 1, n_data
+         if (.not. periodic) then
+            if (x_data(i) < x_min .or. x_data(i) > x_max) cycle
+         end if
+         if (present(weights)) then
+            if (weights(i) == 0.0_dp) cycle
+            w_used(n_keep + 1) = weights(i)
+         end if
+         n_keep = n_keep + 1
+         x_used(n_keep) = x_data(i)
+         f_used(n_keep, :) = f_batch(i, :)
+      end do
+
+      allocate (phi(n_keep, num_points))
+      allocate (y_vec(num_points))
+      allocate (y_knots(num_points, n_quantities))
+
+      do col = 1, num_points
+         call allocate_batch_unit(order, num_points, periodic, col, spl_unit, x_min, x_max)
+         do i = 1, n_keep
+            call evaluate_batch_splines_1d(spl_unit, x_used(i), val)
+            phi(i, col) = val(1)
+         end do
+         call destroy_batch_splines_1d(spl_unit)
+      end do
+
+      do iq = 1, n_quantities
+         if (present(weights)) then
+            call cgls_dense_solve(phi, f_used(:, iq), y_vec, w_used, &
+                                  max_iter=4*num_points, tol=1.0d-14)
+         else
+            call cgls_dense_solve(phi, f_used(:, iq), y_vec, &
+                                  max_iter=4*num_points, tol=1.0d-14)
+         end if
+         y_knots(:, iq) = y_vec
+      end do
+
+      call construct_batch_splines_1d(x_min, x_max, y_knots, order, periodic, spl)
+
+      deallocate (phi, y_vec, y_knots, x_used, f_used)
+      if (present(weights)) deallocate (w_used)
+   end subroutine construct_batch_splines_1d_lsq
+
+   subroutine allocate_batch_unit(order, num_points, periodic, idx, spl, x_min, x_max)
+      integer, intent(in) :: order, num_points, idx
+      logical, intent(in) :: periodic
+      real(dp), intent(in) :: x_min, x_max
+      type(BatchSplineData1D), intent(out) :: spl
+
+      real(dp), allocatable :: y_basis(:, :)
+
+      allocate (y_basis(num_points, 1))
+      y_basis = 0.0_dp
+      y_basis(idx, 1) = 1.0_dp
+      call construct_batch_splines_1d(x_min, x_max, y_basis, order, periodic, spl)
+      deallocate (y_basis)
+   end subroutine allocate_batch_unit
+
    subroutine construct_batch_splines_1d_lines(x_min, x_max, y_batch, order, periodic, spl)
       real(dp), intent(in) :: x_min, x_max
       real(dp), intent(in) :: y_batch(:, :)  ! (n_points, n_quantities)

diff --git a/src/interpolate/cgls_dense.f90 b/src/interpolate/cgls_dense.f90
@@ -0,0 +1,212 @@
+module cgls_dense
+    !! Conjugate gradient for normal equations using an explicit
+    !! design matrix stored in column-major form.  OpenMP and SIMD
+    !! are used inside the matrix-vector products.
+    use, intrinsic :: iso_fortran_env, only : dp => real64
+    implicit none
+    private
+
+    public :: cgls_dense_solve
+
+contains
+
+    subroutine matvec_phi_core(phi, x, y)
+        !! y = A*x  where A == phi (n_rows x n_cols)
+        real(dp), intent(in) :: phi(:,:)
+        real(dp), intent(in) :: x(:)
+        real(dp), intent(out) :: y(:)
+
+        integer :: i, j, n_rows, n_cols
+        real(dp) :: acc
+
+        n_rows = size(phi, 1)
+        n_cols = size(phi, 2)
+
+        if (size(x) /= n_cols) then
+            error stop "matvec_phi_core: x size mismatch"
+        end if
+        if (size(y) /= n_rows) then
+            error stop "matvec_phi_core: y size mismatch"
+        end if
+
+        y = 0.0_dp
+
+        !$omp parallel do default(shared) private(i, j, acc)
+        do i = 1, n_rows
+            acc = 0.0_dp
+            !$omp simd reduction(+:acc)
+            do j = 1, n_cols
+                acc = acc + phi(i, j)*x(j)
+            end do
+            y(i) = acc
+        end do
+        !$omp end parallel do
+    end subroutine matvec_phi_core
+
+
+    subroutine matvec_phi_t_core(phi, r, y)
+        !! y = A^T*r  where A == phi (n_rows x n_cols)
+        real(dp), intent(in) :: phi(:,:)
+        real(dp), intent(in) :: r(:)
+        real(dp), intent(out) :: y(:)
+
+        integer :: i, j, n_rows, n_cols
+        real(dp) :: rloc
+
+        n_rows = size(phi, 1)
+        n_cols = size(phi, 2)
+
+        if (size(r) /= n_rows) then
+            error stop "matvec_phi_t_core: r size mismatch"
+        end if
+        if (size(y) /= n_cols) then
+            error stop "matvec_phi_t_core: y size mismatch"
+        end if
+
+        y = 0.0_dp
+
+        !$omp parallel do default(shared) private(j, i, rloc) schedule(static)
+        do j = 1, n_cols
+            rloc = 0.0_dp
+            !$omp simd reduction(+:rloc)
+            do i = 1, n_rows
+                rloc = rloc + phi(i, j)*r(i)
+            end do
+            y(j) = rloc
+        end do
+        !$omp end parallel do
+    end subroutine matvec_phi_t_core
+
+
+    subroutine cgls_dense_core(phi, rhs, x, max_iter, tol)
+        !! Conjugate Gradient for normal equations using
+        !! explicit matrix-vector products (unweighted case).
+        real(dp), intent(in) :: phi(:,:)
+        real(dp), intent(in) :: rhs(:)
+        real(dp), intent(inout) :: x(:)
+        integer, intent(in) :: max_iter
+        real(dp), intent(in) :: tol
+
+        integer :: iter, n_rows, n_cols
+        real(dp) :: gamma, gamma_new, alpha, beta, denom
+        real(dp) :: rhs_norm
+
+        real(dp), allocatable :: r(:), s(:), p(:), q(:)
+
+        n_rows = size(phi, 1)
+        n_cols = size(phi, 2)
+
+        if (size(rhs) /= n_rows) then
+            error stop "cgls_dense_core: rhs size mismatch"
+        end if
+        if (size(x) /= n_cols) then
+            error stop "cgls_dense_core: x size mismatch"
+        end if
+
+        allocate(r(n_rows))
+        allocate(s(n_cols))
+        allocate(p(n_cols))
+        allocate(q(n_rows))
+
+        r = rhs
+
+        call matvec_phi_t_core(phi, r, s)
+        p = s
+        gamma = sum(s*s)
+        rhs_norm = sqrt(sum(r*r))
+
+        if (rhs_norm == 0.0_dp) then
+            x = 0.0_dp
+            deallocate(r, s, p, q)
+            return
+        end if
+
+        do iter = 1, max_iter
+            call matvec_phi_core(phi, p, q)
+            denom = sum(q*q)
+            if (denom <= 0.0_dp) exit
+
+            alpha = gamma/denom
+            x = x + alpha*p
+            r = r - alpha*q
+
+            call matvec_phi_t_core(phi, r, s)
+            gamma_new = sum(s*s)
+
+            if (gamma_new <= (tol*rhs_norm)**2) exit
+
+            beta = gamma_new/gamma
+            gamma = gamma_new
+            p = s + beta*p
+        end do
+
+        deallocate(r, s, p, q)
+    end subroutine cgls_dense_core
+
+
+    subroutine cgls_dense_solve(phi, rhs, x, w, max_iter, tol)
+        !! Conjugate Gradient for normal equations using
+        !! explicit matrix-vector products.
+        !! If w is present, each row i is scaled by w(i).
+        real(dp), intent(in) :: phi(:,:)
+        real(dp), intent(in) :: rhs(:)
+        real(dp), intent(out) :: x(:)
+        real(dp), intent(in), optional :: w(:)
+        integer, intent(in), optional :: max_iter
+        real(dp), intent(in), optional :: tol
+
+        integer :: n_rows, n_cols, kmax, i, j
+        real(dp) :: atol
+        real(dp), allocatable :: phi_eff(:,:), rhs_eff(:)
+
+        n_rows = size(phi, 1)
+        n_cols = size(phi, 2)
+
+        if (size(rhs) /= n_rows) then
+            error stop "cgls_dense_solve: rhs size mismatch"
+        end if
+        if (size(x) /= n_cols) then
+            error stop "cgls_dense_solve: x size mismatch"
+        end if
+        if (present(w)) then
+            if (size(w) /= n_rows) then
+                error stop "cgls_dense_solve: w size mismatch"
+            end if
+        end if
+
+        if (present(max_iter)) then
+            kmax = max_iter
+        else
+            kmax = 200
+        end if
+        if (present(tol)) then
+            atol = tol
+        else
+            atol = 1.0d-12
+        end if
+
+        x = 0.0_dp
+
+        if (present(w)) then
+            allocate(phi_eff(n_rows, n_cols))
+            allocate(rhs_eff(n_rows))
+
+            !$omp parallel do default(shared) private(i, j)
+            do i = 1, n_rows
+                rhs_eff(i) = w(i)*rhs(i)
+                !$omp simd
+                do j = 1, n_cols
+                    phi_eff(i, j) = w(i)*phi(i, j)
+                end do
+            end do
+            !$omp end parallel do
+
+            call cgls_dense_core(phi_eff, rhs_eff, x, kmax, atol)
+
+            deallocate(phi_eff, rhs_eff)
+        else
+            call cgls_dense_core(phi, rhs, x, kmax, atol)
+        end if
+    end subroutine cgls_dense_solve
+
+end module cgls_dense