diff --git a/.gitignore b/.gitignore
index b9749d00..88fd62a5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -66,5 +66,6 @@ lib/
 .venv
 dist/
 test/**/*.png
+bench_neo_bspline_*.png
 version.f90
 Testing/Temporary/CTestCostData.txt
diff --git a/src/interpolate/CMakeLists.txt b/src/interpolate/CMakeLists.txt
index d6eb1f2c..9c8eefc1 100644
--- a/src/interpolate/CMakeLists.txt
+++ b/src/interpolate/CMakeLists.txt
@@ -1,6 +1,10 @@
 add_library(interpolate STATIC
     ../spl_three_to_five.f90
     acc_spline_build_order5.f90
+    neo_bspline_base.f90
+    neo_bspline_interp.f90
+    neo_bspline_lsq.f90
+    neo_bspline.f90
     batch_interpolate_types.f90
     batch_interpolate_1d.f90
     batch_interpolate_2d.f90
diff --git a/src/interpolate/neo_bspline.f90 b/src/interpolate/neo_bspline.f90
new file mode 100644
index 00000000..5263f5a3
--- /dev/null
+++ b/src/interpolate/neo_bspline.f90
@@ -0,0 +1,11 @@
+module neo_bspline
+    !! Umbrella module for B-spline functionality.
+    !! Re-exports types and procedures from submodules:
+    !!   - neo_bspline_base: Core types, init, eval, basis functions
+    !!   - neo_bspline_interp: Direct interpolation (collocation solve)
+    !!   - neo_bspline_lsq: Least-squares fitting (CGLS solver)
+    use neo_bspline_base
+    use neo_bspline_interp
+    use neo_bspline_lsq
+    implicit none
+end module neo_bspline
diff --git a/src/interpolate/neo_bspline_base.f90 b/src/interpolate/neo_bspline_base.f90
new file mode 100644
index 00000000..02057c29
--- /dev/null
+++ b/src/interpolate/neo_bspline_base.f90
@@ -0,0 +1,213 @@
+module neo_bspline_base
+    !! Core B-spline types and basis function evaluation.
+    !! No external dependencies beyond intrinsics.
+    use, intrinsic :: iso_fortran_env, only: dp => real64
+    implicit none
+    private
+
+    type, public :: bspline_1d
+        integer :: degree
+        integer :: n_ctrl
+        real(dp), allocatable :: knots(:)
+        real(dp) :: x_min, x_max
+    end type bspline_1d
+
+    type, public :: bspline_2d
+        type(bspline_1d) :: sx, sy
+    end type bspline_2d
+
+    type, public :: bspline_3d
+        type(bspline_1d) :: sx, sy, sz
+    end type bspline_3d
+
+    public :: bspline_1d_init_uniform, bspline_2d_init_uniform, bspline_3d_init_uniform
+    public :: bspline_1d_eval, bspline_2d_eval, bspline_3d_eval
+    public :: find_span, basis_funs
+
+contains
+
+    subroutine bspline_1d_init_uniform(spl, degree, n_ctrl, x_min, x_max)
+        type(bspline_1d), intent(out) :: spl
+        integer, intent(in) :: degree, n_ctrl
+        real(dp), intent(in) :: x_min, x_max
+        integer :: p, n, m, i
+        real(dp) :: h
+
+        p = degree
+        n = n_ctrl
+        m = n + p + 1
+        spl%degree = p
+        spl%n_ctrl = n
+        spl%x_min = x_min
+        spl%x_max = x_max
+        allocate(spl%knots(m))
+        h = (x_max - x_min)/real(n - p, dp)
+        do i = 1, m
+            if (i <= p + 1) then
+                spl%knots(i) = x_min
+            else if (i >= m - p) then
+                spl%knots(i) = x_max
+            else
+                spl%knots(i) = x_min + real(i - (p + 1), dp)*h
+            end if
+        end do
+    end subroutine bspline_1d_init_uniform
+
+
+    subroutine bspline_2d_init_uniform(spl, degree, n_ctrl, x_min, x_max)
+        type(bspline_2d), intent(out) :: spl
+        integer, intent(in) :: degree(2), n_ctrl(2)
+        real(dp), intent(in) :: x_min(2), x_max(2)
+
+        call bspline_1d_init_uniform(spl%sx, degree(1), n_ctrl(1), x_min(1), &
+            x_max(1))
+        call bspline_1d_init_uniform(spl%sy, degree(2), n_ctrl(2), x_min(2), &
+            x_max(2))
+    end subroutine bspline_2d_init_uniform
+
+
+    subroutine bspline_3d_init_uniform(spl, degree, n_ctrl, x_min, x_max)
+        type(bspline_3d), intent(out) :: spl
+        integer, intent(in) :: degree(3), n_ctrl(3)
+        real(dp), intent(in) :: x_min(3), x_max(3)
+
+        call bspline_1d_init_uniform(spl%sx, degree(1), n_ctrl(1), x_min(1), &
+            x_max(1))
+        call bspline_1d_init_uniform(spl%sy, degree(2), n_ctrl(2), x_min(2), &
+            x_max(2))
+        call bspline_1d_init_uniform(spl%sz, degree(3), n_ctrl(3), x_min(3), &
+            x_max(3))
+    end subroutine bspline_3d_init_uniform
+
+
+    subroutine find_span(spl, x, span)
+        type(bspline_1d), intent(in) :: spl
+        real(dp), intent(in) :: x
+        integer, intent(out) :: span
+        integer :: low, high, mid, p, n, m
+        real(dp) :: xx
+
+        p = spl%degree
+        n = spl%n_ctrl
+        m = n + p + 1
+        xx = min(max(x, spl%x_min), spl%x_max)
+        if (xx >= spl%knots(m - p)) then
+            span = n
+            return
+        end if
+        low = p + 1
+        high = n + 1
+        do
+            mid = (low + high)/2
+            if (xx < spl%knots(mid)) then
+                high = mid
+            else if (xx >= spl%knots(mid + 1)) then
+                low = mid
+            else
+                span = mid
+                exit
+            end if
+        end do
+    end subroutine find_span
+
+
+    subroutine basis_funs(spl, span, x, N)
+        type(bspline_1d), intent(in) :: spl
+        integer, intent(in) :: span
+        real(dp), intent(in) :: x
+        real(dp), intent(out) :: N(0:)
+        integer :: p, j, r
+        real(dp) :: left(0:spl%degree), right(0:spl%degree), saved, temp
+
+        p = spl%degree
+        N(0) = 1.0_dp
+        do j = 1, p
+            left(j) = x - spl%knots(span + 1 - j)
+            right(j) = spl%knots(span + j) - x
+            saved = 0.0_dp
+            do r = 0, j - 1
+                temp = N(r)/(right(r + 1) + left(j - r))
+                N(r) = saved + right(r + 1)*temp
+                saved = left(j - r)*temp
+            end do
+            N(j) = saved
+        end do
+    end subroutine basis_funs
+
+
+    subroutine bspline_1d_eval(spl, coeff, x, y)
+        type(bspline_1d), intent(in) :: spl
+        real(dp), intent(in) :: coeff(:)
+        real(dp), intent(in) :: x
+        real(dp), intent(out) :: y
+        integer :: span, j, p
+        real(dp) :: N(0:spl%degree)
+
+        p = spl%degree
+        call find_span(spl, x, span)
+        call basis_funs(spl, span, x, N)
+        y = 0.0_dp
+        !$omp simd reduction(+:y)
+        do j = 0, p
+            y = y + N(j)*coeff(span - p + j)
+        end do
+    end subroutine bspline_1d_eval
+
+
+    subroutine bspline_2d_eval(spl, coeff, x, y)
+        type(bspline_2d), intent(in) :: spl
+        real(dp), intent(in) :: coeff(:,:)
+        real(dp), intent(in) :: x(2)
+        real(dp), intent(out) :: y
+        integer :: spanx, spany, px, py, a, b, ix, iy
+        real(dp) :: Nx(0:spl%sx%degree), Ny(0:spl%sy%degree)
+
+        px = spl%sx%degree
+        py = spl%sy%degree
+        call find_span(spl%sx, x(1), spanx)
+        call basis_funs(spl%sx, spanx, x(1), Nx)
+        call find_span(spl%sy, x(2), spany)
+        call basis_funs(spl%sy, spany, x(2), Ny)
+        y = 0.0_dp
+        do a = 0, px
+            ix = spanx - px + a
+            do b = 0, py
+                iy = spany - py + b
+                y = y + Nx(a)*Ny(b)*coeff(ix, iy)
+            end do
+        end do
+    end subroutine bspline_2d_eval
+
+
+    subroutine bspline_3d_eval(spl, coeff, x, y)
+        type(bspline_3d), intent(in) :: spl
+        real(dp), intent(in) :: coeff(:,:,:)
+        real(dp), intent(in) :: x(3)
+        real(dp), intent(out) :: y
+        integer :: spanx, spany, spanz, px, py, pz, a, b, c, ix, iy, iz
+        real(dp) :: Nx(0:spl%sx%degree), Ny(0:spl%sy%degree)
+        real(dp) :: Nz(0:spl%sz%degree)
+
+        px = spl%sx%degree
+        py = spl%sy%degree
+        pz = spl%sz%degree
+        call find_span(spl%sx, x(1), spanx)
+        call basis_funs(spl%sx, spanx, x(1), Nx)
+        call find_span(spl%sy, x(2), spany)
+        call basis_funs(spl%sy, spany, x(2), Ny)
+        call find_span(spl%sz, x(3), spanz)
+        call basis_funs(spl%sz, spanz, x(3), Nz)
+        y = 0.0_dp
+        do a = 0, px
+            ix = spanx - px + a
+            do b = 0, py
+                iy = spany - py + b
+                do c = 0, pz
+                    iz = spanz - pz + c
+                    y = y + Nx(a)*Ny(b)*Nz(c)*coeff(ix, iy, iz)
+                end do
+            end do
+        end do
+    end subroutine bspline_3d_eval
+
+end module neo_bspline_base
diff --git a/src/interpolate/neo_bspline_interp.f90 b/src/interpolate/neo_bspline_interp.f90
new file mode 100644
index 00000000..9be28a60
--- /dev/null
+++ b/src/interpolate/neo_bspline_interp.f90
@@ -0,0 +1,218 @@
+module neo_bspline_interp
+    !! Direct B-spline interpolation via collocation.
+    !! Solves the linear system A*c = f where A_ij = B_j(x_i).
+    !! Uses separable tensor-product structure for efficiency in 2D/3D.
+    use, intrinsic :: iso_fortran_env, only: dp => real64
+    use neo_bspline_base, only: bspline_1d, bspline_2d, bspline_3d, &
+        find_span, basis_funs
+    implicit none
+    private
+
+    interface
+        subroutine dgbsv(n, kl, ku, nrhs, ab, ldab, ipiv, b, ldb, info)
+            import :: dp
+            integer :: n, kl, ku, nrhs, ldab, ldb, info
+            integer :: ipiv(*)
+            real(dp) :: ab(ldab, *), b(ldb, *)
+        end subroutine dgbsv
+    end interface
+
+    public :: bspline_1d_interp, bspline_2d_interp, bspline_3d_interp
+
+contains
+
+    subroutine bspline_1d_interp(spl, x_data, f_data, coeff)
+        !! Direct 1D B-spline interpolation: solve collocation system A*c = f.
+        type(bspline_1d), intent(in) :: spl
+        real(dp), intent(in) :: x_data(:)
+        real(dp), intent(in) :: f_data(:)
+        real(dp), intent(out) :: coeff(:)
+
+        integer :: n, p, i, j, span, info, kl, ku, ldab
+        real(dp), allocatable :: ab(:,:), work_f(:), Nvals(:)
+        integer, allocatable :: ipiv(:)
+
+        n = size(x_data)
+        p = spl%degree
+        kl = p
+        ku = p
+        ldab = 2*kl + ku + 1
+        allocate(ab(ldab, n), work_f(n), ipiv(n), Nvals(0:p))
+        ab = 0.0_dp
+
+        do i = 1, n
+            call find_span(spl, x_data(i), span)
+            call basis_funs(spl, span, x_data(i), Nvals)
+            do j = 0, p
+                ab(kl + ku + 1 + j, span - p + j) = Nvals(j)
+            end do
+        end do
+
+        work_f = f_data
+        call dgbsv(n, kl, ku, 1, ab, ldab, ipiv, work_f, n, info)
+        if (info /= 0) then
+            coeff = 0.0_dp
+            return
+        end if
+        coeff = work_f
+    end subroutine bspline_1d_interp
+
+
+    subroutine bspline_2d_interp(spl, x1, x2, f_grid, coeff)
+        !! Direct 2D B-spline interpolation using separable tensor product.
+        !! Solves dimension-by-dimension: first along x1, then along x2.
+        type(bspline_2d), intent(in) :: spl
+        real(dp), intent(in) :: x1(:), x2(:)
+        real(dp), intent(in) :: f_grid(:,:)
+        real(dp), intent(out) :: coeff(:,:)
+
+        integer :: n1, n2, i, j, span, p1, p2, info, pmax
+        real(dp), allocatable :: A1(:,:), A2(:,:), temp(:,:), work(:), Nvals(:)
+        integer, allocatable :: ipiv(:)
+
+        n1 = size(x1)
+        n2 = size(x2)
+        p1 = spl%sx%degree
+        p2 = spl%sy%degree
+        pmax = max(p1, p2)
+        allocate(A1(n1, n1), A2(n2, n2), temp(n1, n2), ipiv(max(n1, n2)))
+        allocate(work(max(n1, n2)), Nvals(0:pmax))
+
+        A1 = 0.0_dp
+        do i = 1, n1
+            call find_span(spl%sx, x1(i), span)
+            call basis_funs(spl%sx, span, x1(i), Nvals(0:p1))
+            do j = 0, p1
+                A1(i, span - p1 + j) = Nvals(j)
+            end do
+        end do
+
+        A2 = 0.0_dp
+        do i = 1, n2
+            call find_span(spl%sy, x2(i), span)
+            call basis_funs(spl%sy, span, x2(i), Nvals(0:p2))
+            do j = 0, p2
+                A2(i, span - p2 + j) = Nvals(j)
+            end do
+        end do
+
+        temp = f_grid
+        call dgetrf(n1, n1, A1, n1, ipiv, info)
+        if (info /= 0) then
+            coeff = 0.0_dp
+            return
+        end if
+        do j = 1, n2
+            work(1:n1) = temp(:, j)
+            call dgetrs('N', n1, 1, A1, n1, ipiv, work, n1, info)
+            temp(:, j) = work(1:n1)
+        end do
+
+        call dgetrf(n2, n2, A2, n2, ipiv, info)
+        if (info /= 0) then
+            coeff = 0.0_dp
+            return
+        end if
+        do i = 1, n1
+            work(1:n2) = temp(i, :)
+            call dgetrs('N', n2, 1, A2, n2, ipiv, work, n2, info)
+            coeff(i, :) = work(1:n2)
+        end do
+    end subroutine bspline_2d_interp
+
+
+    subroutine bspline_3d_interp(spl, x1, x2, x3, f_grid, coeff)
+        !! Direct 3D B-spline interpolation using separable tensor product.
+        !! Solves dimension-by-dimension: x1 -> x2 -> x3.
+        type(bspline_3d), intent(in) :: spl
+        real(dp), intent(in) :: x1(:), x2(:), x3(:)
+        real(dp), intent(in) :: f_grid(:,:,:)
+        real(dp), intent(out) :: coeff(:,:,:)
+
+        integer :: n1, n2, n3, i, j, k, span, p1, p2, p3, info, pmax, nmax
+        real(dp), allocatable :: A1(:,:), A2(:,:), A3(:,:), temp(:,:,:)
+        real(dp), allocatable :: Nvals(:)
+        integer, allocatable :: ipiv1(:), ipiv2(:), ipiv3(:)
+
+        n1 = size(x1)
+        n2 = size(x2)
+        n3 = size(x3)
+        p1 = spl%sx%degree
+        p2 = spl%sy%degree
+        p3 = spl%sz%degree
+        pmax = max(p1, p2, p3)
+        nmax = max(n1, n2, n3)
+        allocate(A1(n1, n1), A2(n2, n2), A3(n3, n3))
+        allocate(temp(n1, n2, n3), ipiv1(n1), ipiv2(n2), ipiv3(n3))
+        allocate(Nvals(0:pmax))
+
+        A1 = 0.0_dp
+        do i = 1, n1
+            call find_span(spl%sx, x1(i), span)
+            call basis_funs(spl%sx, span, x1(i), Nvals(0:p1))
+            do j = 0, p1
+                A1(i, span - p1 + j) = Nvals(j)
+            end do
+        end do
+
+        A2 = 0.0_dp
+        do i = 1, n2
+            call find_span(spl%sy, x2(i), span)
+            call basis_funs(spl%sy, span, x2(i), Nvals(0:p2))
+            do j = 0, p2
+                A2(i, span - p2 + j) = Nvals(j)
+            end do
+        end do
+
+        A3 = 0.0_dp
+        do i = 1, n3
+            call find_span(spl%sz, x3(i), span)
+            call basis_funs(spl%sz, span, x3(i), Nvals(0:p3))
+            do j = 0, p3
+                A3(i, span - p3 + j) = Nvals(j)
+            end do
+        end do
+
+        temp = f_grid
+
+        call dgetrf(n1, n1, A1, n1, ipiv1, info)
+        if (info /= 0) then
+            coeff = 0.0_dp
+            return
+        end if
+        call dgetrf(n2, n2, A2, n2, ipiv2, info)
+        if (info /= 0) then
+            coeff = 0.0_dp
+            return
+        end if
+        call dgetrf(n3, n3, A3, n3, ipiv3, info)
+        if (info /= 0) then
+            coeff = 0.0_dp
+            return
+        end if
+
+        !$omp parallel do collapse(2) private(j, k) schedule(static)
+        do k = 1, n3
+            do j = 1, n2
+                call dgetrs('N', n1, 1, A1, n1, ipiv1, temp(:, j, k), n1, info)
+            end do
+        end do
+
+        !$omp parallel do collapse(2) private(i, k) schedule(static)
+        do k = 1, n3
+            do i = 1, n1
+                call dgetrs('N', n2, 1, A2, n2, ipiv2, temp(i, :, k), n2, info)
+            end do
+        end do
+
+        !$omp parallel do collapse(2) private(i, j) schedule(static)
+        do j = 1, n2
+            do i = 1, n1
+                call dgetrs('N', n3, 1, A3, n3, ipiv3, temp(i, j, :), n3, info)
+            end do
+        end do
+
+        coeff = temp
+    end subroutine bspline_3d_interp
+
+end module neo_bspline_interp
diff --git a/src/interpolate/neo_bspline_lsq.f90 b/src/interpolate/neo_bspline_lsq.f90
new file mode 100644
index 00000000..611a506d
--- /dev/null
+++ b/src/interpolate/neo_bspline_lsq.f90
@@ -0,0 +1,516 @@
+module neo_bspline_lsq
+    !! Least-squares B-spline fitting via matrix-free CGLS.
+    !! Exploits regular grid structure with separable basis caching.
+    use, intrinsic :: iso_fortran_env, only: dp => real64
+    use neo_bspline_base, only: bspline_1d, bspline_2d, bspline_3d, &
+        find_span, basis_funs
+    implicit none
+    private
+
+    type :: grid_cache_1d
+        integer :: ng1, px
+        integer, allocatable :: span1(:)
+        real(dp), allocatable :: B1(:,:)
+    end type grid_cache_1d
+
+    type :: grid_cache_2d
+        integer :: ng1, ng2, px, py
+        integer, allocatable :: span1(:), span2(:)
+        real(dp), allocatable :: B1(:,:), B2(:,:)
+    end type grid_cache_2d
+
+    type :: grid_cache_3d
+        integer :: ng1, ng2, ng3, px, py, pz
+        integer, allocatable :: span1(:), span2(:), span3(:)
+        real(dp), allocatable :: B1(:,:), B2(:,:), B3(:,:)
+    end type grid_cache_3d
+
+    public :: bspline_1d_lsq_cgls, bspline_2d_lsq_cgls, bspline_3d_lsq_cgls
+
+contains
+
+    subroutine precompute_grid_1d(spl, x1, cache)
+        type(bspline_1d), intent(in) :: spl
+        real(dp), intent(in) :: x1(:)
+        type(grid_cache_1d), intent(out) :: cache
+        integer :: i, n1, px
+
+        n1 = size(x1)
+        px = spl%degree
+        cache%ng1 = n1
+        cache%px = px
+        allocate(cache%span1(n1), cache%B1(0:px, n1))
+
+        !$omp parallel do simd private(i) schedule(simd:static)
+        do i = 1, n1
+            call find_span(spl, x1(i), cache%span1(i))
+        end do
+
+        !$omp parallel do private(i) schedule(static)
+        do i = 1, n1
+            call basis_funs(spl, cache%span1(i), x1(i), cache%B1(:, i))
+        end do
+    end subroutine precompute_grid_1d
+
+
+    subroutine precompute_grid_2d(spl, x1, x2, cache)
+        type(bspline_2d), intent(in) :: spl
+        real(dp), intent(in) :: x1(:), x2(:)
+        type(grid_cache_2d), intent(out) :: cache
+        integer :: i, n1, n2, px, py
+
+        n1 = size(x1)
+        n2 = size(x2)
+        px = spl%sx%degree
+        py = spl%sy%degree
+        cache%ng1 = n1
+        cache%ng2 = n2
+        cache%px = px
+        cache%py = py
+        allocate(cache%span1(n1), cache%span2(n2))
+        allocate(cache%B1(0:px, n1), cache%B2(0:py, n2))
+
+        !$omp parallel do simd private(i) schedule(simd:static)
+        do i = 1, n1
+            call find_span(spl%sx, x1(i), cache%span1(i))
+        end do
+        !$omp parallel do private(i) schedule(static)
+        do i = 1, n1
+            call basis_funs(spl%sx, cache%span1(i), x1(i), cache%B1(:, i))
+        end do
+
+        !$omp parallel do simd private(i) schedule(simd:static)
+        do i = 1, n2
+            call find_span(spl%sy, x2(i), cache%span2(i))
+        end do
+        !$omp parallel do private(i) schedule(static)
+        do i = 1, n2
+            call basis_funs(spl%sy, cache%span2(i), x2(i), cache%B2(:, i))
+        end do
+    end subroutine precompute_grid_2d
+
+
+    subroutine precompute_grid_3d(spl, x1, x2, x3, cache)
+        type(bspline_3d), intent(in) :: spl
+        real(dp), intent(in) :: x1(:), x2(:), x3(:)
+        type(grid_cache_3d), intent(out) :: cache
+        integer :: i, n1, n2, n3, px, py, pz
+
+        n1 = size(x1)
+        n2 = size(x2)
+        n3 = size(x3)
+        px = spl%sx%degree
+        py = spl%sy%degree
+        pz = spl%sz%degree
+        cache%ng1 = n1
+        cache%ng2 = n2
+        cache%ng3 = n3
+        cache%px = px
+        cache%py = py
+        cache%pz = pz
+        allocate(cache%span1(n1), cache%span2(n2), cache%span3(n3))
+        allocate(cache%B1(0:px, n1), cache%B2(0:py, n2), cache%B3(0:pz, n3))
+
+        !$omp parallel do simd private(i) schedule(simd:static)
+        do i = 1, n1
+            call find_span(spl%sx, x1(i), cache%span1(i))
+        end do
+        !$omp parallel do private(i) schedule(static)
+        do i = 1, n1
+            call basis_funs(spl%sx, cache%span1(i), x1(i), cache%B1(:, i))
+        end do
+
+        !$omp parallel do simd private(i) schedule(simd:static)
+        do i = 1, n2
+            call find_span(spl%sy, x2(i), cache%span2(i))
+        end do
+        !$omp parallel do private(i) schedule(static)
+        do i = 1, n2
+            call basis_funs(spl%sy, cache%span2(i), x2(i), cache%B2(:, i))
+        end do
+
+        !$omp parallel do simd private(i) schedule(simd:static)
+        do i = 1, n3
+            call find_span(spl%sz, x3(i), cache%span3(i))
+        end do
+        !$omp parallel do private(i) schedule(static)
+        do i = 1, n3
+            call basis_funs(spl%sz, cache%span3(i), x3(i), cache%B3(:, i))
+        end do
+    end subroutine precompute_grid_3d
+
+
+    subroutine apply_A_1d(cache, coeff, f)
+        type(grid_cache_1d), intent(in) :: cache
+        real(dp), intent(in) :: coeff(:)
+        real(dp), intent(out) :: f(:)
+        integer :: i, a, ix, n1, px, s1
+
+        n1 = cache%ng1
+        px = cache%px
+        !$omp parallel do simd private(i, a, ix, s1) schedule(simd:static)
+        do i = 1, n1
+            s1 = cache%span1(i)
+            f(i) = 0.0_dp
+            do a = 0, px
+                ix = s1 - px + a
+                f(i) = f(i) + cache%B1(a, i)*coeff(ix)
+            end do
+        end do
+    end subroutine apply_A_1d
+
+
+    subroutine apply_AT_1d(cache, r, g, n_ctrl)
+        type(grid_cache_1d), intent(in) :: cache
+        real(dp), intent(in) :: r(:)
+        real(dp), intent(out) :: g(:)
+        integer, intent(in) :: n_ctrl
+        integer :: i, a, ix, n1, px, s1
+        real(dp), allocatable :: g_local(:)
+
+        n1 = cache%ng1
+        px = cache%px
+        g = 0.0_dp
+
+        !$omp parallel private(g_local, i, a, ix, s1) default(shared)
+        allocate(g_local(n_ctrl))
+        g_local = 0.0_dp
+        !$omp do schedule(static)
+        do i = 1, n1
+            s1 = cache%span1(i)
+            do a = 0, px
+                ix = s1 - px + a
+                g_local(ix) = g_local(ix) + cache%B1(a, i)*r(i)
+            end do
+        end do
+        !$omp end do
+        !$omp critical
+        g = g + g_local
+        !$omp end critical
+        deallocate(g_local)
+        !$omp end parallel
+    end subroutine apply_AT_1d
+
+
+    subroutine apply_A_2d(cache, coeff, f)
+        type(grid_cache_2d), intent(in) :: cache
+        real(dp), intent(in) :: coeff(:,:)
+        real(dp), intent(out) :: f(:,:)
+        integer :: i1, i2, a, b, ix, iy, n1, n2, px, py, s1, s2
+        real(dp) :: val
+
+        n1 = cache%ng1
+        n2 = cache%ng2
+        px = cache%px
+        py = cache%py
+
+        !$omp parallel do collapse(2) private(i1, i2, a, b, ix, iy, s1, s2, val) &
+        !$omp& schedule(static)
+        do i2 = 1, n2
+            do i1 = 1, n1
+                s1 = cache%span1(i1)
+                s2 = cache%span2(i2)
+                val = 0.0_dp
+                do a = 0, px
+                    ix = s1 - px + a
+                    do b = 0, py
+                        iy = s2 - py + b
+                        val = val + cache%B1(a, i1)*cache%B2(b, i2)*coeff(ix, iy)
+                    end do
+                end do
+                f(i1, i2) = val
+            end do
+        end do
+    end subroutine apply_A_2d
+
+
+    subroutine apply_AT_2d(cache, r, g, nx, ny)
+        type(grid_cache_2d), intent(in) :: cache
+        real(dp), intent(in) :: r(:,:)
+        real(dp), intent(out) :: g(:,:)
+        integer, intent(in) :: nx, ny
+        integer :: i1, i2, a, b, ix, iy, n1, n2, px, py, s1, s2
+        real(dp), allocatable :: g_local(:,:)
+
+        n1 = cache%ng1
+        n2 = cache%ng2
+        px = cache%px
+        py = cache%py
+        g = 0.0_dp
+
+        !$omp parallel private(g_local, i1, i2, a, b, ix, iy, s1, s2) default(shared)
+        allocate(g_local(nx, ny))
+        g_local = 0.0_dp
+        !$omp do collapse(2) schedule(static)
+        do i2 = 1, n2
+            do i1 = 1, n1
+                s1 = cache%span1(i1)
+                s2 = cache%span2(i2)
+                do a = 0, px
+                    ix = s1 - px + a
+                    do b = 0, py
+                        iy = s2 - py + b
+                        g_local(ix, iy) = g_local(ix, iy) + &
+                            cache%B1(a, i1)*cache%B2(b, i2)*r(i1, i2)
+                    end do
+                end do
+            end do
+        end do
+        !$omp end do
+        !$omp critical
+        g = g + g_local
+        !$omp end critical
+        deallocate(g_local)
+        !$omp end parallel
+    end subroutine apply_AT_2d
+
+
+    subroutine apply_A_3d(cache, coeff, f)
+        type(grid_cache_3d), intent(in) :: cache
+        real(dp), intent(in) :: coeff(:,:,:)
+        real(dp), intent(out) :: f(:,:,:)
+        integer :: i1, i2, i3, a, b, c, ix, iy, iz
+        integer :: n1, n2, n3, px, py, pz, s1, s2, s3
+        real(dp) :: val, Nab
+
+        n1 = cache%ng1
+        n2 = cache%ng2
+        n3 = cache%ng3
+        px = cache%px
+        py = cache%py
+        pz = cache%pz
+
+        !$omp parallel do collapse(3) &
+        !$omp& private(i1, i2, i3, a, b, c, ix, iy, iz, s1, s2, s3, val, Nab) &
+        !$omp& schedule(static)
+        do i3 = 1, n3
+            do i2 = 1, n2
+                do i1 = 1, n1
+                    s1 = cache%span1(i1)
+                    s2 = cache%span2(i2)
+                    s3 = cache%span3(i3)
+                    val = 0.0_dp
+                    do a = 0, px
+                        ix = s1 - px + a
+                        do b = 0, py
+                            iy = s2 - py + b
+                            Nab = cache%B1(a, i1)*cache%B2(b, i2)
+                            !$omp simd reduction(+:val)
+                            do c = 0, pz
+                                iz = s3 - pz + c
+                                val = val + Nab*cache%B3(c, i3)*coeff(ix, iy, iz)
+                            end do
+                        end do
+                    end do
+                    f(i1, i2, i3) = val
+                end do
+            end do
+        end do
+    end subroutine apply_A_3d
+
+
+    subroutine apply_AT_3d(cache, r, g, nx, ny, nz)
+        type(grid_cache_3d), intent(in) :: cache
+        real(dp), intent(in) :: r(:,:,:)
+        real(dp), intent(out) :: g(:,:,:)
+        integer, intent(in) :: nx, ny, nz
+        integer :: i1, i2, i3, a, b, c, ix, iy, iz
+        integer :: n1, n2, n3, px, py, pz, s1, s2, s3
+        real(dp) :: ri, Nab
+        real(dp), allocatable :: g_local(:,:,:)
+
+        n1 = cache%ng1
+        n2 = cache%ng2
+        n3 = cache%ng3
+        px = cache%px
+        py = cache%py
+        pz = cache%pz
+        g = 0.0_dp
+
+        !$omp parallel &
+        !$omp& private(g_local, i1, i2, i3, a, b, c, ix, iy, iz, s1, s2, s3, ri, Nab) &
+        !$omp& default(shared)
+        allocate(g_local(nx, ny, nz))
+        g_local = 0.0_dp
+        !$omp do collapse(3) schedule(static)
+        do i3 = 1, n3
+            do i2 = 1, n2
+                do i1 = 1, n1
+                    s1 = cache%span1(i1)
+                    s2 = cache%span2(i2)
+                    s3 = cache%span3(i3)
+                    ri = r(i1, i2, i3)
+                    do a = 0, px
+                        ix = s1 - px + a
+                        do b = 0, py
+                            iy = s2 - py + b
+                            Nab = cache%B1(a, i1)*cache%B2(b, i2)*ri
+                            do c = 0, pz
+                                iz = s3 - pz + c
+                                g_local(ix, iy, iz) = g_local(ix, iy, iz) + &
+                                    Nab*cache%B3(c, i3)
+                            end do
+                        end do
+                    end do
+                end do
+            end do
+        end do
+        !$omp end do
+        !$omp critical
+        g = g + g_local
+        !$omp end critical
+        deallocate(g_local)
+        !$omp end parallel
+    end subroutine apply_AT_3d
+
+
+    subroutine bspline_1d_lsq_cgls(spl, x_data, f_data, coeff, max_iter, tol)
+        type(bspline_1d), intent(in) :: spl
+        real(dp), intent(in) :: x_data(:)
+        real(dp), intent(in) :: f_data(:)
+        real(dp), intent(inout) :: coeff(:)
+        integer, intent(in), optional :: max_iter
+        real(dp), intent(in), optional :: tol
+
+        type(grid_cache_1d) :: cache
+        integer :: n_data, n_ctrl, k, kmax
+        real(dp) :: atol, gamma, gamma_new, alpha, beta, denom, rhs_norm
+        real(dp), allocatable :: r(:), q(:), s(:), p(:)
+
+        n_data = size(x_data)
+        n_ctrl = spl%n_ctrl
+        kmax = 200
+        atol = 1.0d-10
+        if (present(max_iter)) kmax = max_iter
+        if (present(tol)) atol = tol
+
+        call precompute_grid_1d(spl, x_data, cache)
+
+        allocate(r(n_data), q(n_data), s(n_ctrl), p(n_ctrl))
+        coeff = 0.0_dp
+        r = f_data
+        call apply_AT_1d(cache, r, s, n_ctrl)
+        p = s
+        gamma = dot_product(s, s)
+        rhs_norm = sqrt(dot_product(f_data, f_data))
+        if (rhs_norm == 0.0_dp) return
+
+        do k = 1, kmax
+            call apply_A_1d(cache, p, q)
+            denom = dot_product(q, q)
+            if (denom <= 0.0_dp) exit
+            alpha = gamma/denom
+            coeff = coeff + alpha*p
+            r = r - alpha*q
+            call apply_AT_1d(cache, r, s, n_ctrl)
+            gamma_new = dot_product(s, s)
+            if (gamma_new <= (atol*rhs_norm)**2) exit
+            beta = gamma_new/gamma
+            gamma = gamma_new
+            p = s + beta*p
+        end do
+    end subroutine bspline_1d_lsq_cgls
+
+
+    subroutine bspline_2d_lsq_cgls(spl, x1, x2, f_grid, coeff, max_iter, tol)
+        type(bspline_2d), intent(in) :: spl
+        real(dp), intent(in) :: x1(:), x2(:)
+        real(dp), intent(in) :: f_grid(:,:)
+        real(dp), intent(inout) :: coeff(:,:)
+        integer, intent(in), optional :: max_iter
+        real(dp), intent(in), optional :: tol
+
+        type(grid_cache_2d) :: cache
+        integer :: n1, n2, nx, ny, k, kmax
+        real(dp) :: atol, gamma, gamma_new, alpha, beta, denom, rhs_norm
+        real(dp), allocatable :: r(:,:), q(:,:), s(:,:), p(:,:)
+
+        n1 = size(x1)
+        n2 = size(x2)
+        nx = spl%sx%n_ctrl
+        ny = spl%sy%n_ctrl
+        kmax = 400
+        atol = 1.0d-10
+        if (present(max_iter)) kmax = max_iter
+        if (present(tol)) atol = tol
+
+        call precompute_grid_2d(spl, x1, x2, cache)
+
+        allocate(r(n1, n2), q(n1, n2), s(nx, ny), p(nx, ny))
+        coeff = 0.0_dp
+        r = f_grid
+        call apply_AT_2d(cache, r, s, nx, ny)
+        p = s
+        gamma = sum(s*s)
+        rhs_norm = sqrt(sum(f_grid*f_grid))
+        if (rhs_norm == 0.0_dp) return
+
+        do k = 1, kmax
+            call apply_A_2d(cache, p, q)
+            denom = sum(q*q)
+            if (denom <= 0.0_dp) exit
+            alpha = gamma/denom
+            coeff = coeff + alpha*p
+            r = r - alpha*q
+            call apply_AT_2d(cache, r, s, nx, ny)
+            gamma_new = sum(s*s)
+            if (gamma_new <= (atol*rhs_norm)**2) exit
+            beta = gamma_new/gamma
+            gamma = gamma_new
+            p = s + beta*p
+        end do
+    end subroutine bspline_2d_lsq_cgls
+
+
+    subroutine bspline_3d_lsq_cgls(spl, x1, x2, x3, f_grid, coeff, max_iter, tol)
+        type(bspline_3d), intent(in) :: spl
+        real(dp), intent(in) :: x1(:), x2(:), x3(:)
+        real(dp), intent(in) :: f_grid(:,:,:)
+        real(dp), intent(inout) :: coeff(:,:,:)
+        integer, intent(in), optional :: max_iter
+        real(dp), intent(in), optional :: tol
+
+        type(grid_cache_3d) :: cache
+        integer :: n1, n2, n3, nx, ny, nz, k, kmax
+        real(dp) :: atol, gamma, gamma_new, alpha, beta, denom, rhs_norm
+        real(dp), allocatable :: r(:,:,:), q(:,:,:), s(:,:,:), p(:,:,:)
+
+        n1 = size(x1)
+        n2 = size(x2)
+        n3 = size(x3)
+        nx = spl%sx%n_ctrl
+        ny = spl%sy%n_ctrl
+        nz = spl%sz%n_ctrl
+        kmax = 400
+        atol = 1.0d-10
+        if (present(max_iter)) kmax = max_iter
+        if (present(tol)) atol = tol
+
+        call precompute_grid_3d(spl, x1, x2, x3, cache)
+
+        allocate(r(n1, n2, n3), q(n1, n2, n3), s(nx, ny, nz), p(nx, ny, nz))
+        coeff = 0.0_dp
+        r = f_grid
+        call apply_AT_3d(cache, r, s, nx, ny, nz)
+        p = s
+        gamma = sum(s*s)
+        rhs_norm = sqrt(sum(f_grid*f_grid))
+        if (rhs_norm == 0.0_dp) return
+
+        do k = 1, kmax
+            call apply_A_3d(cache, p, q)
+            denom = sum(q*q)
+            if (denom <= 0.0_dp) exit
+            alpha = gamma/denom
+            coeff = coeff + alpha*p
+            r = r - alpha*q
+            call apply_AT_3d(cache, r, s, nx, ny, nz)
+            gamma_new = sum(s*s)
+            if (gamma_new <= (atol*rhs_norm)**2) exit
+            beta = gamma_new/gamma
+            gamma = gamma_new
+            p = s + beta*p
+        end do
+    end subroutine bspline_3d_lsq_cgls
+
+end module neo_bspline_lsq
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 32d82149..1eebd904 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -144,6 +144,12 @@ target_link_libraries(test_rng.x ${COMMON_LIBS})
 add_executable(test_polylag_3.x source/test_polylag_3.f90)
 target_link_libraries(test_polylag_3.x ${COMMON_LIBS} ${MAGFIE_LIB})
 
+add_executable(test_neo_bspline_1d.x source/test_neo_bspline_1d.f90)
+target_link_libraries(test_neo_bspline_1d.x ${COMMON_LIBS})
+add_executable(test_neo_bspline_2d.x source/test_neo_bspline_2d.f90)
+target_link_libraries(test_neo_bspline_2d.x ${COMMON_LIBS})
+add_executable(test_neo_bspline_3d.x source/test_neo_bspline_3d.f90)
+target_link_libraries(test_neo_bspline_3d.x ${COMMON_LIBS})
 add_executable(test_interpolate.x source/test_interpolate.f90)
 target_link_libraries(test_interpolate.x ${COMMON_LIBS})
 
@@ -197,6 +203,9 @@ target_link_libraries(test_nctools_nc_get3.x
   ${COMMON_LIBS}
 )
 
+add_executable(bench_neo_bspline_vs_interpolate.x
+  source/bench_neo_bspline_vs_interpolate.f90)
+target_link_libraries(bench_neo_bspline_vs_interpolate.x ${COMMON_LIBS})
 
 ## Standard tests according to standard libneo Fortran test convention.
 
@@ -213,6 +222,9 @@ add_test(NAME test_spl_three_to_five COMMAND test_spl_three_to_five.x)
 add_test(NAME test_species COMMAND test_species.x)
 add_test(NAME test_rng COMMAND test_rng.x)
 add_test(NAME test_polylag_3 COMMAND test_polylag_3.x)
+add_test(NAME test_neo_bspline_1d COMMAND test_neo_bspline_1d.x)
+add_test(NAME test_neo_bspline_2d COMMAND test_neo_bspline_2d.x)
+add_test(NAME test_neo_bspline_3d COMMAND test_neo_bspline_3d.x)
 add_test(NAME test_interpolate COMMAND test_interpolate.x)
 add_test(NAME test_batch_interpolate COMMAND test_batch_interpolate.x)
 add_test(NAME test_batch_interpolate_many COMMAND test_batch_interpolate_many.x)
diff --git a/test/scripts/plot_neo_bspline_1d.py b/test/scripts/plot_neo_bspline_1d.py
new file mode 100644
index 00000000..4c8a9582
--- /dev/null
+++ b/test/scripts/plot_neo_bspline_1d.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+import argparse
+import pathlib
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Plot 1D neo_bspline LSQ fit vs analytic reference."
+    )
+    parser.add_argument(
+        "--data",
+        type=pathlib.Path,
+        default=pathlib.Path("bspline_1d_lsq.dat"),
+        help="Path to data file written by test_neo_bspline_1d.x",
+    )
+    parser.add_argument(
+        "--output",
+        type=pathlib.Path,
+        default=pathlib.Path("bspline_1d_lsq.png"),
+        help="Output PNG file for the plot.",
+    )
+    args = parser.parse_args()
+
+    data = np.loadtxt(args.data)
+    x = data[:, 0]
+    f_true = data[:, 1]
+    f_fit = data[:, 2]
+
+    fig, ax = plt.subplots(figsize=(6, 4))
+    ax.plot(x, f_true, label="analytic", color="k", linewidth=1.5)
+    ax.plot(x, f_fit, label="neo_bspline fit", color="C0", linestyle="--")
+    ax.set_xlabel("x")
+    ax.set_ylabel("f(x)")
+    ax.set_title("neo_bspline 1D LSQ fit")
+    ax.legend()
+    fig.tight_layout()
+    fig.savefig(args.output, dpi=150)
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/test/scripts/plot_neo_bspline_2d_circle.py b/test/scripts/plot_neo_bspline_2d_circle.py
new file mode 100644
index 00000000..0c1d109b
--- /dev/null
+++ b/test/scripts/plot_neo_bspline_2d_circle.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python3
+import argparse
+import pathlib
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Plot 2D neo_bspline LSQ fit vs analytic along a circle."
+    )
+    parser.add_argument(
+        "--data",
+        type=pathlib.Path,
+        default=pathlib.Path("bspline_2d_lsq_circle.dat"),
+        help="Path to circle data file written by test_neo_bspline_2d.x",
+    )
+    parser.add_argument(
+        "--output",
+        type=pathlib.Path,
+        default=pathlib.Path("bspline_2d_lsq_circle.png"),
+        help="Output PNG file.",
+    )
+    args = parser.parse_args()
+
+    data = np.loadtxt(args.data)
+    theta = data[:, 0]
+    f_true = data[:, 3]
+    f_fit = data[:, 4]
+
+    fig, ax = plt.subplots(figsize=(6, 4))
+    ax.plot(theta, f_true, label="analytic", color="k", linewidth=1.5)
+    ax.plot(theta, f_fit, label="neo_bspline fit", color="C1", linestyle="--")
+    ax.set_xlabel("theta")
+    ax.set_ylabel("f(theta)")
+    ax.set_title("neo_bspline 2D LSQ fit along circle")
+    ax.legend()
+    fig.tight_layout()
+    fig.savefig(args.output, dpi=150)
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/test/scripts/plot_neo_bspline_2d_grid.py b/test/scripts/plot_neo_bspline_2d_grid.py
new file mode 100644
index 00000000..bcaf4895
--- /dev/null
+++ b/test/scripts/plot_neo_bspline_2d_grid.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python3
+import argparse
+import pathlib
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Plot 2D neo_bspline LSQ fit error on full grid."
+    )
+    parser.add_argument(
+        "--data",
+        type=pathlib.Path,
+        default=pathlib.Path("bspline_2d_lsq_grid.dat"),
+        help="Path to grid data file written by test_neo_bspline_2d.x",
+    )
+    parser.add_argument(
+        "--output",
+        type=pathlib.Path,
+        default=pathlib.Path("bspline_2d_lsq_grid.png"),
+        help="Output PNG file.",
+    )
+    args = parser.parse_args()
+
+    data = np.loadtxt(args.data)
+    x = data[:, 0]
+    y = data[:, 1]
+    f_true = data[:, 2]
+    f_fit = data[:, 3]
+    err = f_fit - f_true
+
+    fig, ax = plt.subplots(figsize=(6, 4))
+    sc = ax.scatter(x, y, c=err, cmap="coolwarm", s=8, edgecolor="none")
+    ax.set_xlabel("x")
+    ax.set_ylabel("y")
+    ax.set_title("neo_bspline 2D LSQ full-grid error (fit - analytic)")
+    cbar = fig.colorbar(sc, ax=ax)
+    cbar.set_label("error")
+    fig.tight_layout()
+    fig.savefig(args.output, dpi=150)
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/test/scripts/plot_neo_bspline_3d_slice.py b/test/scripts/plot_neo_bspline_3d_slice.py
new file mode 100644
index 00000000..b7437d62
--- /dev/null
+++ b/test/scripts/plot_neo_bspline_3d_slice.py
@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+import argparse
+import pathlib
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Plot 3D neo_bspline LSQ fit error on a mid-plane slice.",
+    )
+    parser.add_argument(
+        "--data",
+        type=pathlib.Path,
+        default=pathlib.Path("bspline_3d_lsq_grid.dat"),
+        help="Path to 3D grid data file written by test_neo_bspline_3d.x",
+    )
+    parser.add_argument(
+        "--output",
+        type=pathlib.Path,
+        default=pathlib.Path("bspline_3d_lsq_slice.png"),
+        help="Output PNG file.",
+    )
+    args = parser.parse_args()
+
+    data = np.loadtxt(args.data)
+    x = data[:, 0]
+    y = data[:, 1]
+    z = data[:, 2]
+    f_true = data[:, 3]
+    f_fit = data[:, 4]
+    err = f_fit - f_true
+
+    z_levels = np.unique(z)
+    z_mid = z_levels[len(z_levels) // 2]
+    mask = np.isclose(z, z_mid)
+
+    fig, ax = plt.subplots(figsize=(6, 4))
+    sc = ax.scatter(x[mask], y[mask], c=err[mask], cmap="coolwarm", s=8, edgecolor="none")
+    ax.set_xlabel("x")
+    ax.set_ylabel("y")
+    ax.set_title(f"neo_bspline 3D LSQ mid-plane error (z = {z_mid:.3f})")
+    cbar = fig.colorbar(sc, ax=ax)
+    cbar.set_label("error")
+    fig.tight_layout()
+    fig.savefig(args.output, dpi=150)
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/test/scripts/plot_neo_bspline_benchmarks.py b/test/scripts/plot_neo_bspline_benchmarks.py
new file mode 100644
index 00000000..b1e9b568
--- /dev/null
+++ b/test/scripts/plot_neo_bspline_benchmarks.py
@@ -0,0 +1,127 @@
+#!/usr/bin/env python3
+import argparse
+import pathlib
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+def load_bench(
+    path: pathlib.Path,
+) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    data = np.loadtxt(path)
+    n = data[:, 0]
+    t_create_interp = data[:, 1]
+    t_eval_interp = data[:, 2]
+    t_create_bs = data[:, 3]
+    t_eval_bs = data[:, 4]
+    t_create_dir = data[:, 5]
+    t_eval_dir = data[:, 6]
+    return n, t_create_interp, t_eval_interp, t_create_bs, t_eval_bs, t_create_dir, t_eval_dir
+
+
+def plot_dim_create(
+    path: pathlib.Path,
+    title: str,
+    output: pathlib.Path,
+) -> None:
+    n, t_ci, _, t_cb, _, t_cd, _ = load_bench(path)
+
+    # ignore skipped cases (-1) for direct interpolation
+    mask_dir = t_cd > 0
+
+    fig, ax = plt.subplots(figsize=(6, 4))
+    ax.loglog(n, t_ci, "o-", label="interp create")
+    ax.loglog(n, t_cb, "s-", label="neo_bspline create")
+    if np.any(mask_dir):
+        ax.loglog(n[mask_dir], t_cd[mask_dir], "^-", label="neo_bspline direct create")
+
+    ax.set_xlabel("number of data points")
+    ax.set_ylabel("time [s]")
+    ax.set_title(title)
+    ax.legend()
+    ax.grid(True, which="both", ls=":")
+
+    fig.tight_layout()
+    fig.savefig(output, dpi=150)
+
+
+def plot_dim_eval(
+    path: pathlib.Path,
+    title: str,
+    output: pathlib.Path,
+) -> None:
+    n, _, t_ei, _, t_eb, _, t_ed = load_bench(path)
+
+    mask_dir = t_ed > 0
+
+    fig, ax = plt.subplots(figsize=(6, 4))
+    ax.loglog(n, t_ei, "o--", label="interp eval")
+    ax.loglog(n, t_eb, "s--", label="neo_bspline eval")
+    if np.any(mask_dir):
+        ax.loglog(n[mask_dir], t_ed[mask_dir], "^--", label="neo_bspline direct eval")
+
+    ax.set_xlabel("number of data points")
+    ax.set_ylabel("time [s]")
+    ax.set_title(title)
+    ax.legend()
+    ax.grid(True, which="both", ls=":")
+
+    fig.tight_layout()
+    fig.savefig(output, dpi=150)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Plot runtime benchmarks: interpolate vs neo_bspline.",
+    )
+    parser.add_argument(
+        "--data-dir",
+        type=pathlib.Path,
+        default=pathlib.Path("."),
+        help="Directory with bench_neo_bspline_*.dat files.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=pathlib.Path,
+        default=pathlib.Path("."),
+        help="Directory for output PNG files.",
+    )
+    args = parser.parse_args()
+
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+
+    plot_dim_create(
+        args.data_dir / "bench_neo_bspline_1d.dat",
+        "1D create: interpolate vs neo_bspline",
+        args.output_dir / "bench_neo_bspline_1d_create.png",
+    )
+    plot_dim_eval(
+        args.data_dir / "bench_neo_bspline_1d.dat",
+        "1D eval: interpolate vs neo_bspline",
+        args.output_dir / "bench_neo_bspline_1d_eval.png",
+    )
+    plot_dim_create(
+        args.data_dir / "bench_neo_bspline_2d.dat",
+        "2D create: interpolate vs neo_bspline",
+        args.output_dir / "bench_neo_bspline_2d_create.png",
+    )
+    plot_dim_eval(
+        args.data_dir / "bench_neo_bspline_2d.dat",
+        "2D eval: interpolate vs neo_bspline",
+        args.output_dir / "bench_neo_bspline_2d_eval.png",
+    )
+    plot_dim_create(
+        args.data_dir / "bench_neo_bspline_3d.dat",
+        "3D create: interpolate vs neo_bspline",
+        args.output_dir / "bench_neo_bspline_3d_create.png",
+    )
+    plot_dim_eval(
+        args.data_dir / "bench_neo_bspline_3d.dat",
+        "3D eval: interpolate vs neo_bspline",
+        args.output_dir / "bench_neo_bspline_3d_eval.png",
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/source/bench_neo_bspline_vs_interpolate.f90 b/test/source/bench_neo_bspline_vs_interpolate.f90
new file mode 100644
index 00000000..9db7407f
--- /dev/null
+++ b/test/source/bench_neo_bspline_vs_interpolate.f90
@@ -0,0 +1,421 @@
+program bench_neo_bspline_vs_interpolate
+    use, intrinsic :: iso_fortran_env, only: dp_bench => real64
+    use, intrinsic :: iso_c_binding, only: c_int, c_long
+    use interpolate
+    use neo_bspline
+
+    implicit none
+
+    integer, parameter :: N_CASE = 9
+    integer, parameter :: N_TOTAL_LIST(N_CASE) = &
+        [10, 30, 100, 300, 1000, 3000, 10000, 30000, 100000]
+    real(dp_bench), parameter :: PI_BENCH = acos(-1.0_dp_bench)
+    real(dp_bench), parameter :: TWOPI_BENCH = 2.0_dp_bench*PI_BENCH
+
+    type, bind(C) :: timespec
+        integer(c_long) :: tv_sec
+        integer(c_long) :: tv_nsec
+    end type timespec
+
+!> clock id for clock_gettime; macOS uses 6 for CLOCK_MONOTONIC, Linux uses 1.
+#ifdef __APPLE__
+    integer(c_int), parameter :: CLOCK_MONOTONIC = 6_c_int
+#else
+    integer(c_int), parameter :: CLOCK_MONOTONIC = 1_c_int
+#endif
+
+    interface
+        function c_clock_gettime(clock_id, tp) bind(C, name="clock_gettime")
+            import :: c_int, timespec
+            integer(c_int), value :: clock_id
+            type(timespec) :: tp
+            integer(c_int) :: c_clock_gettime
+        end function c_clock_gettime
+    end interface
+
+    call run_bench_1d()
+    call run_bench_2d()
+    call run_bench_3d()
+
+contains
+
+    subroutine bench_time_now(t)
+        real(dp_bench), intent(out) :: t
+        type(timespec) :: ts
+        integer(c_int) :: ierr
+
+        ierr = c_clock_gettime(CLOCK_MONOTONIC, ts)
+        if (ierr == 0_c_int) then
+            t = real(ts%tv_sec, dp_bench) &
+                + real(ts%tv_nsec, dp_bench)*1.0e-9_dp_bench
+        else
+            ! Fallback for platforms where CLOCK_MONOTONIC is unavailable.
+            call cpu_time(t)
+        end if
+    end subroutine bench_time_now
+
+    subroutine run_bench_1d()
+        type(SplineData1D) :: spl_interp
+        type(bspline_1d) :: spl_bs, spl_bs_dir
+        integer :: i, k, n, degree_bs
+        real(dp_bench) :: t0, t1
+        real(dp_bench) :: t_create_interp, t_eval_interp
+        real(dp_bench) :: t_create_lsq, t_eval_lsq
+        real(dp_bench) :: t_create_dir, t_eval_dir
+        real(dp_bench), allocatable :: x(:), f(:), coeff_lsq(:), coeff_dir(:)
+        real(dp_bench) :: x_min, x_max, y
+        integer :: unit, n_ctrl_lsq
+
+        x_min = 1.23_dp_bench
+        x_max = x_min + TWOPI_BENCH
+
+        open(newunit=unit, file="bench_neo_bspline_1d.dat", &
+            status="replace", action="write", form="formatted")
+
+        do k = 1, N_CASE
+            n = N_TOTAL_LIST(k)
+            if (n < 8) cycle
+
+            allocate(x(n), f(n))
+            do i = 1, n
+                x(i) = x_min + (x_max - x_min)*real(i - 1, dp_bench) &
+                    /real(n - 1, dp_bench)
+                f(i) = cos(2.0_dp_bench*x(i)) + 0.5_dp_bench*sin(3.0_dp_bench*x(i))
+            end do
+
+            ! 1) interpolate module: construct + evaluate
+            call bench_time_now(t0)
+            call construct_splines_1d(x_min, x_max, f, 5, .false., spl_interp)
+            call bench_time_now(t1)
+            t_create_interp = t1 - t0
+
+            call bench_time_now(t0)
+            do i = 1, n
+                call evaluate_splines_1d(spl_interp, x(i), y)
+            end do
+            call bench_time_now(t1)
+            t_eval_interp = t1 - t0
+
+            ! 2) neo_bspline LSQ: construct + evaluate
+            degree_bs = 5
+            n_ctrl_lsq = min(40, n/2)
+            n_ctrl_lsq = max(degree_bs + 1, n_ctrl_lsq)
+
+            allocate(coeff_lsq(n_ctrl_lsq))
+            coeff_lsq = 0.0_dp_bench
+
+            call bench_time_now(t0)
+            call bspline_1d_init_uniform(spl_bs, degree_bs, n_ctrl_lsq, x_min, &
+                x_max)
+            call bspline_1d_lsq_cgls(spl_bs, x, f, coeff_lsq, max_iter=400, &
+                tol=1.0d-10)
+            call bench_time_now(t1)
+            t_create_lsq = t1 - t0
+
+            call bench_time_now(t0)
+            do i = 1, n
+                call bspline_1d_eval(spl_bs, coeff_lsq, x(i), y)
+            end do
+            call bench_time_now(t1)
+            t_eval_lsq = t1 - t0
+
+            ! 3) neo_bspline direct interp: construct + evaluate (skip large N)
+            allocate(coeff_dir(n))
+
+            call bench_time_now(t0)
+            call bspline_1d_init_uniform(spl_bs_dir, degree_bs, n, x_min, x_max)
+            call bspline_1d_interp(spl_bs_dir, x, f, coeff_dir)
+            call bench_time_now(t1)
+            t_create_dir = t1 - t0
+
+            call bench_time_now(t0)
+            do i = 1, n
+                call bspline_1d_eval(spl_bs_dir, coeff_dir, x(i), y)
+            end do
+            call bench_time_now(t1)
+            t_eval_dir = t1 - t0
+
+            deallocate(coeff_dir)
+
+            write(unit,'(i10,6es16.8)') n, t_create_interp, t_eval_interp, &
+                t_create_lsq, t_eval_lsq, t_create_dir, t_eval_dir
+
+            deallocate(x, f, coeff_lsq)
+            call destroy_splines_1d(spl_interp)
+        end do
+
+        close(unit)
+    end subroutine run_bench_1d
+
+
+    subroutine run_bench_2d()
+        type(SplineData2D) :: spl_interp
+        type(bspline_2d) :: spl_bs, spl_bs_dir
+        integer :: i1, i2, k
+        integer :: n_total, n_side, n1, n2
+        integer :: degree(2), n_ctrl_lsq(2)
+        real(dp_bench) :: t0, t1
+        real(dp_bench) :: t_create_interp, t_eval_interp
+        real(dp_bench) :: t_create_lsq, t_eval_lsq
+        real(dp_bench) :: t_create_dir, t_eval_dir
+        real(dp_bench), allocatable :: x1(:), x2(:)
+        real(dp_bench), allocatable :: f_grid(:,:)
+        real(dp_bench), allocatable :: coeff_lsq(:,:), coeff_dir(:,:)
+        real(dp_bench) :: x_min(2), x_max(2)
+        real(dp_bench) :: x(2), y
+        integer :: unit, n_data
+
+        x_min = [1.23_dp_bench, 1.23_dp_bench]
+        x_max = x_min + [TWOPI_BENCH, TWOPI_BENCH]
+
+        open(newunit=unit, file="bench_neo_bspline_2d.dat", &
+            status="replace", action="write", form="formatted")
+
+        degree = [3, 3]
+
+        do k = 1, N_CASE
+            n_total = N_TOTAL_LIST(k)
+            n_side = int(sqrt(real(n_total, dp_bench)))
+            if (n_side < degree(1) + 2) n_side = degree(1) + 2
+
+            n1 = n_side
+            n2 = n_side
+            n_data = n1*n2
+
+            allocate(x1(n1), x2(n2), f_grid(n1, n2))
+
+            do i1 = 1, n1
+                x1(i1) = x_min(1) + (x_max(1) - x_min(1))* &
+                    real(i1 - 1, dp_bench)/real(n1 - 1, dp_bench)
+            end do
+            do i2 = 1, n2
+                x2(i2) = x_min(2) + (x_max(2) - x_min(2))* &
+                    real(i2 - 1, dp_bench)/real(n2 - 1, dp_bench)
+            end do
+
+            do i2 = 1, n2
+                do i1 = 1, n1
+                    f_grid(i1, i2) = cos(x1(i1))*cos(2.0_dp_bench*x2(i2))
+                end do
+            end do
+
+            ! 1) interpolate module: construct + evaluate
+            call bench_time_now(t0)
+            call construct_splines_2d(x_min, x_max, f_grid, [5, 5], &
+                [.false., .false.], spl_interp)
+            call bench_time_now(t1)
+            t_create_interp = t1 - t0
+
+            call bench_time_now(t0)
+            do i2 = 1, n2
+                do i1 = 1, n1
+                    x = [x1(i1), x2(i2)]
+                    call evaluate_splines_2d(spl_interp, x, y)
+                end do
+            end do
+            call bench_time_now(t1)
+            t_eval_interp = t1 - t0
+
+            ! 2) neo_bspline LSQ: construct + evaluate
+            n_ctrl_lsq(1) = min(16, n1/2)
+            n_ctrl_lsq(2) = min(16, n2/2)
+            n_ctrl_lsq(1) = max(degree(1) + 1, n_ctrl_lsq(1))
+            n_ctrl_lsq(2) = max(degree(2) + 1, n_ctrl_lsq(2))
+
+            allocate(coeff_lsq(n_ctrl_lsq(1), n_ctrl_lsq(2)))
+            coeff_lsq = 0.0_dp_bench
+
+            call bench_time_now(t0)
+            call bspline_2d_init_uniform(spl_bs, degree, n_ctrl_lsq, x_min, x_max)
+            call bspline_2d_lsq_cgls(spl_bs, x1, x2, f_grid, coeff_lsq, &
+                max_iter=800, tol=1.0d-10)
+            call bench_time_now(t1)
+            t_create_lsq = t1 - t0
+
+            call bench_time_now(t0)
+            do i2 = 1, n2
+                do i1 = 1, n1
+                    x = [x1(i1), x2(i2)]
+                    call bspline_2d_eval(spl_bs, coeff_lsq, x, y)
+                end do
+            end do
+            call bench_time_now(t1)
+            t_eval_lsq = t1 - t0
+
+            ! 3) neo_bspline direct interp: construct + evaluate (skip large N)
+            allocate(coeff_dir(n1, n2))
+
+            call bench_time_now(t0)
+            call bspline_2d_init_uniform(spl_bs_dir, degree, [n1, n2], x_min, &
+                x_max)
+            call bspline_2d_interp(spl_bs_dir, x1, x2, f_grid, coeff_dir)
+            call bench_time_now(t1)
+            t_create_dir = t1 - t0
+
+            call bench_time_now(t0)
+            do i2 = 1, n2
+                do i1 = 1, n1
+                    x = [x1(i1), x2(i2)]
+                    call bspline_2d_eval(spl_bs_dir, coeff_dir, x, y)
+                end do
+            end do
+            call bench_time_now(t1)
+            t_eval_dir = t1 - t0
+
+            deallocate(coeff_dir)
+
+            write(unit,'(i10,6es16.8)') n_data, t_create_interp, t_eval_interp, &
+                t_create_lsq, t_eval_lsq, t_create_dir, t_eval_dir
+
+            deallocate(x1, x2, f_grid, coeff_lsq)
+            call destroy_splines_2d(spl_interp)
+        end do
+
+        close(unit)
+    end subroutine run_bench_2d
+
+
+    subroutine run_bench_3d()
+        type(SplineData3D) :: spl_interp
+        type(bspline_3d) :: spl_bs, spl_bs_dir
+        integer :: i1, i2, i3, k
+        integer :: n_total, n_side, n1, n2, n3
+        integer :: degree(3), n_ctrl_lsq(3)
+        real(dp_bench) :: t0, t1
+        real(dp_bench) :: t_create_interp, t_eval_interp
+        real(dp_bench) :: t_create_lsq, t_eval_lsq
+        real(dp_bench) :: t_create_dir, t_eval_dir
+        real(dp_bench), allocatable :: x1(:), x2(:), x3(:)
+        real(dp_bench), allocatable :: f3d(:,:,:)
+        real(dp_bench), allocatable :: coeff_lsq(:,:,:), coeff_dir(:,:,:)
+        real(dp_bench) :: x_min(3), x_max(3)
+        real(dp_bench) :: x(3), y
+        integer :: unit, n_data
+
+        x_min = [1.23_dp_bench, 1.23_dp_bench, 1.23_dp_bench]
+        x_max = x_min + [TWOPI_BENCH, TWOPI_BENCH, TWOPI_BENCH]
+
+        open(newunit=unit, file="bench_neo_bspline_3d.dat", &
+            status="replace", action="write", form="formatted")
+
+        degree = [3, 3, 3]
+
+        do k = 1, N_CASE
+            n_total = N_TOTAL_LIST(k)
+            n_side = int(real(n_total, dp_bench)**(1.0_dp_bench/3.0_dp_bench))
+            if (n_side < degree(1) + 2) n_side = degree(1) + 2
+
+            n1 = n_side
+            n2 = n_side
+            n3 = n_side
+            n_data = n1*n2*n3
+
+            allocate(x1(n1), x2(n2), x3(n3))
+            allocate(f3d(n1, n2, n3))
+
+            do i1 = 1, n1
+                x1(i1) = x_min(1) + (x_max(1) - x_min(1))* &
+                    real(i1 - 1, dp_bench)/real(n1 - 1, dp_bench)
+            end do
+            do i2 = 1, n2
+                x2(i2) = x_min(2) + (x_max(2) - x_min(2))* &
+                    real(i2 - 1, dp_bench)/real(n2 - 1, dp_bench)
+            end do
+            do i3 = 1, n3
+                x3(i3) = x_min(3) + (x_max(3) - x_min(3))* &
+                    real(i3 - 1, dp_bench)/real(n3 - 1, dp_bench)
+            end do
+
+            do i3 = 1, n3
+                do i2 = 1, n2
+                    do i1 = 1, n1
+                        f3d(i1, i2, i3) = cos(x1(i1))*cos(2.0_dp_bench*x2(i2)) &
+                            *cos(3.0_dp_bench*x3(i3))
+                    end do
+                end do
+            end do
+
+            ! 1) interpolate module: construct + evaluate
+            call bench_time_now(t0)
+            call construct_splines_3d(x_min, x_max, f3d, [5, 5, 5], &
+                [.false., .false., .false.], spl_interp)
+            call bench_time_now(t1)
+            t_create_interp = t1 - t0
+
+            call bench_time_now(t0)
+            do i3 = 1, n3
+                do i2 = 1, n2
+                    do i1 = 1, n1
+                        x = [x1(i1), x2(i2), x3(i3)]
+                        call evaluate_splines_3d(spl_interp, x, y)
+                    end do
+                end do
+            end do
+            call bench_time_now(t1)
+            t_eval_interp = t1 - t0
+
+            ! 2) neo_bspline LSQ: construct + evaluate
+            n_ctrl_lsq(1) = min(16, n1/2)
+            n_ctrl_lsq(2) = min(16, n2/2)
+            n_ctrl_lsq(3) = min(16, n3/2)
+            n_ctrl_lsq(1) = max(degree(1) + 1, n_ctrl_lsq(1))
+            n_ctrl_lsq(2) = max(degree(2) + 1, n_ctrl_lsq(2))
+            n_ctrl_lsq(3) = max(degree(3) + 1, n_ctrl_lsq(3))
+
+            allocate(coeff_lsq(n_ctrl_lsq(1), n_ctrl_lsq(2), n_ctrl_lsq(3)))
+            coeff_lsq = 0.0_dp_bench
+
+            call bench_time_now(t0)
+            call bspline_3d_init_uniform(spl_bs, degree, n_ctrl_lsq, x_min, x_max)
+            call bspline_3d_lsq_cgls(spl_bs, x1, x2, x3, f3d, coeff_lsq, &
+                max_iter=800, tol=1.0d-10)
+            call bench_time_now(t1)
+            t_create_lsq = t1 - t0
+
+            call bench_time_now(t0)
+            do i3 = 1, n3
+                do i2 = 1, n2
+                    do i1 = 1, n1
+                        x = [x1(i1), x2(i2), x3(i3)]
+                        call bspline_3d_eval(spl_bs, coeff_lsq, x, y)
+                    end do
+                end do
+            end do
+            call bench_time_now(t1)
+            t_eval_lsq = t1 - t0
+
+            ! 3) neo_bspline direct interp: construct + evaluate (skip large N)
+            allocate(coeff_dir(n1, n2, n3))
+
+            call bench_time_now(t0)
+            call bspline_3d_init_uniform(spl_bs_dir, degree, [n1, n2, n3], &
+                x_min, x_max)
+            call bspline_3d_interp(spl_bs_dir, x1, x2, x3, f3d, coeff_dir)
+            call bench_time_now(t1)
+            t_create_dir = t1 - t0
+
+            call bench_time_now(t0)
+            do i3 = 1, n3
+                do i2 = 1, n2
+                    do i1 = 1, n1
+                        x = [x1(i1), x2(i2), x3(i3)]
+                        call bspline_3d_eval(spl_bs_dir, coeff_dir, x, y)
+                    end do
+                end do
+            end do
+            call bench_time_now(t1)
+            t_eval_dir = t1 - t0
+
+            deallocate(coeff_dir)
+
+            write(unit,'(i10,6es16.8)') n_data, t_create_interp, t_eval_interp, &
+                t_create_lsq, t_eval_lsq, t_create_dir, t_eval_dir
+
+            deallocate(x1, x2, x3, f3d, coeff_lsq)
+            if (n_data <= 10000) call destroy_splines_3d(spl_interp)
+        end do
+
+        close(unit)
+    end subroutine run_bench_3d
+
+end program bench_neo_bspline_vs_interpolate
diff --git a/test/source/test_neo_bspline_1d.f90 b/test/source/test_neo_bspline_1d.f90
new file mode 100644
index 00000000..dab6895a
--- /dev/null
+++ b/test/source/test_neo_bspline_1d.f90
@@ -0,0 +1,180 @@
+program test_neo_bspline_1d
+    use libneo_kinds, only : dp
+    use math_constants
+    use libneo_util,  only : linspace
+    use neo_bspline
+
+    implicit none
+
+    real(dp), parameter :: TOL_L2 = 5.0d-7
+    real(dp), parameter :: X_MIN = 1.23d0
+    real(dp), parameter :: X_MAX = TWOPI + 1.23d0
+
+    call test_bspline_1d_partition_unity()
+    call test_bspline_1d_lsq_cos()
+    call test_bspline_1d_lsq_batch()
+
+contains
+
+    subroutine test_bspline_1d_partition_unity()
+        type(bspline_1d) :: spl
+        integer, parameter :: DEGREE = 3
+        integer, parameter :: N_CTRL = 20
+        integer, parameter :: N_SAMPLE = 200
+
+        real(dp) :: x_eval(N_SAMPLE)
+        real(dp) :: sum_basis, x, temp
+        real(dp) :: max_err
+        real(dp) :: c(N_CTRL)
+        integer :: i, j
+
+        print *, "Testing neo_bspline 1D partition of unity"
+
+        call bspline_1d_init_uniform(spl, DEGREE, N_CTRL, X_MIN, X_MAX)
+        call linspace(X_MIN, X_MAX, N_SAMPLE, x_eval)
+
+        max_err = 0.0d0
+        do i = 1, N_SAMPLE
+            x = x_eval(i)
+            sum_basis = 0.0d0
+            do j = 1, N_CTRL
+                c = 0.0d0
+                c(j) = 1.0d0
+                call bspline_1d_eval(spl, c, x, temp)
+                sum_basis = sum_basis + temp
+            end do
+            max_err = max(max_err, abs(sum_basis - 1.0d0))
+        end do
+
+        print *, "  max |sum_j N_j(x) - 1| =", max_err
+
+        if (max_err > 1.0d-10) then
+            error stop "neo_bspline 1D partition of unity violated"
+        end if
+
+    end subroutine test_bspline_1d_partition_unity
+
+
+    subroutine test_bspline_1d_lsq_cos()
+        type(bspline_1d) :: spl
+        integer, parameter :: DEGREE = 5
+        integer, parameter :: N_CTRL = 60
+        integer, parameter :: N_DATA = 400
+        integer, parameter :: N_PLOT = 201
+
+        real(dp) :: x_data(N_DATA), f_data(N_DATA)
+        real(dp) :: coeff(N_CTRL)
+        real(dp) :: x_plot(N_PLOT), f_true(N_PLOT), f_fit(N_PLOT)
+        real(dp) :: err2
+        integer :: i
+
+        print *, "Testing neo_bspline 1D LSQ CGLS (cos)"
+
+        call bspline_1d_init_uniform(spl, DEGREE, N_CTRL, X_MIN, X_MAX)
+
+        ! Scattered data in [X_MIN, X_MAX]
+        do i = 1, N_DATA
+            x_data(i) = X_MIN + (X_MAX - X_MIN) * real(i, dp) / real(N_DATA + 1, dp)
+            x_data(i) = x_data(i) + 0.1d0 * sin(3.0d0*real(i, dp))
+            if (x_data(i) < X_MIN) x_data(i) = X_MIN
+            if (x_data(i) > X_MAX) x_data(i) = X_MAX
+            f_data(i) = cos(2.0d0*x_data(i)) + 0.5d0*sin(3.0d0*x_data(i))
+        end do
+
+        coeff = 0.0_dp
+        call bspline_1d_lsq_cgls(spl, x_data, f_data, coeff, max_iter=400, tol=1.0d-10)
+
+        call linspace(X_MIN, X_MAX, N_PLOT, x_plot)
+        do i = 1, N_PLOT
+            f_true(i) = cos(2.0d0*x_plot(i)) + 0.5d0*sin(3.0d0*x_plot(i))
+            call bspline_1d_eval(spl, coeff, x_plot(i), f_fit(i))
+        end do
+
+        err2 = sum((f_fit - f_true)**2)/real(N_PLOT, dp)
+        print *, "  LSQ L2 error =", sqrt(err2)
+
+        if (sqrt(err2) > TOL_L2) then
+            error stop "neo_bspline 1D LSQ error too large"
+        end if
+
+        call write_plot_data("bspline_1d_lsq.dat", N_PLOT, x_plot, f_true, f_fit)
+
+    end subroutine test_bspline_1d_lsq_cos
+
+
+    subroutine test_bspline_1d_lsq_batch()
+        type(bspline_1d) :: spl
+        integer, parameter :: DEGREE = 5
+        integer, parameter :: N_CTRL = 60
+        integer, parameter :: N_DATA = 400
+        integer, parameter :: N_PLOT = 201
+        integer, parameter :: N_RHS = 2
+
+        real(dp) :: x_data(N_DATA)
+        real(dp) :: f_data(N_DATA)
+        real(dp) :: coeff(N_CTRL, N_RHS)
+        real(dp) :: x_plot(N_PLOT)
+        real(dp) :: f_true(N_PLOT, N_RHS)
+        real(dp) :: f_fit(N_PLOT, N_RHS)
+        real(dp) :: err2(N_RHS)
+        integer :: i, k
+
+        print *, "Testing neo_bspline 1D LSQ CGLS (batched)"
+
+        call bspline_1d_init_uniform(spl, DEGREE, N_CTRL, X_MIN, X_MAX)
+
+        do i = 1, N_DATA
+            x_data(i) = X_MIN + (X_MAX - X_MIN)*real(i, dp)/real(N_DATA + 1, dp)
+            x_data(i) = x_data(i) + 0.1d0*sin(3.0d0*real(i, dp))
+            if (x_data(i) < X_MIN) x_data(i) = X_MIN
+            if (x_data(i) > X_MAX) x_data(i) = X_MAX
+        end do
+
+        do k = 1, N_RHS
+            do i = 1, N_DATA
+                if (k == 1) then
+                    f_data(i) = cos(2.0d0*x_data(i)) + 0.5d0*sin(3.0d0*x_data(i))
+                else
+                    f_data(i) = sin(1.5d0*x_data(i)) + 0.3d0*cos(4.0d0*x_data(i))
+                end if
+            end do
+            coeff(:, k) = 0.0_dp
+            call bspline_1d_lsq_cgls(spl, x_data, f_data, coeff(:, k), &
+                max_iter=400, tol=1.0d-10)
+        end do
+
+        call linspace(X_MIN, X_MAX, N_PLOT, x_plot)
+        do i = 1, N_PLOT
+            f_true(i, 1) = cos(2.0d0*x_plot(i)) + 0.5d0*sin(3.0d0*x_plot(i))
+            f_true(i, 2) = sin(1.5d0*x_plot(i)) + 0.3d0*cos(4.0d0*x_plot(i))
+            do k = 1, N_RHS
+                call bspline_1d_eval(spl, coeff(:, k), x_plot(i), f_fit(i, k))
+            end do
+        end do
+
+        do k = 1, N_RHS
+            err2(k) = sum((f_fit(:, k) - f_true(:, k))**2)/real(N_PLOT, dp)
+            print *, "  LSQ L2 error (rhs =", k, ") =", sqrt(err2(k))
+            if (sqrt(err2(k)) > TOL_L2) then
+                error stop "neo_bspline 1D batched LSQ error too large"
+            end if
+        end do
+
+    end subroutine test_bspline_1d_lsq_batch
+
+
+    subroutine write_plot_data(fname, n, x, f_true, f_fit)
+        character(*), intent(in) :: fname
+        integer, intent(in) :: n
+        real(dp), intent(in) :: x(n), f_true(n), f_fit(n)
+        integer :: iunit, i
+
+        open(newunit=iunit, file=fname, status="replace", action="write", &
+            form="formatted")
+        do i = 1, n
+            write(iunit,'(3es24.16)') x(i), f_true(i), f_fit(i)
+        end do
+        close(iunit)
+    end subroutine write_plot_data
+
+end program test_neo_bspline_1d
diff --git a/test/source/test_neo_bspline_2d.f90 b/test/source/test_neo_bspline_2d.f90
new file mode 100644
index 00000000..d7d1188b
--- /dev/null
+++ b/test/source/test_neo_bspline_2d.f90
@@ -0,0 +1,186 @@
+program test_neo_bspline_2d
+    use libneo_kinds, only: dp
+    use math_constants
+    use libneo_util, only: linspace
+    use neo_bspline
+
+    implicit none
+
+    real(dp), parameter :: X_MIN = 1.23d0
+    real(dp), parameter :: X_MAX = TWOPI + 1.23d0
+    real(dp), parameter :: TOL_L2_FULL = 5.0d-4
+
+    call test_bspline_2d_lsq_full_grid()
+    call test_bspline_2d_interp_exact()
+    call test_bspline_2d_lsq_full_grid_batch()
+
+contains
+
+    subroutine test_bspline_2d_lsq_full_grid()
+        type(bspline_2d) :: spl
+        integer, parameter :: DEGREE(2) = [3, 3]
+        integer, parameter :: N_CTRL(2) = [32, 28]
+        integer, parameter :: N_GRID1 = 40
+        integer, parameter :: N_GRID2 = 36
+
+        real(dp) :: x1(N_GRID1), x2(N_GRID2)
+        real(dp) :: f_grid(N_GRID1, N_GRID2)
+        real(dp) :: coeff(N_CTRL(1), N_CTRL(2))
+
+        integer :: i1, i2
+        real(dp) :: x(2), f_true, f_fit
+        real(dp) :: err2
+
+        print *, "Testing neo_bspline 2D LSQ CGLS (full grid)"
+
+        call bspline_2d_init_uniform(spl, DEGREE, N_CTRL, [X_MIN, X_MIN], &
+            [X_MAX, X_MAX])
+
+        call linspace(X_MIN, X_MAX, N_GRID1, x1)
+        call linspace(X_MIN, X_MAX, N_GRID2, x2)
+
+        do i2 = 1, N_GRID2
+            do i1 = 1, N_GRID1
+                f_grid(i1, i2) = cos(x1(i1))*cos(2.0d0*x2(i2))
+            end do
+        end do
+
+        coeff = 0.0_dp
+        call bspline_2d_lsq_cgls(spl, x1, x2, f_grid, coeff, &
+            max_iter=800, tol=1.0d-10)
+
+        err2 = 0.0d0
+        open(unit=20, file="bspline_2d_lsq_grid.dat", status="replace", &
+            action="write")
+        do i2 = 1, N_GRID2
+            do i1 = 1, N_GRID1
+                x(1) = x1(i1)
+                x(2) = x2(i2)
+                f_true = cos(x(1))*cos(2.0d0*x(2))
+                call bspline_2d_eval(spl, coeff, x, f_fit)
+                err2 = err2 + (f_fit - f_true)**2
+                write(20,'(4es24.16)') x(1), x(2), f_true, f_fit
+            end do
+        end do
+        close(20)
+
+        err2 = err2/real(N_GRID1*N_GRID2, dp)
+        print *, "  2D LSQ full-grid L2 error =", sqrt(err2)
+
+        if (sqrt(err2) > TOL_L2_FULL) then
+            error stop "neo_bspline 2D LSQ full-grid error too large"
+        end if
+
+    end subroutine test_bspline_2d_lsq_full_grid
+
+
+    subroutine test_bspline_2d_interp_exact()
+        type(bspline_2d) :: spl
+        integer, parameter :: DEGREE(2) = [3, 3]
+        integer, parameter :: N1 = 18
+        integer, parameter :: N2 = 14
+
+        real(dp) :: x1(N1), x2(N2)
+        real(dp) :: f_grid(N1, N2)
+        real(dp) :: coeff(N1, N2)
+        real(dp) :: f_fit, f_true, x(2)
+        real(dp) :: err_max
+        integer :: i1, i2
+
+        print *, "Testing neo_bspline 2D direct interpolation (collocation)"
+
+        call bspline_2d_init_uniform(spl, DEGREE, [N1, N2], [X_MIN, X_MIN], &
+            [X_MAX, X_MAX])
+
+        call linspace(X_MIN, X_MAX, N1, x1)
+        call linspace(X_MIN, X_MAX, N2, x2)
+
+        do i2 = 1, N2
+            do i1 = 1, N1
+                f_grid(i1, i2) = cos(x1(i1))*cos(2.0d0*x2(i2))
+            end do
+        end do
+
+        call bspline_2d_interp(spl, x1, x2, f_grid, coeff)
+
+        err_max = 0.0d0
+        do i2 = 1, N2
+            do i1 = 1, N1
+                x = [x1(i1), x2(i2)]
+                f_true = cos(x(1))*cos(2.0d0*x(2))
+                call bspline_2d_eval(spl, coeff, x, f_fit)
+                err_max = max(err_max, abs(f_fit - f_true))
+            end do
+        end do
+
+        print *, "  2D direct interp max error =", err_max
+        if (err_max > 1.0d-10) then
+            error stop "neo_bspline 2D direct interpolation error too large"
+        end if
+    end subroutine test_bspline_2d_interp_exact
+
+
+    subroutine test_bspline_2d_lsq_full_grid_batch()
+        type(bspline_2d) :: spl
+        integer, parameter :: DEGREE(2) = [3, 3]
+        integer, parameter :: N_CTRL(2) = [32, 28]
+        integer, parameter :: N_GRID1 = 40
+        integer, parameter :: N_GRID2 = 36
+        integer, parameter :: N_RHS = 2
+
+        real(dp) :: x1(N_GRID1), x2(N_GRID2)
+        real(dp) :: f_grid(N_GRID1, N_GRID2)
+        real(dp) :: coeff(N_CTRL(1), N_CTRL(2), N_RHS)
+
+        real(dp) :: x(2), f_true(N_RHS), f_fit
+        real(dp) :: err2(N_RHS)
+        integer :: i1, i2, k
+
+        print *, "Testing neo_bspline 2D LSQ CGLS (full grid, batched)"
+
+        call bspline_2d_init_uniform(spl, DEGREE, N_CTRL, [X_MIN, X_MIN], &
+            [X_MAX, X_MAX])
+
+        call linspace(X_MIN, X_MAX, N_GRID1, x1)
+        call linspace(X_MIN, X_MAX, N_GRID2, x2)
+
+        do k = 1, N_RHS
+            do i2 = 1, N_GRID2
+                do i1 = 1, N_GRID1
+                    if (k == 1) then
+                        f_grid(i1, i2) = cos(x1(i1))*cos(2.0d0*x2(i2))
+                    else
+                        f_grid(i1, i2) = sin(x1(i1))*cos(3.0d0*x2(i2))
+                    end if
+                end do
+            end do
+            coeff(:, :, k) = 0.0_dp
+            call bspline_2d_lsq_cgls(spl, x1, x2, f_grid, coeff(:, :, k), &
+                max_iter=800, tol=1.0d-10)
+        end do
+
+        err2 = 0.0d0
+        do i2 = 1, N_GRID2
+            do i1 = 1, N_GRID1
+                x(1) = x1(i1)
+                x(2) = x2(i2)
+                f_true(1) = cos(x(1))*cos(2.0d0*x(2))
+                f_true(2) = sin(x(1))*cos(3.0d0*x(2))
+                do k = 1, N_RHS
+                    call bspline_2d_eval(spl, coeff(:, :, k), x, f_fit)
+                    err2(k) = err2(k) + (f_fit - f_true(k))**2
+                end do
+            end do
+        end do
+
+        err2 = err2/real(N_GRID1*N_GRID2, dp)
+        do k = 1, N_RHS
+            print *, "  2D LSQ full-grid L2 error (rhs =", k, ") =", sqrt(err2(k))
+            if (sqrt(err2(k)) > TOL_L2_FULL) then
+                error stop "neo_bspline 2D batched LSQ full-grid error too large"
+            end if
+        end do
+
+    end subroutine test_bspline_2d_lsq_full_grid_batch
+
+end program test_neo_bspline_2d
diff --git a/test/source/test_neo_bspline_3d.f90 b/test/source/test_neo_bspline_3d.f90
new file mode 100644
index 00000000..f38b4d25
--- /dev/null
+++ b/test/source/test_neo_bspline_3d.f90
@@ -0,0 +1,155 @@
+program test_neo_bspline_3d
+    use libneo_kinds, only: dp
+    use math_constants
+    use libneo_util, only: linspace
+    use neo_bspline
+
+    implicit none
+
+    real(dp), parameter :: X_MIN = 1.23d0
+    real(dp), parameter :: X_MAX = TWOPI + 1.23d0
+    real(dp), parameter :: TOL_L2_3D = 2.0d-3
+
+    call test_bspline_3d_lsq_full_grid()
+    call test_bspline_3d_lsq_batch()
+
+contains
+
+    subroutine test_bspline_3d_lsq_full_grid()
+        type(bspline_3d) :: spl
+        integer, parameter :: DEGREE(3) = [3, 3, 3]
+        integer, parameter :: N_CTRL(3) = [24, 20, 16]
+        integer, parameter :: N_GRID1 = 18
+        integer, parameter :: N_GRID2 = 16
+        integer, parameter :: N_GRID3 = 14
+
+        real(dp) :: x1(N_GRID1), x2(N_GRID2), x3(N_GRID3)
+        real(dp) :: f_grid(N_GRID1, N_GRID2, N_GRID3)
+        real(dp) :: coeff(N_CTRL(1), N_CTRL(2), N_CTRL(3))
+
+        real(dp) :: x(3), f_true, f_fit
+        real(dp) :: err2
+        integer :: i1, i2, i3
+
+        print *, "Testing neo_bspline 3D LSQ CGLS (full grid)"
+
+        call bspline_3d_init_uniform(spl, DEGREE, N_CTRL, &
+            [X_MIN, X_MIN, X_MIN], [X_MAX, X_MAX, X_MAX])
+
+        call linspace(X_MIN, X_MAX, N_GRID1, x1)
+        call linspace(X_MIN, X_MAX, N_GRID2, x2)
+        call linspace(X_MIN, X_MAX, N_GRID3, x3)
+
+        do i3 = 1, N_GRID3
+            do i2 = 1, N_GRID2
+                do i1 = 1, N_GRID1
+                    f_grid(i1, i2, i3) = cos(x1(i1))*cos(2.0d0*x2(i2)) &
+                        *cos(3.0d0*x3(i3))
+                end do
+            end do
+        end do
+
+        coeff = 0.0_dp
+        call bspline_3d_lsq_cgls(spl, x1, x2, x3, f_grid, coeff, &
+            max_iter=800, tol=1.0d-10)
+
+        err2 = 0.0d0
+        open(unit=30, file="bspline_3d_lsq_grid.dat", status="replace", &
+            action="write")
+        do i3 = 1, N_GRID3
+            do i2 = 1, N_GRID2
+                do i1 = 1, N_GRID1
+                    x(1) = x1(i1)
+                    x(2) = x2(i2)
+                    x(3) = x3(i3)
+                    f_true = cos(x(1))*cos(2.0d0*x(2))*cos(3.0d0*x(3))
+                    call bspline_3d_eval(spl, coeff, x, f_fit)
+                    err2 = err2 + (f_fit - f_true)**2
+                    write(30,'(5es24.16)') x(1), x(2), x(3), f_true, f_fit
+                end do
+            end do
+        end do
+        close(30)
+
+        err2 = err2/real(N_GRID1*N_GRID2*N_GRID3, dp)
+        print *, "  3D LSQ full-grid L2 error =", sqrt(err2)
+
+        if (sqrt(err2) > TOL_L2_3D) then
+            error stop "neo_bspline 3D LSQ full-grid error too large"
+        end if
+
+    end subroutine test_bspline_3d_lsq_full_grid
+
+    subroutine test_bspline_3d_lsq_batch()
+        type(bspline_3d) :: spl
+        integer, parameter :: DEGREE(3) = [3, 3, 3]
+        integer, parameter :: N_CTRL(3) = [24, 20, 16]
+        integer, parameter :: N_GRID1 = 18
+        integer, parameter :: N_GRID2 = 16
+        integer, parameter :: N_GRID3 = 14
+        integer, parameter :: N_RHS = 2
+
+        real(dp) :: x1(N_GRID1), x2(N_GRID2), x3(N_GRID3)
+        real(dp) :: f_grid(N_GRID1, N_GRID2, N_GRID3)
+        real(dp) :: coeff(N_CTRL(1), N_CTRL(2), N_CTRL(3), N_RHS)
+
+        real(dp) :: x(3), f_true(N_RHS), f_fit
+        real(dp) :: err2(N_RHS)
+        integer :: i1, i2, i3, k
+
+        print *, "Testing neo_bspline 3D LSQ CGLS (batched)"
+
+        call bspline_3d_init_uniform(spl, DEGREE, N_CTRL, &
+            [X_MIN, X_MIN, X_MIN], [X_MAX, X_MAX, X_MAX])
+
+        call linspace(X_MIN, X_MAX, N_GRID1, x1)
+        call linspace(X_MIN, X_MAX, N_GRID2, x2)
+        call linspace(X_MIN, X_MAX, N_GRID3, x3)
+
+        do k = 1, N_RHS
+            do i3 = 1, N_GRID3
+                do i2 = 1, N_GRID2
+                    do i1 = 1, N_GRID1
+                        if (k == 1) then
+                            f_grid(i1, i2, i3) = cos(x1(i1))*cos(2.0d0*x2(i2)) &
+                                *cos(3.0d0*x3(i3))
+                        else
+                            f_grid(i1, i2, i3) = sin(x1(i1))*cos(2.0d0*x2(i2)) &
+                                *cos(1.5d0*x3(i3))
+                        end if
+                    end do
+                end do
+            end do
+            coeff(:, :, :, k) = 0.0_dp
+            call bspline_3d_lsq_cgls(spl, x1, x2, x3, f_grid, &
+                coeff(:, :, :, k), max_iter=800, tol=1.0d-10)
+        end do
+
+        err2 = 0.0d0
+        do i3 = 1, N_GRID3
+            do i2 = 1, N_GRID2
+                do i1 = 1, N_GRID1
+                    x(1) = x1(i1)
+                    x(2) = x2(i2)
+                    x(3) = x3(i3)
+                    f_true(1) = cos(x(1))*cos(2.0d0*x(2))*cos(3.0d0*x(3))
+                    f_true(2) = sin(x(1))*cos(2.0d0*x(2))*cos(1.5d0*x(3))
+                    do k = 1, N_RHS
+                        call bspline_3d_eval(spl, coeff(:, :, :, k), x, f_fit)
+                        err2(k) = err2(k) + (f_fit - f_true(k))**2
+                    end do
+                end do
+            end do
+        end do
+
+        err2 = err2/real(N_GRID1*N_GRID2*N_GRID3, dp)
+        do k = 1, N_RHS
+            print *, "  3D LSQ full-grid L2 error (rhs =", k, ") =", sqrt(err2(k))
+            if (sqrt(err2(k)) > TOL_L2_3D) then
+                error stop "neo_bspline 3D batched LSQ full-grid error too large"
+            end if
+        end do
+
+    end subroutine test_bspline_3d_lsq_batch
+
+end program test_neo_bspline_3d