Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/interpolate/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ add_library(interpolate STATIC
batch_interpolate_2d.f90
batch_interpolate_3d.f90
batch_interpolate.f90
cgls_small.f90
cgls_dense.f90
interpolate.f90)

if(LIBNEO_BUILD_TESTING)
Expand Down
2 changes: 2 additions & 0 deletions src/interpolate/batch_interpolate.f90
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ module batch_interpolate
use batch_interpolate_types, only: BatchSplineData1D, BatchSplineData2D, &
BatchSplineData3D
use batch_interpolate_1d, only: construct_batch_splines_1d, &
construct_batch_splines_1d_lsq, &
construct_batch_splines_1d_lines, &
construct_batch_splines_1d_resident, &
construct_batch_splines_1d_resident_device, &
Expand Down Expand Up @@ -49,6 +50,7 @@ module batch_interpolate

! Re-export 1D routines from batch_interpolate_1d
public :: construct_batch_splines_1d, destroy_batch_splines_1d
public :: construct_batch_splines_1d_lsq
public :: construct_batch_splines_1d_lines
public :: construct_batch_splines_1d_resident
public :: construct_batch_splines_1d_resident_device
Expand Down
115 changes: 115 additions & 0 deletions src/interpolate/batch_interpolate_1d.f90
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ module batch_interpolate_1d
spl_four_per_line, spl_four_reg_line, &
spl_five_per_line, spl_five_reg_line
use spl_three_to_five_sub, only: spl_per, spl_reg
use cgls_dense, only: cgls_dense_solve
#ifdef _OPENACC
use openacc, only: acc_is_present
#endif
Expand All @@ -14,6 +15,7 @@ module batch_interpolate_1d

! Export batch spline construction/destruction routines
public :: construct_batch_splines_1d
public :: construct_batch_splines_1d_lsq
public :: construct_batch_splines_1d_lines
public :: construct_batch_splines_1d_resident
public :: construct_batch_splines_1d_resident_device
Expand Down Expand Up @@ -118,6 +120,119 @@ subroutine construct_batch_splines_1d_legacy(x_min, x_max, y_batch, order, perio
deallocate (splcoe_temp)
end subroutine construct_batch_splines_1d_legacy

subroutine construct_batch_splines_1d_lsq(x_min, x_max, order, periodic, &
num_points, x_data, f_batch, spl, weights)
real(dp), intent(in) :: x_min, x_max
integer, intent(in) :: order
logical, intent(in) :: periodic
integer, intent(in) :: num_points
real(dp), intent(in) :: x_data(:)
real(dp), intent(in) :: f_batch(:, :) ! (n_data, n_quantities)
type(BatchSplineData1D), intent(out) :: spl
real(dp), intent(in), optional :: weights(:)

integer :: n_data, n_quantities, n_keep
integer :: i, iq, col
real(dp), allocatable :: x_used(:), f_used(:, :), w_used(:)
real(dp), allocatable :: phi(:, :), y_vec(:), y_knots(:, :)
type(BatchSplineData1D) :: spl_unit
real(dp) :: val(1)

n_data = size(x_data)
if (n_data /= size(f_batch, 1)) then
error stop "construct_batch_splines_1d_lsq: data size mismatch"
end if
n_quantities = size(f_batch, 2)
if (n_quantities < 1) then
error stop "construct_batch_splines_1d_lsq: need at least 1 quantity"
end if
if (num_points < 2) then
error stop "construct_batch_splines_1d_lsq: need at least 2 grid points"
end if

if (present(weights)) then
if (size(weights) /= n_data) then
error stop "construct_batch_splines_1d_lsq: weights size mismatch"
end if
end if

n_keep = 0
do i = 1, n_data
if (.not. periodic) then
if (x_data(i) < x_min .or. x_data(i) > x_max) cycle
end if
if (present(weights)) then
if (weights(i) == 0.0_dp) cycle
end if
n_keep = n_keep + 1
end do

if (n_keep == 0) then
error stop "construct_batch_splines_1d_lsq: no usable data"
end if

allocate (x_used(n_keep), f_used(n_keep, n_quantities))
if (present(weights)) allocate (w_used(n_keep))

n_keep = 0
do i = 1, n_data
if (.not. periodic) then
if (x_data(i) < x_min .or. x_data(i) > x_max) cycle
end if
if (present(weights)) then
if (weights(i) == 0.0_dp) cycle
w_used(n_keep + 1) = weights(i)
end if
n_keep = n_keep + 1
x_used(n_keep) = x_data(i)
f_used(n_keep, :) = f_batch(i, :)
end do

allocate (phi(n_keep, num_points))
allocate (y_vec(num_points))
allocate (y_knots(num_points, n_quantities))

do col = 1, num_points
call allocate_batch_unit(order, num_points, periodic, col, spl_unit, x_min, x_max)
do i = 1, n_keep
call evaluate_batch_splines_1d(spl_unit, x_used(i), val)
phi(i, col) = val(1)
end do
call destroy_batch_splines_1d(spl_unit)
end do

do iq = 1, n_quantities
if (present(weights)) then
call cgls_dense_solve(phi, f_used(:, iq), y_vec, w_used, &
max_iter=4*num_points, tol=1.0d-14)
else
call cgls_dense_solve(phi, f_used(:, iq), y_vec, &
max_iter=4*num_points, tol=1.0d-14)
end if
y_knots(:, iq) = y_vec
end do

call construct_batch_splines_1d(x_min, x_max, y_knots, order, periodic, spl)

deallocate (phi, y_vec, y_knots, x_used, f_used)
if (present(weights)) deallocate (w_used)
end subroutine construct_batch_splines_1d_lsq

subroutine allocate_batch_unit(order, num_points, periodic, idx, spl, x_min, x_max)
integer, intent(in) :: order, num_points, idx
logical, intent(in) :: periodic
real(dp), intent(in) :: x_min, x_max
type(BatchSplineData1D), intent(out) :: spl

real(dp), allocatable :: y_basis(:, :)

allocate (y_basis(num_points, 1))
y_basis = 0.0_dp
y_basis(idx, 1) = 1.0_dp
call construct_batch_splines_1d(x_min, x_max, y_basis, order, periodic, spl)
deallocate (y_basis)
end subroutine allocate_batch_unit

subroutine construct_batch_splines_1d_lines(x_min, x_max, y_batch, order, periodic, spl)
real(dp), intent(in) :: x_min, x_max
real(dp), intent(in) :: y_batch(:, :) ! (n_points, n_quantities)
Expand Down
212 changes: 212 additions & 0 deletions src/interpolate/cgls_dense.f90
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
module cgls_dense
!! Conjugate gradient for normal equations using an explicit
!! design matrix stored in column-major form. OpenMP and SIMD
!! are used inside the matrix-vector products.
use, intrinsic :: iso_fortran_env, only : dp => real64
implicit none
private

public :: cgls_dense_solve

contains

subroutine matvec_phi_core(phi, x, y)
!! y = A*x where A == phi (n_rows x n_cols)
real(dp), intent(in) :: phi(:,:)
real(dp), intent(in) :: x(:)
real(dp), intent(out) :: y(:)

integer :: i, j, n_rows, n_cols
real(dp) :: acc

n_rows = size(phi, 1)
n_cols = size(phi, 2)

if (size(x) /= n_cols) then
error stop "matvec_phi_core: x size mismatch"
end if
if (size(y) /= n_rows) then
error stop "matvec_phi_core: y size mismatch"
end if

y = 0.0_dp

!$omp parallel do default(shared) private(i, j, acc)
do i = 1, n_rows
acc = 0.0_dp
!$omp simd reduction(+:acc)
do j = 1, n_cols
acc = acc + phi(i, j)*x(j)
end do
y(i) = acc
end do
!$omp end parallel do
end subroutine matvec_phi_core


subroutine matvec_phi_t_core(phi, r, y)
!! y = A^T*r where A == phi (n_rows x n_cols)
real(dp), intent(in) :: phi(:,:)
real(dp), intent(in) :: r(:)
real(dp), intent(out) :: y(:)

integer :: i, j, n_rows, n_cols
real(dp) :: rloc

n_rows = size(phi, 1)
n_cols = size(phi, 2)

if (size(r) /= n_rows) then
error stop "matvec_phi_t_core: r size mismatch"
end if
if (size(y) /= n_cols) then
error stop "matvec_phi_t_core: y size mismatch"
end if

y = 0.0_dp

!$omp parallel do default(shared) private(j, i, rloc) schedule(static)
do j = 1, n_cols
rloc = 0.0_dp
!$omp simd reduction(+:rloc)
do i = 1, n_rows
rloc = rloc + phi(i, j)*r(i)
end do
y(j) = rloc
end do
!$omp end parallel do
end subroutine matvec_phi_t_core


subroutine cgls_dense_core(phi, rhs, x, max_iter, tol)
!! Conjugate Gradient for normal equations using
!! explicit matrix-vector products (unweighted case).
real(dp), intent(in) :: phi(:,:)
real(dp), intent(in) :: rhs(:)
real(dp), intent(inout) :: x(:)
integer, intent(in) :: max_iter
real(dp), intent(in) :: tol

integer :: iter, n_rows, n_cols
real(dp) :: gamma, gamma_new, alpha, beta, denom
real(dp) :: rhs_norm

real(dp), allocatable :: r(:), s(:), p(:), q(:)

n_rows = size(phi, 1)
n_cols = size(phi, 2)

if (size(rhs) /= n_rows) then
error stop "cgls_dense_core: rhs size mismatch"
end if
if (size(x) /= n_cols) then
error stop "cgls_dense_core: x size mismatch"
end if

allocate(r(n_rows))
allocate(s(n_cols))
allocate(p(n_cols))
allocate(q(n_rows))

r = rhs

call matvec_phi_t_core(phi, r, s)
p = s
gamma = sum(s*s)
rhs_norm = sqrt(sum(r*r))

if (rhs_norm == 0.0_dp) then
x = 0.0_dp
deallocate(r, s, p, q)
return
end if

do iter = 1, max_iter
call matvec_phi_core(phi, p, q)
denom = sum(q*q)
if (denom <= 0.0_dp) exit

alpha = gamma/denom
x = x + alpha*p
r = r - alpha*q

call matvec_phi_t_core(phi, r, s)
gamma_new = sum(s*s)

if (gamma_new <= (tol*rhs_norm)**2) exit

beta = gamma_new/gamma
gamma = gamma_new
p = s + beta*p
end do

deallocate(r, s, p, q)
end subroutine cgls_dense_core


subroutine cgls_dense_solve(phi, rhs, x, w, max_iter, tol)
!! Conjugate Gradient for normal equations using
!! explicit matrix-vector products.
!! If w is present, each row i is scaled by w(i).
real(dp), intent(in) :: phi(:,:)
real(dp), intent(in) :: rhs(:)
real(dp), intent(out) :: x(:)
real(dp), intent(in), optional :: w(:)
integer, intent(in), optional :: max_iter
real(dp), intent(in), optional :: tol

integer :: n_rows, n_cols, kmax, i, j
real(dp) :: atol
real(dp), allocatable :: phi_eff(:,:), rhs_eff(:)

n_rows = size(phi, 1)
n_cols = size(phi, 2)

if (size(rhs) /= n_rows) then
error stop "cgls_dense_solve: rhs size mismatch"
end if
if (size(x) /= n_cols) then
error stop "cgls_dense_solve: x size mismatch"
end if
if (present(w)) then
if (size(w) /= n_rows) then
error stop "cgls_dense_solve: w size mismatch"
end if
end if

if (present(max_iter)) then
kmax = max_iter
else
kmax = 200
end if
if (present(tol)) then
atol = tol
else
atol = 1.0d-12
end if

x = 0.0_dp

if (present(w)) then
allocate(phi_eff(n_rows, n_cols))
allocate(rhs_eff(n_rows))

!$omp parallel do default(shared) private(i, j)
do i = 1, n_rows
rhs_eff(i) = w(i)*rhs(i)
!$omp simd
do j = 1, n_cols
phi_eff(i, j) = w(i)*phi(i, j)
end do
end do
!$omp end parallel do

call cgls_dense_core(phi_eff, rhs_eff, x, kmax, atol)

deallocate(phi_eff, rhs_eff)
else
call cgls_dense_core(phi, rhs, x, kmax, atol)
end if
end subroutine cgls_dense_solve

end module cgls_dense
Loading