Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,38 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
- Compiler: GNU Fortran (gfortran)
- Optional: OpenMP (enabled by default)

### OpenACC GPU Builds
SIMPLE supports GPU acceleration via OpenACC using GCC 16+ with nvptx offload.

**GCC 16 with nvptx offload** (experimental):
- Location: `/temp/AG-plasma/opt/gcc16`
- RTX 4090 GPU available for testing
- CRITICAL: OpenMP must be disabled - nvptx mkoffload cannot handle both -fopenacc AND -fopenmp

**Manual build with GCC 16 OpenACC**:
```bash
cmake -S . -B build -G Ninja \
-DCMAKE_Fortran_COMPILER=/temp/AG-plasma/opt/gcc16/bin/gfortran \
-DCMAKE_C_COMPILER=/temp/AG-plasma/opt/gcc16/bin/gcc \
-DCMAKE_CXX_COMPILER=/temp/AG-plasma/opt/gcc16/bin/g++ \
-DCMAKE_Fortran_FLAGS="-fopenacc -foffload=nvptx-none -O2 -DSIMPLE_OPENACC" \
-DCMAKE_BUILD_TYPE=Release \
-DENABLE_OPENMP=OFF

cmake --build build -j
```

**Running with OpenACC**:
```bash
LD_LIBRARY_PATH=/temp/AG-plasma/opt/gcc16/lib64:$LD_LIBRARY_PATH ./build/simple.x
```

**OpenACC implementation status**:
- Module variables with `!$acc declare create(...)` in params.f90 and get_canonical_coordinates.F90
- Preprocessor macro `SIMPLE_OPENACC` for conditional compilation
- Batch spline evaluation routines in libneo have `!$acc routine seq` directives
- GPU particle loop stub in simple_main.f90 (needs full integration implementation)

### GVEC Integration
- Minimal GVEC library automatically built from `thirdparty/gvec/`
- Provides B-spline and cubic spline functionality for magnetic field interpolation
Expand Down
31 changes: 25 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,26 @@ option(ENABLE_GVEC "Enable GVEC field support (experimental)" OFF)
option(ENABLE_COVERAGE "Enable code coverage analysis (Debug/Profile builds only)" OFF)
option(SIMPLE_DETERMINISTIC_FP "Disable fast-math for reproducible floating-point" OFF)
option(SIMPLE_ENABLE_PYTHON_TOOLS "Enable Python helpers (tests/data generation)" ON)
option(SIMPLE_ENABLE_OPENACC "Enable OpenACC offload (NVHPC only)" OFF)
option(SIMPLE_ENABLE_OPENACC "Enable OpenACC offload (NVHPC or GCC with nvptx)" OFF)
set(SIMPLE_OPENACC_OFFLOAD_TARGET "none" CACHE STRING "OpenACC offload target for GCC (none|nvptx)")
set_property(CACHE SIMPLE_OPENACC_OFFLOAD_TARGET PROPERTY STRINGS none nvptx)

# GCC OpenACC support (must be before NVHPC block to set flags early)
if(CMAKE_Fortran_COMPILER_ID STREQUAL "GNU" AND SIMPLE_ENABLE_OPENACC)
message(STATUS "OpenACC enabled for GCC compiler")
add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-fopenacc>)
add_link_options(-fopenacc)
# Pass OpenACC settings to libneo (must be set before find_or_fetch)
set(ENABLE_OPENACC ON CACHE BOOL "Enable OpenACC in libneo" FORCE)
set(OPENACC_OFFLOAD_TARGET "${SIMPLE_OPENACC_OFFLOAD_TARGET}" CACHE STRING "OpenACC offload target for libneo" FORCE)
if(SIMPLE_OPENACC_OFFLOAD_TARGET STREQUAL "nvptx")
message(STATUS "OpenACC offload target: nvptx-none (GPU)")
add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-foffload=nvptx-none>)
add_link_options(-foffload=nvptx-none)
else()
message(STATUS "OpenACC offload target: none (host fallback)")
endif()
endif()

if(CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC")
# NVHPC/nvfortran: disable all Python integration/tools by default.
Expand Down Expand Up @@ -232,21 +251,21 @@ endif()
if (ENABLE_COVERAGE)
if (CMAKE_BUILD_TYPE MATCHES "Debug|Profile")
message(STATUS "Coverage analysis enabled for ${CMAKE_BUILD_TYPE} build")

# Create interface library for coverage flags to scope them to specific targets
add_library(coverage_flags INTERFACE)
target_compile_options(coverage_flags INTERFACE --coverage -fprofile-arcs -ftest-coverage)
target_link_options(coverage_flags INTERFACE --coverage -lgcov)

# Add custom target for coverage data generation
find_program(LCOV_PATH lcov)
if(LCOV_PATH)
add_custom_target(coverage
COMMAND ${CMAKE_COMMAND} -E echo "Generating coverage data..."
COMMAND ${LCOV_PATH} --capture --directory ${CMAKE_BINARY_DIR} --output-file coverage.info
--rc branch_coverage=1
--ignore-errors inconsistent
--ignore-errors mismatch
--rc branch_coverage=1
--ignore-errors inconsistent
--ignore-errors mismatch
--ignore-errors unused
COMMAND ${LCOV_PATH} --remove coverage.info
"/home/ert/code/libneo/*"
Expand Down
27 changes: 27 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -163,3 +163,30 @@ nvfortran-acc-test: nvfortran-acc

nvfortran-acc-clean:
rm -rf $(NVHPC_ACC_BUILD_DIR)

# GCC OpenACC build targets (requires GCC 16+ with nvptx offload support)
GCC16_ROOT := /temp/AG-plasma/opt/gcc16
GCC_ACC_BUILD_DIR := build_gcc_acc

.PHONY: gcc-acc gcc-acc-test gcc-acc-configure gcc-acc-clean

gcc-acc-configure:
cmake -S . -B$(GCC_ACC_BUILD_DIR) -GNinja \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_Fortran_COMPILER=$(GCC16_ROOT)/bin/gfortran \
-DCMAKE_C_COMPILER=$(GCC16_ROOT)/bin/gcc \
-DSIMPLE_ENABLE_OPENACC=ON \
-DSIMPLE_OPENACC_OFFLOAD_TARGET=nvptx \
-DENABLE_PYTHON_INTERFACE=OFF \
-DCMAKE_COLOR_DIAGNOSTICS=ON \
$(FLAGS)

gcc-acc: gcc-acc-configure
LD_LIBRARY_PATH=$(GCC16_ROOT)/lib64:$$LD_LIBRARY_PATH cmake --build $(GCC_ACC_BUILD_DIR) --config $(CONFIG)

gcc-acc-test: gcc-acc
cd $(GCC_ACC_BUILD_DIR) && LD_LIBRARY_PATH=$(GCC16_ROOT)/lib64:$$LD_LIBRARY_PATH \
ctest --test-dir test --output-on-failure $(if $(filter 1,$(VERBOSE)),-V) $(if $(TEST),-R $(TEST)) -LE "python|regression"

gcc-acc-clean:
rm -rf $(GCC_ACC_BUILD_DIR)
7 changes: 7 additions & 0 deletions examples/simple_test_gpu.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
&config
trace_time = 1d-2
sbeg = 0.3d0
ntestpart = 1024
isw_field_type = -1
ntimstep = 100
/
1 change: 1 addition & 0 deletions src/field/field_can_base.f90
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ module field_can_base
integer(8) :: n_field_evaluations = 0

!$omp threadprivate(n_field_evaluations)
!$acc declare create(n_field_evaluations)

type :: field_can_t
real(dp) :: Ath, Aph
Expand Down
3 changes: 3 additions & 0 deletions src/field/field_can_flux.f90
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,14 @@ module field_can_flux
contains

subroutine evaluate_flux(f, r, th_c, ph_c, mode_secders)
!$acc routine seq
type(field_can_t), intent(inout) :: f
real(dp), intent(in) :: r, th_c, ph_c
integer, intent(in) :: mode_secders

call eval_field_can(f, r, th_c, ph_c, mode_secders)

!$acc atomic
n_field_evaluations = n_field_evaluations + 1
end subroutine evaluate_flux

Expand Down Expand Up @@ -52,6 +54,7 @@ end subroutine ref_to_integ_flux
!> mode_secders = 1: second derivatives only in d/dr^2
!> mode_secders = 2: all second derivatives, including mixed
subroutine eval_field_can(f, r, th_c, ph_c, mode_secders)
!$acc routine seq
use get_can_sub, only: splint_can_coord

type(field_can_t), intent(inout) :: f
Expand Down
3 changes: 3 additions & 0 deletions src/field_can.f90
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ end subroutine field_can_init
!ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
!
subroutine get_val(f, pphi)
!$acc routine seq
!
! computes values of H, pth and vpar at z=(r, th, ph, pphi)
!
Expand All @@ -221,6 +222,7 @@ end subroutine get_val
!ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
!
subroutine get_derivatives(f, pphi)
!$acc routine seq
!
! computes H, pth and vpar at z=(r, th, ph, pphi) and their derivatives
!
Expand All @@ -245,6 +247,7 @@ end subroutine get_derivatives
!ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
!
subroutine get_derivatives2(f, pphi)
!$acc routine seq
!
! computes H, pth and vpar at z=(r, th, ph, pphi) up to 2nd derivatives
! order of second derivatives:
Expand Down
57 changes: 37 additions & 20 deletions src/get_canonical_coordinates.F90
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,13 @@ module get_can_sub
type(BatchSplineData3D), save :: sqg_Bt_Bp_batch_spline
logical, save :: sqg_Bt_Bp_batch_spline_ready = .false.

#ifdef SIMPLE_OPENACC
! OpenACC declare for batch spline module variables used in !$acc routine seq
!$acc declare create(aphi_batch_spline, aphi_batch_spline_ready)
!$acc declare create(G_batch_spline, G_batch_spline_ready)
!$acc declare create(sqg_Bt_Bp_batch_spline, sqg_Bt_Bp_batch_spline_ready)
#endif

contains


Expand Down Expand Up @@ -262,11 +269,23 @@ subroutine get_canonical_coordinates_impl
call build_canflux_G_batch_spline
call build_canflux_sqg_Bt_Bp_batch_spline

! Update device copies of module variables used in splint_can_coord
call update_canflux_device_data

deallocate(ipoi_t, ipoi_p, sqg_c, B_vartheta_c, B_varphi_c, G_c)

end subroutine get_canonical_coordinates_impl


subroutine update_canflux_device_data
use new_vmec_stuff_mod, only: nper
use vector_potentail_mod, only: torflux

! Copy module variables to GPU device (set at runtime during VMEC loading)
!$acc enter data copyin(nper, torflux)
end subroutine update_canflux_device_data


subroutine build_canflux_aphi_batch_spline
use vector_potentail_mod, only: ns, hs, sA_phi
use new_vmec_stuff_mod, only: ns_A
Expand Down Expand Up @@ -294,6 +313,10 @@ subroutine build_canflux_aphi_batch_spline
aphi_batch_spline%coeff(1, 0:order, :) = sA_phi(1:order + 1, :)

aphi_batch_spline_ready = .true.

! Copy spline data to GPU device
!$acc enter data copyin(aphi_batch_spline, aphi_batch_spline_ready)
!$acc enter data copyin(aphi_batch_spline%coeff)
end subroutine build_canflux_aphi_batch_spline


Expand Down Expand Up @@ -331,6 +354,10 @@ subroutine build_canflux_G_batch_spline
G_batch_spline)
G_batch_spline_ready = .true.
deallocate(y_batch)

! Copy spline data to GPU device
!$acc enter data copyin(G_batch_spline, G_batch_spline_ready)
!$acc enter data copyin(G_batch_spline%coeff)
end subroutine build_canflux_G_batch_spline


Expand Down Expand Up @@ -371,6 +398,10 @@ subroutine build_canflux_sqg_Bt_Bp_batch_spline
sqg_Bt_Bp_batch_spline)
sqg_Bt_Bp_batch_spline_ready = .true.
deallocate(y_batch)

! Copy spline data to GPU device
!$acc enter data copyin(sqg_Bt_Bp_batch_spline, sqg_Bt_Bp_batch_spline_ready)
!$acc enter data copyin(sqg_Bt_Bp_batch_spline%coeff)
end subroutine build_canflux_sqg_Bt_Bp_batch_spline


Expand Down Expand Up @@ -477,7 +508,7 @@ subroutine splint_can_coord(fullset, mode_secders, r, vartheta_c, varphi_c, &
d2bth_tt, d2bth_tp, d2bth_pp, &
d2bph_rr, d2bph_rt, d2bph_rp, &
d2bph_tt, d2bph_tp, d2bph_pp, G_c)

!$acc routine seq
use vector_potentail_mod, only: torflux
use new_vmec_stuff_mod, only: nper
use chamb_mod, only: rnegflag
Expand Down Expand Up @@ -510,13 +541,13 @@ subroutine splint_can_coord(fullset, mode_secders, r, vartheta_c, varphi_c, &
real(dp) :: x_eval(3)
real(dp) :: y_eval(3), dy_eval(3, 3), d2y_eval(6, 3)
real(dp) :: y_G(1), dy_G(3, 1)
real(dp) :: y1d(1), dy1d(1), d2y1d(1)
real(dp) :: y1d(1), dy1d(1), d2y1d(1), d3y1d(1)
real(dp) :: theta_wrapped, phi_wrapped
real(dp) :: qua, dqua_dr, dqua_dt, dqua_dp
real(dp) :: d2qua_dr2, d2qua_drdt, d2qua_drdp
real(dp) :: d2qua_dt2, d2qua_dtdp, d2qua_dp2

!$omp atomic
!$acc atomic
icounter = icounter + 1
r_eval = r
if (r_eval <= 0.0_dp) then
Expand All @@ -528,18 +559,11 @@ subroutine splint_can_coord(fullset, mode_secders, r, vartheta_c, varphi_c, &
dA_theta_dr = torflux

! Interpolate A_phi using batch spline (1D)
if (.not. aphi_batch_spline_ready) then
error stop "splint_can_coord: Aphi batch spline not initialized"
end if

if (mode_secders > 0) then
! Need third derivative - use der3 which computes all derivatives in one pass
block
real(dp) :: d3y1d(1)
call evaluate_batch_splines_1d_der3(aphi_batch_spline, r_eval, &
y1d, dy1d, d2y1d, d3y1d)
d3A_phi_dr3 = d3y1d(1)
end block
call evaluate_batch_splines_1d_der3(aphi_batch_spline, r_eval, &
y1d, dy1d, d2y1d, d3y1d)
d3A_phi_dr3 = d3y1d(1)
else
call evaluate_batch_splines_1d_der2(aphi_batch_spline, r_eval, &
y1d, dy1d, d2y1d)
Expand All @@ -566,20 +590,13 @@ subroutine splint_can_coord(fullset, mode_secders, r, vartheta_c, varphi_c, &

! Interpolate G if needed
if (fullset) then
if (.not. G_batch_spline_ready) then
error stop "splint_can_coord: G batch spline not initialized"
end if
call evaluate_batch_splines_3d_der(G_batch_spline, x_eval, y_G, dy_G)
G_c = y_G(1)
else
G_c = 0.0_dp
end if

! Interpolate sqg, B_vartheta, B_varphi (3 quantities)
if (.not. sqg_Bt_Bp_batch_spline_ready) then
error stop "splint_can_coord: sqg_Bt_Bp batch spline not initialized"
end if

if (mode_secders == 2) then
call evaluate_batch_splines_3d_der2(sqg_Bt_Bp_batch_spline, x_eval, &
y_eval, dy_eval, d2y_eval)
Expand Down
Loading