itpplasma · krystophny · Dec 25, 2025 · Dec 25, 2025 · Dec 26, 2025 · Dec 26, 2025
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -39,6 +39,38 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co
 - Compiler: GNU Fortran (gfortran)
 - Optional: OpenMP (enabled by default)
 
+### OpenACC GPU Builds
+SIMPLE supports GPU acceleration via OpenACC using GCC 16+ with nvptx offload.
+
+**GCC 16 with nvptx offload** (experimental):
+- Location: `/temp/AG-plasma/opt/gcc16`
+- RTX 4090 GPU available for testing
+- CRITICAL: OpenMP must be disabled - nvptx mkoffload cannot handle both -fopenacc AND -fopenmp
+
+**Manual build with GCC 16 OpenACC**:
+```bash
+cmake -S . -B build -G Ninja \
+  -DCMAKE_Fortran_COMPILER=/temp/AG-plasma/opt/gcc16/bin/gfortran \
+  -DCMAKE_C_COMPILER=/temp/AG-plasma/opt/gcc16/bin/gcc \
+  -DCMAKE_CXX_COMPILER=/temp/AG-plasma/opt/gcc16/bin/g++ \
+  -DCMAKE_Fortran_FLAGS="-fopenacc -foffload=nvptx-none -O2 -DSIMPLE_OPENACC" \
+  -DCMAKE_BUILD_TYPE=Release \
+  -DENABLE_OPENMP=OFF
+
+cmake --build build -j
+```
+
+**Running with OpenACC**:
+```bash
+LD_LIBRARY_PATH=/temp/AG-plasma/opt/gcc16/lib64:$LD_LIBRARY_PATH ./build/simple.x
+```
+
+**OpenACC implementation status**:
+- Module variables with `!$acc declare create(...)` in params.f90 and get_canonical_coordinates.F90
+- Preprocessor macro `SIMPLE_OPENACC` for conditional compilation
+- Batch spline evaluation routines in libneo have `!$acc routine seq` directives
+- GPU particle loop stub in simple_main.f90 (needs full integration implementation)
+
 ### GVEC Integration
 - Minimal GVEC library automatically built from `thirdparty/gvec/`
 - Provides B-spline and cubic spline functionality for magnetic field interpolation

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -54,7 +54,26 @@ option(ENABLE_GVEC "Enable GVEC field support (experimental)" OFF)
 option(ENABLE_COVERAGE "Enable code coverage analysis (Debug/Profile builds only)" OFF)
 option(SIMPLE_DETERMINISTIC_FP "Disable fast-math for reproducible floating-point" OFF)
 option(SIMPLE_ENABLE_PYTHON_TOOLS "Enable Python helpers (tests/data generation)" ON)
-option(SIMPLE_ENABLE_OPENACC "Enable OpenACC offload (NVHPC only)" OFF)
+option(SIMPLE_ENABLE_OPENACC "Enable OpenACC offload (NVHPC or GCC with nvptx)" OFF)
+set(SIMPLE_OPENACC_OFFLOAD_TARGET "none" CACHE STRING "OpenACC offload target for GCC (none|nvptx)")
+set_property(CACHE SIMPLE_OPENACC_OFFLOAD_TARGET PROPERTY STRINGS none nvptx)
+
+# GCC OpenACC support (must be before NVHPC block to set flags early)
+if(CMAKE_Fortran_COMPILER_ID STREQUAL "GNU" AND SIMPLE_ENABLE_OPENACC)
+    message(STATUS "OpenACC enabled for GCC compiler")
+    add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-fopenacc>)
+    add_link_options(-fopenacc)
+    # Pass OpenACC settings to libneo (must be set before find_or_fetch)
+    set(ENABLE_OPENACC ON CACHE BOOL "Enable OpenACC in libneo" FORCE)
+    set(OPENACC_OFFLOAD_TARGET "${SIMPLE_OPENACC_OFFLOAD_TARGET}" CACHE STRING "OpenACC offload target for libneo" FORCE)
+    if(SIMPLE_OPENACC_OFFLOAD_TARGET STREQUAL "nvptx")
+        message(STATUS "OpenACC offload target: nvptx-none (GPU)")
+        add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-foffload=nvptx-none>)
+        add_link_options(-foffload=nvptx-none)
+    else()
+        message(STATUS "OpenACC offload target: none (host fallback)")
+    endif()
+endif()
 
 if(CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC")
     # NVHPC/nvfortran: disable all Python integration/tools by default.
@@ -232,21 +251,21 @@ endif()
 if (ENABLE_COVERAGE)
     if (CMAKE_BUILD_TYPE MATCHES "Debug|Profile")
         message(STATUS "Coverage analysis enabled for ${CMAKE_BUILD_TYPE} build")
-        
+
         # Create interface library for coverage flags to scope them to specific targets
         add_library(coverage_flags INTERFACE)
         target_compile_options(coverage_flags INTERFACE --coverage -fprofile-arcs -ftest-coverage)
         target_link_options(coverage_flags INTERFACE --coverage -lgcov)
-        
+
         # Add custom target for coverage data generation
         find_program(LCOV_PATH lcov)
         if(LCOV_PATH)
             add_custom_target(coverage
                 COMMAND ${CMAKE_COMMAND} -E echo "Generating coverage data..."
                 COMMAND ${LCOV_PATH} --capture --directory ${CMAKE_BINARY_DIR} --output-file coverage.info
-                    --rc branch_coverage=1 
-                    --ignore-errors inconsistent 
-                    --ignore-errors mismatch 
+                    --rc branch_coverage=1
+                    --ignore-errors inconsistent
+                    --ignore-errors mismatch
                     --ignore-errors unused
                 COMMAND ${LCOV_PATH} --remove coverage.info
                     "/home/ert/code/libneo/*"

diff --git a/Makefile b/Makefile
@@ -163,3 +163,30 @@ nvfortran-acc-test: nvfortran-acc
 
 nvfortran-acc-clean:
 	rm -rf $(NVHPC_ACC_BUILD_DIR)
+
+# GCC OpenACC build targets (requires GCC 16+ with nvptx offload support)
+GCC16_ROOT := /temp/AG-plasma/opt/gcc16
+GCC_ACC_BUILD_DIR := build_gcc_acc
+
+.PHONY: gcc-acc gcc-acc-test gcc-acc-configure gcc-acc-clean
+
+gcc-acc-configure:
+	cmake -S . -B$(GCC_ACC_BUILD_DIR) -GNinja \
+		-DCMAKE_BUILD_TYPE=Release \
+		-DCMAKE_Fortran_COMPILER=$(GCC16_ROOT)/bin/gfortran \
+		-DCMAKE_C_COMPILER=$(GCC16_ROOT)/bin/gcc \
+		-DSIMPLE_ENABLE_OPENACC=ON \
+		-DSIMPLE_OPENACC_OFFLOAD_TARGET=nvptx \
+		-DENABLE_PYTHON_INTERFACE=OFF \
+		-DCMAKE_COLOR_DIAGNOSTICS=ON \
+		$(FLAGS)
+
+gcc-acc: gcc-acc-configure
+	LD_LIBRARY_PATH=$(GCC16_ROOT)/lib64:$$LD_LIBRARY_PATH cmake --build $(GCC_ACC_BUILD_DIR) --config $(CONFIG)
+
+gcc-acc-test: gcc-acc
+	cd $(GCC_ACC_BUILD_DIR) && LD_LIBRARY_PATH=$(GCC16_ROOT)/lib64:$$LD_LIBRARY_PATH \
+		ctest --test-dir test --output-on-failure $(if $(filter 1,$(VERBOSE)),-V) $(if $(TEST),-R $(TEST)) -LE "python|regression"
+
+gcc-acc-clean:
+	rm -rf $(GCC_ACC_BUILD_DIR)
diff --git a/examples/simple_test_gpu.in b/examples/simple_test_gpu.in
@@ -0,0 +1,7 @@
+&config
+trace_time = 1d-2
+sbeg = 0.3d0
+ntestpart = 1024
+isw_field_type = -1
+ntimstep = 100
+/
diff --git a/src/field/field_can_base.f90 b/src/field/field_can_base.f90
@@ -8,6 +8,7 @@ module field_can_base
 integer(8) :: n_field_evaluations = 0
 
 !$omp threadprivate(n_field_evaluations)
+!$acc declare create(n_field_evaluations)
 
 type :: field_can_t
     real(dp) :: Ath, Aph

diff --git a/src/field/field_can_flux.f90 b/src/field/field_can_flux.f90
@@ -12,12 +12,14 @@ module field_can_flux
 contains
 
 subroutine evaluate_flux(f, r, th_c, ph_c, mode_secders)
+    !$acc routine seq
     type(field_can_t), intent(inout) :: f
     real(dp), intent(in) :: r, th_c, ph_c
     integer, intent(in) :: mode_secders
 
     call eval_field_can(f, r, th_c, ph_c, mode_secders)
 
+    !$acc atomic
     n_field_evaluations = n_field_evaluations + 1
 end subroutine evaluate_flux
 
@@ -52,6 +54,7 @@ end subroutine ref_to_integ_flux
 !> mode_secders = 1: second derivatives only in d/dr^2
 !> mode_secders = 2: all second derivatives, including mixed
 subroutine eval_field_can(f, r, th_c, ph_c, mode_secders)
+    !$acc routine seq
     use get_can_sub, only: splint_can_coord
 
     type(field_can_t), intent(inout) :: f

diff --git a/src/field_can.f90 b/src/field_can.f90
@@ -204,6 +204,7 @@ end subroutine field_can_init
   !ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
   !
 subroutine get_val(f, pphi)
+  !$acc routine seq
   !
   ! computes values of H, pth and vpar at z=(r, th, ph, pphi)
   !
@@ -221,6 +222,7 @@ end subroutine get_val
   !ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
   !
 subroutine get_derivatives(f, pphi)
+  !$acc routine seq
   !
   ! computes H, pth and vpar at z=(r, th, ph, pphi) and their derivatives
   !
@@ -245,6 +247,7 @@ end subroutine get_derivatives
   !ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc
   !
 subroutine get_derivatives2(f, pphi)
+  !$acc routine seq
   !
   ! computes H, pth and vpar at z=(r, th, ph, pphi) up to 2nd derivatives
   ! order of second derivatives:

diff --git a/src/get_canonical_coordinates.F90 b/src/get_canonical_coordinates.F90
@@ -57,6 +57,13 @@ module get_can_sub
     type(BatchSplineData3D), save :: sqg_Bt_Bp_batch_spline
     logical, save :: sqg_Bt_Bp_batch_spline_ready = .false.
 
+#ifdef SIMPLE_OPENACC
+    ! OpenACC declare for batch spline module variables used in !$acc routine seq
+    !$acc declare create(aphi_batch_spline, aphi_batch_spline_ready)
+    !$acc declare create(G_batch_spline, G_batch_spline_ready)
+    !$acc declare create(sqg_Bt_Bp_batch_spline, sqg_Bt_Bp_batch_spline_ready)
+#endif
+
 contains
 
 
@@ -262,11 +269,23 @@ subroutine get_canonical_coordinates_impl
     call build_canflux_G_batch_spline
     call build_canflux_sqg_Bt_Bp_batch_spline
 
+    ! Update device copies of module variables used in splint_can_coord
+    call update_canflux_device_data
+
     deallocate(ipoi_t, ipoi_p, sqg_c, B_vartheta_c, B_varphi_c, G_c)
 
 end subroutine get_canonical_coordinates_impl
 
 
+subroutine update_canflux_device_data
+    use new_vmec_stuff_mod, only: nper
+    use vector_potentail_mod, only: torflux
+
+    ! Copy module variables to GPU device (set at runtime during VMEC loading)
+    !$acc enter data copyin(nper, torflux)
+end subroutine update_canflux_device_data
+
+
 subroutine build_canflux_aphi_batch_spline
     use vector_potentail_mod, only: ns, hs, sA_phi
     use new_vmec_stuff_mod, only: ns_A
@@ -294,6 +313,10 @@ subroutine build_canflux_aphi_batch_spline
     aphi_batch_spline%coeff(1, 0:order, :) = sA_phi(1:order + 1, :)
 
     aphi_batch_spline_ready = .true.
+
+    ! Copy spline data to GPU device
+    !$acc enter data copyin(aphi_batch_spline, aphi_batch_spline_ready)
+    !$acc enter data copyin(aphi_batch_spline%coeff)
 end subroutine build_canflux_aphi_batch_spline
 
 
@@ -331,6 +354,10 @@ subroutine build_canflux_G_batch_spline
                                     G_batch_spline)
     G_batch_spline_ready = .true.
     deallocate(y_batch)
+
+    ! Copy spline data to GPU device
+    !$acc enter data copyin(G_batch_spline, G_batch_spline_ready)
+    !$acc enter data copyin(G_batch_spline%coeff)
 end subroutine build_canflux_G_batch_spline
 
 
@@ -371,6 +398,10 @@ subroutine build_canflux_sqg_Bt_Bp_batch_spline
                                     sqg_Bt_Bp_batch_spline)
     sqg_Bt_Bp_batch_spline_ready = .true.
     deallocate(y_batch)
+
+    ! Copy spline data to GPU device
+    !$acc enter data copyin(sqg_Bt_Bp_batch_spline, sqg_Bt_Bp_batch_spline_ready)
+    !$acc enter data copyin(sqg_Bt_Bp_batch_spline%coeff)
 end subroutine build_canflux_sqg_Bt_Bp_batch_spline
 
 
@@ -477,7 +508,7 @@ subroutine splint_can_coord(fullset, mode_secders, r, vartheta_c, varphi_c, &
                             d2bth_tt, d2bth_tp, d2bth_pp, &
                             d2bph_rr, d2bph_rt, d2bph_rp, &
                             d2bph_tt, d2bph_tp, d2bph_pp, G_c)
-
+    !$acc routine seq
     use vector_potentail_mod, only: torflux
     use new_vmec_stuff_mod, only: nper
     use chamb_mod, only: rnegflag
@@ -510,13 +541,13 @@ subroutine splint_can_coord(fullset, mode_secders, r, vartheta_c, varphi_c, &
     real(dp) :: x_eval(3)
     real(dp) :: y_eval(3), dy_eval(3, 3), d2y_eval(6, 3)
     real(dp) :: y_G(1), dy_G(3, 1)
-    real(dp) :: y1d(1), dy1d(1), d2y1d(1)
+    real(dp) :: y1d(1), dy1d(1), d2y1d(1), d3y1d(1)
     real(dp) :: theta_wrapped, phi_wrapped
     real(dp) :: qua, dqua_dr, dqua_dt, dqua_dp
     real(dp) :: d2qua_dr2, d2qua_drdt, d2qua_drdp
     real(dp) :: d2qua_dt2, d2qua_dtdp, d2qua_dp2
 
-!$omp atomic
+!$acc atomic
     icounter = icounter + 1
     r_eval = r
     if (r_eval <= 0.0_dp) then
@@ -528,18 +559,11 @@ subroutine splint_can_coord(fullset, mode_secders, r, vartheta_c, varphi_c, &
     dA_theta_dr = torflux
 
     ! Interpolate A_phi using batch spline (1D)
-    if (.not. aphi_batch_spline_ready) then
-        error stop "splint_can_coord: Aphi batch spline not initialized"
-    end if
-
     if (mode_secders > 0) then
         ! Need third derivative - use der3 which computes all derivatives in one pass
-        block
-            real(dp) :: d3y1d(1)
-            call evaluate_batch_splines_1d_der3(aphi_batch_spline, r_eval, &
-                                                y1d, dy1d, d2y1d, d3y1d)
-            d3A_phi_dr3 = d3y1d(1)
-        end block
+        call evaluate_batch_splines_1d_der3(aphi_batch_spline, r_eval, &
+                                            y1d, dy1d, d2y1d, d3y1d)
+        d3A_phi_dr3 = d3y1d(1)
     else
         call evaluate_batch_splines_1d_der2(aphi_batch_spline, r_eval, &
                                             y1d, dy1d, d2y1d)
@@ -566,20 +590,13 @@ subroutine splint_can_coord(fullset, mode_secders, r, vartheta_c, varphi_c, &
 
     ! Interpolate G if needed
     if (fullset) then
-        if (.not. G_batch_spline_ready) then
-            error stop "splint_can_coord: G batch spline not initialized"
-        end if
         call evaluate_batch_splines_3d_der(G_batch_spline, x_eval, y_G, dy_G)
         G_c = y_G(1)
     else
         G_c = 0.0_dp
     end if
 
     ! Interpolate sqg, B_vartheta, B_varphi (3 quantities)
-    if (.not. sqg_Bt_Bp_batch_spline_ready) then
-        error stop "splint_can_coord: sqg_Bt_Bp batch spline not initialized"
-    end if
-
     if (mode_secders == 2) then
         call evaluate_batch_splines_3d_der2(sqg_Bt_Bp_batch_spline, x_eval, &
                                             y_eval, dy_eval, d2y_eval)