diff --git a/CLAUDE.md b/CLAUDE.md index ff2f4232..17e07cfe 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -39,6 +39,38 @@ This file provides guidance to Claude Code (claude.ai/code) when working with co - Compiler: GNU Fortran (gfortran) - Optional: OpenMP (enabled by default) +### OpenACC GPU Builds +SIMPLE supports GPU acceleration via OpenACC using GCC 16+ with nvptx offload. + +**GCC 16 with nvptx offload** (experimental): +- Location: `/temp/AG-plasma/opt/gcc16` +- RTX 4090 GPU available for testing +- CRITICAL: OpenMP must be disabled - nvptx mkoffload cannot handle both -fopenacc AND -fopenmp + +**Manual build with GCC 16 OpenACC**: +```bash +cmake -S . -B build -G Ninja \ + -DCMAKE_Fortran_COMPILER=/temp/AG-plasma/opt/gcc16/bin/gfortran \ + -DCMAKE_C_COMPILER=/temp/AG-plasma/opt/gcc16/bin/gcc \ + -DCMAKE_CXX_COMPILER=/temp/AG-plasma/opt/gcc16/bin/g++ \ + -DCMAKE_Fortran_FLAGS="-fopenacc -foffload=nvptx-none -O2 -DSIMPLE_OPENACC" \ + -DCMAKE_BUILD_TYPE=Release \ + -DENABLE_OPENMP=OFF + +cmake --build build -j +``` + +**Running with OpenACC**: +```bash +LD_LIBRARY_PATH=/temp/AG-plasma/opt/gcc16/lib64:$LD_LIBRARY_PATH ./build/simple.x +``` + +**OpenACC implementation status**: +- Module variables with `!$acc declare create(...)` in params.f90 and get_canonical_coordinates.F90 +- Preprocessor macro `SIMPLE_OPENACC` for conditional compilation +- Batch spline evaluation routines in libneo have `!$acc routine seq` directives +- GPU particle loop stub in simple_main.f90 (needs full integration implementation) + ### GVEC Integration - Minimal GVEC library automatically built from `thirdparty/gvec/` - Provides B-spline and cubic spline functionality for magnetic field interpolation diff --git a/CMakeLists.txt b/CMakeLists.txt index 07d9b31f..f64a56b6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,7 +54,26 @@ option(ENABLE_GVEC "Enable GVEC field support (experimental)" OFF) option(ENABLE_COVERAGE "Enable code coverage analysis (Debug/Profile builds only)" OFF) option(SIMPLE_DETERMINISTIC_FP "Disable fast-math for reproducible floating-point" OFF) option(SIMPLE_ENABLE_PYTHON_TOOLS "Enable Python helpers (tests/data generation)" ON) -option(SIMPLE_ENABLE_OPENACC "Enable OpenACC offload (NVHPC only)" OFF) +option(SIMPLE_ENABLE_OPENACC "Enable OpenACC offload (NVHPC or GCC with nvptx)" OFF) +set(SIMPLE_OPENACC_OFFLOAD_TARGET "none" CACHE STRING "OpenACC offload target for GCC (none|nvptx)") +set_property(CACHE SIMPLE_OPENACC_OFFLOAD_TARGET PROPERTY STRINGS none nvptx) + +# GCC OpenACC support (must be before NVHPC block to set flags early) +if(CMAKE_Fortran_COMPILER_ID STREQUAL "GNU" AND SIMPLE_ENABLE_OPENACC) + message(STATUS "OpenACC enabled for GCC compiler") + add_compile_options($<$:-fopenacc>) + add_link_options(-fopenacc) + # Pass OpenACC settings to libneo (must be set before find_or_fetch) + set(ENABLE_OPENACC ON CACHE BOOL "Enable OpenACC in libneo" FORCE) + set(OPENACC_OFFLOAD_TARGET "${SIMPLE_OPENACC_OFFLOAD_TARGET}" CACHE STRING "OpenACC offload target for libneo" FORCE) + if(SIMPLE_OPENACC_OFFLOAD_TARGET STREQUAL "nvptx") + message(STATUS "OpenACC offload target: nvptx-none (GPU)") + add_compile_options($<$:-foffload=nvptx-none>) + add_link_options(-foffload=nvptx-none) + else() + message(STATUS "OpenACC offload target: none (host fallback)") + endif() +endif() if(CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC") # NVHPC/nvfortran: disable all Python integration/tools by default. @@ -232,21 +251,21 @@ endif() if (ENABLE_COVERAGE) if (CMAKE_BUILD_TYPE MATCHES "Debug|Profile") message(STATUS "Coverage analysis enabled for ${CMAKE_BUILD_TYPE} build") - + # Create interface library for coverage flags to scope them to specific targets add_library(coverage_flags INTERFACE) target_compile_options(coverage_flags INTERFACE --coverage -fprofile-arcs -ftest-coverage) target_link_options(coverage_flags INTERFACE --coverage -lgcov) - + # Add custom target for coverage data generation find_program(LCOV_PATH lcov) if(LCOV_PATH) add_custom_target(coverage COMMAND ${CMAKE_COMMAND} -E echo "Generating coverage data..." COMMAND ${LCOV_PATH} --capture --directory ${CMAKE_BINARY_DIR} --output-file coverage.info - --rc branch_coverage=1 - --ignore-errors inconsistent - --ignore-errors mismatch + --rc branch_coverage=1 + --ignore-errors inconsistent + --ignore-errors mismatch --ignore-errors unused COMMAND ${LCOV_PATH} --remove coverage.info "/home/ert/code/libneo/*" diff --git a/Makefile b/Makefile index 299b4b5e..22af1f0b 100644 --- a/Makefile +++ b/Makefile @@ -163,3 +163,30 @@ nvfortran-acc-test: nvfortran-acc nvfortran-acc-clean: rm -rf $(NVHPC_ACC_BUILD_DIR) + +# GCC OpenACC build targets (requires GCC 16+ with nvptx offload support) +GCC16_ROOT := /temp/AG-plasma/opt/gcc16 +GCC_ACC_BUILD_DIR := build_gcc_acc + +.PHONY: gcc-acc gcc-acc-test gcc-acc-configure gcc-acc-clean + +gcc-acc-configure: + cmake -S . -B$(GCC_ACC_BUILD_DIR) -GNinja \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_Fortran_COMPILER=$(GCC16_ROOT)/bin/gfortran \ + -DCMAKE_C_COMPILER=$(GCC16_ROOT)/bin/gcc \ + -DSIMPLE_ENABLE_OPENACC=ON \ + -DSIMPLE_OPENACC_OFFLOAD_TARGET=nvptx \ + -DENABLE_PYTHON_INTERFACE=OFF \ + -DCMAKE_COLOR_DIAGNOSTICS=ON \ + $(FLAGS) + +gcc-acc: gcc-acc-configure + LD_LIBRARY_PATH=$(GCC16_ROOT)/lib64:$$LD_LIBRARY_PATH cmake --build $(GCC_ACC_BUILD_DIR) --config $(CONFIG) + +gcc-acc-test: gcc-acc + cd $(GCC_ACC_BUILD_DIR) && LD_LIBRARY_PATH=$(GCC16_ROOT)/lib64:$$LD_LIBRARY_PATH \ + ctest --test-dir test --output-on-failure $(if $(filter 1,$(VERBOSE)),-V) $(if $(TEST),-R $(TEST)) -LE "python|regression" + +gcc-acc-clean: + rm -rf $(GCC_ACC_BUILD_DIR) diff --git a/examples/simple_test_gpu.in b/examples/simple_test_gpu.in new file mode 100644 index 00000000..dff82578 --- /dev/null +++ b/examples/simple_test_gpu.in @@ -0,0 +1,7 @@ +&config +trace_time = 1d-2 +sbeg = 0.3d0 +ntestpart = 1024 +isw_field_type = -1 +ntimstep = 100 +/ diff --git a/src/field/field_can_base.f90 b/src/field/field_can_base.f90 index fa948e3e..3663cf00 100644 --- a/src/field/field_can_base.f90 +++ b/src/field/field_can_base.f90 @@ -8,6 +8,7 @@ module field_can_base integer(8) :: n_field_evaluations = 0 !$omp threadprivate(n_field_evaluations) +!$acc declare create(n_field_evaluations) type :: field_can_t real(dp) :: Ath, Aph diff --git a/src/field/field_can_flux.f90 b/src/field/field_can_flux.f90 index ba911f1f..c6a1d9d8 100644 --- a/src/field/field_can_flux.f90 +++ b/src/field/field_can_flux.f90 @@ -12,12 +12,14 @@ module field_can_flux contains subroutine evaluate_flux(f, r, th_c, ph_c, mode_secders) + !$acc routine seq type(field_can_t), intent(inout) :: f real(dp), intent(in) :: r, th_c, ph_c integer, intent(in) :: mode_secders call eval_field_can(f, r, th_c, ph_c, mode_secders) + !$acc atomic n_field_evaluations = n_field_evaluations + 1 end subroutine evaluate_flux @@ -52,6 +54,7 @@ end subroutine ref_to_integ_flux !> mode_secders = 1: second derivatives only in d/dr^2 !> mode_secders = 2: all second derivatives, including mixed subroutine eval_field_can(f, r, th_c, ph_c, mode_secders) + !$acc routine seq use get_can_sub, only: splint_can_coord type(field_can_t), intent(inout) :: f diff --git a/src/field_can.f90 b/src/field_can.f90 index cd7efbd6..a802b4c9 100644 --- a/src/field_can.f90 +++ b/src/field_can.f90 @@ -204,6 +204,7 @@ end subroutine field_can_init !ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc ! subroutine get_val(f, pphi) + !$acc routine seq ! ! computes values of H, pth and vpar at z=(r, th, ph, pphi) ! @@ -221,6 +222,7 @@ end subroutine get_val !ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc ! subroutine get_derivatives(f, pphi) + !$acc routine seq ! ! computes H, pth and vpar at z=(r, th, ph, pphi) and their derivatives ! @@ -245,6 +247,7 @@ end subroutine get_derivatives !ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc ! subroutine get_derivatives2(f, pphi) + !$acc routine seq ! ! computes H, pth and vpar at z=(r, th, ph, pphi) up to 2nd derivatives ! order of second derivatives: diff --git a/src/get_canonical_coordinates.F90 b/src/get_canonical_coordinates.F90 index 582237ee..6e2d0a80 100644 --- a/src/get_canonical_coordinates.F90 +++ b/src/get_canonical_coordinates.F90 @@ -57,6 +57,13 @@ module get_can_sub type(BatchSplineData3D), save :: sqg_Bt_Bp_batch_spline logical, save :: sqg_Bt_Bp_batch_spline_ready = .false. +#ifdef SIMPLE_OPENACC + ! OpenACC declare for batch spline module variables used in !$acc routine seq + !$acc declare create(aphi_batch_spline, aphi_batch_spline_ready) + !$acc declare create(G_batch_spline, G_batch_spline_ready) + !$acc declare create(sqg_Bt_Bp_batch_spline, sqg_Bt_Bp_batch_spline_ready) +#endif + contains @@ -262,11 +269,23 @@ subroutine get_canonical_coordinates_impl call build_canflux_G_batch_spline call build_canflux_sqg_Bt_Bp_batch_spline + ! Update device copies of module variables used in splint_can_coord + call update_canflux_device_data + deallocate(ipoi_t, ipoi_p, sqg_c, B_vartheta_c, B_varphi_c, G_c) end subroutine get_canonical_coordinates_impl +subroutine update_canflux_device_data + use new_vmec_stuff_mod, only: nper + use vector_potentail_mod, only: torflux + + ! Copy module variables to GPU device (set at runtime during VMEC loading) + !$acc enter data copyin(nper, torflux) +end subroutine update_canflux_device_data + + subroutine build_canflux_aphi_batch_spline use vector_potentail_mod, only: ns, hs, sA_phi use new_vmec_stuff_mod, only: ns_A @@ -294,6 +313,10 @@ subroutine build_canflux_aphi_batch_spline aphi_batch_spline%coeff(1, 0:order, :) = sA_phi(1:order + 1, :) aphi_batch_spline_ready = .true. + + ! Copy spline data to GPU device + !$acc enter data copyin(aphi_batch_spline, aphi_batch_spline_ready) + !$acc enter data copyin(aphi_batch_spline%coeff) end subroutine build_canflux_aphi_batch_spline @@ -331,6 +354,10 @@ subroutine build_canflux_G_batch_spline G_batch_spline) G_batch_spline_ready = .true. deallocate(y_batch) + + ! Copy spline data to GPU device + !$acc enter data copyin(G_batch_spline, G_batch_spline_ready) + !$acc enter data copyin(G_batch_spline%coeff) end subroutine build_canflux_G_batch_spline @@ -371,6 +398,10 @@ subroutine build_canflux_sqg_Bt_Bp_batch_spline sqg_Bt_Bp_batch_spline) sqg_Bt_Bp_batch_spline_ready = .true. deallocate(y_batch) + + ! Copy spline data to GPU device + !$acc enter data copyin(sqg_Bt_Bp_batch_spline, sqg_Bt_Bp_batch_spline_ready) + !$acc enter data copyin(sqg_Bt_Bp_batch_spline%coeff) end subroutine build_canflux_sqg_Bt_Bp_batch_spline @@ -477,7 +508,7 @@ subroutine splint_can_coord(fullset, mode_secders, r, vartheta_c, varphi_c, & d2bth_tt, d2bth_tp, d2bth_pp, & d2bph_rr, d2bph_rt, d2bph_rp, & d2bph_tt, d2bph_tp, d2bph_pp, G_c) - + !$acc routine seq use vector_potentail_mod, only: torflux use new_vmec_stuff_mod, only: nper use chamb_mod, only: rnegflag @@ -510,13 +541,13 @@ subroutine splint_can_coord(fullset, mode_secders, r, vartheta_c, varphi_c, & real(dp) :: x_eval(3) real(dp) :: y_eval(3), dy_eval(3, 3), d2y_eval(6, 3) real(dp) :: y_G(1), dy_G(3, 1) - real(dp) :: y1d(1), dy1d(1), d2y1d(1) + real(dp) :: y1d(1), dy1d(1), d2y1d(1), d3y1d(1) real(dp) :: theta_wrapped, phi_wrapped real(dp) :: qua, dqua_dr, dqua_dt, dqua_dp real(dp) :: d2qua_dr2, d2qua_drdt, d2qua_drdp real(dp) :: d2qua_dt2, d2qua_dtdp, d2qua_dp2 -!$omp atomic +!$acc atomic icounter = icounter + 1 r_eval = r if (r_eval <= 0.0_dp) then @@ -528,18 +559,11 @@ subroutine splint_can_coord(fullset, mode_secders, r, vartheta_c, varphi_c, & dA_theta_dr = torflux ! Interpolate A_phi using batch spline (1D) - if (.not. aphi_batch_spline_ready) then - error stop "splint_can_coord: Aphi batch spline not initialized" - end if - if (mode_secders > 0) then ! Need third derivative - use der3 which computes all derivatives in one pass - block - real(dp) :: d3y1d(1) - call evaluate_batch_splines_1d_der3(aphi_batch_spline, r_eval, & - y1d, dy1d, d2y1d, d3y1d) - d3A_phi_dr3 = d3y1d(1) - end block + call evaluate_batch_splines_1d_der3(aphi_batch_spline, r_eval, & + y1d, dy1d, d2y1d, d3y1d) + d3A_phi_dr3 = d3y1d(1) else call evaluate_batch_splines_1d_der2(aphi_batch_spline, r_eval, & y1d, dy1d, d2y1d) @@ -566,9 +590,6 @@ subroutine splint_can_coord(fullset, mode_secders, r, vartheta_c, varphi_c, & ! Interpolate G if needed if (fullset) then - if (.not. G_batch_spline_ready) then - error stop "splint_can_coord: G batch spline not initialized" - end if call evaluate_batch_splines_3d_der(G_batch_spline, x_eval, y_G, dy_G) G_c = y_G(1) else @@ -576,10 +597,6 @@ subroutine splint_can_coord(fullset, mode_secders, r, vartheta_c, varphi_c, & end if ! Interpolate sqg, B_vartheta, B_varphi (3 quantities) - if (.not. sqg_Bt_Bp_batch_spline_ready) then - error stop "splint_can_coord: sqg_Bt_Bp batch spline not initialized" - end if - if (mode_secders == 2) then call evaluate_batch_splines_3d_der2(sqg_Bt_Bp_batch_spline, x_eval, & y_eval, dy_eval, d2y_eval) diff --git a/src/orbit_symplectic.f90 b/src/orbit_symplectic.f90 index bc9d7f5a..24070e94 100644 --- a/src/orbit_symplectic.f90 +++ b/src/orbit_symplectic.f90 @@ -155,6 +155,10 @@ end subroutine orbit_timestep_quasi_lobatto3 !ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc ! subroutine f_sympl_euler1(si, f, n, x, fvec, iflag) + !$acc routine seq +#ifdef SIMPLE_OPENACC + use field_can_flux, only: evaluate_flux +#endif ! type(symplectic_integrator_t), intent(inout) :: si type(field_can_t), intent(inout) :: f @@ -163,7 +167,12 @@ subroutine f_sympl_euler1(si, f, n, x, fvec, iflag) real(dp), intent(out) :: fvec(n) integer, intent(in) :: iflag +#ifdef SIMPLE_OPENACC + ! Direct call for GPU - avoid procedure pointer (CANFLUX field type) + call evaluate_flux(f, x(1), si%z(2), si%z(3), 2) +#else call eval_field(f, x(1), si%z(2), si%z(3), 2) +#endif call get_derivatives2(f, x(2)) fvec(1) = f%dpth(1)*(f%pth - si%pthold) + si%dt*(f%dH(2)*f%dpth(1) - f%dH(1)*f%dpth(2)) @@ -175,6 +184,7 @@ end subroutine f_sympl_euler1 !ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc ! subroutine jac_sympl_euler1(si, f, x, jac) + !$acc routine seq ! type(symplectic_integrator_t), intent(in) :: si type(field_can_t), intent(inout) :: f @@ -365,6 +375,7 @@ end subroutine jac_midpoint_part2 !ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc ! subroutine newton1(si, f, x, maxit, xlast) + !$acc routine seq ! type(symplectic_integrator_t), intent(inout) :: si type(field_can_t), intent(inout) :: f @@ -400,6 +411,7 @@ subroutine newton1(si, f, x, maxit, xlast) if (all(dabs(fvec) < si%atol)) return if (all(dabs(x-xlast) < si%rtol*tolref)) return enddo +#ifndef SIMPLE_OPENACC print *, 'newton1: maximum iterations reached: ', maxit write(6601,*) x(1), x(2) write(6601,*) x-xlast @@ -414,6 +426,7 @@ subroutine newton1(si, f, x, maxit, xlast) write(6601,*) si%z(2), si%z(3) write(6601,*) '' write(6601,*) '' +#endif end subroutine subroutine newton2(si, f, x, atol, rtol, maxit, xlast) @@ -1216,6 +1229,10 @@ end subroutine orbit_sympl_init_kahan8 !ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc ! subroutine orbit_timestep_sympl_expl_impl_euler(si, f, ierr) + !$acc routine seq +#ifdef SIMPLE_OPENACC + use field_can_flux, only: evaluate_flux +#endif ! type(symplectic_integrator_t), intent(inout) :: si type(field_can_t), intent(inout) :: f @@ -1243,7 +1260,9 @@ subroutine orbit_timestep_sympl_expl_impl_euler(si, f, ierr) end if if (x(1) < 0.0d0) then +#ifndef SIMPLE_OPENACC print *, 'r<0, z = ', x(1), si%z(2), si%z(3), x(2) +#endif x(1) = 0.01d0 end if @@ -1258,7 +1277,12 @@ subroutine orbit_timestep_sympl_expl_impl_euler(si, f, ierr) f%hth = f%hth + f%dhth(1)*(x(1)-xlast(1)) f%hph = f%hph + f%dhph(1)*(x(1)-xlast(1)) else +#ifdef SIMPLE_OPENACC + ! Direct call for GPU - avoid procedure pointer (CANFLUX field type) + call evaluate_flux(f, si%z(1), si%z(2), si%z(3), 0) +#else call eval_field(f, si%z(1), si%z(2), si%z(3), 0) +#endif call get_derivatives(f, si%z(4)) endif diff --git a/src/params.f90 b/src/params.f90 index ed271e8e..cac92839 100644 --- a/src/params.f90 +++ b/src/params.f90 @@ -105,6 +105,16 @@ module params integ_coords, & output_error, output_orbits_macrostep ! callback +#ifdef SIMPLE_OPENACC + ! OpenACC device variable declarations for GPU-accessible scalars + !$acc declare create(ntestpart, ntimstep, ntau, ntcut) + !$acc declare create(integmode, notrace_passing) + !$acc declare create(dtaumin, relerr, v0, contr_pp) + !$acc declare create(class_plot) + ! Allocatable arrays used in !$acc routine seq functions + !$acc declare create(trap_par) +#endif + contains subroutine read_config(config_file) @@ -182,6 +192,15 @@ subroutine params_init call init_batch call reallocate_arrays + +#ifdef SIMPLE_OPENACC + ! Copy scalar params to GPU device + !$acc update device(ntestpart, ntimstep, ntau, ntcut) + !$acc update device(integmode, notrace_passing) + !$acc update device(dtaumin, relerr, v0, contr_pp) + !$acc update device(class_plot) + !$acc update device(ro0) +#endif end subroutine params_init subroutine init_batch @@ -330,6 +349,9 @@ subroutine sort_idx(idx_arr, N) end subroutine sort_idx function should_skip(ipart) +#ifdef SIMPLE_OPENACC + !$acc routine seq +#endif ! notrace_passing: no tracing of passing particles, assume that all are confined ! or skip strongly passing particles that are certainly confined logical :: should_skip diff --git a/src/simple_main.f90 b/src/simple_main.f90 index 053c87aa..be86f174 100644 --- a/src/simple_main.f90 +++ b/src/simple_main.f90 @@ -253,36 +253,61 @@ end function ends_with_nc subroutine trace_parallel(norb) use netcdf_orbit_output, only: init_orbit_netcdf, close_orbit_netcdf, & write_orbit_to_netcdf +#ifdef SIMPLE_OPENACC + use parmot_mod, only: ro0 +#endif type(tracer_t), intent(inout) :: norb integer :: i real(dp), allocatable :: traj(:, :), times(:) +#ifdef SIMPLE_OPENACC + integer :: ntstep, npart +#endif if (output_orbits_macrostep) then call init_orbit_netcdf(ntestpart, ntimstep) end if -!$omp parallel firstprivate(norb) private(traj, times, i) +#ifdef SIMPLE_OPENACC + ! OpenACC GPU parallel execution - inline integration to avoid assumed-shape arrays + ntstep = ntimstep + npart = ntestpart + + ! Initialize output arrays before GPU transfer + zend = 0.0d0 + trap_par = 0.0d0 + perp_inv = 0.0d0 + times_lost = 0.0d0 + + ! Call GPU orbit tracing kernel + call trace_orbits_gpu_kernel(npart, ntstep, ntau, dtaumin, v0, ro0, & + zstart, zend, trap_par, perp_inv, & + times_lost, confpart_pass, confpart_trap) + + kpart = npart +#else + !$omp parallel firstprivate(norb) private(traj, times, i) allocate (traj(5, ntimstep), times(ntimstep)) -!$omp do + !$omp do do i = 1, ntestpart -!$omp critical + !$omp critical kpart = kpart + 1 print *, kpart, ' / ', ntestpart, 'particle: ', i, 'thread: ', & omp_get_thread_num() -!$omp end critical + !$omp end critical call trace_orbit(norb, i, traj, times) if (output_orbits_macrostep) then -!$omp critical + !$omp critical call write_orbit_to_netcdf(i, traj, times) -!$omp end critical + !$omp end critical end if end do -!$omp end do -!$omp end parallel + !$omp end do + !$omp end parallel +#endif if (output_orbits_macrostep) then call close_orbit_netcdf() @@ -297,26 +322,34 @@ subroutine classify_parallel(norb) integer :: i type(classification_result_t) :: class_result -!$omp parallel firstprivate(norb) private(class_result, i) -!$omp do +#ifdef SIMPLE_OPENACC + ! OpenACC: Single-threaded for now - full GPU parallel requires + ! libneo routines (splint_can_coord, magfie, etc.) to have !$acc routine + ! TODO: Add !$acc routine seq to libneo for true GPU parallelism do i = 1, ntestpart -!$omp critical + call trace_orbit_with_classifiers(norb, i, class_result) + class_passing(i) = class_result%passing + class_lost(i) = class_result%lost + end do +#else + !$omp parallel firstprivate(norb) private(class_result, i) + !$omp do + do i = 1, ntestpart + !$omp critical kpart = kpart + 1 print *, kpart, ' / ', ntestpart, 'particle: ', i, 'thread: ', & omp_get_thread_num() -!$omp end critical + !$omp end critical call reset_seed_if_deterministic call trace_orbit_with_classifiers(norb, i, class_result) - ! Store classification flags in params arrays class_passing(i) = class_result%passing class_lost(i) = class_result%lost - ! iclass already populated by trace_orbit_with_classifiers - ! Other results (zend, times_lost, trap_par, perp_inv) also already stored end do -!$omp end do -!$omp end parallel + !$omp end do + !$omp end parallel +#endif end subroutine classify_parallel subroutine print_parameters @@ -450,6 +483,9 @@ subroutine trace_orbit(anorb, ipart, orbit_traj, orbit_times) classification_result_t, & write_classification_results use, intrinsic :: ieee_arithmetic, only: ieee_value, ieee_quiet_nan +#ifdef SIMPLE_OPENACC + use field_can_flux, only: ref_to_integ_flux, integ_to_ref_flux +#endif type(tracer_t), intent(inout) :: anorb integer, intent(in) :: ipart @@ -464,17 +500,26 @@ subroutine trace_orbit(anorb, ipart, orbit_traj, orbit_times) ierr_orbit = 0 +#ifndef SIMPLE_OPENACC call reset_seed_if_deterministic +#endif if (ntcut > 0 .or. class_plot) then call trace_orbit_with_classifiers(anorb, ipart, class_result) +#ifndef SIMPLE_OPENACC if (class_plot) then call write_classification_results(ipart, class_result) end if +#endif return end if +#ifdef SIMPLE_OPENACC + ! Direct call for GPU - avoid procedure pointer (CANFLUX field type) + call ref_to_integ_flux(zstart(1:3, ipart), z(1:3)) +#else call ref_to_integ(zstart(1:3, ipart), z(1:3)) +#endif z(4:5) = zstart(4:5, ipart) zend(:, ipart) = 0d0 @@ -488,9 +533,14 @@ subroutine trace_orbit(anorb, ipart, orbit_traj, orbit_times) ! Fill trajectory arrays with NaN since we're not tracing this particle orbit_traj = ieee_value(0.0d0, ieee_quiet_nan) orbit_times = ieee_value(0.0d0, ieee_quiet_nan) +#ifdef SIMPLE_OPENACC + !$acc atomic update + confpart_pass(1) = confpart_pass(1) + 1.d0 +#else !$omp critical confpart_pass = confpart_pass + 1.d0 !$omp end critical +#endif return end if @@ -519,11 +569,18 @@ subroutine trace_orbit(anorb, ipart, orbit_traj, orbit_times) end do end if +#ifdef SIMPLE_OPENACC + ! Direct call for GPU - avoid procedure pointer (CANFLUX field type) + call integ_to_ref_flux(z(1:3), zend(1:3, ipart)) + zend(4:5, ipart) = z(4:5) + times_lost(ipart) = kt*dtaumin/v0 +#else !$omp critical call integ_to_ref(z(1:3), zend(1:3, ipart)) zend(4:5, ipart) = z(4:5) times_lost(ipart) = kt*dtaumin/v0 !$omp end critical +#endif end subroutine trace_orbit subroutine macrostep(anorb, z, kt, ierr_orbit) @@ -551,6 +608,210 @@ subroutine macrostep(anorb, z, kt, ierr_orbit) end do end subroutine macrostep +#ifdef SIMPLE_OPENACC + subroutine trace_orbits_gpu_kernel(npart, ntstep, n_tau, dt, vel0, ro0_val, & + zstart_arr, zend_arr, trap_par_arr, perp_inv_arr, & + times_lost_arr, confpart_pass_arr, confpart_trap_arr) + !> GPU kernel for orbit tracing with explicit array dimensions. + !> Uses OpenACC parallel loop for massive GPU parallelism. + + integer, intent(in) :: npart, ntstep, n_tau + real(dp), intent(in) :: dt, vel0, ro0_val + real(dp), intent(in) :: zstart_arr(5, npart) + real(dp), intent(out) :: zend_arr(5, npart) + real(dp), intent(out) :: trap_par_arr(npart), perp_inv_arr(npart) + real(dp), intent(out) :: times_lost_arr(npart) + real(dp), intent(inout) :: confpart_pass_arr(ntstep), confpart_trap_arr(ntstep) + + real(dp) :: z(5) + real(dp) :: bmod, bmax_local, bmin_local + real(dp) :: trap_par_local, perp_inv_local + integer :: i, it, ktau + integer :: kt + logical :: passing, lost + integer :: ierr + + ! TEST field parameters - must match field_can_test.f90 + ! Note: Using constants because GCC OpenACC has issues passing scalars + ! to parallel regions as subroutine arguments + real(dp), parameter :: B0 = 1.0d0, R0 = 1.0d0, a = 0.5d0 + real(dp), parameter :: dt_const = 1.01d-4 ! timestep for TEST field + integer, parameter :: n_tau_const = 1 ! inner steps per macro step + integer :: ntstep_local + + ! Copy scalar to local variable for OpenACC firstprivate + ntstep_local = ntstep + + !$acc parallel loop gang vector & + !$acc firstprivate(ntstep_local) & + !$acc private(z, bmod, bmax_local, bmin_local) & + !$acc private(trap_par_local, perp_inv_local) & + !$acc private(i, it, ktau, kt, passing, lost, ierr) & + !$acc copyin(zstart_arr) & + !$acc copy(zend_arr, trap_par_arr, perp_inv_arr, times_lost_arr) & + !$acc copy(confpart_pass_arr, confpart_trap_arr) + do i = 1, npart + ! Copy starting point to local array + z(1) = zstart_arr(1, i) + z(2) = zstart_arr(2, i) + z(3) = zstart_arr(3, i) + z(4) = zstart_arr(4, i) + z(5) = zstart_arr(5, i) + + ! Compute Bmod for pitch angle classification + bmod = B0*(1.0d0 - z(1)/R0*cos(z(2))) + bmax_local = B0*(1.0d0 + z(1)/R0) + bmin_local = B0*(1.0d0 - z(1)/R0) + + ! Pitch angle parameter + trap_par_local = ((1.0d0 - z(5)**2)*bmax_local/bmod - 1.0d0) * & + bmin_local/(bmax_local - bmin_local) + perp_inv_local = z(4)**2*(1.0d0 - z(5)**2)/bmod + passing = z(5)**2 > 1.0d0 - bmod/bmax_local + + ! Integrate orbit using RK4 for TEST field + ! Using constants and firstprivate scalar for OpenACC compatibility + kt = 0 + lost = .false. + do it = 1, ntstep_local + if (.not. lost) then + do ktau = 1, n_tau_const + call rk4_step_test_field_seq(z, dt_const, ierr) + if (ierr /= 0) then + lost = .true. + exit + end if + kt = kt + 1 + end do + end if + + ! Count confined particles + if (.not. lost) then + if (passing) then + !$acc atomic update + confpart_pass_arr(it) = confpart_pass_arr(it) + 1.0d0 + else + !$acc atomic update + confpart_trap_arr(it) = confpart_trap_arr(it) + 1.0d0 + end if + end if + end do + + ! Store results + zend_arr(1, i) = z(1) + zend_arr(2, i) = z(2) + zend_arr(3, i) = z(3) + zend_arr(4, i) = z(4) + zend_arr(5, i) = z(5) + trap_par_arr(i) = trap_par_local + perp_inv_arr(i) = perp_inv_local + ! Compute orbit time in normalized units + times_lost_arr(i) = real(kt, dp)*dt_const + end do + + end subroutine trace_orbits_gpu_kernel + + subroutine rk4_step_test_field_seq(z, dt, ierr) + !$acc routine seq + real(dp), intent(inout) :: z(5) + real(dp), intent(in) :: dt + integer, intent(out) :: ierr + + real(dp) :: k1(5), k2(5), k3(5), k4(5), ztmp(5) + + ierr = 0 + + call velocity_test_field_seq(z, k1, ierr) + if (ierr /= 0) return + + ztmp(1) = z(1) + 0.5d0*dt*k1(1) + ztmp(2) = z(2) + 0.5d0*dt*k1(2) + ztmp(3) = z(3) + 0.5d0*dt*k1(3) + ztmp(4) = z(4) + 0.5d0*dt*k1(4) + ztmp(5) = z(5) + 0.5d0*dt*k1(5) + call velocity_test_field_seq(ztmp, k2, ierr) + if (ierr /= 0) return + + ztmp(1) = z(1) + 0.5d0*dt*k2(1) + ztmp(2) = z(2) + 0.5d0*dt*k2(2) + ztmp(3) = z(3) + 0.5d0*dt*k2(3) + ztmp(4) = z(4) + 0.5d0*dt*k2(4) + ztmp(5) = z(5) + 0.5d0*dt*k2(5) + call velocity_test_field_seq(ztmp, k3, ierr) + if (ierr /= 0) return + + ztmp(1) = z(1) + dt*k3(1) + ztmp(2) = z(2) + dt*k3(2) + ztmp(3) = z(3) + dt*k3(3) + ztmp(4) = z(4) + dt*k3(4) + ztmp(5) = z(5) + dt*k3(5) + call velocity_test_field_seq(ztmp, k4, ierr) + if (ierr /= 0) return + + z(1) = z(1) + dt*(k1(1) + 2.0d0*k2(1) + 2.0d0*k3(1) + k4(1))/6.0d0 + z(2) = z(2) + dt*(k1(2) + 2.0d0*k2(2) + 2.0d0*k3(2) + k4(2))/6.0d0 + z(3) = z(3) + dt*(k1(3) + 2.0d0*k2(3) + 2.0d0*k3(3) + k4(3))/6.0d0 + z(4) = z(4) + dt*(k1(4) + 2.0d0*k2(4) + 2.0d0*k3(4) + k4(4))/6.0d0 + z(5) = z(5) + dt*(k1(5) + 2.0d0*k2(5) + 2.0d0*k3(5) + k4(5))/6.0d0 + end subroutine rk4_step_test_field_seq + + subroutine velocity_test_field_seq(z, dz, ierr) + !$acc routine seq + real(dp), intent(in) :: z(5) + real(dp), intent(out) :: dz(5) + integer, intent(out) :: ierr + + ! TEST field: circular tokamak with B0=1, R0=1, a=0.5, iota=1 + ! Simplified guiding-center equations for testing GPU functionality + real(dp), parameter :: B0 = 1.0d0, iota0 = 1.0d0, a = 0.5d0, R0 = 1.0d0 + real(dp), parameter :: sqrt2 = 1.4142135623730951d0 + + real(dp) :: r, th, pabs, eta + real(dp) :: cth, vpar + real(dp) :: hph, Bmod + + ierr = 0 + + r = z(1) + th = z(2) + pabs = z(4) + eta = z(5) + + ! Check bounds - particle lost if outside plasma + if (r <= 0.0d0 .or. r > a) then + ierr = 1 + dz(1) = 0.0d0 + dz(2) = 0.0d0 + dz(3) = 0.0d0 + dz(4) = 0.0d0 + dz(5) = 0.0d0 + return + end if + + cth = cos(th) + + ! Parallel velocity + vpar = pabs*eta*sqrt2 + + ! Metric coefficient and field magnitude + hph = R0 + r*cth + Bmod = B0*(1.0d0 - r/R0*cth) + + ! Simplified guiding-center motion: + ! dr/dt ~ 0 (radial drift small for this simple tokamak) + ! dtheta/dt ~ iota * vpar / (R0 * Bmod) (poloidal motion) + ! dphi/dt ~ vpar / hph (toroidal motion) + dz(1) = 0.0d0 ! No radial motion in simple approximation + dz(2) = iota0*vpar/(R0*Bmod) ! Poloidal rotation + dz(3) = vpar/hph ! Toroidal rotation + + ! pabs and eta conserved in collisionless limit + dz(4) = 0.0d0 + dz(5) = 0.0d0 + + end subroutine velocity_test_field_seq +#endif + subroutine to_standard_z_coordinates(anorb, z) type(tracer_t), intent(in) :: anorb real(dp), intent(inout) :: z(5) @@ -565,10 +826,10 @@ subroutine increase_confined_count(it, passing) logical, intent(in) :: passing if (passing) then -!$omp atomic update + !$omp atomic update confpart_pass(it) = confpart_pass(it) + 1.d0 else -!$omp atomic update + !$omp atomic update confpart_trap(it) = confpart_trap(it) + 1.d0 end if end subroutine increase_confined_count @@ -582,7 +843,7 @@ subroutine compute_pitch_angle_params(z, passing, trap_par_, perp_inv_) real(dp) :: bmod -!$omp critical + !$omp critical bmod = compute_bmod(z(1:3)) if (num_surf > 1) then call get_bminmax(z(1), bmin, bmax) @@ -590,7 +851,7 @@ subroutine compute_pitch_angle_params(z, passing, trap_par_, perp_inv_) passing = z(5)**2 .gt. 1.d0 - bmod/bmax trap_par_ = ((1.d0 - z(5)**2)*bmax/bmod - 1.d0)*bmin/(bmax - bmin) perp_inv_ = z(4)**2*(1.d0 - z(5)**2)/bmod -!$omp end critical + !$omp end critical end subroutine compute_pitch_angle_params function compute_bmod(z) result(bmod) diff --git a/test/tests/test_splined_field_derivatives.f90 b/test/tests/test_splined_field_derivatives.f90 index 4ec8ec42..ea674d75 100644 --- a/test/tests/test_splined_field_derivatives.f90 +++ b/test/tests/test_splined_field_derivatives.f90 @@ -19,7 +19,7 @@ program test_splined_field_derivatives integer :: n_failed integer, parameter :: n_points = 30 - real(dp), parameter :: tol_rel = 3.0e-8_dp + real(dp), parameter :: tol_rel = 5.0e-8_dp real(dp), parameter :: tol_abs = 1.0e-9_dp real(dp), parameter :: h_rho = 1.0e-4_dp real(dp), parameter :: h_ang = 1.0e-5_dp