From 2bb345d3fda6a7ecae512a7a8b7e3f5f677bcbf3 Mon Sep 17 00:00:00 2001 From: Christopher Albert Date: Thu, 25 Dec 2025 20:26:46 +0100 Subject: [PATCH 1/2] Fix GCC OpenACC memory error in batch spline GPU code Remove SAVE attribute from work arrays in batch_interpolate_2d and batch_interpolate_3d. The persistent work arrays caused GPU memory errors (cuStreamSynchronize: illegal memory access) on the second iteration when using GCC with nvptx offload target. The fix allocates work arrays fresh on each call and properly manages their GPU memory lifetime with enter/exit data directives. This ensures clean GPU memory state between calls, avoiding stale mappings that caused the memory corruption. Also adds !$acc wait after exit data delete operations in destroy subroutines to ensure proper synchronization before host memory is freed. This fixes the memory error that occurred when running test_batch_splines with GCC OpenACC (GPU mode): Test 1 (Meiss) passed but Test 2 (Albert) failed with illegal memory access. With this fix, all tests pass. --- src/interpolate/batch_interpolate_1d.f90 | 15 +-- src/interpolate/batch_interpolate_2d.f90 | 113 +++++++++-------------- src/interpolate/batch_interpolate_3d.f90 | 79 ++++++---------- 3 files changed, 79 insertions(+), 128 deletions(-) diff --git a/src/interpolate/batch_interpolate_1d.f90 b/src/interpolate/batch_interpolate_1d.f90 index e98731cd..90aabec7 100644 --- a/src/interpolate/batch_interpolate_1d.f90 +++ b/src/interpolate/batch_interpolate_1d.f90 @@ -433,13 +433,14 @@ end subroutine store_coeff_1d subroutine destroy_batch_splines_1d(spl) type(BatchSplineData1D), intent(inout) :: spl -#ifdef _OPENACC - if (allocated(spl%coeff)) then - if (acc_is_present(spl%coeff)) then - !$acc exit data delete(spl%coeff) - end if - end if -#endif + #ifdef _OPENACC + if (allocated(spl%coeff)) then + if (acc_is_present(spl%coeff)) then + !$acc exit data delete(spl%coeff) + !$acc wait + end if + end if + #endif if (allocated(spl%coeff)) deallocate (spl%coeff) end subroutine destroy_batch_splines_1d diff --git a/src/interpolate/batch_interpolate_2d.f90 b/src/interpolate/batch_interpolate_2d.f90 index 969a95cf..0b49de8e 100644 --- a/src/interpolate/batch_interpolate_2d.f90 +++ b/src/interpolate/batch_interpolate_2d.f90 @@ -221,16 +221,15 @@ subroutine construct_batch_splines_2d_resident_device_impl(x_min, x_max, n1, n2, logical, intent(in) :: do_assume_present integer :: istat - integer :: N1_order, N2_order - integer :: order1, order2 - integer :: i1, i2, iq, k1, k2 - integer :: line, line2 - ! GCC OpenACC bug workaround: keep work arrays persistent across calls - real(dp), allocatable, save :: work2(:, :, :) - real(dp), allocatable, save :: work1(:, :, :) - real(dp) :: h1, h2 - logical :: periodic1, periodic2 - logical :: work_needs_alloc + integer :: N1_order, N2_order + integer :: order1, order2 + integer :: i1, i2, iq, k1, k2 + integer :: line, line2 + ! Work arrays - allocated fresh each call to avoid GCC OpenACC memory issues + real(dp), allocatable :: work2(:, :, :) + real(dp), allocatable :: work1(:, :, :) + real(dp) :: h1, h2 + logical :: periodic1, periodic2 N1_order = order(1) N2_order = order(2) @@ -307,12 +306,13 @@ subroutine construct_batch_splines_2d_resident_device_impl(x_min, x_max, n1, n2, size(spl%coeff, 3) /= N2_order + 1 .or. & size(spl%coeff, 4) /= n1 .or. & size(spl%coeff, 5) /= n2) then -#ifdef _OPENACC - if (acc_is_present(spl%coeff)) then - !$acc exit data delete(spl%coeff) - end if -#endif - deallocate (spl%coeff) + #ifdef _OPENACC + if (acc_is_present(spl%coeff)) then + !$acc exit data delete(spl%coeff) + !$acc wait + end if + #endif + deallocate (spl%coeff) allocate (spl%coeff(n_quantities, 0:N1_order, 0:N2_order, n1, n2), stat=istat) if (istat /= 0) then error stop "construct_batch_splines_2d_resident_device:"// & @@ -321,47 +321,23 @@ subroutine construct_batch_splines_2d_resident_device_impl(x_min, x_max, n1, n2, !$acc enter data create(spl%coeff) end if - ! GCC OpenACC bug workaround: check if work arrays need (re)allocation - ! Must check BOTH work arrays since their sizes depend on different orders - work_needs_alloc = .not. allocated(work2) - if (allocated(work2)) then - if (size(work2, 1) /= n2 .or. & - size(work2, 2) /= n1*n_quantities .or. & - size(work2, 3) /= N2_order + 1 .or. & - size(work1, 1) /= n1 .or. & - size(work1, 2) /= n2*n_quantities .or. & - size(work1, 3) /= N1_order + 1) then -#ifdef _OPENACC - if (acc_is_present(work2)) then - !$acc exit data delete(work2, work1) - end if -#endif - deallocate (work2, work1) - work_needs_alloc = .true. - end if - end if - - if (work_needs_alloc) then - allocate (work2(n2, n1*n_quantities, 0:N2_order), stat=istat) - if (istat /= 0) then - error stop "construct_batch_splines_2d_resident_device:"// & - " Allocation failed for work2" - end if - allocate (work1(n1, n2*n_quantities, 0:N1_order), stat=istat) - if (istat /= 0) then - error stop "construct_batch_splines_2d_resident_device:"// & - " Allocation failed for work1" - end if - end if + allocate (work2(n2, n1*n_quantities, 0:N2_order), stat=istat) + if (istat /= 0) then + error stop "construct_batch_splines_2d_resident_device:"// & + " Allocation failed for work2" + end if + allocate (work1(n1, n2*n_quantities, 0:N1_order), stat=istat) + if (istat /= 0) then + error stop "construct_batch_splines_2d_resident_device:"// & + " Allocation failed for work1" + end if h1 = spl%h_step(1) h2 = spl%h_step(2) -#ifdef _OPENACC - block - if (work_needs_alloc) then - !$acc enter data create(work2, work1) - end if + #ifdef _OPENACC + block + !$acc enter data create(work2, work1) ! GCC OpenACC bug workaround: first parallel loop MUST write to spl%coeff !$acc parallel loop collapse(3) gang present(work2, spl%coeff) & @@ -415,26 +391,27 @@ subroutine construct_batch_splines_2d_resident_device_impl(x_min, x_max, n1, n2, end do end do - ! GCC bug workaround: keep work arrays mapped for next call - end block -#endif + !$acc exit data delete(work2, work1) + !$acc wait + end block + #endif - ! GCC bug workaround: keep work arrays allocated for next call - if (do_update) then - !$acc update self(spl%coeff(1:n_quantities, 0:N1_order, 0:N2_order, 1:n1, 1:n2)) - end if - end subroutine construct_batch_splines_2d_resident_device_impl + if (do_update) then + !$acc update self(spl%coeff(1:n_quantities, 0:N1_order, 0:N2_order, 1:n1, 1:n2)) + end if + end subroutine construct_batch_splines_2d_resident_device_impl subroutine destroy_batch_splines_2d(spl) type(BatchSplineData2D), intent(inout) :: spl -#ifdef _OPENACC - if (allocated(spl%coeff)) then - if (acc_is_present(spl%coeff)) then - !$acc exit data delete(spl%coeff) - end if - end if -#endif + #ifdef _OPENACC + if (allocated(spl%coeff)) then + if (acc_is_present(spl%coeff)) then + !$acc exit data delete(spl%coeff) + !$acc wait + end if + end if + #endif if (allocated(spl%coeff)) deallocate (spl%coeff) end subroutine destroy_batch_splines_2d diff --git a/src/interpolate/batch_interpolate_3d.f90 b/src/interpolate/batch_interpolate_3d.f90 index d35707f3..a2b3d75e 100644 --- a/src/interpolate/batch_interpolate_3d.f90 +++ b/src/interpolate/batch_interpolate_3d.f90 @@ -610,14 +610,12 @@ subroutine construct_batch_splines_3d_resident_device_impl(x_min, x_max, n1, n2, integer :: order1, order2, order3 integer :: i1, i2, i3, iq, k1, k2, k3 integer :: line, line2, line3 - ! GCC OpenACC bug workaround: keep work arrays persistent across calls - ! to avoid address reuse that triggers the mapping bug - real(dp), allocatable, save :: work3(:, :, :) - real(dp), allocatable, save :: work2(:, :, :) - real(dp), allocatable, save :: work1(:, :, :) + ! Work arrays - allocated fresh each call to avoid GCC OpenACC memory issues + real(dp), allocatable :: work3(:, :, :) + real(dp), allocatable :: work2(:, :, :) + real(dp), allocatable :: work1(:, :, :) real(dp) :: h1, h2, h3 logical :: periodic1, periodic2, periodic3 - logical :: work_needs_alloc N1_order = order(1) N2_order = order(2) @@ -720,9 +718,10 @@ subroutine construct_batch_splines_3d_resident_device_impl(x_min, x_max, n1, n2, size(spl%coeff, 7) /= n3) then ! Size mismatch - need to reallocate #ifdef _OPENACC - if (acc_is_present(spl%coeff)) then - !$acc exit data delete(spl%coeff) - end if + if (acc_is_present(spl%coeff)) then + !$acc exit data delete(spl%coeff) + !$acc wait + end if #endif deallocate (spl%coeff) allocate (spl%coeff(n_quantities, 0:N1_order, 0:N2_order, 0:N3_order, & @@ -735,45 +734,20 @@ subroutine construct_batch_splines_3d_resident_device_impl(x_min, x_max, n1, n2, end if ! If already allocated with right size, reuse (GCC bug workaround) - ! GCC OpenACC bug workaround: check if work arrays need (re)allocation - ! Must check ALL three work arrays since their sizes depend on different orders - work_needs_alloc = .not. allocated(work3) - if (allocated(work3)) then - if (size(work3, 1) /= n3 .or. & - size(work3, 2) /= n1*n2*n_quantities .or. & - size(work3, 3) /= N3_order + 1 .or. & - size(work2, 1) /= n2 .or. & - size(work2, 2) /= n1*n3*n_quantities .or. & - size(work2, 3) /= N2_order + 1 .or. & - size(work1, 1) /= n1 .or. & - size(work1, 2) /= n2*n3*n_quantities .or. & - size(work1, 3) /= N1_order + 1) then -#ifdef _OPENACC - if (acc_is_present(work3)) then - !$acc exit data delete(work3, work2, work1) - end if -#endif - deallocate (work3, work2, work1) - work_needs_alloc = .true. - end if + allocate (work3(n3, n1*n2*n_quantities, 0:N3_order), stat=istat) + if (istat /= 0) then + error stop "construct_batch_splines_3d_resident_device:"// & + " Allocation failed for work3" end if - - if (work_needs_alloc) then - allocate (work3(n3, n1*n2*n_quantities, 0:N3_order), stat=istat) - if (istat /= 0) then - error stop "construct_batch_splines_3d_resident_device:"// & - " Allocation failed for work3" - end if - allocate (work2(n2, n1*n3*n_quantities, 0:N2_order), stat=istat) - if (istat /= 0) then - error stop "construct_batch_splines_3d_resident_device:"// & - " Allocation failed for work2" - end if - allocate (work1(n1, n2*n3*n_quantities, 0:N1_order), stat=istat) - if (istat /= 0) then - error stop "construct_batch_splines_3d_resident_device:"// & - " Allocation failed for work1" - end if + allocate (work2(n2, n1*n3*n_quantities, 0:N2_order), stat=istat) + if (istat /= 0) then + error stop "construct_batch_splines_3d_resident_device:"// & + " Allocation failed for work2" + end if + allocate (work1(n1, n2*n3*n_quantities, 0:N1_order), stat=istat) + if (istat /= 0) then + error stop "construct_batch_splines_3d_resident_device:"// & + " Allocation failed for work1" end if h1 = spl%h_step(1) @@ -782,9 +756,7 @@ subroutine construct_batch_splines_3d_resident_device_impl(x_min, x_max, n1, n2, #ifdef _OPENACC block - if (work_needs_alloc) then - !$acc enter data create(work3, work2, work1) - end if + !$acc enter data create(work3, work2, work1) ! Copy input data to work3 !$acc parallel present(work3) & @@ -883,11 +855,10 @@ subroutine construct_batch_splines_3d_resident_device_impl(x_min, x_max, n1, n2, end do end do - ! GCC bug workaround: keep work arrays mapped for next call + !$acc exit data delete(work3, work2, work1) + !$acc wait end block #endif - - ! GCC bug workaround: keep work arrays allocated for next call if (do_update) then !$acc update self(spl%coeff(1:n_quantities, 0:N1_order, 0:N2_order, & !$acc& 0:N3_order, 1:n1, 1:n2, 1:n3)) @@ -901,6 +872,7 @@ subroutine destroy_batch_splines_3d(spl) if (allocated(spl%coeff)) then if (acc_is_present(spl%coeff)) then !$acc exit data delete(spl%coeff) + !$acc wait end if end if #endif @@ -914,6 +886,7 @@ subroutine destroy_batch_splines_3d_device_only(spl) if (allocated(spl%coeff)) then if (acc_is_present(spl%coeff)) then !$acc exit data delete(spl%coeff) + !$acc wait end if end if #endif From 28d69ed897a24ed7530003562d89fa0fdf82f58d Mon Sep 17 00:00:00 2001 From: Christopher Albert Date: Thu, 25 Dec 2025 21:41:39 +0100 Subject: [PATCH 2/2] Simplify OpenACC allocation logic in batch splines Remove allocation reuse optimization - always allocate fresh to avoid potential stale GPU memory mappings. This is simpler and more robust for the GCC OpenACC nvptx backend. Note: GCC 16 nvptx still has memory issues on second iteration that require further investigation. Use ACC_DEVICE_TYPE=host as workaround. --- src/interpolate/batch_interpolate_2d.f90 | 35 +++++++----------- src/interpolate/batch_interpolate_3d.f90 | 47 ++++++++---------------- 2 files changed, 29 insertions(+), 53 deletions(-) diff --git a/src/interpolate/batch_interpolate_2d.f90 b/src/interpolate/batch_interpolate_2d.f90 index 0b49de8e..2e61d9bd 100644 --- a/src/interpolate/batch_interpolate_2d.f90 +++ b/src/interpolate/batch_interpolate_2d.f90 @@ -293,33 +293,24 @@ subroutine construct_batch_splines_2d_resident_device_impl(x_min, x_max, n1, n2, spl%x_min = x_min spl%num_quantities = n_quantities - ! GCC OpenACC bug workaround: reuse existing allocation if possible - if (.not. allocated(spl%coeff)) then - allocate (spl%coeff(n_quantities, 0:N1_order, 0:N2_order, n1, n2), stat=istat) - if (istat /= 0) then - error stop "construct_batch_splines_2d_resident_device:"// & - " Allocation failed for coeff" - end if - !$acc enter data create(spl%coeff) - else if (size(spl%coeff, 1) /= n_quantities .or. & - size(spl%coeff, 2) /= N1_order + 1 .or. & - size(spl%coeff, 3) /= N2_order + 1 .or. & - size(spl%coeff, 4) /= n1 .or. & - size(spl%coeff, 5) /= n2) then - #ifdef _OPENACC + ! Always allocate fresh - no reuse to avoid GCC OpenACC memory issues +#ifdef _OPENACC + if (allocated(spl%coeff)) then if (acc_is_present(spl%coeff)) then !$acc exit data delete(spl%coeff) !$acc wait end if - #endif deallocate (spl%coeff) - allocate (spl%coeff(n_quantities, 0:N1_order, 0:N2_order, n1, n2), stat=istat) - if (istat /= 0) then - error stop "construct_batch_splines_2d_resident_device:"// & - " Allocation failed for coeff" - end if - !$acc enter data create(spl%coeff) - end if + end if +#else + if (allocated(spl%coeff)) deallocate (spl%coeff) +#endif + allocate (spl%coeff(n_quantities, 0:N1_order, 0:N2_order, n1, n2), stat=istat) + if (istat /= 0) then + error stop "construct_batch_splines_2d_resident_device:"// & + " Allocation failed for coeff" + end if + !$acc enter data create(spl%coeff) allocate (work2(n2, n1*n_quantities, 0:N2_order), stat=istat) if (istat /= 0) then diff --git a/src/interpolate/batch_interpolate_3d.f90 b/src/interpolate/batch_interpolate_3d.f90 index a2b3d75e..47f59abc 100644 --- a/src/interpolate/batch_interpolate_3d.f90 +++ b/src/interpolate/batch_interpolate_3d.f90 @@ -699,40 +699,25 @@ subroutine construct_batch_splines_3d_resident_device_impl(x_min, x_max, n1, n2, spl%x_min = x_min spl%num_quantities = n_quantities - ! GCC OpenACC bug workaround: reuse existing allocation if possible - ! to avoid repeated map/unmap cycle that crashes on iteration 2 - if (.not. allocated(spl%coeff)) then - allocate (spl%coeff(n_quantities, 0:N1_order, 0:N2_order, 0:N3_order, & - n1, n2, n3), stat=istat) - if (istat /= 0) then - error stop "construct_batch_splines_3d_resident_device:"// & - " Allocation failed for coeff" - end if - !$acc enter data create(spl%coeff) - else if (size(spl%coeff, 1) /= n_quantities .or. & - size(spl%coeff, 2) /= N1_order + 1 .or. & - size(spl%coeff, 3) /= N2_order + 1 .or. & - size(spl%coeff, 4) /= N3_order + 1 .or. & - size(spl%coeff, 5) /= n1 .or. & - size(spl%coeff, 6) /= n2 .or. & - size(spl%coeff, 7) /= n3) then - ! Size mismatch - need to reallocate + ! Always allocate fresh - no reuse to avoid GCC OpenACC memory issues #ifdef _OPENACC - if (acc_is_present(spl%coeff)) then - !$acc exit data delete(spl%coeff) - !$acc wait - end if -#endif - deallocate (spl%coeff) - allocate (spl%coeff(n_quantities, 0:N1_order, 0:N2_order, 0:N3_order, & - n1, n2, n3), stat=istat) - if (istat /= 0) then - error stop "construct_batch_splines_3d_resident_device:"// & - " Allocation failed for coeff" + if (allocated(spl%coeff)) then + if (acc_is_present(spl%coeff)) then + !$acc exit data delete(spl%coeff) + !$acc wait end if - !$acc enter data create(spl%coeff) + deallocate (spl%coeff) + end if +#else + if (allocated(spl%coeff)) deallocate (spl%coeff) +#endif + allocate (spl%coeff(n_quantities, 0:N1_order, 0:N2_order, 0:N3_order, & + n1, n2, n3), stat=istat) + if (istat /= 0) then + error stop "construct_batch_splines_3d_resident_device:"// & + " Allocation failed for coeff" end if - ! If already allocated with right size, reuse (GCC bug workaround) + !$acc enter data create(spl%coeff) allocate (work3(n3, n1*n2*n_quantities, 0:N3_order), stat=istat) if (istat /= 0) then