From 19fedef281aa1ffe13295580742c9bf66c1c03db Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Thu, 28 Aug 2025 17:55:26 +1000 Subject: [PATCH 01/14] add gpu2gpu mpi transer with flag for do_group_update --- mpp/include/group_update_pack.inc | 27 +++++++++++++++------------ mpp/include/group_update_unpack.inc | 28 ++++++++++++++++------------ mpp/include/mpp_group_update.fh | 21 +++++++++++++++------ mpp/include/mpp_transmit.inc | 10 ++++++---- mpp/include/mpp_transmit_mpi.fh | 29 ++++++++++++++++++++++++----- 5 files changed, 76 insertions(+), 39 deletions(-) diff --git a/mpp/include/group_update_pack.inc b/mpp/include/group_update_pack.inc index 142106aece..b1a55b68e3 100644 --- a/mpp/include/group_update_pack.inc +++ b/mpp/include/group_update_pack.inc @@ -18,9 +18,6 @@ !*********************************************************************** if( group%k_loop_inside ) then -!$OMP parallel do default(none) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & -!$OMP private(buffer_pos,pos,m,is,ie,js,je,rotation, & -!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k) do n = 1, npack buffer_pos = group%pack_buffer_pos(n) + buffer_start_pos pos = buffer_pos @@ -32,14 +29,17 @@ if( group%k_loop_inside ) then case(ZERO) do l=1, group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) + !$omp target teams distribute parallel do private(idx) & + !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) if(use_device_ptr) do k = 1, ksize do j = js, je do i = is, ie - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (k-1)*(je-js+1)*(ie-is+1) + (j-js)*(ie-is+1) + (i-is) + 1 + buffer(idx) = field(i,j,k) end do end do enddo + pos = pos + ksize*(je-js+1)*(ie-is+1) enddo case( MINUS_NINETY ) do l=1,group%nscalar ! loop over number of fields @@ -83,14 +83,17 @@ if( group%k_loop_inside ) then case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) + !$omp target teams distribute parallel do private(idx) & + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) if(use_device_ptr) do k = 1, ksize do j = js, je do i = is, ie - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (k-1)*(je-js+1)*(ie-is+1) + (j-js)*(ie-is+1) + (i-is) + 1 + buffer(idx) = fieldx(i,j,k) end do end do end do + pos = pos + ksize*(je-js+1)*(ie-is+1) end do case( MINUS_NINETY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then @@ -162,14 +165,17 @@ if( group%k_loop_inside ) then case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) + !$omp target teams distribute parallel do private(idx) & + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) if(use_device_ptr) do k = 1, ksize do j = js, je do i = is, ie - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + idx = pos + (k-1)*(je-js+1)*(ie-is+1) + (j-js)*(ie-is+1) + (i-is) + 1 + buffer(idx) = fieldy(i,j,k) end do end do end do + pos = pos + ksize*(je-js+1)*(ie-is+1) end do case( MINUS_NINETY ) do l=1,nvector ! loop over number of fields @@ -239,9 +245,6 @@ if( group%k_loop_inside ) then endif enddo else -!$OMP parallel do default(none) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & -!$OMP private(buffer_pos,pos,m,is,ie,js,je,rotation, & -!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k) do nk = 1, npack*ksize n = (nk-1)/ksize + 1 k = mod((nk-1), ksize) + 1 diff --git a/mpp/include/group_update_unpack.inc b/mpp/include/group_update_unpack.inc index 7f60ed93df..c18275e9bb 100644 --- a/mpp/include/group_update_unpack.inc +++ b/mpp/include/group_update_unpack.inc @@ -18,9 +18,6 @@ !*********************************************************************** if( group%k_loop_inside ) then -!$OMP parallel do default(none) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & -!$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, & -!$OMP ptr_field, ptr_fieldx, ptr_fieldy, n,k ) do n = nunpack, 1, -1 buffer_pos = group%unpack_buffer_pos(n) + buffer_start_pos pos = buffer_pos @@ -29,45 +26,52 @@ if( group%k_loop_inside ) then if( group%unpack_type(n) == FIELD_S ) then do l=1,nscalar ! loop over number of fields ptr_field = group%addrs_s(l) + !$omp target teams distribute parallel do if(use_device_ptr) private(idx) & + !$omp map(to: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) & + !$omp map(from: field(is:ie,js:je,1:ksize)) do k = 1, ksize do j = js, je do i = is, ie - pos = pos + 1 - field(i,j,k) = buffer(pos) + idx = pos + (k-1)*(je-js+1)*(ie-is+1) + (j-js)*(ie-is+1) + (i-is) + 1 + field(i,j,k) = buffer(idx) end do end do end do + pos = pos + ksize*(je-js+1)*(ie-is+1) end do else if( group%unpack_type(n) == FIELD_X ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) + !$omp target teams distribute parallel do private(idx) & + !$omp map(to: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) map(from: fieldx(is:ie,js:je,1:ksize)) if(use_device_ptr) do k = 1, ksize do j = js, je do i = is, ie - pos = pos + 1 - fieldx(i,j,k) = buffer(pos) + idx = pos + (k-1)*(je-js+1)*(ie-is+1) + (j-js)*(ie-is+1) + (i-is) + 1 + fieldx(i,j,k) = buffer(idx) end do end do end do + pos = pos + ksize*(je-js+1)*(ie-is+1) end do else if( group%unpack_type(n) == FIELD_Y ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) + !$omp target teams distribute parallel do private(idx) & + !$omp map(to: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) map(from: fieldy(is:ie,js:je,1:ksize)) if(use_device_ptr) do k = 1, ksize do j = js, je do i = is, ie - pos = pos + 1 - fieldy(i,j,k) = buffer(pos) + idx = pos + (k-1)*(je-js+1)*(ie-is+1) + (j-js)*(ie-is+1) + (i-is) + 1 + fieldy(i,j,k) = buffer(idx) end do end do end do + pos = pos + ksize*(je-js+1)*(ie-is+1) end do endif enddo else -!$OMP parallel do default(none) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & -!$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, & -!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k) do nk = nunpack*ksize, 1, -1 n = (nk-1)/ksize + 1 k = mod((nk-1), ksize) + 1 diff --git a/mpp/include/mpp_group_update.fh b/mpp/include/mpp_group_update.fh index 45be7ea531..748e633f59 100644 --- a/mpp/include/mpp_group_update.fh +++ b/mpp/include/mpp_group_update.fh @@ -419,20 +419,22 @@ subroutine MPP_CREATE_GROUP_UPDATE_4D_V_( group, fieldx, fieldy, domain, flags, end subroutine MPP_CREATE_GROUP_UPDATE_4D_V_ -subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) +subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type, omp_offload) type(mpp_group_update_type), intent(inout) :: group type(domain2D), intent(inout) :: domain MPP_TYPE_, intent(in) :: d_type + logical, optional, intent(in) :: omp_offload integer :: nscalar, nvector, nlist logical :: recv_y(8) integer :: nsend, nrecv, flags_v integer :: msgsize - integer :: from_pe, to_pe, buffer_pos, pos + integer :: from_pe, to_pe, buffer_pos, pos, idx integer :: ksize, is, ie, js, je integer :: n, l, m, i, j, k, buffer_start_pos, nk integer :: shift, gridtype, midpoint integer :: npack, nunpack, rotation, isd + logical :: use_device_ptr MPP_TYPE_ :: buffer(mpp_domains_stack_size) MPP_TYPE_ :: field (group%is_s:group%ie_s,group%js_s:group%je_s, group%ksize_s) @@ -448,6 +450,9 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) nlist = size(domain%list(:)) gridtype = group%gridtype + use_device_ptr = .false. + if (present(omp_offload)) use_device_ptr = omp_offload + !--- ksize_s must equal ksize_v if(nvector > 0 .AND. nscalar > 0) then if(group%ksize_s .NE. group%ksize_v) then @@ -476,13 +481,14 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) !---pre-post receive. call mpp_clock_begin(group_recv_clock) + !$omp target enter data map(alloc: buffer) if(use_device_ptr) do m = 1, nrecv msgsize = group%recv_size(m) from_pe = group%from_pe(m) if( msgsize .GT. 0 )then buffer_pos = group%buffer_pos_recv(m) call mpp_recv( buffer(buffer_pos+1), glen=msgsize, from_pe=from_pe, block=.false., & - tag=COMM_TAG_1) + tag=COMM_TAG_1, omp_offload=omp_offload) end if end do @@ -504,7 +510,7 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) if( msgsize .GT. 0 )then buffer_pos = group%buffer_pos_send(n) to_pe = group%to_pe(n) - call mpp_send( buffer(buffer_pos+1), plen=msgsize, to_pe=to_pe, tag=COMM_TAG_1) + call mpp_send( buffer(buffer_pos+1), plen=msgsize, to_pe=to_pe, tag=COMM_TAG_1, omp_offload=omp_offload) endif enddo call mpp_clock_end(group_send_clock) @@ -519,6 +525,7 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) nunpack = group%nunpack call mpp_clock_begin(group_unpk_clock) #include + !$omp target exit data map(release: buffer) if(use_device_ptr) call mpp_clock_end(group_unpk_clock) ! ---northern boundary fold @@ -644,10 +651,11 @@ subroutine MPP_START_GROUP_UPDATE_(group, domain, d_type, reuse_buffer) integer :: nscalar, nvector integer :: nsend, nrecv, flags_v integer :: msgsize, npack, rotation - integer :: from_pe, to_pe, buffer_pos, pos + integer :: from_pe, to_pe, buffer_pos, pos, idx integer :: ksize, is, ie, js, je integer :: n, l, m, i, j, k, buffer_start_pos, nk logical :: reuse_buf_pos + logical, parameter :: use_device_ptr = .false. ! placeholder character(len=8) :: text MPP_TYPE_ :: buffer(size(mpp_domains_stack_nonblock(:))) @@ -749,11 +757,12 @@ subroutine MPP_COMPLETE_GROUP_UPDATE_(group, domain, d_type) MPP_TYPE_, intent(in) :: d_type integer :: nsend, nrecv, nscalar, nvector - integer :: k, buffer_pos, pos, m, n, l + integer :: k, buffer_pos, pos, m, n, l, idx integer :: is, ie, js, je, ksize, i, j integer :: shift, gridtype, midpoint, flags_v integer :: nunpack, rotation, buffer_start_pos, nk, isd logical :: recv_y(8) + logical, parameter :: use_device_ptr = .false. ! placeholder MPP_TYPE_ :: buffer(size(mpp_domains_stack_nonblock(:))) MPP_TYPE_ :: field (group%is_s:group%ie_s,group%js_s:group%je_s, group%ksize_s) MPP_TYPE_ :: fieldx(group%is_x:group%ie_x,group%js_x:group%je_x, group%ksize_v) diff --git a/mpp/include/mpp_transmit.inc b/mpp/include/mpp_transmit.inc index aaa770cc06..b3eac0c3ff 100644 --- a/mpp/include/mpp_transmit.inc +++ b/mpp/include/mpp_transmit.inc @@ -171,7 +171,7 @@ call mpp_transmit( put_data, put_len, to_pe, dummy, 1, NULL_PE, tag=tag, send_request=request ) end subroutine MPP_SEND_ - subroutine MPP_RECV_SCALAR_( get_data, from_pe, glen, block, tag, request ) + subroutine MPP_RECV_SCALAR_( get_data, from_pe, glen, block, tag, request, omp_offload ) !a mpp_transmit with null arguments on the put side integer, intent(in) :: from_pe MPP_TYPE_, intent(out) :: get_data @@ -180,6 +180,7 @@ integer, intent(out), optional :: request integer, optional, intent(in) :: glen + logical, optional, intent(in) :: omp_offload integer :: get_len MPP_TYPE_ :: get_data1D(1) MPP_TYPE_ :: dummy(1) @@ -189,17 +190,18 @@ ptr = LOC(get_data) get_len=1; if(PRESENT(glen))get_len=glen - call mpp_transmit( dummy, 1, NULL_PE, get_data1D, get_len, from_pe, block, tag, recv_request=request ) + call mpp_transmit( dummy, 1, NULL_PE, get_data1D, get_len, from_pe, block, tag, recv_request=request, omp_offload=omp_offload ) end subroutine MPP_RECV_SCALAR_ - subroutine MPP_SEND_SCALAR_( put_data, to_pe, plen, tag, request) + subroutine MPP_SEND_SCALAR_( put_data, to_pe, plen, tag, request, omp_offload) !a mpp_transmit with null arguments on the get side integer, intent(in) :: to_pe MPP_TYPE_, intent(in) :: put_data integer, optional, intent(in) :: plen integer, intent(in), optional :: tag integer, intent(out), optional :: request + logical, optional, intent(in) :: omp_offload integer :: put_len MPP_TYPE_ :: put_data1D(1) MPP_TYPE_ :: dummy(1) @@ -207,7 +209,7 @@ pointer( ptr, put_data1D ) ptr = LOC(put_data) put_len=1; if(PRESENT(plen))put_len=plen - call mpp_transmit( put_data1D, put_len, to_pe, dummy, 1, NULL_PE, tag = tag, send_request=request ) + call mpp_transmit( put_data1D, put_len, to_pe, dummy, 1, NULL_PE, tag = tag, send_request=request, omp_offload=omp_offload ) end subroutine MPP_SEND_SCALAR_ diff --git a/mpp/include/mpp_transmit_mpi.fh b/mpp/include/mpp_transmit_mpi.fh index 1fdfc2f6cb..729cc165b0 100644 --- a/mpp/include/mpp_transmit_mpi.fh +++ b/mpp/include/mpp_transmit_mpi.fh @@ -37,7 +37,7 @@ !!(avoiding f90 rank conformance check) !!caller is responsible for completion checks (mpp_sync_self) before and after subroutine MPP_TRANSMIT_( put_data, put_len, to_pe, get_data, get_len, from_pe, block, tag, recv_request, & - & send_request ) + & send_request, omp_offload ) integer, intent(in) :: put_len, to_pe, get_len, from_pe MPP_TYPE_, intent(in) :: put_data(*) @@ -45,10 +45,15 @@ logical, intent(in), optional :: block integer, intent(in), optional :: tag integer, intent(out), optional :: recv_request, send_request + logical, intent(in), optional :: omp_offload logical :: block_comm integer :: i integer :: comm_tag integer :: rsize + logical :: use_device_ptr + + use_device_ptr = .false. + if (present(omp_offload)) use_device_ptr = omp_offload if( .NOT.module_is_initialized )call mpp_error( FATAL, 'MPP_TRANSMIT: You must first call mpp_init.' ) if( to_pe.EQ.NULL_PE .AND. from_pe.EQ.NULL_PE )return @@ -82,8 +87,15 @@ if( cur_send_request > max_request ) & call mpp_error(FATAL, & & "MPP_TRANSMIT: cur_send_request is greater than max_request, increase mpp_nml request_multiply") - call MPI_ISEND( put_data, put_len, MPI_TYPE_, to_pe, comm_tag, mpp_comm_private, & - request_send(cur_send_request), error) + if (use_device_ptr) then + !$omp target data use_device_ptr(put_data) + call MPI_ISEND( put_data, put_len, MPI_TYPE_, to_pe, comm_tag, mpp_comm_private, & + request_send(cur_send_request), error) + !$omp end target data + else + call MPI_ISEND( put_data, put_len, MPI_TYPE_, to_pe, comm_tag, mpp_comm_private, & + request_send(cur_send_request), error) + endif endif if (debug .and. (current_clock.NE.0)) call increment_current_clock(EVENT_SEND, put_len*MPP_TYPE_BYTELEN_) else if (to_pe.EQ.ALL_PES) then !this is a broadcast from from_pe @@ -130,8 +142,15 @@ if( cur_recv_request > max_request ) & call mpp_error(FATAL, & "MPP_TRANSMIT: cur_recv_request is greater than max_request, increase mpp_nml request_multiply") - call MPI_IRECV( get_data, get_len, MPI_TYPE_, from_pe, comm_tag, mpp_comm_private, & - request_recv(cur_recv_request), error ) + if (use_device_ptr) then + !$omp target data use_device_ptr(get_data) + call MPI_IRECV( get_data, get_len, MPI_TYPE_, from_pe, comm_tag, mpp_comm_private, & + request_recv(cur_recv_request), error ) + !$omp end target data + else + call MPI_IRECV( get_data, get_len, MPI_TYPE_, from_pe, comm_tag, mpp_comm_private, & + request_recv(cur_recv_request), error ) + endif size_recv(cur_recv_request) = get_len type_recv(cur_recv_request) = MPI_TYPE_ endif From f81247bd5e315608d2f5725a21852f0c6d9e620a Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Tue, 2 Sep 2025 10:12:37 +1000 Subject: [PATCH 02/14] add missing collapse(3) clauses --- mpp/include/group_update_pack.inc | 6 +++--- mpp/include/group_update_unpack.inc | 12 +++++++----- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/mpp/include/group_update_pack.inc b/mpp/include/group_update_pack.inc index b1a55b68e3..d08335d1e6 100644 --- a/mpp/include/group_update_pack.inc +++ b/mpp/include/group_update_pack.inc @@ -29,7 +29,7 @@ if( group%k_loop_inside ) then case(ZERO) do l=1, group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) - !$omp target teams distribute parallel do private(idx) & + !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) if(use_device_ptr) do k = 1, ksize do j = js, je @@ -83,7 +83,7 @@ if( group%k_loop_inside ) then case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) - !$omp target teams distribute parallel do private(idx) & + !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) if(use_device_ptr) do k = 1, ksize do j = js, je @@ -165,7 +165,7 @@ if( group%k_loop_inside ) then case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) - !$omp target teams distribute parallel do private(idx) & + !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) if(use_device_ptr) do k = 1, ksize do j = js, je diff --git a/mpp/include/group_update_unpack.inc b/mpp/include/group_update_unpack.inc index c18275e9bb..f242cd3452 100644 --- a/mpp/include/group_update_unpack.inc +++ b/mpp/include/group_update_unpack.inc @@ -26,7 +26,7 @@ if( group%k_loop_inside ) then if( group%unpack_type(n) == FIELD_S ) then do l=1,nscalar ! loop over number of fields ptr_field = group%addrs_s(l) - !$omp target teams distribute parallel do if(use_device_ptr) private(idx) & + !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) private(idx) & !$omp map(to: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) & !$omp map(from: field(is:ie,js:je,1:ksize)) do k = 1, ksize @@ -42,8 +42,9 @@ if( group%k_loop_inside ) then else if( group%unpack_type(n) == FIELD_X ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) - !$omp target teams distribute parallel do private(idx) & - !$omp map(to: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) map(from: fieldx(is:ie,js:je,1:ksize)) if(use_device_ptr) + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) & + !$omp map(from: fieldx(is:ie,js:je,1:ksize)) if(use_device_ptr) do k = 1, ksize do j = js, je do i = is, ie @@ -57,8 +58,9 @@ if( group%k_loop_inside ) then else if( group%unpack_type(n) == FIELD_Y ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) - !$omp target teams distribute parallel do private(idx) & - !$omp map(to: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) map(from: fieldy(is:ie,js:je,1:ksize)) if(use_device_ptr) + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) & + !$omp map(from: fieldy(is:ie,js:je,1:ksize)) if(use_device_ptr) do k = 1, ksize do j = js, je do i = is, ie From 9f068e3d3b1b20fb13e74f41d0cdfeb2fd7a7086 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Fri, 10 Oct 2025 13:14:15 +1100 Subject: [PATCH 03/14] Use __NVCOMPILER macro for target regions --- mpp/include/group_update_pack.inc | 28 +++++++++++++++++----------- mpp/include/group_update_unpack.inc | 28 +++++++++++++++++----------- mpp/include/mpp_group_update.fh | 10 +++++++--- 3 files changed, 41 insertions(+), 25 deletions(-) diff --git a/mpp/include/group_update_pack.inc b/mpp/include/group_update_pack.inc index d08335d1e6..43e2af5bf5 100644 --- a/mpp/include/group_update_pack.inc +++ b/mpp/include/group_update_pack.inc @@ -21,25 +21,27 @@ if( group%k_loop_inside ) then do n = 1, npack buffer_pos = group%pack_buffer_pos(n) + buffer_start_pos pos = buffer_pos - is = group%pack_is(n); ie = group%pack_ie(n) - js = group%pack_js(n); je = group%pack_je(n) + is = group%pack_is(n); ie = group%pack_ie(n); ni = ie-is+1 + js = group%pack_js(n); je = group%pack_je(n); nj = je-js+1 rotation = group%pack_rotation(n) if( group%pack_type(n) == FIELD_S ) then select case( rotation ) case(ZERO) do l=1, group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) if(use_device_ptr) + !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do j = js, je do i = is, ie - idx = pos + (k-1)*(je-js+1)*(ie-is+1) + (j-js)*(ie-is+1) + (i-is) + 1 + idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1 buffer(idx) = field(i,j,k) end do end do enddo - pos = pos + ksize*(je-js+1)*(ie-is+1) + pos = pos + ksize*nj*ni enddo case( MINUS_NINETY ) do l=1,group%nscalar ! loop over number of fields @@ -83,17 +85,19 @@ if( group%k_loop_inside ) then case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) if(use_device_ptr) + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do j = js, je do i = is, ie - idx = pos + (k-1)*(je-js+1)*(ie-is+1) + (j-js)*(ie-is+1) + (i-is) + 1 + idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1 buffer(idx) = fieldx(i,j,k) end do end do end do - pos = pos + ksize*(je-js+1)*(ie-is+1) + pos = pos + ksize*nj*ni end do case( MINUS_NINETY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then @@ -165,17 +169,19 @@ if( group%k_loop_inside ) then case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) if(use_device_ptr) + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do j = js, je do i = is, ie - idx = pos + (k-1)*(je-js+1)*(ie-is+1) + (j-js)*(ie-is+1) + (i-is) + 1 + idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1 buffer(idx) = fieldy(i,j,k) end do end do end do - pos = pos + ksize*(je-js+1)*(ie-is+1) + pos = pos + ksize*nj*ni end do case( MINUS_NINETY ) do l=1,nvector ! loop over number of fields diff --git a/mpp/include/group_update_unpack.inc b/mpp/include/group_update_unpack.inc index f242cd3452..3f0b08faeb 100644 --- a/mpp/include/group_update_unpack.inc +++ b/mpp/include/group_update_unpack.inc @@ -21,55 +21,61 @@ if( group%k_loop_inside ) then do n = nunpack, 1, -1 buffer_pos = group%unpack_buffer_pos(n) + buffer_start_pos pos = buffer_pos - is = group%unpack_is(n); ie = group%unpack_ie(n) - js = group%unpack_js(n); je = group%unpack_je(n) + is = group%unpack_is(n); ie = group%unpack_ie(n); ni = ie-is+1 + js = group%unpack_js(n); je = group%unpack_je(n); nj = je-js+1 if( group%unpack_type(n) == FIELD_S ) then do l=1,nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) private(idx) & - !$omp map(to: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) & + !$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) & !$omp map(from: field(is:ie,js:je,1:ksize)) +#endif do k = 1, ksize do j = js, je do i = is, ie - idx = pos + (k-1)*(je-js+1)*(ie-is+1) + (j-js)*(ie-is+1) + (i-is) + 1 + idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1 field(i,j,k) = buffer(idx) end do end do end do - pos = pos + ksize*(je-js+1)*(ie-is+1) + pos = pos + ksize*nj*ni end do else if( group%unpack_type(n) == FIELD_X ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) & + !$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) & !$omp map(from: fieldx(is:ie,js:je,1:ksize)) if(use_device_ptr) +#endif do k = 1, ksize do j = js, je do i = is, ie - idx = pos + (k-1)*(je-js+1)*(ie-is+1) + (j-js)*(ie-is+1) + (i-is) + 1 + idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1 fieldx(i,j,k) = buffer(idx) end do end do end do - pos = pos + ksize*(je-js+1)*(ie-is+1) + pos = pos + ksize*nj*ni end do else if( group%unpack_type(n) == FIELD_Y ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) & + !$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) & !$omp map(from: fieldy(is:ie,js:je,1:ksize)) if(use_device_ptr) +#endif do k = 1, ksize do j = js, je do i = is, ie - idx = pos + (k-1)*(je-js+1)*(ie-is+1) + (j-js)*(ie-is+1) + (i-is) + 1 + idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1 fieldy(i,j,k) = buffer(idx) end do end do end do - pos = pos + ksize*(je-js+1)*(ie-is+1) + pos = pos + ksize*nj*ni end do endif enddo diff --git a/mpp/include/mpp_group_update.fh b/mpp/include/mpp_group_update.fh index 748e633f59..179856c624 100644 --- a/mpp/include/mpp_group_update.fh +++ b/mpp/include/mpp_group_update.fh @@ -431,7 +431,7 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type, omp_offload) integer :: msgsize integer :: from_pe, to_pe, buffer_pos, pos, idx integer :: ksize, is, ie, js, je - integer :: n, l, m, i, j, k, buffer_start_pos, nk + integer :: n, l, m, i, j, k, buffer_start_pos, ni, nj, nk integer :: shift, gridtype, midpoint integer :: npack, nunpack, rotation, isd logical :: use_device_ptr @@ -481,7 +481,9 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type, omp_offload) !---pre-post receive. call mpp_clock_begin(group_recv_clock) +#ifdef __NVCOMPILER !$omp target enter data map(alloc: buffer) if(use_device_ptr) +#endif do m = 1, nrecv msgsize = group%recv_size(m) from_pe = group%from_pe(m) @@ -525,7 +527,9 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type, omp_offload) nunpack = group%nunpack call mpp_clock_begin(group_unpk_clock) #include +#ifdef __NVCOMPILER !$omp target exit data map(release: buffer) if(use_device_ptr) +#endif call mpp_clock_end(group_unpk_clock) ! ---northern boundary fold @@ -653,7 +657,7 @@ subroutine MPP_START_GROUP_UPDATE_(group, domain, d_type, reuse_buffer) integer :: msgsize, npack, rotation integer :: from_pe, to_pe, buffer_pos, pos, idx integer :: ksize, is, ie, js, je - integer :: n, l, m, i, j, k, buffer_start_pos, nk + integer :: n, l, m, i, j, k, buffer_start_pos, ni, nj, nk logical :: reuse_buf_pos logical, parameter :: use_device_ptr = .false. ! placeholder character(len=8) :: text @@ -760,7 +764,7 @@ subroutine MPP_COMPLETE_GROUP_UPDATE_(group, domain, d_type) integer :: k, buffer_pos, pos, m, n, l, idx integer :: is, ie, js, je, ksize, i, j integer :: shift, gridtype, midpoint, flags_v - integer :: nunpack, rotation, buffer_start_pos, nk, isd + integer :: nunpack, rotation, buffer_start_pos, ni, nj, nk, isd logical :: recv_y(8) logical, parameter :: use_device_ptr = .false. ! placeholder MPP_TYPE_ :: buffer(size(mpp_domains_stack_nonblock(:))) From 93d148e690579df9c4a9bcc09f6e9eb808a2ff3a Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Fri, 10 Oct 2025 13:19:55 +1100 Subject: [PATCH 04/14] add back old omp directive wrapped in #ifndef __NVCOMPILER --- mpp/include/group_update_pack.inc | 10 ++++++++++ mpp/include/group_update_unpack.inc | 14 ++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/mpp/include/group_update_pack.inc b/mpp/include/group_update_pack.inc index 43e2af5bf5..6fa8866e91 100644 --- a/mpp/include/group_update_pack.inc +++ b/mpp/include/group_update_pack.inc @@ -18,6 +18,11 @@ !*********************************************************************** if( group%k_loop_inside ) then +#ifndef __NVCOMPILER +!$OMP parallel do default(none) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & +!$OMP private(buffer_pos,pos,m,is,ie,js,je,rotation, & +!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx) +#endif do n = 1, npack buffer_pos = group%pack_buffer_pos(n) + buffer_start_pos pos = buffer_pos @@ -251,6 +256,11 @@ if( group%k_loop_inside ) then endif enddo else +#ifndef __NVCOMPILER +!$OMP parallel do default(none) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & +!$OMP private(buffer_pos,pos,m,is,ie,js,je,rotation, & +!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx) +#endif do nk = 1, npack*ksize n = (nk-1)/ksize + 1 k = mod((nk-1), ksize) + 1 diff --git a/mpp/include/group_update_unpack.inc b/mpp/include/group_update_unpack.inc index 3f0b08faeb..894d6b110c 100644 --- a/mpp/include/group_update_unpack.inc +++ b/mpp/include/group_update_unpack.inc @@ -18,6 +18,11 @@ !*********************************************************************** if( group%k_loop_inside ) then +#ifndef __NVCOMPILER +!$OMP parallel do default(none) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & +!$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, & +!$OMP ptr_field, ptr_fieldx, ptr_fieldy, n,k,ni,nj,idx) +#endif do n = nunpack, 1, -1 buffer_pos = group%unpack_buffer_pos(n) + buffer_start_pos pos = buffer_pos @@ -80,13 +85,18 @@ if( group%k_loop_inside ) then endif enddo else +#ifndef __NVCOMPILER +!$OMP parallel do default(none) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & +!$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, & +!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx) +#endif do nk = nunpack*ksize, 1, -1 n = (nk-1)/ksize + 1 k = mod((nk-1), ksize) + 1 buffer_pos = group%unpack_buffer_pos(n) + buffer_start_pos pos = buffer_pos + (k-1)*group%unpack_size(n) - is = group%unpack_is(n); ie = group%unpack_ie(n) - js = group%unpack_js(n); je = group%unpack_je(n) + is = group%unpack_is(n); ie = group%unpack_ie(n); ni = ie-is+1 + js = group%unpack_js(n); je = group%unpack_je(n); nj = je-js+1 if( group%unpack_type(n) == FIELD_S ) then do l=1,nscalar ! loop over number of fields ptr_field = group%addrs_s(l) From 3e3da6e86e195f1dcb83a7af0ddad9d18ea2ee0f Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Fri, 10 Oct 2025 15:46:53 +1100 Subject: [PATCH 05/14] port remaining un/pack loops --- mpp/include/group_update_pack.inc | 263 ++++++++++++++++++++++------ mpp/include/group_update_unpack.inc | 27 ++- 2 files changed, 226 insertions(+), 64 deletions(-) diff --git a/mpp/include/group_update_pack.inc b/mpp/include/group_update_pack.inc index 6fa8866e91..27ed8b9438 100644 --- a/mpp/include/group_update_pack.inc +++ b/mpp/include/group_update_pack.inc @@ -51,38 +51,53 @@ if( group%k_loop_inside ) then case( MINUS_NINETY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (k-1)*nj*ni + (i-is)*nj + (je-j) + 1 + buffer(idx) = field(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do case( NINETY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (k-1)*nj*ni + (ie-i)*nj + (j-js) + 1 + buffer(idx) = field(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do case( ONE_HUNDRED_EIGHTY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (k-1)*nj*ni + (je-j)*ni + (ie-i) + 1 + buffer(idx) = field(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do end select else if( group%pack_type(n) == FIELD_X ) then @@ -108,64 +123,91 @@ if( group%k_loop_inside ) then if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + idx = pos + (k-1)*nj*ni + (i-is)*nj + (je-j) + 1 + buffer(idx) = fieldy(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = -fieldy(i,j,k) + idx = pos + (k-1)*nj*ni + (i-is)*nj + (je-j) + 1 + buffer(idx) = -fieldy(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do end if case( NINETY ) do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + ! pos = pos + 1 + ! buffer(pos) = fieldy(i,j,k) + idx = pos + (k-1)*nj*ni + (ie-i)*nj + (j-js) + 1 + buffer(idx) = fieldy(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do case( ONE_HUNDRED_EIGHTY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (k-1)*nj*ni + (je-j)*ni + (ie-i) + 1 + buffer(idx) = fieldx(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = -fieldx(i,j,k) + idx = pos + (k-1)*nj*ni + (je-j)*ni + (ie-i) + 1 + buffer(idx) = -fieldx(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do end if end select ! select case( rotation(n) ) @@ -191,65 +233,90 @@ if( group%k_loop_inside ) then case( MINUS_NINETY ) do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (k-1)*nj*ni + (i-is)*nj + (je-j) + 1 + buffer(idx) = fieldx(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do case( NINETY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (k-1)*nj*ni + (ie-i)*nj + (j-js) + 1 + buffer(idx) = fieldx(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = -fieldx(i,j,k) + idx = pos + (k-1)*nj*ni + (ie-i)*nj + (j-js) + 1 + buffer(idx) = -fieldx(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do end if case( ONE_HUNDRED_EIGHTY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + idx = pos + (k-1)*nj*ni + (je-j)*ni + (ie-i) + 1 + buffer(idx) = fieldy(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = -fieldy(i,j,k) + idx = pos + (k-1)*nj*ni + (je-j)*ni + (ie-i) + 1 + buffer(idx) = -fieldy(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do end if end select ! select case( rotation(n) ) @@ -274,42 +341,62 @@ else case(ZERO) do l=1, group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: field(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do j = js, je do i = is, ie - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (j-js)*ni + (i-is) + 1 + buffer(idx) = field(i,j,k) end do end do + pos = pos + nj*ni enddo case( MINUS_NINETY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: field(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (i-is)*nj + (je-j) + 1 + buffer(idx) = field(i,j,k) end do end do + pos = pos + nj*ni end do case( NINETY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: field(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (ie-i)*nj + (j-js) + 1 + buffer(idx) = field(i,j,k) end do end do + pos = pos + nj*ni end do case( ONE_HUNDRED_EIGHTY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: field(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (je-j)*ni + (ie-i) + 1 + buffer(idx) = field(i,j,k) end do end do + pos = pos + nj*ni end do end select else if( group%pack_type(n) == FIELD_X ) then @@ -317,65 +404,95 @@ else case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do j = js, je do i = is, ie - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (j-js)*ni + (i-is) + 1 + buffer(idx) = fieldx(i,j,k) end do end do + pos = pos + nj*ni end do case( MINUS_NINETY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + idx = pos + (i-is)*nj + (je-j) + 1 + buffer(idx) = fieldy(i,j,k) end do end do + pos = pos + nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = -fieldy(i,j,k) + idx = pos + (i-is)*nj + (je-j) + 1 + buffer(idx) = -fieldy(i,j,k) end do end do + pos = pos + nj*ni end do end if case( NINETY ) do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + idx = pos + (ie-i)*nj + (j-js) + 1 + buffer(idx) = fieldy(i,j,k) end do end do + pos = pos + nj*ni end do case( ONE_HUNDRED_EIGHTY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (je-j)*ni + (ie-i) + 1 + buffer(idx) = fieldx(i,j,k) end do end do + pos = pos + nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = -fieldx(i,j,k) + idx = pos + (je-j)*ni + (ie-i) + 1 + buffer(idx) = -fieldx(i,j,k) end do end do + pos = pos + nj*ni end do end if end select ! select case( rotation(n) ) @@ -384,65 +501,95 @@ else case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do j = js, je do i = is, ie - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + idx = pos + (j-js)*ni + (i-is) + 1 + buffer(idx) = fieldy(i,j,k) end do end do + pos = pos + nj*ni end do case( MINUS_NINETY ) do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (i-is)*nj + (je-j) + 1 + buffer(idx) = fieldx(i,j,k) end do end do + pos = pos + nj*ni end do case( NINETY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (ie-i)*nj + (j-js) + 1 + buffer(idx) = fieldx(i,j,k) end do end do + pos = pos + nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = -fieldx(i,j,k) + idx = pos + (ie-i)*nj + (j-js) + 1 + buffer(idx) = -fieldx(i,j,k) end do end do + pos = pos + nj*ni end do end if case( ONE_HUNDRED_EIGHTY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + idx = pos + (je-j)*ni + (ie-i) + 1 + buffer(idx) = fieldy(i,j,k) end do end do + pos = pos + nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = -fieldy(i,j,k) + idx = pos + (je-j)*ni + (ie-i) + 1 + buffer(idx) = -fieldy(i,j,k) end do end do + pos = pos + nj*ni end do end if end select ! select case( rotation(n) ) diff --git a/mpp/include/group_update_unpack.inc b/mpp/include/group_update_unpack.inc index 894d6b110c..3fa214c7bc 100644 --- a/mpp/include/group_update_unpack.inc +++ b/mpp/include/group_update_unpack.inc @@ -100,32 +100,47 @@ else if( group%unpack_type(n) == FIELD_S ) then do l=1,nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: field(is:ie,js:je,k)) if(use_device_ptr) +#endif do j = js, je do i = is, ie - pos = pos + 1 - field(i,j,k) = buffer(pos) + idx = pos + (j-js)*ni + (i-is) + 1 + field(i,j,k) = buffer(idx) end do end do + pos = pos + ni*nj end do else if( group%unpack_type(n) == FIELD_X ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER__ + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: fieldx(is:ie,js:je,k)) if(use_device_ptr) +#endif do j = js, je do i = is, ie - pos = pos + 1 - fieldx(i,j,k) = buffer(pos) + idx = pos + (j-js)*ni + (i-is) + 1 + fieldx(i,j,k) = buffer(idx) end do end do + pos = pos + ni*nj end do else if( group%unpack_type(n) == FIELD_Y ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: fieldy(is:ie,js:je,k)) if(use_device_ptr) +#endif do j = js, je do i = is, ie - pos = pos + 1 - fieldy(i,j,k) = buffer(pos) + idx = pos + (j-js)*ni + (i-is) + 1 + fieldy(i,j,k) = buffer(idx) end do end do + pos = pos + ni*nj end do endif enddo From d5739ef2e052b7d36008e994e1dee42b8b6b971d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20Luis=20G=C3=A1lvez=20Vallejo?= Date: Thu, 30 Oct 2025 16:51:55 +1100 Subject: [PATCH 06/14] add multi gpu support (#2) * add multi gpu support * address review comments, add helpful comment for the acc/mp runbtime call --- mpp/include/mpp_comm_mpi.inc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/mpp/include/mpp_comm_mpi.inc b/mpp/include/mpp_comm_mpi.inc index 82df7e3164..008e5fcbe5 100644 --- a/mpp/include/mpp_comm_mpi.inc +++ b/mpp/include/mpp_comm_mpi.inc @@ -31,6 +31,7 @@ !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! !> @brief Initialize the @ref mpp_mod module. Must be called before any usage. subroutine mpp_init( flags, localcomm, test_level, alt_input_nml_path ) + !$use omp_lib integer, optional, intent(in) :: flags !< Flags for debug output, can be MPP_VERBOSE or MPP_DEBUG integer, optional, intent(in) :: localcomm !< Id of MPI communicator used to initialize integer, optional, intent(in) :: test_level !< Used to exit initialization at certain stages @@ -54,6 +55,14 @@ call MPI_COMM_RANK( mpp_comm_private, pe, error ) call MPI_COMM_SIZE( mpp_comm_private, npes, error ) + ! set default device to enable multi GPU parallelism + ! calls to both OpenACC and OpenMP runtimes are needed + ! because we use both do-concurrent and openmp + ! if you remove either, the code will run multiple + ! ranks on a _single_ GPU. Be careful out there! + !$call omp_set_default_device(pe) + !$acc set device_num(pe) + module_is_initialized = .TRUE. if (present(test_level)) then t_level = test_level From b28747180d4b18bb949f7c6f1dfddf75c4360ffa Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Tue, 14 Oct 2025 13:20:54 +1100 Subject: [PATCH 07/14] sub __NVCOMPILER with __NVCOMPILER_OPENMP_GPU --- mpp/include/group_update_pack.inc | 68 ++++++++++++++--------------- mpp/include/group_update_unpack.inc | 16 +++---- mpp/include/mpp_group_update.fh | 4 +- 3 files changed, 44 insertions(+), 44 deletions(-) diff --git a/mpp/include/group_update_pack.inc b/mpp/include/group_update_pack.inc index 27ed8b9438..4c5b7be7e3 100644 --- a/mpp/include/group_update_pack.inc +++ b/mpp/include/group_update_pack.inc @@ -18,7 +18,7 @@ !*********************************************************************** if( group%k_loop_inside ) then -#ifndef __NVCOMPILER +#ifndef __NVCOMPILER_OPENMP_GPU !$OMP parallel do default(none) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & !$OMP private(buffer_pos,pos,m,is,ie,js,je,rotation, & !$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx) @@ -34,7 +34,7 @@ if( group%k_loop_inside ) then case(ZERO) do l=1, group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) #endif @@ -51,7 +51,7 @@ if( group%k_loop_inside ) then case( MINUS_NINETY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) #endif @@ -68,7 +68,7 @@ if( group%k_loop_inside ) then case( NINETY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) #endif @@ -85,7 +85,7 @@ if( group%k_loop_inside ) then case( ONE_HUNDRED_EIGHTY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) #endif @@ -105,7 +105,7 @@ if( group%k_loop_inside ) then case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) #endif @@ -123,7 +123,7 @@ if( group%k_loop_inside ) then if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) #endif @@ -140,7 +140,7 @@ if( group%k_loop_inside ) then else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) #endif @@ -158,7 +158,7 @@ if( group%k_loop_inside ) then case( NINETY ) do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) #endif @@ -178,7 +178,7 @@ if( group%k_loop_inside ) then if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) #endif @@ -195,7 +195,7 @@ if( group%k_loop_inside ) then else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) #endif @@ -216,7 +216,7 @@ if( group%k_loop_inside ) then case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) #endif @@ -233,7 +233,7 @@ if( group%k_loop_inside ) then case( MINUS_NINETY ) do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) #endif @@ -251,7 +251,7 @@ if( group%k_loop_inside ) then if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) #endif @@ -268,7 +268,7 @@ if( group%k_loop_inside ) then else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) #endif @@ -287,7 +287,7 @@ if( group%k_loop_inside ) then if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) #endif @@ -304,7 +304,7 @@ if( group%k_loop_inside ) then else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) #endif @@ -323,7 +323,7 @@ if( group%k_loop_inside ) then endif enddo else -#ifndef __NVCOMPILER +#ifndef __NVCOMPILER_OPENMP_GPU !$OMP parallel do default(none) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & !$OMP private(buffer_pos,pos,m,is,ie,js,je,rotation, & !$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx) @@ -341,7 +341,7 @@ else case(ZERO) do l=1, group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: field(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) #endif @@ -356,7 +356,7 @@ else case( MINUS_NINETY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: field(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) #endif @@ -371,7 +371,7 @@ else case( NINETY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: field(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) #endif @@ -386,7 +386,7 @@ else case( ONE_HUNDRED_EIGHTY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: field(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) #endif @@ -404,7 +404,7 @@ else case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) #endif @@ -420,7 +420,7 @@ else if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) #endif @@ -435,7 +435,7 @@ else else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) #endif @@ -451,7 +451,7 @@ else case( NINETY ) do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) #endif @@ -467,7 +467,7 @@ else if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) #endif @@ -482,7 +482,7 @@ else else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) #endif @@ -501,7 +501,7 @@ else case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) #endif @@ -516,7 +516,7 @@ else case( MINUS_NINETY ) do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) #endif @@ -532,7 +532,7 @@ else if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) #endif @@ -547,7 +547,7 @@ else else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) #endif @@ -564,7 +564,7 @@ else if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) #endif @@ -579,7 +579,7 @@ else else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) #endif diff --git a/mpp/include/group_update_unpack.inc b/mpp/include/group_update_unpack.inc index 3fa214c7bc..286f07bc83 100644 --- a/mpp/include/group_update_unpack.inc +++ b/mpp/include/group_update_unpack.inc @@ -18,7 +18,7 @@ !*********************************************************************** if( group%k_loop_inside ) then -#ifndef __NVCOMPILER +#ifndef __NVCOMPILER_OPENMP_GPU !$OMP parallel do default(none) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & !$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, & !$OMP ptr_field, ptr_fieldx, ptr_fieldy, n,k,ni,nj,idx) @@ -31,7 +31,7 @@ if( group%k_loop_inside ) then if( group%unpack_type(n) == FIELD_S ) then do l=1,nscalar ! loop over number of fields ptr_field = group%addrs_s(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) private(idx) & !$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) & !$omp map(from: field(is:ie,js:je,1:ksize)) @@ -49,7 +49,7 @@ if( group%k_loop_inside ) then else if( group%unpack_type(n) == FIELD_X ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) & !$omp map(from: fieldx(is:ie,js:je,1:ksize)) if(use_device_ptr) @@ -67,7 +67,7 @@ if( group%k_loop_inside ) then else if( group%unpack_type(n) == FIELD_Y ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) & !$omp map(from: fieldy(is:ie,js:je,1:ksize)) if(use_device_ptr) @@ -85,7 +85,7 @@ if( group%k_loop_inside ) then endif enddo else -#ifndef __NVCOMPILER +#ifndef __NVCOMPILER_OPENMP_GPU !$OMP parallel do default(none) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & !$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, & !$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx) @@ -100,7 +100,7 @@ else if( group%unpack_type(n) == FIELD_S ) then do l=1,nscalar ! loop over number of fields ptr_field = group%addrs_s(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: field(is:ie,js:je,k)) if(use_device_ptr) #endif @@ -115,7 +115,7 @@ else else if( group%unpack_type(n) == FIELD_X ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) -#ifdef __NVCOMPILER__ +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: fieldx(is:ie,js:je,k)) if(use_device_ptr) #endif @@ -130,7 +130,7 @@ else else if( group%unpack_type(n) == FIELD_Y ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: fieldy(is:ie,js:je,k)) if(use_device_ptr) #endif diff --git a/mpp/include/mpp_group_update.fh b/mpp/include/mpp_group_update.fh index 179856c624..6a4cb2465d 100644 --- a/mpp/include/mpp_group_update.fh +++ b/mpp/include/mpp_group_update.fh @@ -481,7 +481,7 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type, omp_offload) !---pre-post receive. call mpp_clock_begin(group_recv_clock) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target enter data map(alloc: buffer) if(use_device_ptr) #endif do m = 1, nrecv @@ -527,7 +527,7 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type, omp_offload) nunpack = group%nunpack call mpp_clock_begin(group_unpk_clock) #include -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target exit data map(release: buffer) if(use_device_ptr) #endif call mpp_clock_end(group_unpk_clock) From 0cc2a77ee70851ad08458a52a1e4f6307e163463 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Tue, 16 Dec 2025 16:42:46 +1100 Subject: [PATCH 08/14] allow choice of gpu or cpu parallel To enable this, had to be removed - otherwise segfaults happen on the GPU. --- mpp/include/group_update_pack.inc | 9 ++++----- mpp/include/group_update_unpack.inc | 9 ++++----- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/mpp/include/group_update_pack.inc b/mpp/include/group_update_pack.inc index 4c5b7be7e3..8c421ba906 100644 --- a/mpp/include/group_update_pack.inc +++ b/mpp/include/group_update_pack.inc @@ -18,11 +18,10 @@ !*********************************************************************** if( group%k_loop_inside ) then -#ifndef __NVCOMPILER_OPENMP_GPU -!$OMP parallel do default(none) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & -!$OMP private(buffer_pos,pos,m,is,ie,js,je,rotation, & -!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx) -#endif +!$OMP parallel do shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & +!$OMP private(buffer_pos,pos,m,is,ie,js,je,rotation, & +!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx) & +!$OMP if (.not.use_device_ptr) do n = 1, npack buffer_pos = group%pack_buffer_pos(n) + buffer_start_pos pos = buffer_pos diff --git a/mpp/include/group_update_unpack.inc b/mpp/include/group_update_unpack.inc index 286f07bc83..a745982cf0 100644 --- a/mpp/include/group_update_unpack.inc +++ b/mpp/include/group_update_unpack.inc @@ -18,11 +18,10 @@ !*********************************************************************** if( group%k_loop_inside ) then -#ifndef __NVCOMPILER_OPENMP_GPU -!$OMP parallel do default(none) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & -!$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, & -!$OMP ptr_field, ptr_fieldx, ptr_fieldy, n,k,ni,nj,idx) -#endif +!$OMP parallel do shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & +!$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, & +!$OMP ptr_field, ptr_fieldx, ptr_fieldy, n,k,ni,nj,idx) & +!$OMP if (.not.use_device_ptr) do n = nunpack, 1, -1 buffer_pos = group%unpack_buffer_pos(n) + buffer_start_pos pos = buffer_pos From d26eb4dc56ab5b95c7e25669daf753262e0413b7 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Thu, 15 Jan 2026 02:31:34 +0000 Subject: [PATCH 09/14] fix omp set device call --- mpp/include/mpp_comm_mpi.inc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mpp/include/mpp_comm_mpi.inc b/mpp/include/mpp_comm_mpi.inc index 008e5fcbe5..ee66b0cd4e 100644 --- a/mpp/include/mpp_comm_mpi.inc +++ b/mpp/include/mpp_comm_mpi.inc @@ -60,7 +60,7 @@ ! because we use both do-concurrent and openmp ! if you remove either, the code will run multiple ! ranks on a _single_ GPU. Be careful out there! - !$call omp_set_default_device(pe) + !$ call omp_set_default_device(pe) !$acc set device_num(pe) module_is_initialized = .TRUE. From ca753a4a4d51ed44ab4d77d017955b91b7b177ca Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Tue, 17 Feb 2026 09:54:24 +1100 Subject: [PATCH 10/14] Revert "allow choice of gpu or cpu parallel" This reverts commit 0cc2a77ee70851ad08458a52a1e4f6307e163463. Having both the CPU and GPU OpenMP directives compiled caused a significant slowdown in GPU packing/unpacking performance - even if parallelism is controlled using OpenMP "if" clause. --- mpp/include/group_update_pack.inc | 9 +++++---- mpp/include/group_update_unpack.inc | 9 +++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/mpp/include/group_update_pack.inc b/mpp/include/group_update_pack.inc index 8c421ba906..4c5b7be7e3 100644 --- a/mpp/include/group_update_pack.inc +++ b/mpp/include/group_update_pack.inc @@ -18,10 +18,11 @@ !*********************************************************************** if( group%k_loop_inside ) then -!$OMP parallel do shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & -!$OMP private(buffer_pos,pos,m,is,ie,js,je,rotation, & -!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx) & -!$OMP if (.not.use_device_ptr) +#ifndef __NVCOMPILER_OPENMP_GPU +!$OMP parallel do default(none) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & +!$OMP private(buffer_pos,pos,m,is,ie,js,je,rotation, & +!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx) +#endif do n = 1, npack buffer_pos = group%pack_buffer_pos(n) + buffer_start_pos pos = buffer_pos diff --git a/mpp/include/group_update_unpack.inc b/mpp/include/group_update_unpack.inc index a745982cf0..286f07bc83 100644 --- a/mpp/include/group_update_unpack.inc +++ b/mpp/include/group_update_unpack.inc @@ -18,10 +18,11 @@ !*********************************************************************** if( group%k_loop_inside ) then -!$OMP parallel do shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & -!$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, & -!$OMP ptr_field, ptr_fieldx, ptr_fieldy, n,k,ni,nj,idx) & -!$OMP if (.not.use_device_ptr) +#ifndef __NVCOMPILER_OPENMP_GPU +!$OMP parallel do default(none) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & +!$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, & +!$OMP ptr_field, ptr_fieldx, ptr_fieldy, n,k,ni,nj,idx) +#endif do n = nunpack, 1, -1 buffer_pos = group%unpack_buffer_pos(n) + buffer_start_pos pos = buffer_pos From 2d2cc1a56b87a559dc8afb7e18f32fb96e43d378 Mon Sep 17 00:00:00 2001 From: Marshall Ward Date: Mon, 16 Mar 2026 19:09:09 +0000 Subject: [PATCH 11/14] OMP MPI: Minor cleanups Some very minor changes to the OpenMP target MPI PR: * use_device_ptr -> use_device_addr This appears to be the updated form (or at least nvfortran says it is) * Whitespace added to `!$ use omp_lib` Does not seem crucial but from our previous discussion it appears more correct. * Removal of some trailing whitespace. --- mpp/include/mpp_comm_mpi.inc | 10 +++++----- mpp/include/mpp_transmit_mpi.fh | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/mpp/include/mpp_comm_mpi.inc b/mpp/include/mpp_comm_mpi.inc index ee66b0cd4e..93eb7a9c83 100644 --- a/mpp/include/mpp_comm_mpi.inc +++ b/mpp/include/mpp_comm_mpi.inc @@ -31,7 +31,7 @@ !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! !> @brief Initialize the @ref mpp_mod module. Must be called before any usage. subroutine mpp_init( flags, localcomm, test_level, alt_input_nml_path ) - !$use omp_lib + !$ use omp_lib integer, optional, intent(in) :: flags !< Flags for debug output, can be MPP_VERBOSE or MPP_DEBUG integer, optional, intent(in) :: localcomm !< Id of MPI communicator used to initialize integer, optional, intent(in) :: test_level !< Used to exit initialization at certain stages @@ -55,10 +55,10 @@ call MPI_COMM_RANK( mpp_comm_private, pe, error ) call MPI_COMM_SIZE( mpp_comm_private, npes, error ) - ! set default device to enable multi GPU parallelism - ! calls to both OpenACC and OpenMP runtimes are needed - ! because we use both do-concurrent and openmp - ! if you remove either, the code will run multiple + ! set default device to enable multi GPU parallelism + ! calls to both OpenACC and OpenMP runtimes are needed + ! because we use both do-concurrent and openmp + ! if you remove either, the code will run multiple ! ranks on a _single_ GPU. Be careful out there! !$ call omp_set_default_device(pe) !$acc set device_num(pe) diff --git a/mpp/include/mpp_transmit_mpi.fh b/mpp/include/mpp_transmit_mpi.fh index 729cc165b0..543f6ae0cc 100644 --- a/mpp/include/mpp_transmit_mpi.fh +++ b/mpp/include/mpp_transmit_mpi.fh @@ -88,7 +88,7 @@ call mpp_error(FATAL, & & "MPP_TRANSMIT: cur_send_request is greater than max_request, increase mpp_nml request_multiply") if (use_device_ptr) then - !$omp target data use_device_ptr(put_data) + !$omp target data use_device_addr(put_data) call MPI_ISEND( put_data, put_len, MPI_TYPE_, to_pe, comm_tag, mpp_comm_private, & request_send(cur_send_request), error) !$omp end target data @@ -143,7 +143,7 @@ call mpp_error(FATAL, & "MPP_TRANSMIT: cur_recv_request is greater than max_request, increase mpp_nml request_multiply") if (use_device_ptr) then - !$omp target data use_device_ptr(get_data) + !$omp target data use_device_addr(get_data) call MPI_IRECV( get_data, get_len, MPI_TYPE_, from_pe, comm_tag, mpp_comm_private, & request_recv(cur_recv_request), error ) !$omp end target data From fe5ddda44914acc52671923a829baf937d6eb79e Mon Sep 17 00:00:00 2001 From: Marshall Ward Date: Mon, 16 Mar 2026 19:47:25 +0000 Subject: [PATCH 12/14] OMP target MPI: line length compliance This patch refactors several lines to keep within the 121-character line length limit prescribed by the FMS style guidelines. --- mpp/include/group_update_pack.inc | 128 ++++++++++++++++++++++-------- mpp/include/mpp_transmit.inc | 9 ++- 2 files changed, 102 insertions(+), 35 deletions(-) diff --git a/mpp/include/group_update_pack.inc b/mpp/include/group_update_pack.inc index 4c5b7be7e3..2422c1d8fa 100644 --- a/mpp/include/group_update_pack.inc +++ b/mpp/include/group_update_pack.inc @@ -36,7 +36,9 @@ if( group%k_loop_inside ) then ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) + !$omp map(to: field(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) #endif do k = 1, ksize do j = js, je @@ -53,7 +55,9 @@ if( group%k_loop_inside ) then ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) + !$omp map(to: field(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) #endif do k = 1, ksize do i = is, ie @@ -70,7 +74,9 @@ if( group%k_loop_inside ) then ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) + !$omp map(to: field(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) #endif do k = 1, ksize do i = ie, is, -1 @@ -87,7 +93,9 @@ if( group%k_loop_inside ) then ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) + !$omp map(to: field(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) #endif do k = 1, ksize do j = je, js, -1 @@ -107,7 +115,9 @@ if( group%k_loop_inside ) then ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) #endif do k = 1, ksize do j = js, je @@ -125,7 +135,9 @@ if( group%k_loop_inside ) then ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) #endif do k = 1, ksize do i = is, ie @@ -142,7 +154,9 @@ if( group%k_loop_inside ) then ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) #endif do k = 1, ksize do i = is, ie @@ -160,7 +174,9 @@ if( group%k_loop_inside ) then ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) #endif do k = 1, ksize do i = ie, is, -1 @@ -180,7 +196,9 @@ if( group%k_loop_inside ) then ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if(use_device_ptr) #endif do k = 1, ksize do j = je, js, -1 @@ -197,7 +215,9 @@ if( group%k_loop_inside ) then ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) #endif do k = 1, ksize do j = je, js, -1 @@ -218,7 +238,9 @@ if( group%k_loop_inside ) then ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) #endif do k = 1, ksize do j = js, je @@ -235,7 +257,9 @@ if( group%k_loop_inside ) then ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) #endif do k = 1, ksize do i = is, ie @@ -253,7 +277,9 @@ if( group%k_loop_inside ) then ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) #endif do k = 1, ksize do i = ie, is, -1 @@ -270,7 +296,9 @@ if( group%k_loop_inside ) then ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) #endif do k = 1, ksize do i = ie, is, -1 @@ -289,7 +317,9 @@ if( group%k_loop_inside ) then ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) #endif do k = 1, ksize do j = je, js, -1 @@ -306,7 +336,9 @@ if( group%k_loop_inside ) then ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if(use_device_ptr) #endif do k = 1, ksize do j = je, js, -1 @@ -343,7 +375,9 @@ else ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & - !$omp map(to: field(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) + !$omp map(to: field(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) #endif do j = js, je do i = is, ie @@ -358,7 +392,9 @@ else ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & - !$omp map(to: field(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) + !$omp map(to: field(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) #endif do i = is, ie do j = je, js, -1 @@ -373,7 +409,9 @@ else ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & - !$omp map(to: field(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) + !$omp map(to: field(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) #endif do i = ie, is, -1 do j = js, je @@ -388,7 +426,9 @@ else ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & - !$omp map(to: field(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) + !$omp map(to: field(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) #endif do j = je, js, -1 do i = ie, is, -1 @@ -406,7 +446,9 @@ else ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & - !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) + !$omp map(to: fieldx(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) #endif do j = js, je do i = is, ie @@ -422,7 +464,9 @@ else ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & - !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) + !$omp map(to: fieldy(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) #endif do i = is, ie do j = je, js, -1 @@ -437,7 +481,9 @@ else ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & - !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) + !$omp map(to: fieldy(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) #endif do i = is, ie do j = je, js, -1 @@ -453,7 +499,9 @@ else ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & - !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) + !$omp map(to: fieldy(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) #endif do i = ie, is, -1 do j = js, je @@ -469,7 +517,9 @@ else ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & - !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) + !$omp map(to: fieldx(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) #endif do j = je, js, -1 do i = ie, is, -1 @@ -484,7 +534,9 @@ else ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & - !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) + !$omp map(to: fieldx(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) #endif do j = je, js, -1 do i = ie, is, -1 @@ -503,7 +555,9 @@ else ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & - !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) + !$omp map(to: fieldy(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) #endif do j = js, je do i = is, ie @@ -518,7 +572,9 @@ else ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & - !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) + !$omp map(to: fieldx(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if(use_device_ptr) #endif do i = is, ie do j = je, js, -1 @@ -534,7 +590,9 @@ else ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & - !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) + !$omp map(to: fieldx(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) #endif do i = ie, is, -1 do j = js, je @@ -549,7 +607,9 @@ else ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & - !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) + !$omp map(to: fieldx(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) #endif do i = ie, is, -1 do j = js, je @@ -566,7 +626,9 @@ else ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & - !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) + !$omp map(to: fieldy(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) #endif do j = je, js, -1 do i = ie, is, -1 @@ -581,7 +643,9 @@ else ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & - !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) + !$omp map(to: fieldy(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) #endif do j = je, js, -1 do i = ie, is, -1 diff --git a/mpp/include/mpp_transmit.inc b/mpp/include/mpp_transmit.inc index b3eac0c3ff..224366f384 100644 --- a/mpp/include/mpp_transmit.inc +++ b/mpp/include/mpp_transmit.inc @@ -190,7 +190,8 @@ ptr = LOC(get_data) get_len=1; if(PRESENT(glen))get_len=glen - call mpp_transmit( dummy, 1, NULL_PE, get_data1D, get_len, from_pe, block, tag, recv_request=request, omp_offload=omp_offload ) + call mpp_transmit( dummy, 1, NULL_PE, get_data1D, get_len, from_pe, & + block, tag, recv_request=request, omp_offload=omp_offload ) end subroutine MPP_RECV_SCALAR_ @@ -209,7 +210,8 @@ pointer( ptr, put_data1D ) ptr = LOC(put_data) put_len=1; if(PRESENT(plen))put_len=plen - call mpp_transmit( put_data1D, put_len, to_pe, dummy, 1, NULL_PE, tag = tag, send_request=request, omp_offload=omp_offload ) + call mpp_transmit( put_data1D, put_len, to_pe, dummy, 1, NULL_PE, & + tag=tag, send_request=request, omp_offload=omp_offload ) end subroutine MPP_SEND_SCALAR_ @@ -222,7 +224,8 @@ integer, intent(out), optional :: request MPP_TYPE_ :: dummy(1,1) - call mpp_transmit( dummy, 1, NULL_PE, get_data, get_len, from_pe, block, tag, recv_request=request ) + call mpp_transmit( dummy, 1, NULL_PE, get_data, get_len, from_pe, & + block, tag, recv_request=request ) end subroutine MPP_RECV_2D_ subroutine MPP_SEND_2D_( put_data, put_len, to_pe, tag, request ) From c91024a279f7c48044a15f74aa819024b9ac32ea Mon Sep 17 00:00:00 2001 From: Marshall Ward Date: Tue, 17 Mar 2026 14:14:53 +0000 Subject: [PATCH 13/14] OMP MPI: Update nocomm interface The no-comm (no MPI) interface has been updated to support the new omp_offload argument. --- mpp/include/mpp_transmit_mpi.fh | 6 +++--- mpp/include/mpp_transmit_nocomm.fh | 5 ++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/mpp/include/mpp_transmit_mpi.fh b/mpp/include/mpp_transmit_mpi.fh index 543f6ae0cc..cbfa070de1 100644 --- a/mpp/include/mpp_transmit_mpi.fh +++ b/mpp/include/mpp_transmit_mpi.fh @@ -52,12 +52,12 @@ integer :: rsize logical :: use_device_ptr - use_device_ptr = .false. - if (present(omp_offload)) use_device_ptr = omp_offload - if( .NOT.module_is_initialized )call mpp_error( FATAL, 'MPP_TRANSMIT: You must first call mpp_init.' ) if( to_pe.EQ.NULL_PE .AND. from_pe.EQ.NULL_PE )return + use_device_ptr = .false. + if (present(omp_offload)) use_device_ptr = omp_offload + block_comm = .true. if(PRESENT(block)) block_comm = block diff --git a/mpp/include/mpp_transmit_nocomm.fh b/mpp/include/mpp_transmit_nocomm.fh index 8baaee60f5..a9bb5da2a1 100644 --- a/mpp/include/mpp_transmit_nocomm.fh +++ b/mpp/include/mpp_transmit_nocomm.fh @@ -36,7 +36,7 @@ !!words from an array of any rank to be passed (avoiding f90 rank conformance check) !!caller is responsible for completion checks (mpp_sync_self) before and after subroutine MPP_TRANSMIT_( put_data, put_len, to_pe, get_data, get_len, from_pe, block, tag, recv_request, & - & send_request ) + & send_request, omp_offload ) integer, intent(in) :: put_len, to_pe, get_len, from_pe MPP_TYPE_, intent(in) :: put_data(*) @@ -44,6 +44,8 @@ logical, intent(in), optional :: block integer, intent(in), optional :: tag integer, intent(out), optional :: recv_request, send_request + logical, intent(in), optional :: omp_offload + ! NOTE: omp_offload is unused in this function integer :: i, outunit MPP_TYPE_, allocatable, save :: local_data(:) !local copy used by non-parallel code (no SHMEM or MPI) @@ -55,6 +57,7 @@ if( .NOT.module_is_initialized )call mpp_error( FATAL, 'MPP_TRANSMIT: You must first call mpp_init.' ) if( to_pe.EQ.NULL_PE .AND. from_pe.EQ.NULL_PE )return + outunit = stdout() if( debug )then call SYSTEM_CLOCK(tick) From b4fae8dcc4f938fe000283bbb2a90c3cf9120a4b Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Fri, 17 Apr 2026 13:48:32 +1000 Subject: [PATCH 14/14] use openmp cpu if ompoffload=.false. This ensures that (un)packing steps in do_group_update is performed with openmp cpu parallelism if ompoffload=.false.. Previously it would only do serial. This is implemented by undefining the GPU macro (currently __NVCOMPILER_OPENMP_GPU) and re-including the (un)packing files. To make this work, the default(shared) was used in all the relevant OpenMP directives. If default(none) is used, the loops would hang or segfault. --- mpp/include/group_update_pack.inc | 105 +++++++++++++++++++--------- mpp/include/group_update_unpack.inc | 27 ++++--- mpp/include/mpp_group_update.fh | 47 +++++++++++++ 3 files changed, 137 insertions(+), 42 deletions(-) diff --git a/mpp/include/group_update_pack.inc b/mpp/include/group_update_pack.inc index 2422c1d8fa..a6ee72d7b8 100644 --- a/mpp/include/group_update_pack.inc +++ b/mpp/include/group_update_pack.inc @@ -18,8 +18,13 @@ !*********************************************************************** if( group%k_loop_inside ) then +! nvfortran + cray pointers imposes some restrictions on the loops below: +! * the compiler cannot privatise OpenMP cray pointers in offloaded loops. Hence, inner loops +! must be ported rather than the whole outer loop. +! * the more verbose form of openmp offload loops must be used. Would prefer "target teams loop". +! * default(shared) must be used otherwise loops hang or segfault. Would prefer "default(none)". #ifndef __NVCOMPILER_OPENMP_GPU -!$OMP parallel do default(none) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & +!$OMP parallel do default(shared) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & !$OMP private(buffer_pos,pos,m,is,ie,js,je,rotation, & !$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx) #endif @@ -35,7 +40,8 @@ if( group%k_loop_inside ) then do l=1, group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_field,ptr) & !$omp map(to: field(is:ie,js:je,1:ksize)) & !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & !$omp if (use_device_ptr) @@ -54,7 +60,8 @@ if( group%k_loop_inside ) then do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_field,ptr) & !$omp map(to: field(is:ie,js:je,1:ksize)) & !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & !$omp if (use_device_ptr) @@ -73,7 +80,8 @@ if( group%k_loop_inside ) then do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_field,ptr) & !$omp map(to: field(is:ie,js:je,1:ksize)) & !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & !$omp if (use_device_ptr) @@ -92,7 +100,8 @@ if( group%k_loop_inside ) then do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_field,ptr) & !$omp map(to: field(is:ie,js:je,1:ksize)) & !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & !$omp if (use_device_ptr) @@ -114,7 +123,8 @@ if( group%k_loop_inside ) then do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) & !$omp map(to: fieldx(is:ie,js:je,1:ksize)) & !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & !$omp if (use_device_ptr) @@ -134,7 +144,8 @@ if( group%k_loop_inside ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,is,ie,je,js,pos,nj,ni,ptr_fieldy,ptr) & !$omp map(to: fieldy(is:ie,js:je,1:ksize)) & !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & !$omp if (use_device_ptr) @@ -153,7 +164,8 @@ if( group%k_loop_inside ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,is,ie,je,js,pos,nj,ni,ptr_fieldy,ptr) & !$omp map(to: fieldy(is:ie,js:je,1:ksize)) & !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & !$omp if (use_device_ptr) @@ -173,7 +185,8 @@ if( group%k_loop_inside ) then do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,is,ie,je,js,pos,nj,ni,ptr_fieldy,ptr) & !$omp map(to: fieldy(is:ie,js:je,1:ksize)) & !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & !$omp if (use_device_ptr) @@ -195,7 +208,8 @@ if( group%k_loop_inside ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) & !$omp map(to: fieldx(is:ie,js:je,1:ksize)) & !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & !$omp if(use_device_ptr) @@ -214,7 +228,8 @@ if( group%k_loop_inside ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) & !$omp map(to: fieldx(is:ie,js:je,1:ksize)) & !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & !$omp if (use_device_ptr) @@ -237,7 +252,8 @@ if( group%k_loop_inside ) then do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,is,ie,je,js,pos,nj,ni,ptr_fieldy,ptr) & !$omp map(to: fieldy(is:ie,js:je,1:ksize)) & !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & !$omp if (use_device_ptr) @@ -256,7 +272,8 @@ if( group%k_loop_inside ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) & !$omp map(to: fieldx(is:ie,js:je,1:ksize)) & !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & !$omp if (use_device_ptr) @@ -276,7 +293,8 @@ if( group%k_loop_inside ) then do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) & !$omp map(to: fieldx(is:ie,js:je,1:ksize)) & !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & !$omp if (use_device_ptr) @@ -295,7 +313,8 @@ if( group%k_loop_inside ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) & !$omp map(to: fieldx(is:ie,js:je,1:ksize)) & !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & !$omp if (use_device_ptr) @@ -316,7 +335,8 @@ if( group%k_loop_inside ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,is,ie,je,js,pos,nj,ni,ptr_fieldy,ptr) & !$omp map(to: fieldy(is:ie,js:je,1:ksize)) & !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & !$omp if (use_device_ptr) @@ -335,7 +355,8 @@ if( group%k_loop_inside ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,is,ie,je,js,pos,nj,ni,ptr_fieldy,ptr) & !$omp map(to: fieldy(is:ie,js:je,1:ksize)) & !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & !$omp if(use_device_ptr) @@ -356,7 +377,7 @@ if( group%k_loop_inside ) then enddo else #ifndef __NVCOMPILER_OPENMP_GPU -!$OMP parallel do default(none) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & +!$OMP parallel do default(shared) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & !$OMP private(buffer_pos,pos,m,is,ie,js,je,rotation, & !$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx) #endif @@ -374,7 +395,8 @@ else do l=1, group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_field,ptr) & !$omp map(to: field(k, is:ie, js:je)) & !$omp map(from: buffer(pos+1:pos+nj*ni)) & !$omp if (use_device_ptr) @@ -391,7 +413,8 @@ else do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_field,ptr) & !$omp map(to: field(k, is:ie, js:je)) & !$omp map(from: buffer(pos+1:pos+nj*ni)) & !$omp if (use_device_ptr) @@ -408,7 +431,8 @@ else do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_field,ptr) & !$omp map(to: field(k, is:ie, js:je)) & !$omp map(from: buffer(pos+1:pos+nj*ni)) & !$omp if (use_device_ptr) @@ -425,7 +449,8 @@ else do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_field,ptr) & !$omp map(to: field(k, is:ie, js:je)) & !$omp map(from: buffer(pos+1:pos+nj*ni)) & !$omp if (use_device_ptr) @@ -445,7 +470,8 @@ else do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) & !$omp map(to: fieldx(k, is:ie, js:je)) & !$omp map(from: buffer(pos+1:pos+nj*ni)) & !$omp if (use_device_ptr) @@ -463,7 +489,8 @@ else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) & !$omp map(to: fieldy(k, is:ie, js:je)) & !$omp map(from: buffer(pos+1:pos+nj*ni)) & !$omp if (use_device_ptr) @@ -480,7 +507,8 @@ else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) & !$omp map(to: fieldy(k, is:ie, js:je)) & !$omp map(from: buffer(pos+1:pos+nj*ni)) & !$omp if (use_device_ptr) @@ -498,7 +526,8 @@ else do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) & !$omp map(to: fieldy(k, is:ie, js:je)) & !$omp map(from: buffer(pos+1:pos+nj*ni)) & !$omp if (use_device_ptr) @@ -516,7 +545,8 @@ else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) & !$omp map(to: fieldx(k, is:ie, js:je)) & !$omp map(from: buffer(pos+1:pos+nj*ni)) & !$omp if (use_device_ptr) @@ -533,7 +563,8 @@ else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) & !$omp map(to: fieldx(k, is:ie, js:je)) & !$omp map(from: buffer(pos+1:pos+nj*ni)) & !$omp if (use_device_ptr) @@ -554,7 +585,8 @@ else do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) & !$omp map(to: fieldy(k, is:ie, js:je)) & !$omp map(from: buffer(pos+1:pos+nj*ni)) & !$omp if (use_device_ptr) @@ -571,7 +603,8 @@ else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) & !$omp map(to: fieldx(k, is:ie, js:je)) & !$omp map(from: buffer(pos+1:pos+nj*ni)) & !$omp if(use_device_ptr) @@ -589,7 +622,8 @@ else do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) & !$omp map(to: fieldx(k, is:ie, js:je)) & !$omp map(from: buffer(pos+1:pos+nj*ni)) & !$omp if (use_device_ptr) @@ -606,7 +640,8 @@ else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) & !$omp map(to: fieldx(k, is:ie, js:je)) & !$omp map(from: buffer(pos+1:pos+nj*ni)) & !$omp if (use_device_ptr) @@ -625,7 +660,8 @@ else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) & !$omp map(to: fieldy(k, is:ie, js:je)) & !$omp map(from: buffer(pos+1:pos+nj*ni)) & !$omp if (use_device_ptr) @@ -642,7 +678,8 @@ else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) & !$omp map(to: fieldy(k, is:ie, js:je)) & !$omp map(from: buffer(pos+1:pos+nj*ni)) & !$omp if (use_device_ptr) diff --git a/mpp/include/group_update_unpack.inc b/mpp/include/group_update_unpack.inc index 286f07bc83..25a31ee8fc 100644 --- a/mpp/include/group_update_unpack.inc +++ b/mpp/include/group_update_unpack.inc @@ -18,8 +18,13 @@ !*********************************************************************** if( group%k_loop_inside ) then +! nvfortran + cray pointers imposes some restrictions on the loops below: +! * the compiler cannot privatise OpenMP cray pointers in offloaded loops. Hence, inner loops +! must be ported rather than the whole outer loop. +! * the more verbose form of openmp offload loops must be used. Would prefer "target teams loop". +! * default(shared) must be used otherwise loops hang or segfault. Would prefer "default(none)". #ifndef __NVCOMPILER_OPENMP_GPU -!$OMP parallel do default(none) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & +!$OMP parallel do default(shared) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & !$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, & !$OMP ptr_field, ptr_fieldx, ptr_fieldy, n,k,ni,nj,idx) #endif @@ -32,7 +37,8 @@ if( group%k_loop_inside ) then do l=1,nscalar ! loop over number of fields ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) private(idx) & + !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_field,ptr) & !$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) & !$omp map(from: field(is:ie,js:je,1:ksize)) #endif @@ -50,7 +56,8 @@ if( group%k_loop_inside ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) & !$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) & !$omp map(from: fieldx(is:ie,js:je,1:ksize)) if(use_device_ptr) #endif @@ -68,7 +75,8 @@ if( group%k_loop_inside ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldy,ptr) & !$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) & !$omp map(from: fieldy(is:ie,js:je,1:ksize)) if(use_device_ptr) #endif @@ -86,7 +94,7 @@ if( group%k_loop_inside ) then enddo else #ifndef __NVCOMPILER_OPENMP_GPU -!$OMP parallel do default(none) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & +!$OMP parallel do default(shared) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & !$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, & !$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx) #endif @@ -101,7 +109,8 @@ else do l=1,nscalar ! loop over number of fields ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_field,ptr) & !$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: field(is:ie,js:je,k)) if(use_device_ptr) #endif do j = js, je @@ -116,7 +125,8 @@ else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) & !$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: fieldx(is:ie,js:je,k)) if(use_device_ptr) #endif do j = js, je @@ -131,7 +141,8 @@ else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) & !$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: fieldy(is:ie,js:je,k)) if(use_device_ptr) #endif do j = js, je diff --git a/mpp/include/mpp_group_update.fh b/mpp/include/mpp_group_update.fh index 6a4cb2465d..75af692500 100644 --- a/mpp/include/mpp_group_update.fh +++ b/mpp/include/mpp_group_update.fh @@ -503,7 +503,19 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type, omp_offload) call mpp_clock_begin(group_pack_clock) !pack the data buffer_start_pos = 0 + ! below switch runs OpenMP offloaded packing if ompoffload is .true. and compiled with + ! OpenMP offload support. Otherwise, run OpenMP CPU by undefining GPU macro if defined + if (use_device_ptr) then #include + else +#ifdef __NVCOMPILER_OPENMP_GPU +#undef __NVCOMPILER_OPENMP_GPU +#include +#define __NVCOMPILER_OPENMP_GPU +#else +#include +#endif + endif call mpp_clock_end(group_pack_clock) call mpp_clock_begin(group_send_clock) @@ -526,10 +538,20 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type, omp_offload) !---unpack the buffer nunpack = group%nunpack call mpp_clock_begin(group_unpk_clock) + ! below switch runs OpenMP offloaded unpacking if ompoffload is .true. and compiled with + ! OpenMP offload support. Otherwise, run OpenMP CPU by undefining GPU macro if defined + if (use_device_ptr) then #include + else #ifdef __NVCOMPILER_OPENMP_GPU +#undef __NVCOMPILER_OPENMP_GPU +#include +#define __NVCOMPILER_OPENMP_GPU !$omp target exit data map(release: buffer) if(use_device_ptr) +#else +#include #endif + endif call mpp_clock_end(group_unpk_clock) ! ---northern boundary fold @@ -738,7 +760,19 @@ subroutine MPP_START_GROUP_UPDATE_(group, domain, d_type, reuse_buffer) call mpp_clock_begin(nonblock_group_pack_clock) npack = group%npack buffer_start_pos = group%buffer_start_pos + ! below switch runs OpenMP offloaded packing if ompoffload is .true. and compiled with + ! OpenMP offload support. Otherwise, run OpenMP CPU by undefining GPU macro if defined + if (use_device_ptr) then #include + else +#ifdef __NVCOMPILER_OPENMP_GPU +#undef __NVCOMPILER_OPENMP_GPU +#include +#define __NVCOMPILER_OPENMP_GPU +#else +#include +#endif + endif call mpp_clock_end(nonblock_group_pack_clock) call mpp_clock_begin(nonblock_group_send_clock) @@ -807,7 +841,20 @@ subroutine MPP_COMPLETE_GROUP_UPDATE_(group, domain, d_type) call mpp_clock_begin(nonblock_group_unpk_clock) buffer_start_pos = group%buffer_start_pos + ! below switch runs OpenMP offloaded unpacking if ompoffload is .true. and compiled with + ! OpenMP offload support. Otherwise, run OpenMP CPU by undefining GPU macro if defined + if (use_device_ptr) then #include + else +#ifdef __NVCOMPILER_OPENMP_GPU +#undef __NVCOMPILER_OPENMP_GPU +#include +#define __NVCOMPILER_OPENMP_GPU + !$omp target exit data map(release: buffer) if(use_device_ptr) +#else +#include +#endif + endif call mpp_clock_end(nonblock_group_unpk_clock) ! ---northern boundary fold