Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
407 changes: 337 additions & 70 deletions mpp/include/group_update_pack.inc

Large diffs are not rendered by default.

88 changes: 68 additions & 20 deletions mpp/include/group_update_unpack.inc
Original file line number Diff line number Diff line change
Expand Up @@ -18,92 +18,140 @@
!***********************************************************************

if( group%k_loop_inside ) then
!$OMP parallel do default(none) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) &
! nvfortran + cray pointers imposes some restrictions on the loops below:
! * the compiler cannot privatise OpenMP cray pointers in offloaded loops. Hence, inner loops
! must be ported rather than the whole outer loop.
! * the more verbose form of openmp offload loops must be used. Would prefer "target teams loop".
! * default(shared) must be used otherwise loops hang or segfault. Would prefer "default(none)".
#ifndef __NVCOMPILER_OPENMP_GPU
!$OMP parallel do default(shared) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) &
!$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, &
!$OMP ptr_field, ptr_fieldx, ptr_fieldy, n,k )
!$OMP ptr_field, ptr_fieldx, ptr_fieldy, n,k,ni,nj,idx)
#endif
do n = nunpack, 1, -1
buffer_pos = group%unpack_buffer_pos(n) + buffer_start_pos
pos = buffer_pos
is = group%unpack_is(n); ie = group%unpack_ie(n)
js = group%unpack_js(n); je = group%unpack_je(n)
is = group%unpack_is(n); ie = group%unpack_ie(n); ni = ie-is+1
js = group%unpack_js(n); je = group%unpack_je(n); nj = je-js+1
if( group%unpack_type(n) == FIELD_S ) then
do l=1,nscalar ! loop over number of fields
ptr_field = group%addrs_s(l)
#ifdef __NVCOMPILER_OPENMP_GPU
!$omp target teams distribute parallel do collapse(3) if(use_device_ptr) default(shared) &
!$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_field,ptr) &
!$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) &
!$omp map(from: field(is:ie,js:je,1:ksize))
#endif
do k = 1, ksize
do j = js, je
do i = is, ie
pos = pos + 1
field(i,j,k) = buffer(pos)
idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How are these two implementations equivalent? Is new idx = old pos always?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, they're equivalent. For any iteration, idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1 produces the same value that pos would have had at that point. The formula accounts for all the iterations that would have occurred in the nested loops up to that (i,j,k) position.

The reason for the change is that each nested iteration is now independent and can be performed in parallel.

field(i,j,k) = buffer(idx)
end do
end do
end do
pos = pos + ksize*nj*ni
end do
else if( group%unpack_type(n) == FIELD_X ) then
do l=1,nvector ! loop over number of fields
ptr_fieldx = group%addrs_x(l)
#ifdef __NVCOMPILER_OPENMP_GPU
!$omp target teams distribute parallel do collapse(3) default(shared) &
!$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) &
!$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) &
!$omp map(from: fieldx(is:ie,js:je,1:ksize)) if(use_device_ptr)
#endif
do k = 1, ksize
do j = js, je
do i = is, ie
pos = pos + 1
fieldx(i,j,k) = buffer(pos)
idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1
fieldx(i,j,k) = buffer(idx)
end do
end do
end do
pos = pos + ksize*nj*ni
end do
else if( group%unpack_type(n) == FIELD_Y ) then
do l=1,nvector ! loop over number of fields
ptr_fieldy = group%addrs_y(l)
#ifdef __NVCOMPILER_OPENMP_GPU
!$omp target teams distribute parallel do collapse(3) default(shared) &
!$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldy,ptr) &
!$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) &
!$omp map(from: fieldy(is:ie,js:je,1:ksize)) if(use_device_ptr)
#endif
do k = 1, ksize
do j = js, je
do i = is, ie
pos = pos + 1
fieldy(i,j,k) = buffer(pos)
idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1
fieldy(i,j,k) = buffer(idx)
end do
end do
end do
pos = pos + ksize*nj*ni
end do
endif
enddo
else
!$OMP parallel do default(none) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) &
#ifndef __NVCOMPILER_OPENMP_GPU
!$OMP parallel do default(shared) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) &
!$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, &
!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k)
!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx)
#endif
do nk = nunpack*ksize, 1, -1
n = (nk-1)/ksize + 1
k = mod((nk-1), ksize) + 1
buffer_pos = group%unpack_buffer_pos(n) + buffer_start_pos
pos = buffer_pos + (k-1)*group%unpack_size(n)
is = group%unpack_is(n); ie = group%unpack_ie(n)
js = group%unpack_js(n); je = group%unpack_je(n)
is = group%unpack_is(n); ie = group%unpack_ie(n); ni = ie-is+1
js = group%unpack_js(n); je = group%unpack_je(n); nj = je-js+1
if( group%unpack_type(n) == FIELD_S ) then
do l=1,nscalar ! loop over number of fields
ptr_field = group%addrs_s(l)
#ifdef __NVCOMPILER_OPENMP_GPU
!$omp target teams distribute parallel do collapse(2) default(shared) &
!$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_field,ptr) &
!$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: field(is:ie,js:je,k)) if(use_device_ptr)
#endif
do j = js, je
do i = is, ie
pos = pos + 1
field(i,j,k) = buffer(pos)
idx = pos + (j-js)*ni + (i-is) + 1
field(i,j,k) = buffer(idx)
end do
end do
pos = pos + ni*nj
end do
else if( group%unpack_type(n) == FIELD_X ) then
do l=1,nvector ! loop over number of fields
ptr_fieldx = group%addrs_x(l)
#ifdef __NVCOMPILER_OPENMP_GPU
!$omp target teams distribute parallel do collapse(2) default(shared) &
!$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) &
!$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: fieldx(is:ie,js:je,k)) if(use_device_ptr)
#endif
do j = js, je
do i = is, ie
pos = pos + 1
fieldx(i,j,k) = buffer(pos)
idx = pos + (j-js)*ni + (i-is) + 1
fieldx(i,j,k) = buffer(idx)
end do
end do
pos = pos + ni*nj
end do
else if( group%unpack_type(n) == FIELD_Y ) then
do l=1,nvector ! loop over number of fields
ptr_fieldy = group%addrs_y(l)
#ifdef __NVCOMPILER_OPENMP_GPU
!$omp target teams distribute parallel do collapse(2) default(shared) &
!$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) &
!$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: fieldy(is:ie,js:je,k)) if(use_device_ptr)
#endif
do j = js, je
do i = is, ie
pos = pos + 1
fieldy(i,j,k) = buffer(pos)
idx = pos + (j-js)*ni + (i-is) + 1
fieldy(i,j,k) = buffer(idx)
end do
end do
pos = pos + ni*nj
end do
endif
enddo
Expand Down
9 changes: 9 additions & 0 deletions mpp/include/mpp_comm_mpi.inc
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!> @brief Initialize the @ref mpp_mod module. Must be called before any usage.
subroutine mpp_init( flags, localcomm, test_level, alt_input_nml_path )
!$ use omp_lib
integer, optional, intent(in) :: flags !< Flags for debug output, can be MPP_VERBOSE or MPP_DEBUG
integer, optional, intent(in) :: localcomm !< Id of MPI communicator used to initialize
integer, optional, intent(in) :: test_level !< Used to exit initialization at certain stages
Expand All @@ -54,6 +55,14 @@
call MPI_COMM_RANK( mpp_comm_private, pe, error )
call MPI_COMM_SIZE( mpp_comm_private, npes, error )

! set default device to enable multi GPU parallelism
! calls to both OpenACC and OpenMP runtimes are needed
! because we use both do-concurrent and openmp
! if you remove either, the code will run multiple
! ranks on a _single_ GPU. Be careful out there!
!$ call omp_set_default_device(pe)
!$acc set device_num(pe)

module_is_initialized = .TRUE.
if (present(test_level)) then
t_level = test_level
Expand Down
78 changes: 69 additions & 9 deletions mpp/include/mpp_group_update.fh
Original file line number Diff line number Diff line change
Expand Up @@ -419,20 +419,22 @@ subroutine MPP_CREATE_GROUP_UPDATE_4D_V_( group, fieldx, fieldy, domain, flags,
end subroutine MPP_CREATE_GROUP_UPDATE_4D_V_


subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type)
subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type, omp_offload)
type(mpp_group_update_type), intent(inout) :: group
type(domain2D), intent(inout) :: domain
MPP_TYPE_, intent(in) :: d_type
logical, optional, intent(in) :: omp_offload

integer :: nscalar, nvector, nlist
logical :: recv_y(8)
integer :: nsend, nrecv, flags_v
integer :: msgsize
integer :: from_pe, to_pe, buffer_pos, pos
integer :: from_pe, to_pe, buffer_pos, pos, idx
integer :: ksize, is, ie, js, je
integer :: n, l, m, i, j, k, buffer_start_pos, nk
integer :: n, l, m, i, j, k, buffer_start_pos, ni, nj, nk
integer :: shift, gridtype, midpoint
integer :: npack, nunpack, rotation, isd
logical :: use_device_ptr

MPP_TYPE_ :: buffer(mpp_domains_stack_size)
MPP_TYPE_ :: field (group%is_s:group%ie_s,group%js_s:group%je_s, group%ksize_s)
Expand All @@ -448,6 +450,9 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type)
nlist = size(domain%list(:))
gridtype = group%gridtype

use_device_ptr = .false.
if (present(omp_offload)) use_device_ptr = omp_offload

!--- ksize_s must equal ksize_v
if(nvector > 0 .AND. nscalar > 0) then
if(group%ksize_s .NE. group%ksize_v) then
Expand Down Expand Up @@ -476,13 +481,16 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type)

!---pre-post receive.
call mpp_clock_begin(group_recv_clock)
#ifdef __NVCOMPILER_OPENMP_GPU
!$omp target enter data map(alloc: buffer) if(use_device_ptr)
#endif
do m = 1, nrecv
msgsize = group%recv_size(m)
from_pe = group%from_pe(m)
if( msgsize .GT. 0 )then
buffer_pos = group%buffer_pos_recv(m)
call mpp_recv( buffer(buffer_pos+1), glen=msgsize, from_pe=from_pe, block=.false., &
tag=COMM_TAG_1)
tag=COMM_TAG_1, omp_offload=omp_offload)
end if
end do

Expand All @@ -495,7 +503,19 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type)
call mpp_clock_begin(group_pack_clock)
!pack the data
buffer_start_pos = 0
! below switch runs OpenMP offloaded packing if ompoffload is .true. and compiled with
! OpenMP offload support. Otherwise, run OpenMP CPU by undefining GPU macro if defined
if (use_device_ptr) then
#include <group_update_pack.inc>
else
#ifdef __NVCOMPILER_OPENMP_GPU
#undef __NVCOMPILER_OPENMP_GPU
#include <group_update_pack.inc>
#define __NVCOMPILER_OPENMP_GPU
#else
#include <group_update_pack.inc>
#endif
endif
call mpp_clock_end(group_pack_clock)

call mpp_clock_begin(group_send_clock)
Expand All @@ -504,7 +524,7 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type)
if( msgsize .GT. 0 )then
buffer_pos = group%buffer_pos_send(n)
to_pe = group%to_pe(n)
call mpp_send( buffer(buffer_pos+1), plen=msgsize, to_pe=to_pe, tag=COMM_TAG_1)
call mpp_send( buffer(buffer_pos+1), plen=msgsize, to_pe=to_pe, tag=COMM_TAG_1, omp_offload=omp_offload)
endif
enddo
call mpp_clock_end(group_send_clock)
Expand All @@ -518,7 +538,20 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type)
!---unpack the buffer
nunpack = group%nunpack
call mpp_clock_begin(group_unpk_clock)
! below switch runs OpenMP offloaded unpacking if ompoffload is .true. and compiled with
! OpenMP offload support. Otherwise, run OpenMP CPU by undefining GPU macro if defined
if (use_device_ptr) then
#include <group_update_unpack.inc>
else
#ifdef __NVCOMPILER_OPENMP_GPU
#undef __NVCOMPILER_OPENMP_GPU
#include <group_update_unpack.inc>
#define __NVCOMPILER_OPENMP_GPU
!$omp target exit data map(release: buffer) if(use_device_ptr)
#else
#include <group_update_unpack.inc>
#endif
endif
call mpp_clock_end(group_unpk_clock)

! ---northern boundary fold
Expand Down Expand Up @@ -644,10 +677,11 @@ subroutine MPP_START_GROUP_UPDATE_(group, domain, d_type, reuse_buffer)
integer :: nscalar, nvector
integer :: nsend, nrecv, flags_v
integer :: msgsize, npack, rotation
integer :: from_pe, to_pe, buffer_pos, pos
integer :: from_pe, to_pe, buffer_pos, pos, idx
integer :: ksize, is, ie, js, je
integer :: n, l, m, i, j, k, buffer_start_pos, nk
integer :: n, l, m, i, j, k, buffer_start_pos, ni, nj, nk
logical :: reuse_buf_pos
logical, parameter :: use_device_ptr = .false. ! placeholder
character(len=8) :: text

MPP_TYPE_ :: buffer(size(mpp_domains_stack_nonblock(:)))
Expand Down Expand Up @@ -726,7 +760,19 @@ subroutine MPP_START_GROUP_UPDATE_(group, domain, d_type, reuse_buffer)
call mpp_clock_begin(nonblock_group_pack_clock)
npack = group%npack
buffer_start_pos = group%buffer_start_pos
! below switch runs OpenMP offloaded packing if ompoffload is .true. and compiled with
! OpenMP offload support. Otherwise, run OpenMP CPU by undefining GPU macro if defined
if (use_device_ptr) then
#include <group_update_pack.inc>
else
#ifdef __NVCOMPILER_OPENMP_GPU
#undef __NVCOMPILER_OPENMP_GPU
#include <group_update_pack.inc>
#define __NVCOMPILER_OPENMP_GPU
#else
#include <group_update_pack.inc>
#endif
endif
call mpp_clock_end(nonblock_group_pack_clock)

call mpp_clock_begin(nonblock_group_send_clock)
Expand All @@ -749,11 +795,12 @@ subroutine MPP_COMPLETE_GROUP_UPDATE_(group, domain, d_type)
MPP_TYPE_, intent(in) :: d_type

integer :: nsend, nrecv, nscalar, nvector
integer :: k, buffer_pos, pos, m, n, l
integer :: k, buffer_pos, pos, m, n, l, idx
integer :: is, ie, js, je, ksize, i, j
integer :: shift, gridtype, midpoint, flags_v
integer :: nunpack, rotation, buffer_start_pos, nk, isd
integer :: nunpack, rotation, buffer_start_pos, ni, nj, nk, isd
logical :: recv_y(8)
logical, parameter :: use_device_ptr = .false. ! placeholder
MPP_TYPE_ :: buffer(size(mpp_domains_stack_nonblock(:)))
MPP_TYPE_ :: field (group%is_s:group%ie_s,group%js_s:group%je_s, group%ksize_s)
MPP_TYPE_ :: fieldx(group%is_x:group%ie_x,group%js_x:group%je_x, group%ksize_v)
Expand Down Expand Up @@ -794,7 +841,20 @@ subroutine MPP_COMPLETE_GROUP_UPDATE_(group, domain, d_type)

call mpp_clock_begin(nonblock_group_unpk_clock)
buffer_start_pos = group%buffer_start_pos
! below switch runs OpenMP offloaded unpacking if ompoffload is .true. and compiled with
! OpenMP offload support. Otherwise, run OpenMP CPU by undefining GPU macro if defined
if (use_device_ptr) then
#include <group_update_unpack.inc>
else
#ifdef __NVCOMPILER_OPENMP_GPU
#undef __NVCOMPILER_OPENMP_GPU
#include <group_update_unpack.inc>
#define __NVCOMPILER_OPENMP_GPU
!$omp target exit data map(release: buffer) if(use_device_ptr)
#else
#include <group_update_unpack.inc>
#endif
endif
call mpp_clock_end(nonblock_group_unpk_clock)

! ---northern boundary fold
Expand Down
Loading