diff --git a/doc/src/fix_qeq_reaxff.rst b/doc/src/fix_qeq_reaxff.rst index 74411383968..a536fc31181 100644 --- a/doc/src/fix_qeq_reaxff.rst +++ b/doc/src/fix_qeq_reaxff.rst @@ -24,10 +24,11 @@ Syntax .. parsed-literal:: - keyword = *dual* or *maxiter* or *nowarn* + keyword = *dual* or *maxiter* or *nowarn* or *matfree* *dual* = process S and T matrix in parallel (only for qeq/reaxff/omp) *maxiter* N = limit the number of iterations to *N* *nowarn* = do not print a warning message if the maximum number of iterations was reached + *matfree* = use a matrix-free approach for applying the H matrix (only for qeq/reaxff/kk) Examples """""""" @@ -90,6 +91,14 @@ same fixed number of QEq iterations is desired, which can be achieved by using a very small tolerance and setting *maxiter* to the desired number of iterations. +The optional *matfree* keyword replaces the sequence of +explicitly constructing the H matrix, then (repeatedly) applying it +with a matrix-free approach where the H matrix is effectively +regenerated each time it is applied. This trades performance for +reduced memory requirements because it avoids the overheads of +storing the matrix. This is only supported for the *qeq/reaxff/kk* +style, with both full and half qeq neighbor lists supported. + .. note:: In order to solve the self-consistent equations for electronegativity diff --git a/doc/src/package.rst b/doc/src/package.rst index eefecac4fd4..f8f7b86ddd1 100644 --- a/doc/src/package.rst +++ b/doc/src/package.rst @@ -79,7 +79,7 @@ Syntax *no_affinity* values = none *kokkos* args = keyword value ... zero or more keyword/value pairs may be appended - keywords = *neigh* or *neigh/qeq* or *neigh/thread* or *neigh/transpose* or *newton* or *binsize* or *comm* or *comm/exchange* or *comm/forward* or *comm/pair/forward* or *comm/fix/forward* or *comm/reverse* or *comm/pair/reverse* or *sort* or *atom/map* or *gpu/aware* or *pair/only* + keywords = *neigh* or *neigh/qeq* or *neigh/thread* or *neigh/transpose* or *newton* or *binsize* or *comm* or *comm/exchange* or *comm/forward* or *comm/pair/forward* or *comm/fix/forward* or *comm/reverse* or *comm/pair/reverse* or *comm/fix/reverse* or *sort* or *atom/map* or *gpu/aware* or *pair/only* *neigh* value = *full* or *half* full = full neighbor list half = half neighbor list built in thread-safe manner @@ -98,7 +98,7 @@ Syntax *binsize* value = size size = bin size for neighbor list construction (distance units) *comm* value = *no* or *host* or *device* - use value for comm/exchange and comm/forward and comm/pair/forward and comm/fix/forward and comm/reverse + use value for comm/exchange and comm/forward and comm/pair/forward and comm/fix/forward and comm/reverse and comm/fix/reverse *comm/exchange* value = *no* or *host* or *device* *comm/forward* value = *no* or *host* or *device* *comm/pair/forward* value = *no* or *device* @@ -110,6 +110,10 @@ Syntax *comm/pair/reverse* value = *no* or *device* *no* = perform communication pack/unpack in non-KOKKOS mode *device* = perform pack/unpack on device (e.g. on GPU) + *comm/fix/reverse* value = *no* or *host* or *device* + *no* = perform communication pack/unpack in non-KOKKOS mode + *host* = perform pack/unpack on host (e.g. with OpenMP threading) + *device* = perform pack/unpack on device (e.g. on GPU) *sort* value = *no* or *device* *no* = perform atom sorting in non-KOKKOS mode *device* = perform atom sorting on device (e.g. on GPU) @@ -545,7 +549,8 @@ rule of thumb may give too large a binsize and the default should be overridden with a smaller value. The *comm* and *comm/exchange* and *comm/forward* and *comm/pair/forward* -and *comm/fix/forward* and *comm/reverse* and *comm/pair/reverse* +and *comm/fix/forward* and *comm/reverse* and *comm/pair/reverse* and +*comm/fix/reverse* keywords determine whether the host or device performs the packing and unpacking of data when communicating per-atom data between processors. "Exchange" communication happens only on timesteps that neighbor lists @@ -573,7 +578,8 @@ keywords, if a value of *host* is used it will be automatically be changed to *no* since these keywords don't support *host* mode. The value of *no* will also always be used when running on the CPU, i.e. setting the value to *device* will have no effect if the pair/fix style is -running on the CPU. For the *comm/fix/forward* or *comm/pair/reverse* +running on the CPU. For the *comm/fix/forward* or *comm/pair/reverse* or +*comm/fix/reverse* keywords, not all styles support *device* mode and in that case will run in *no* mode instead. diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp index 00cdc31f5b6..e65986f9cc4 100644 --- a/src/KOKKOS/comm_kokkos.cpp +++ b/src/KOKKOS/comm_kokkos.cpp @@ -12,6 +12,11 @@ See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ +/* ---------------------------------------------------------------------- + Contributing authors: Christian Trott (SNL), Stan Moore (SNL), + Lewis Russell (U. Strathclyde), Balint Joo (NVIDIA) +------------------------------------------------------------------------- */ + #include "comm_kokkos.h" #include "atom.h" @@ -113,6 +118,7 @@ void CommKokkos::init() reverse_pair_comm_legacy = lmp->kokkos->reverse_pair_comm_legacy; forward_fix_comm_legacy = lmp->kokkos->forward_fix_comm_legacy; reverse_comm_legacy = lmp->kokkos->reverse_comm_legacy; + reverse_fix_comm_legacy = lmp->kokkos->reverse_fix_comm_legacy; exchange_comm_on_host = lmp->kokkos->exchange_comm_on_host; forward_comm_on_host = lmp->kokkos->forward_comm_on_host; reverse_comm_on_host = lmp->kokkos->reverse_comm_on_host; @@ -352,7 +358,8 @@ void CommKokkos::reverse_comm_device() void CommKokkos::forward_comm(Fix *fix, int size) { - if (fix->execution_space == Host || fix->execution_space == HostKK || !fix->forward_comm_device || forward_fix_comm_legacy) { + if (fix->execution_space == Host || fix->execution_space == HostKK || + !fix->forward_comm_device || forward_fix_comm_legacy) { k_sendlist.sync_host(); CommBrick::forward_comm(fix, size); } else { @@ -444,10 +451,86 @@ void CommKokkos::forward_comm_device(Fix *fix, int size) void CommKokkos::reverse_comm(Fix *fix, int size) { - k_sendlist.sync_host(); - CommBrick::reverse_comm(fix, size); + if (fix->execution_space == Host || fix->execution_space == HostKK || + !fix->reverse_comm_device || reverse_fix_comm_legacy) { + k_sendlist.sync_host(); + CommBrick::reverse_comm(fix, size); + } else { + k_sendlist.sync_device(); + reverse_comm_device(fix,size); + } } +/* ---------------------------------------------------------------------- */ + +template +void CommKokkos::reverse_comm_device(Fix *fix, int size) +{ + int iswap, n, nsize; + MPI_Request request; + DAT::tdual_double_1d k_buf_tmp; + + if (size) nsize = size; + else nsize = fix->comm_reverse; + KokkosBase* fixKKBase = dynamic_cast(fix); + + for (iswap = 0; iswap < nswap; iswap++) { + int n = MAX(max_buf_fix,nsize*sendnum[iswap]); + n = MAX(n,nsize*recvnum[iswap]); + if (n > max_buf_fix) + grow_buf_fix(n); + } + + // exchange data with another proc + // if other proc is self, just copy + + for (iswap = nswap-1; iswap >= 0; iswap--) { + + n = fixKKBase->pack_reverse_comm_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_send_fix); + + // exchange with another proc + // if self, set recv buffer to send buffer + + if (sendproc[iswap] != me) { + double* buf_send_fix; + double* buf_recv_fix; + if (lmp->kokkos->gpu_aware_flag) { + buf_send_fix = k_buf_send_fix.view().data(); + buf_recv_fix = k_buf_recv_fix.view().data(); + } else { + k_buf_send_fix.modify(); + k_buf_send_fix.sync(); + buf_send_fix = k_buf_send_fix.view_host().data(); + buf_recv_fix = k_buf_recv_fix.view_host().data(); + } + + if (sendnum[iswap]) { + DeviceType().fence(); + MPI_Irecv(buf_recv_fix,nsize*sendnum[iswap],MPI_DOUBLE, + sendproc[iswap],0,world,&request); + } + if (recvnum[iswap]) { + DeviceType().fence(); + MPI_Send(buf_send_fix,n,MPI_DOUBLE,recvproc[iswap],0,world); + } + if (sendnum[iswap]) { + MPI_Wait(&request,MPI_STATUS_IGNORE); + DeviceType().fence(); + } + + if (!lmp->kokkos->gpu_aware_flag) { + k_buf_recv_fix.modify(); + k_buf_recv_fix.sync(); + } + k_buf_tmp = k_buf_recv_fix; + } else k_buf_tmp = k_buf_send_fix; + + // unpack buffer + auto k_sendlist_iswap = Kokkos::subview(k_sendlist,iswap,Kokkos::ALL); + fixKKBase->unpack_reverse_comm_kokkos(sendnum[iswap], k_sendlist_iswap,k_buf_tmp); + } + +} /* ---------------------------------------------------------------------- reverse communication invoked by a Fix with variable size data diff --git a/src/KOKKOS/comm_kokkos.h b/src/KOKKOS/comm_kokkos.h index b91740304d2..875ed290d8a 100644 --- a/src/KOKKOS/comm_kokkos.h +++ b/src/KOKKOS/comm_kokkos.h @@ -30,6 +30,7 @@ class CommKokkos : public CommBrick { bool reverse_pair_comm_legacy; bool forward_fix_comm_legacy; bool reverse_comm_legacy; + bool reverse_fix_comm_legacy; bool exchange_comm_on_host; bool forward_comm_on_host; bool reverse_comm_on_host; @@ -64,6 +65,7 @@ class CommKokkos : public CommBrick { template void forward_comm_device(Pair *pair, int size=0); template void reverse_comm_device(Pair *pair, int size=0); template void forward_comm_device(Fix *fix, int size=0); + template void reverse_comm_device(Fix *fix, int size=0); template void exchange_device(); template void borders_device(); diff --git a/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp b/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp index e5dc705aa5b..73202ba4279 100644 --- a/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp +++ b/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp @@ -23,6 +23,8 @@ - Fused the CG solve for "S" and "T" matrices - Improved the SpMV algorithm by using vector instead of team level parallelism on GPUs + Evan Weinberg (NVIDIA): Matrix-free representation + Balint Joo (NVIDIA): Device-resident reverse comms ------------------------------------------------------------------------- */ #include "fix_qeq_reaxff_kokkos.h" @@ -55,7 +57,7 @@ FixQEqReaxFFKokkos(LAMMPS *lmp, int narg, char **arg) : { kokkosable = 1; comm_forward = comm_reverse = 2; // fused - forward_comm_device = exchange_comm_device = sort_device = 1; + forward_comm_device = reverse_comm_device = exchange_comm_device = sort_device = 1; atomKK = (AtomKokkos *) atom; execution_space = ExecutionSpaceFromDevice::space; @@ -75,6 +77,9 @@ FixQEqReaxFFKokkos(LAMMPS *lmp, int narg, char **arg) : d_mfill_offset = typename AT::t_bigint_scalar("qeq/kk:mfill_offset"); converged = 0; + + if (comm->me == 0 && matrix_free) + utils::logmesg(lmp, "Using matrix-free form for fix qeq/reax/kk\n"); } /* ---------------------------------------------------------------------- */ @@ -213,43 +218,47 @@ void FixQEqReaxFFKokkos::pre_force(int /*vflag*/) allocate_array(); - // get max number of neighbor + if (!matrix_free) { + // create the functor that handles the H matrix + FixQEqReaxFFKokkosNeighborFunctor neighbor_functor(*this); - if (!allocated_flag || last_allocate < neighbor->lastcall - || nlocal_last_allocate != nlocal) { - allocate_matrix(); - last_allocate = update->ntimestep; - nlocal_last_allocate = nlocal; - } + // get max number of neighbor - // compute_H + if (!allocated_flag || last_allocate < neighbor->lastcall + || nlocal_last_allocate != nlocal) { + // determine the total space for the H matrix + m_cap_big = 0; + Kokkos::parallel_reduce(Kokkos::RangePolicy(0,nn), neighbor_functor, m_cap_big); - if (execution_space == HostKK) { // CPU - if (neighflag == FULL) { - FixQEqReaxFFKokkosComputeHFunctor computeH_functor(this); - Kokkos::parallel_scan(nn,computeH_functor); - } else { // HALF and HALFTHREAD are the same - FixQEqReaxFFKokkosComputeHFunctor computeH_functor(this); - Kokkos::parallel_scan(nn,computeH_functor); + allocate_matrix(); + neighbor_functor.update_after_allocation(*this); + last_allocate = update->ntimestep; + nlocal_last_allocate = nlocal; } - } else { // GPU, use teams - Kokkos::deep_copy(d_mfill_offset,0); - - int atoms_per_team = FixQEqReaxFFKokkos::compute_h_teamsize; - int vector_length = FixQEqReaxFFKokkos::compute_h_vectorsize; - - int num_teams = nn / atoms_per_team + (nn % atoms_per_team ? 1 : 0); - - Kokkos::TeamPolicy policy(num_teams, atoms_per_team, - vector_length); - if (neighflag == FULL) { - FixQEqReaxFFKokkosComputeHFunctor computeH_functor( - this, atoms_per_team, vector_length); - Kokkos::parallel_for(policy, computeH_functor); - } else { // HALF and HALFTHREAD are the same - FixQEqReaxFFKokkosComputeHFunctor computeH_functor( - this, atoms_per_team, vector_length); - Kokkos::parallel_for(policy, computeH_functor); + + // compute_H + + if constexpr (is_host) { // CPU + if (neighflag == FULL) { + Kokkos::parallel_scan(Kokkos::RangePolicy>(0,nn), neighbor_functor); + } else { + Kokkos::parallel_scan(Kokkos::RangePolicy>(0,nn), neighbor_functor); + } + } else { // GPU, use teams + Kokkos::deep_copy(d_mfill_offset,0); + + int atoms_per_team = FixQEqReaxFFKokkos::compute_h_teamsize; + int vector_length = FixQEqReaxFFKokkos::compute_h_vectorsize; + + int num_teams = nn / atoms_per_team + (nn % atoms_per_team ? 1 : 0); + + if (neighflag == FULL) { + Kokkos::parallel_for(Kokkos::TeamPolicy>(num_teams, atoms_per_team, + vector_length), neighbor_functor); + } else { // HALF and HALFTHREAD are the same + Kokkos::parallel_for(Kokkos::TeamPolicy>(num_teams, atoms_per_team, + vector_length), neighbor_functor); + } } } @@ -302,30 +311,9 @@ void FixQEqReaxFFKokkos::pre_force(int /*vflag*/) /* ---------------------------------------------------------------------- */ -template -// NOLINTNEXTLINE -KOKKOS_INLINE_FUNCTION -void FixQEqReaxFFKokkos::num_neigh_item(int ii, bigint &totneigh) const -{ - const int i = d_ilist[ii]; - totneigh += d_numneigh[i]; -} - -/* ---------------------------------------------------------------------- */ - template void FixQEqReaxFFKokkos::allocate_matrix() { - // determine the total space for the H matrix - - m_cap_big = 0; - - // limit scope of functor to allow deallocation of views - { - FixQEqReaxFFKokkosNumNeighFunctor neigh_functor(this); - Kokkos::parallel_reduce(nn,neigh_functor,m_cap_big); - } - // deallocate first to reduce memory overhead d_firstnbr = typename AT::t_bigint_1d(); @@ -374,8 +362,8 @@ void FixQEqReaxFFKokkos::allocate_array() if (efield) get_chi_field(); Kokkos::parallel_for(Kokkos::RangePolicy(0,nn),*this); - } + /* ---------------------------------------------------------------------- */ template @@ -405,305 +393,6 @@ void FixQEqReaxFFKokkos::operator()(TagQEqZero, const int &ii) const /* ---------------------------------------------------------------------- */ -template -template -// NOLINTNEXTLINE -KOKKOS_INLINE_FUNCTION -void FixQEqReaxFFKokkos::compute_h_item(int ii, bigint &m_fill, const bool &final) const -{ - const int i = d_ilist[ii]; - int j,jj,jtype; - - if (mask[i] & groupbit) { - - const KK_FLOAT xtmp = x(i,0); - const KK_FLOAT ytmp = x(i,1); - const KK_FLOAT ztmp = x(i,2); - const int itype = type(i); - const tagint itag = tag(i); - const int jnum = d_numneigh[i]; - if (final) - d_firstnbr[i] = m_fill; - - for (jj = 0; jj < jnum; jj++) { - j = d_neighbors(i,jj); - j &= NEIGHMASK; - jtype = type(j); - - const KK_FLOAT delx = x(j,0) - xtmp; - const KK_FLOAT dely = x(j,1) - ytmp; - const KK_FLOAT delz = x(j,2) - ztmp; - - if (NEIGHFLAG != FULL) { - // skip half of the interactions - const tagint jtag = tag(j); - if (j >= nlocal) { - if (itag > jtag) { - if ((itag+jtag) % 2 == 0) continue; - } else if (itag < jtag) { - if ((itag+jtag) % 2 == 1) continue; - } else { - if (x(j,2) < ztmp) continue; - if (x(j,2) == ztmp && x(j,1) < ytmp) continue; - if (x(j,2) == ztmp && x(j,1) == ytmp && x(j,0) < xtmp) continue; - } - } - } - - const KK_FLOAT rsq = delx*delx + dely*dely + delz*delz; - if (rsq > cutsq) continue; - - if (final) { - const KK_FLOAT r = sqrt(rsq); - d_jlist(m_fill) = j; - const KK_FLOAT shldij = d_shield(itype,jtype); - d_val(m_fill) = calculate_H_k(r,shldij); - } - m_fill++; - } - if (final) - d_numnbrs[i] = int(m_fill - d_firstnbr[i]); - } -} - -/* ---------------------------------------------------------------------- */ - -// Calculate Qeq matrix H where H is a sparse matrix and H[i][j] represents the electrostatic interaction coefficients on atom-i with atom-j -// d_val - contains the non-zero entries of sparse matrix H -// d_numnbrs - d_numnbrs[i] contains the # of non-zero entries in the i-th row of H (which also represents the # of neighbor atoms with electrostatic interaction coefficients with atom-i) -// d_firstnbr- d_firstnbr[i] contains the beginning index from where the H matrix entries corresponding to row-i is stored in d_val -// d_jlist - contains the column index corresponding to each entry in d_val - -template -template -// NOLINTNEXTLINE -KOKKOS_INLINE_FUNCTION -void FixQEqReaxFFKokkos::compute_h_team( - const typename Kokkos::TeamPolicy::member_type &team, - int atoms_per_team, int vector_length) const { - - // scratch space setup - Kokkos::View, - Kokkos::MemoryTraits> - s_ilist(team.team_shmem(), atoms_per_team); - Kokkos::View, - Kokkos::MemoryTraits> - s_numnbrs(team.team_shmem(), atoms_per_team); - Kokkos::View, - Kokkos::MemoryTraits> - s_firstnbr(team.team_shmem(), atoms_per_team); - - Kokkos::View, - Kokkos::MemoryTraits> - s_jtype(team.team_shmem(), atoms_per_team, vector_length); - Kokkos::View, - Kokkos::MemoryTraits> - s_jlist(team.team_shmem(), atoms_per_team, vector_length); - Kokkos::View, - Kokkos::MemoryTraits> - s_r(team.team_shmem(), atoms_per_team, vector_length); - - // team of threads work on atoms with index in [firstatom, lastatom) - int firstatom = team.league_rank() * atoms_per_team; - int lastatom = - (firstatom + atoms_per_team < nn) ? (firstatom + atoms_per_team) : nn; - - // kokkos-thread-0 is used to load info from global memory into scratch space - if (team.team_rank() == 0) { - - // copy atom indices from d_ilist[firstatom:lastatom] to scratch space s_ilist[0:atoms_per_team] - // copy # of neighbor atoms for all the atoms with indices in d_ilist[firstatom:lastatom] from d_numneigh to scratch space s_numneigh[0:atoms_per_team] - // calculate total number of neighbor atoms for all atoms assigned to the current team of threads (Note - Total # of neighbor atoms here provides the - // upper bound space requirement to store the H matrix values corresponding to the atoms with indices in d_ilist[firstatom:lastatom]) - - Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team, atoms_per_team), - [&](const int &idx, int &totalnbrs, bool final) { - int ii = firstatom + idx; - - if (ii < nn) { - const int i = d_ilist[ii]; - int jnum = d_numneigh[i]; - - if (final) { - s_ilist[idx] = i; - s_numnbrs[idx] = jnum; - s_firstnbr[idx] = totalnbrs; - } - totalnbrs += jnum; - } else { - s_numnbrs[idx] = 0; - } - }); - } - - // barrier ensures that the data moved to scratch space is visible to all the - // threads of the corresponding team - team.team_barrier(); - - // calculate the global memory offset from where the H matrix values to be - // calculated by the current team will be stored in d_val - bigint team_firstnbr_idx = 0; - Kokkos::single(Kokkos::PerTeam(team), - [&](bigint &val) { - int totalnbrs = s_firstnbr[lastatom - firstatom - 1] + - s_numnbrs[lastatom - firstatom - 1]; - val = Kokkos::atomic_fetch_add(&d_mfill_offset(), totalnbrs); - }, - team_firstnbr_idx); - - // map the H matrix computation of each atom to kokkos-thread (one atom per - // kokkos-thread) neighbor computation for each atom is assigned to vector - // lanes of the corresponding thread - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, atoms_per_team), [&](const int &idx) { - int ii = firstatom + idx; - - if (ii < nn) { - const int i = s_ilist[idx]; - - if (mask[i] & groupbit) { - const KK_FLOAT xtmp = x(i, 0); - const KK_FLOAT ytmp = x(i, 1); - const KK_FLOAT ztmp = x(i, 2); - const int itype = type(i); - tagint itag = tag(i); // removed "const" to work around GCC 7 bug - int jnum = s_numnbrs[idx]; // removed "const" to work around GCC 7 bug - - // calculate the write-offset for atom-i's first neighbor - bigint atomi_firstnbr_idx = team_firstnbr_idx + s_firstnbr[idx]; - Kokkos::single(Kokkos::PerThread(team), - [&]() { d_firstnbr[i] = atomi_firstnbr_idx; }); - - // current # of neighbor atoms with non-zero electrostatic - // interaction coefficients with atom-i which represents the # of - // non-zero elements in row-i of H matrix - int atomi_nbrs_inH = 0; - - // calculate H matrix values corresponding to atom-i where neighbors - // are processed in batches and the batch size is vector_length - for (int jj_start = 0; jj_start < jnum; jj_start += vector_length) { - - bigint atomi_nbr_writeIdx = atomi_firstnbr_idx + atomi_nbrs_inH; - - // count the # of neighbor atoms with non-zero electrostatic - // interaction coefficients with atom-i in the current batch - int atomi_nbrs_curbatch = 0; - - // compute rsq, jtype, j and store in scratch space which is - // reused later - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, vector_length), - [&](const int &idx, int &m_fill) { - const int jj = jj_start + idx; - - // initialize: -1 represents no interaction with atom-j - // where j = d_neighbors(i,jj) - s_jlist(team.team_rank(), idx) = -1; - - if (jj < jnum) { - int j = d_neighbors(i, jj); - j &= NEIGHMASK; - const int jtype = type(j); - - const KK_FLOAT delx = x(j, 0) - xtmp; - const KK_FLOAT dely = x(j, 1) - ytmp; - const KK_FLOAT delz = x(j, 2) - ztmp; - - // valid nbr interaction - bool valid = true; - if (NEIGHFLAG != FULL) { - // skip half of the interactions - const tagint jtag = tag(j); - if (j >= nlocal) { - if (itag > jtag) { - if ((itag + jtag) % 2 == 0) - valid = false; - } else if (itag < jtag) { - if ((itag + jtag) % 2 == 1) - valid = false; - } else { - if (x(j, 2) < ztmp) - valid = false; - if (x(j, 2) == ztmp && x(j, 1) < ytmp) - valid = false; - if (x(j, 2) == ztmp && x(j, 1) == ytmp && - x(j, 0) < xtmp) - valid = false; - } - } - } - - const KK_FLOAT rsq = - delx * delx + dely * dely + delz * delz; - if (rsq > cutsq) - valid = false; - - if (valid) { - s_jlist(team.team_rank(), idx) = j; - s_jtype(team.team_rank(), idx) = jtype; - s_r(team.team_rank(), idx) = sqrt(rsq); - m_fill++; - } - } - }, - atomi_nbrs_curbatch); - - // write non-zero entries of H to global memory - Kokkos::parallel_scan( - Kokkos::ThreadVectorRange(team, vector_length), - [&](const int &idx, int &m_fill, bool final) { - int j = s_jlist(team.team_rank(), idx); - if (final) { - if (j != -1) { - const int jtype = s_jtype(team.team_rank(), idx); - const KK_FLOAT r = s_r(team.team_rank(), idx); - const KK_FLOAT shldij = d_shield(itype, jtype); - - d_jlist[atomi_nbr_writeIdx + m_fill] = j; - d_val[atomi_nbr_writeIdx + m_fill] = - calculate_H_k(r, shldij); - } - } - - if (j != -1) { - m_fill++; - } - }); - atomi_nbrs_inH += atomi_nbrs_curbatch; - } - - Kokkos::single(Kokkos::PerThread(team), - [&]() { d_numnbrs[i] = atomi_nbrs_inH; }); - } - } - }); -} - -/* ---------------------------------------------------------------------- */ - -template -// NOLINTNEXTLINE -KOKKOS_INLINE_FUNCTION -KK_FLOAT FixQEqReaxFFKokkos::calculate_H_k(const KK_FLOAT &r, const KK_FLOAT &shld) const -{ - KK_FLOAT taper, denom; - - taper = d_tap[7] * r + d_tap[6]; - taper = taper * r + d_tap[5]; - taper = taper * r + d_tap[4]; - taper = taper * r + d_tap[3]; - taper = taper * r + d_tap[2]; - taper = taper * r + d_tap[1]; - taper = taper * r + d_tap[0]; - - denom = r * r * r + shld; - denom = cbrt(denom); - - return taper * static_cast(EV_TO_KCAL_PER_MOL) / denom; -} - -/* ---------------------------------------------------------------------- */ - template // NOLINTNEXTLINE KOKKOS_INLINE_FUNCTION @@ -854,6 +543,29 @@ void FixQEqReaxFFKokkos::calculate_q() /* ---------------------------------------------------------------------- */ +template +// NOLINTNEXTLINE +KOKKOS_INLINE_FUNCTION +KK_FLOAT FixQEqReaxFFKokkos::calculate_H_k(const KK_FLOAT &r, const KK_FLOAT &shld) const +{ + KK_FLOAT taper, denom; + + taper = d_tap[7] * r + d_tap[6]; + taper = taper * r + d_tap[5]; + taper = taper * r + d_tap[4]; + taper = taper * r + d_tap[3]; + taper = taper * r + d_tap[2]; + taper = taper * r + d_tap[1]; + taper = taper * r + d_tap[0]; + + denom = r * r * r + shld; + denom = cbrt(denom); + + return taper * static_cast(EV_TO_KCAL_PER_MOL) / denom; +} + +/* ---------------------------------------------------------------------- */ + template void FixQEqReaxFFKokkos::sparse_matvec_kokkos(typename AT::t_kkfloat_1d_2 &d_xx_in) { @@ -881,15 +593,28 @@ void FixQEqReaxFFKokkos::sparse_matvec_kokkos(typename AT::t_kkfloat if (need_dup) dup_o.reset_except(d_o); - if (neighflag == HALF) - Kokkos::parallel_for(Kokkos::TeamPolicy>(leaguesize, teamsize, vectorsize), *this); - else if (neighflag == HALFTHREAD) - Kokkos::parallel_for(Kokkos::TeamPolicy>(leaguesize, teamsize, vectorsize), *this); + if (matrix_free) { + if (neighflag == HALF) + Kokkos::parallel_for(Kokkos::RangePolicy>(0,nn),*this); + else if (neighflag == HALFTHREAD) + Kokkos::parallel_for(Kokkos::RangePolicy>(0,nn),*this); + } else { + if (neighflag == HALF) + Kokkos::parallel_for(Kokkos::TeamPolicy>(leaguesize, teamsize, vectorsize), *this); + else if (neighflag == HALFTHREAD) + Kokkos::parallel_for(Kokkos::TeamPolicy>(leaguesize, teamsize, vectorsize), *this); + } if (need_dup) Kokkos::Experimental::contribute(d_o, dup_o); - } else // FULL - Kokkos::parallel_for(Kokkos::TeamPolicy (leaguesize, teamsize, vectorsize), *this); + } else { + // FULL + if (matrix_free) { + Kokkos::parallel_for(Kokkos::RangePolicy(0,nn),*this); + } else { + Kokkos::parallel_for(Kokkos::TeamPolicy (leaguesize, teamsize, vectorsize), *this); + } + } } /* ---------------------------------------------------------------------- */ @@ -972,7 +697,7 @@ void FixQEqReaxFFKokkos::operator()(TagQEqSparseMatvec2_Half // NOLINTNEXTLINE KOKKOS_INLINE_FUNCTION -void FixQEqReaxFFKokkos::operator()(TagQEqSparseMatvec2_Full, const membertype_vec &team) const +void FixQEqReaxFFKokkos::operator()(TagQEqSparseMatvec2_Full, const typename Kokkos::TeamPolicy::member_type &team) const { int k = team.league_rank() * team.team_size() + team.team_rank(); if (k < nn) { @@ -999,6 +724,152 @@ void FixQEqReaxFFKokkos::operator()(TagQEqSparseMatvec2_Full, const /* ---------------------------------------------------------------------- */ +// half matrix-free form of the sparse mat-vec +template +template +// NOLINTNEXTLINE +KOKKOS_INLINE_FUNCTION +void FixQEqReaxFFKokkos::operator()(TagQEqSparseMatvec2_Half_MatrixFree, const int &ii) const +{ + // The o array is duplicated for OpenMP, atomic for GPU, and neither for Serial + auto v_o = ScatterViewHelper,decltype(dup_o),decltype(ndup_o)>::get(dup_o,ndup_o); + auto a_o = v_o.template access>(); + + const int i = d_ilist[ii]; + int j, jj, jtype; + + if (mask[i] & groupbit) { + const KK_FLOAT xtmp = x(i,0); + const KK_FLOAT ytmp = x(i,1); + const KK_FLOAT ztmp = x(i,2); + const int itype = type(i); + const tagint itag = tag(i); + const int jnum = d_numneigh[i]; + + KK_FLOAT2 doi = KK_FLOAT2(); + const KK_FLOAT2 xx_i = { d_xx(i,0), d_xx(i,1) }; + + for (jj = 0; jj < jnum; jj++) { + j = d_neighbors(i,jj); + j &= NEIGHMASK; + jtype = type(j); + + const KK_FLOAT delx = x(j,0) - xtmp; + const KK_FLOAT dely = x(j,1) - ytmp; + const KK_FLOAT delz = x(j,2) - ztmp; + + // always true + //if (neighflag != FULL) { + // skip half of the interactions + const tagint jtag = tag(j); + bool skip = [&]() -> bool { + if (j >= nlocal) { + if (itag > jtag) { + if ((itag+jtag) % 2 == 0) return true; + } else if (itag < jtag) { + if ((itag+jtag) % 2 == 1) return true; + } else { + if (x(j,2) < ztmp) return true; + if (x(j,2) == ztmp && x(j,1) < ytmp) return true; + if (x(j,2) == ztmp && x(j,1) == ytmp && x(j,0) < xtmp) return true; + } + } + return false; + }(); + + if (!skip) { + const KK_FLOAT rsq = delx*delx + dely*dely + delz*delz; + if (rsq <= cutsq) { + const KK_FLOAT r = sqrt(rsq); + const KK_FLOAT shldij = d_shield(itype, jtype); + + // we split this out here for mixed precision calculations + double h_k_val = calculate_H_k(r, shldij); + + const KK_FLOAT2 xx_j = { d_xx(j,0), d_xx(j,1) }; + if (!(converged & 1)) { + doi.v[0] += h_k_val * xx_j.v[0]; + a_o(j,0) += h_k_val * xx_i.v[0]; + } + + if (!(converged & 2)) { + doi.v[1] += h_k_val * xx_j.v[1]; + a_o(j,1) += h_k_val * xx_i.v[1]; + } + } + } + } + + if (!(converged & 1)) { + a_o(i,0) += doi.v[0]; + } + if (!(converged & 2)) { + a_o(i,1) += doi.v[1]; + } + } +} + +/* ---------------------------------------------------------------------- */ + +// matrix-free form of the sparse mat-vec +template +// NOLINTNEXTLINE +KOKKOS_INLINE_FUNCTION +void FixQEqReaxFFKokkos::operator()(TagQEqSparseMatvec2_Full_MatrixFree, const int &ii) const +{ + const int i = d_ilist[ii]; + int j, jj, jtype; + + if (mask[i] & groupbit) { + const KK_FLOAT xtmp = x(i,0); + const KK_FLOAT ytmp = x(i,1); + const KK_FLOAT ztmp = x(i,2); + const int itype = type(i); + const int jnum = d_numneigh[i]; + + KK_FLOAT2 doi = KK_FLOAT2(); + + for (jj = 0; jj < jnum; jj++) { + j = d_neighbors(i,jj); + j &= NEIGHMASK; + jtype = type(j); + + const KK_FLOAT delx = x(j,0) - xtmp; + const KK_FLOAT dely = x(j,1) - ytmp; + const KK_FLOAT delz = x(j,2) - ztmp; + + const KK_FLOAT rsq = delx*delx + dely*dely + delz*delz; + if (rsq <= cutsq) { + const KK_FLOAT r = sqrt(rsq); + const KK_FLOAT shldij = d_shield(itype, jtype); + + // we split this out here for mixed precision calculations + double h_k_val = calculate_H_k(r, shldij); + + const KK_FLOAT2 xx = { d_xx(j,0), d_xx(j,1) }; + if (!(converged & 1)) + doi.v[0] += h_k_val * xx.v[0]; + + if (!(converged & 2)) + doi.v[1] += h_k_val * xx.v[1]; + } + + } + + KK_FLOAT2 o = { d_o(i,0), d_o(i,1) }; + if (!(converged & 1)) + o.v[0] += doi.v[0]; + + if (!(converged & 2)) + o.v[1] += doi.v[1]; + + d_o(i,0) = o.v[0]; + d_o(i,1) = o.v[1]; + } +} + +/* ---------------------------------------------------------------------- */ + template // NOLINTNEXTLINE KOKKOS_INLINE_FUNCTION @@ -1264,6 +1135,44 @@ void FixQEqReaxFFKokkos::unpack_forward_comm(int n, int first, doubl } } +/* ---------------------------------------------------------------------- */ +template +int FixQEqReaxFFKokkos::pack_reverse_comm_kokkos(int n, int first_in, DAT::tdual_double_1d &buf) +{ + first = first_in; + d_buf = buf.view(); + Kokkos::parallel_for(Kokkos::RangePolicy(0,n),*this); + return n*2; +} + +template +KOKKOS_INLINE_FUNCTION +void FixQEqReaxFFKokkos::operator()(TagQEqPackReverseComm, const int &i) const +{ + if (!(converged & 1)) d_buf[2*i] = d_o(i+first,0); + if (!(converged & 2)) d_buf[2*i+1] = d_o(i+first,1); +} + +/* ---------------------------------------------------------------------- */ + +template +void FixQEqReaxFFKokkos::unpack_reverse_comm_kokkos(int n, DAT::tdual_int_1d k_sendlist, DAT::tdual_double_1d& buf) +{ + d_buf = buf.view(); + d_sendlist = k_sendlist.view(); + Kokkos::parallel_for(Kokkos::RangePolicy(0,n),*this); +} + +template +KOKKOS_INLINE_FUNCTION +void FixQEqReaxFFKokkos::operator()(TagQEqUnpackReverseComm, const int &i) const +{ + int j = d_sendlist(i); + + if (!(converged & 1)) d_o(j,0) += d_buf[2*i]; + if (!(converged & 2)) d_o(j,1) += d_buf[2*i+1]; +} + /* ---------------------------------------------------------------------- */ template @@ -1528,6 +1437,314 @@ void FixQEqReaxFFKokkos::get_chi_field() /* ---------------------------------------------------------------------- */ +template +void FixQEqReaxFFKokkosNeighborFunctor::update_after_allocation(const FixQEqReaxFFKokkos &qeqreax) +{ + d_firstnbr = qeqreax.d_firstnbr; + d_numnbrs = qeqreax.d_numnbrs; + d_jlist = qeqreax.d_jlist; + d_val = qeqreax.d_val; +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void FixQEqReaxFFKokkosNeighborFunctor::operator()(TagQEqNeighborNumNeigh, const int& ii, bigint& totneigh) const +{ + totneigh += static_cast(d_numneigh[ii]); +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +KK_FLOAT FixQEqReaxFFKokkosNeighborFunctor::calculate_H_k(const KK_FLOAT &r, const KK_FLOAT &shld) const +{ + KK_FLOAT taper, denom; + + taper = d_tap[7] * r + d_tap[6]; + taper = taper * r + d_tap[5]; + taper = taper * r + d_tap[4]; + taper = taper * r + d_tap[3]; + taper = taper * r + d_tap[2]; + taper = taper * r + d_tap[1]; + taper = taper * r + d_tap[0]; + + denom = r * r * r + shld; + denom = cbrt(denom); + + return taper * static_cast(EV_TO_KCAL_PER_MOL) / denom; +} + +/* ---------------------------------------------------------------------- */ + +template +template +KOKKOS_INLINE_FUNCTION +void FixQEqReaxFFKokkosNeighborFunctor::operator()(TagQEqNeighborComputeH, const int& ii, bigint &m_fill, const bool &final) const +{ + const int i = d_ilist[ii]; + int j,jj,jtype; + + if (mask[i] & groupbit) { + + const KK_FLOAT xtmp = x(i,0); + const KK_FLOAT ytmp = x(i,1); + const KK_FLOAT ztmp = x(i,2); + const int itype = type(i); + const tagint itag = tag(i); + const int jnum = d_numneigh[i]; + if (final) + d_firstnbr[i] = m_fill; + + for (jj = 0; jj < jnum; jj++) { + j = d_neighbors(i,jj); + j &= NEIGHMASK; + jtype = type(j); + + const KK_FLOAT delx = x(j,0) - xtmp; + const KK_FLOAT dely = x(j,1) - ytmp; + const KK_FLOAT delz = x(j,2) - ztmp; + + if (NEIGHFLAG != FULL) { + // skip half of the interactions + const tagint jtag = tag(j); + if (j >= nlocal) { + if (itag > jtag) { + if ((itag+jtag) % 2 == 0) continue; + } else if (itag < jtag) { + if ((itag+jtag) % 2 == 1) continue; + } else { + if (x(j,2) < ztmp) continue; + if (x(j,2) == ztmp && x(j,1) < ytmp) continue; + if (x(j,2) == ztmp && x(j,1) == ytmp && x(j,0) < xtmp) continue; + } + } + } + + const KK_FLOAT rsq = delx*delx + dely*dely + delz*delz; + if (rsq > cutsq) continue; + + if (final) { + const KK_FLOAT r = sqrt(rsq); + d_jlist(m_fill) = j; + const KK_FLOAT shldij = d_shield(itype,jtype); + d_val(m_fill) = calculate_H_k(r,shldij); + } + m_fill++; + } + if (final) + d_numnbrs[i] = int(m_fill - d_firstnbr[i]); + } +} + +// Calculate Qeq matrix H where H is a sparse matrix and H[i][j] represents the electrostatic interaction coefficients on atom-i with atom-j +// d_val - contains the non-zero entries of sparse matrix H +// d_numnbrs - d_numnbrs[i] contains the # of non-zero entries in the i-th row of H (which also represents the # of neighbor atoms with electrostatic interaction coefficients with atom-i) +// d_firstnbr- d_firstnbr[i] contains the beginning index from where the H matrix entries corresponding to row-i is stored in d_val +// d_jlist - contains the column index corresponding to each entry in d_val +template +template +KOKKOS_INLINE_FUNCTION +void FixQEqReaxFFKokkosNeighborFunctor::operator()(TagQEqNeighborComputeH, const typename Kokkos::TeamPolicy::member_type &team) const +{ + + // scratch space setup + Kokkos::View, + Kokkos::MemoryTraits> + s_ilist(team.team_shmem(), atoms_per_team); + Kokkos::View, + Kokkos::MemoryTraits> + s_numnbrs(team.team_shmem(), atoms_per_team); + Kokkos::View, + Kokkos::MemoryTraits> + s_firstnbr(team.team_shmem(), atoms_per_team); + + Kokkos::View, + Kokkos::MemoryTraits> + s_jtype(team.team_shmem(), atoms_per_team, vector_length); + Kokkos::View, + Kokkos::MemoryTraits> + s_jlist(team.team_shmem(), atoms_per_team, vector_length); + Kokkos::View, + Kokkos::MemoryTraits> + s_r(team.team_shmem(), atoms_per_team, vector_length); + + // team of threads work on atoms with index in [firstatom, lastatom) + int firstatom = team.league_rank() * atoms_per_team; + int lastatom = + (firstatom + atoms_per_team < nn) ? (firstatom + atoms_per_team) : nn; + + // kokkos-thread-0 is used to load info from global memory into scratch space + if (team.team_rank() == 0) { + + // copy atom indices from d_ilist[firstatom:lastatom] to scratch space s_ilist[0:atoms_per_team] + // copy # of neighbor atoms for all the atoms with indices in d_ilist[firstatom:lastatom] from d_numneigh to scratch space s_numneigh[0:atoms_per_team] + // calculate total number of neighbor atoms for all atoms assigned to the current team of threads (Note - Total # of neighbor atoms here provides the + // upper bound space requirement to store the H matrix values corresponding to the atoms with indices in d_ilist[firstatom:lastatom]) + + Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team, atoms_per_team), + [&](const int &idx, int &totalnbrs, bool final) { + int ii = firstatom + idx; + + if (ii < nn) { + const int i = d_ilist[ii]; + int jnum = d_numneigh[i]; + + if (final) { + s_ilist[idx] = i; + s_numnbrs[idx] = jnum; + s_firstnbr[idx] = totalnbrs; + } + totalnbrs += jnum; + } else { + s_numnbrs[idx] = 0; + } + }); + } + + // barrier ensures that the data moved to scratch space is visible to all the + // threads of the corresponding team + team.team_barrier(); + + // calculate the global memory offset from where the H matrix values to be + // calculated by the current team will be stored in d_val + bigint team_firstnbr_idx = 0; + Kokkos::single(Kokkos::PerTeam(team), + [&](bigint &val) { + int totalnbrs = s_firstnbr[lastatom - firstatom - 1] + + s_numnbrs[lastatom - firstatom - 1]; + val = Kokkos::atomic_fetch_add(&d_mfill_offset(), totalnbrs); + }, + team_firstnbr_idx); + + // map the H matrix computation of each atom to kokkos-thread (one atom per + // kokkos-thread) neighbor computation for each atom is assigned to vector + // lanes of the corresponding thread + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, atoms_per_team), + [&](const int &idx) { + int ii = firstatom + idx; + + if (ii < nn) { + const int i = s_ilist[idx]; + + if (mask[i] & groupbit) { + const KK_FLOAT xtmp = x(i, 0); + const KK_FLOAT ytmp = x(i, 1); + const KK_FLOAT ztmp = x(i, 2); + const int itype = type(i); + tagint itag = tag(i); // removed "const" to work around GCC 7 bug + int jnum = s_numnbrs[idx]; // removed "const" to work around GCC 7 bug + + // calculate the write-offset for atom-i's first neighbor + bigint atomi_firstnbr_idx = team_firstnbr_idx + s_firstnbr[idx]; + Kokkos::single(Kokkos::PerThread(team), + [&]() { d_firstnbr[i] = atomi_firstnbr_idx; }); + + // current # of neighbor atoms with non-zero electrostatic + // interaction coefficients with atom-i which represents the # of + // non-zero elements in row-i of H matrix + int atomi_nbrs_inH = 0; + + // calculate H matrix values corresponding to atom-i where neighbors + // are processed in batches and the batch size is vector_length + for (int jj_start = 0; jj_start < jnum; jj_start += vector_length) { + + bigint atomi_nbr_writeIdx = atomi_firstnbr_idx + atomi_nbrs_inH; + + // count the # of neighbor atoms with non-zero electrostatic + // interaction coefficients with atom-i in the current batch + int atomi_nbrs_curbatch = 0; + + // compute rsq, jtype, j and store in scratch space which is + // reused later + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, vector_length), + [&](const int &idx, int &m_fill) { + const int jj = jj_start + idx; + + // initialize: -1 represents no interaction with atom-j + // where j = d_neighbors(i,jj) + s_jlist(team.team_rank(), idx) = -1; + + if (jj < jnum) { + int j = d_neighbors(i, jj); + j &= NEIGHMASK; + const int jtype = type(j); + + const KK_FLOAT delx = x(j, 0) - xtmp; + const KK_FLOAT dely = x(j, 1) - ytmp; + const KK_FLOAT delz = x(j, 2) - ztmp; + + // valid nbr interaction + bool valid = true; + if (NEIGHFLAG != FULL) { + // skip half of the interactions + const tagint jtag = tag(j); + if (j >= nlocal) { + if (itag > jtag) { + if ((itag + jtag) % 2 == 0) + valid = false; + } else if (itag < jtag) { + if ((itag + jtag) % 2 == 1) + valid = false; + } else { + if (x(j, 2) < ztmp) + valid = false; + if (x(j, 2) == ztmp && x(j, 1) < ytmp) + valid = false; + if (x(j, 2) == ztmp && x(j, 1) == ytmp && + x(j, 0) < xtmp) + valid = false; + } + } + } + + const KK_FLOAT rsq = + delx * delx + dely * dely + delz * delz; + if (rsq > cutsq) + valid = false; + + if (valid) { + s_jlist(team.team_rank(), idx) = j; + s_jtype(team.team_rank(), idx) = jtype; + s_r(team.team_rank(), idx) = sqrt(rsq); + m_fill++; + } + } + }, + atomi_nbrs_curbatch); + + // write non-zero entries of H to global memory + Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team, vector_length), + [&](const int &idx, int &m_fill, bool final) { + int j = s_jlist(team.team_rank(), idx); + if (final) { + if (j != -1) { + const int jtype = s_jtype(team.team_rank(), idx); + const KK_FLOAT r = s_r(team.team_rank(), idx); + const KK_FLOAT shldij = d_shield(itype, jtype); + + d_jlist[atomi_nbr_writeIdx + m_fill] = j; + d_val[atomi_nbr_writeIdx + m_fill] = + calculate_H_k(r, shldij); + } + } + + if (j != -1) { + m_fill++; + } + }); + atomi_nbrs_inH += atomi_nbrs_curbatch; + } + + Kokkos::single(Kokkos::PerThread(team), + [&]() { d_numnbrs[i] = atomi_nbrs_inH; }); + } + } + }); +} + namespace LAMMPS_NS { template class FixQEqReaxFFKokkos; #ifdef LMP_KOKKOS_GPU diff --git a/src/KOKKOS/fix_qeq_reaxff_kokkos.h b/src/KOKKOS/fix_qeq_reaxff_kokkos.h index 822cfa1cedb..34c00036e6d 100644 --- a/src/KOKKOS/fix_qeq_reaxff_kokkos.h +++ b/src/KOKKOS/fix_qeq_reaxff_kokkos.h @@ -34,6 +34,10 @@ FixStyle(qeq/reax/kk/host,FixQEqReaxFFKokkos); namespace LAMMPS_NS { +// Forward declare a neighbor helper +template +struct FixQEqReaxFFKokkosNeighborFunctor; + struct TagQEqZero{}; struct TagQEqInitMatvec{}; struct TagQEqSparseMatvec1{}; @@ -41,8 +45,13 @@ struct TagQEqZeroQGhosts{}; template struct TagQEqSparseMatvec2_Half{}; - struct TagQEqSparseMatvec2_Full{}; + +// Matrix-free version of the sparse mat-vec operations +template +struct TagQEqSparseMatvec2_Half_MatrixFree{}; +struct TagQEqSparseMatvec2_Full_MatrixFree{}; + struct TagQEqNorm1{}; struct TagQEqDot1{}; struct TagQEqDot2{}; @@ -52,6 +61,8 @@ struct TagQEqSum2{}; struct TagQEqCalculateQ{}; struct TagQEqPackForwardComm{}; struct TagQEqUnpackForwardComm{}; +struct TagQEqPackReverseComm{}; +struct TagQEqUnpackReverseComm{}; struct TagQEqPackExchange{}; struct TagQEqUnpackExchange{}; @@ -60,6 +71,7 @@ class FixQEqReaxFFKokkos : public FixQEqReaxFF, public KokkosBase { public: typedef DeviceType device_type; typedef ArrayTypes AT; + static constexpr bool is_host = (ExecutionSpaceFromDevice::space == HostKK); typedef KK_double2 value_type; FixQEqReaxFFKokkos(class LAMMPS *, int, char **); ~FixQEqReaxFFKokkos() override; @@ -81,33 +93,31 @@ class FixQEqReaxFFKokkos : public FixQEqReaxFF, public KokkosBase { KOKKOS_INLINE_FUNCTION void operator()(TagQEqInitMatvec, const int&) const; - template // NOLINTNEXTLINE KOKKOS_INLINE_FUNCTION - void compute_h_item(int, bigint &, const bool &) const; + void operator()(TagQEqSparseMatvec1, const int&) const; - template // NOLINTNEXTLINE KOKKOS_INLINE_FUNCTION - void compute_h_team(const typename Kokkos::TeamPolicy::member_type &team, int, int) const; + void operator()(TagQEqZeroQGhosts, const int&) const; + template // NOLINTNEXTLINE KOKKOS_INLINE_FUNCTION - void operator()(TagQEqSparseMatvec1, const int&) const; + void operator()(TagQEqSparseMatvec2_Half, const typename Kokkos::TeamPolicy>::member_type &team) const; // NOLINTNEXTLINE KOKKOS_INLINE_FUNCTION - void operator()(TagQEqZeroQGhosts, const int&) const; + void operator()(TagQEqSparseMatvec2_Full, const typename Kokkos::TeamPolicy::member_type &team) const; template // NOLINTNEXTLINE KOKKOS_INLINE_FUNCTION - void operator()(TagQEqSparseMatvec2_Half, const typename Kokkos::TeamPolicy>::member_type &team) const; + void operator()(TagQEqSparseMatvec2_Half_MatrixFree, const int&) const; - typedef typename Kokkos::TeamPolicy::member_type membertype_vec; // NOLINTNEXTLINE KOKKOS_INLINE_FUNCTION - void operator()(TagQEqSparseMatvec2_Full, const membertype_vec &team) const; + void operator()(TagQEqSparseMatvec2_Full_MatrixFree, const int&) const; // NOLINTNEXTLINE KOKKOS_INLINE_FUNCTION @@ -149,6 +159,14 @@ class FixQEqReaxFFKokkos : public FixQEqReaxFF, public KokkosBase { KOKKOS_INLINE_FUNCTION void operator()(TagQEqUnpackForwardComm, const int&) const; +// NOLINTNEXTLINE + KOKKOS_INLINE_FUNCTION + void operator()(TagQEqPackReverseComm, const int&) const; + +// NOLINTNEXTLINE + KOKKOS_INLINE_FUNCTION + void operator()(TagQEqUnpackReverseComm, const int&) const; + // NOLINTNEXTLINE KOKKOS_INLINE_FUNCTION void operator()(TagQEqPackExchange, const int&) const; @@ -182,7 +200,11 @@ class FixQEqReaxFFKokkos : public FixQEqReaxFF, public KokkosBase { void unpack_forward_comm_kokkos(int, int, DAT::tdual_double_1d&) override; int pack_forward_comm(int, int *, double *, int, int *) override; void unpack_forward_comm(int, int, double *) override; + + int pack_reverse_comm_kokkos(int, int, DAT::tdual_double_1d&) override; int pack_reverse_comm(int, int, double *) override; + + void unpack_reverse_comm_kokkos(int, DAT::tdual_int_1d, DAT::tdual_double_1d&) override; void unpack_reverse_comm(int, int *, double *) override; double memory_usage() override; void sparse_matvec_kokkos(typename AT::t_kkfloat_1d_2 &); @@ -312,53 +334,82 @@ class FixQEqReaxFFKokkos : public FixQEqReaxFF, public KokkosBase { int pack_exchange(int, double *) override; int unpack_exchange(int, double *) override; void get_chi_field() override; + + // Let the neighbor count functor access some of my members + friend class FixQEqReaxFFKokkosNeighborFunctor; }; +// Custom functor to count various things about the number of neighbors +struct TagQEqNeighborNumNeigh{}; + +template +struct TagQEqNeighborComputeH{}; + template -struct FixQEqReaxFFKokkosNumNeighFunctor { - typedef DeviceType device_type; - typedef ArrayTypes AT; - typedef bigint value_type; - FixQEqReaxFFKokkos c; - FixQEqReaxFFKokkosNumNeighFunctor(FixQEqReaxFFKokkos* c_ptr):c(*c_ptr) { - c.cleanup_copy(); - }; -// NOLINTNEXTLINE - KOKKOS_INLINE_FUNCTION - void operator()(const int ii, bigint &totneigh) const { - c.num_neigh_item(ii, totneigh); - } -}; +struct FixQEqReaxFFKokkosNeighborFunctor { + using device_type = DeviceType; + using value_type = bigint; + using scratch_space = Kokkos::ScratchMemorySpace; + using AT = ArrayTypes; -template -struct FixQEqReaxFFKokkosComputeHFunctor { - int atoms_per_team, vector_length; - typedef bigint value_type; - typedef Kokkos::ScratchMemorySpace scratch_space; - FixQEqReaxFFKokkos c; + typename AT::t_kkfloat_1d_3_lr_const x; + typename AT::t_int_1d_const type, mask; + typename AT::t_tagint_1d_const tag; - FixQEqReaxFFKokkosComputeHFunctor(FixQEqReaxFFKokkos* c_ptr):c(*c_ptr) { - c.cleanup_copy(); - }; + typename AT::t_neighbors_2d_const d_neighbors; + typename AT::t_int_1d_randomread d_ilist, d_numneigh; - FixQEqReaxFFKokkosComputeHFunctor(FixQEqReaxFFKokkos *c_ptr, - int _atoms_per_team, int _vector_length) - : atoms_per_team(_atoms_per_team), vector_length(_vector_length), c(*c_ptr) { - c.cleanup_copy(); - }; + typename AT::t_bigint_1d d_firstnbr; + typename AT::t_int_1d d_numnbrs; + typename AT::t_int_1d d_jlist; + typename AT::t_kkfloat_1d d_val; + typename AT::t_kkfloat_2d_const d_shield; + typename AT::t_kkfloat_1d_const d_tap; -// NOLINTNEXTLINE + typename AT::t_bigint_scalar d_mfill_offset; + + int nlocal, nn, nmax; + int groupbit, neighflag; + KK_FLOAT cutsq; + + static constexpr int atoms_per_team = FixQEqReaxFFKokkos::compute_h_teamsize; + static constexpr int vector_length = FixQEqReaxFFKokkos::compute_h_vectorsize; + + // no default constructor + FixQEqReaxFFKokkosNeighborFunctor() = delete; + + FixQEqReaxFFKokkosNeighborFunctor(const FixQEqReaxFFKokkos &qeqreax) + : x(qeqreax.x), type(qeqreax.type), mask(qeqreax.mask), tag(qeqreax.tag), + d_neighbors(qeqreax.d_neighbors), d_ilist(qeqreax.d_ilist), d_numneigh(qeqreax.d_numneigh), + d_firstnbr(qeqreax.d_firstnbr), d_numnbrs(qeqreax.d_numnbrs), d_jlist(qeqreax.d_jlist), + d_val(qeqreax.d_val), d_shield(qeqreax.d_shield), d_tap(qeqreax.d_tap), d_mfill_offset(qeqreax.d_mfill_offset), + nlocal(qeqreax.nlocal), nn(qeqreax.nn), nmax(qeqreax.nmax), groupbit(qeqreax.groupbit), + neighflag(qeqreax.neighflag == FULL ? FULL : HALF), cutsq(qeqreax.cutsq) + {} + + void update_after_allocation(const FixQEqReaxFFKokkos &qeqreax); + + // Function to calculate H based on r and shld + // NOLINTNEXTLINE KOKKOS_INLINE_FUNCTION - void operator()(const int ii, bigint &m_fill, const bool &final) const { - c.template compute_h_item(ii,m_fill,final); - } + KK_FLOAT calculate_H_k(const KK_FLOAT &r, const KK_FLOAT &shld) const; -// NOLINTNEXTLINE + // Count the number of neighbors w/in the cutoff radius + // NOLINTNEXTLINE KOKKOS_INLINE_FUNCTION - void operator()( - const typename Kokkos::TeamPolicy::member_type &team) const { - c.template compute_h_team(team, atoms_per_team, vector_length); - } + void operator()(TagQEqNeighborNumNeigh, const int& ii, bigint& neigh_count) const; + + // Calculate H deterministically w/a scan, CPU version + template + // NOLINTNEXTLINE + KOKKOS_INLINE_FUNCTION + void operator()(TagQEqNeighborComputeH, const int& ii, bigint &m_fill, const bool &final) const; + + // Calculate H, GPU Team parallelism version + template + // NOLINTNEXTLINE + KOKKOS_INLINE_FUNCTION + void operator()(TagQEqNeighborComputeH, const typename Kokkos::TeamPolicy::member_type &team) const; [[nodiscard]] size_t team_shmem_size(int /*team_size*/) const { size_t shmem_size = @@ -377,6 +428,7 @@ struct FixQEqReaxFFKokkosComputeHFunctor { vector_length); // s_r return shmem_size; } + }; } diff --git a/src/KOKKOS/kokkos.cpp b/src/KOKKOS/kokkos.cpp index 8827a73f6e8..87db605acc9 100644 --- a/src/KOKKOS/kokkos.cpp +++ b/src/KOKKOS/kokkos.cpp @@ -68,6 +68,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp) forward_pair_comm_changed = 0; reverse_pair_comm_changed = 0; forward_fix_comm_changed = 0; + reverse_fix_comm_changed = 0; reverse_comm_changed = 0; sort_changed = atom_map_changed = 0; @@ -287,7 +288,8 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp) newtonflag = 0; exchange_comm_legacy = forward_comm_legacy = reverse_comm_legacy = 0; - forward_pair_comm_legacy = reverse_pair_comm_legacy = forward_fix_comm_legacy = 0; + forward_pair_comm_legacy = reverse_pair_comm_legacy = + forward_fix_comm_legacy = reverse_fix_comm_legacy = 0; sort_legacy = 0; atom_map_legacy = 0; @@ -303,7 +305,8 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp) newtonflag = 1; exchange_comm_legacy = forward_comm_legacy = reverse_comm_legacy = 1; - forward_pair_comm_legacy = reverse_pair_comm_legacy = forward_fix_comm_legacy = 1; + forward_pair_comm_legacy = reverse_pair_comm_legacy = + forward_fix_comm_legacy = reverse_fix_comm_legacy = 1; sort_legacy = 1; atom_map_legacy = 1; @@ -488,17 +491,20 @@ void KokkosLMP::accelerator(int narg, char **arg) if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command"); if (strcmp(arg[iarg+1],"no") == 0) { exchange_comm_legacy = forward_comm_legacy = reverse_comm_legacy = 1; - forward_pair_comm_legacy = reverse_pair_comm_legacy = forward_fix_comm_legacy = 1; + forward_pair_comm_legacy = reverse_pair_comm_legacy = + forward_fix_comm_legacy = reverse_fix_comm_legacy = 1; exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0; } else if (strcmp(arg[iarg+1],"host") == 0) { exchange_comm_legacy = forward_comm_legacy = reverse_comm_legacy = 0; - forward_pair_comm_legacy = reverse_pair_comm_legacy = forward_fix_comm_legacy = 1; + forward_pair_comm_legacy = reverse_pair_comm_legacy = + forward_fix_comm_legacy = reverse_fix_comm_legacy = 1; exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 1; } else if (strcmp(arg[iarg+1],"device") == 0) { exchange_comm_legacy = forward_comm_legacy = reverse_comm_legacy = 0; - forward_pair_comm_legacy = reverse_pair_comm_legacy = forward_fix_comm_legacy = 0; + forward_pair_comm_legacy = reverse_pair_comm_legacy = + forward_fix_comm_legacy = reverse_fix_comm_legacy = 0; exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0; } else error->all(FLERR,"Illegal package kokkos command"); @@ -551,6 +557,14 @@ void KokkosLMP::accelerator(int narg, char **arg) else error->all(FLERR,"Illegal package kokkos command"); forward_fix_comm_changed = 0; iarg += 2; + } else if (strcmp(arg[iarg],"comm/fix/reverse") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command"); + if (strcmp(arg[iarg+1],"no") == 0) reverse_fix_comm_legacy = 1; + else if (strcmp(arg[iarg+1],"host") == 0) reverse_fix_comm_legacy = 1; + else if (strcmp(arg[iarg+1],"device") == 0) reverse_fix_comm_legacy = 0; + else error->all(FLERR,"Illegal package kokkos command"); + reverse_fix_comm_changed = 0; + iarg += 2; } else if (strcmp(arg[iarg],"comm/reverse") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command"); else if (strcmp(arg[iarg+1],"no") == 0) reverse_comm_legacy = 1; @@ -653,6 +667,10 @@ void KokkosLMP::accelerator(int narg, char **arg) forward_fix_comm_legacy = 1; forward_fix_comm_changed = 1; } + if (reverse_fix_comm_legacy == 0) { + reverse_fix_comm_legacy = 1; + reverse_fix_comm_changed = 1; + } if (reverse_comm_legacy == 0 && reverse_comm_on_host == 0) { reverse_comm_legacy = 1; reverse_comm_changed = 1; @@ -693,6 +711,10 @@ void KokkosLMP::accelerator(int narg, char **arg) forward_fix_comm_legacy = 0; forward_fix_comm_changed = 0; } + if (reverse_fix_comm_changed) { + reverse_fix_comm_legacy = 0; + reverse_fix_comm_changed = 0; + } if (reverse_comm_changed) { reverse_comm_legacy = 0; reverse_comm_changed = 0; diff --git a/src/KOKKOS/kokkos.h b/src/KOKKOS/kokkos.h index 0dee4dd650f..4e3af0bdbcd 100644 --- a/src/KOKKOS/kokkos.h +++ b/src/KOKKOS/kokkos.h @@ -32,6 +32,7 @@ class KokkosLMP : protected Pointers { int forward_pair_comm_legacy; int reverse_pair_comm_legacy; int forward_fix_comm_legacy; + int reverse_fix_comm_legacy; int reverse_comm_legacy; int sort_legacy; int atom_map_legacy; @@ -43,6 +44,7 @@ class KokkosLMP : protected Pointers { int forward_pair_comm_changed; int reverse_pair_comm_changed; int forward_fix_comm_changed; + int reverse_fix_comm_changed; int reverse_comm_changed; int sort_changed; int atom_map_changed; diff --git a/src/KOKKOS/kokkos_base.h b/src/KOKKOS/kokkos_base.h index b931f2d2623..6b07a7fd2bc 100644 --- a/src/KOKKOS/kokkos_base.h +++ b/src/KOKKOS/kokkos_base.h @@ -41,6 +41,12 @@ class KokkosBase { int, int *) {return 0;}; virtual void unpack_forward_comm_fix_kokkos(int, int, DAT::tdual_double_1d &) {} + + virtual int pack_reverse_comm_fix_kokkos(int, int, DAT::tdual_double_1d &) {return 0;}; + virtual void unpack_reverse_comm_fix_kokkos(int, DAT::tdual_int_1d, + int, DAT::tdual_double_1d &) {} + + virtual int pack_exchange_kokkos(const int & /*nsend*/, DAT::tdual_double_2d_lr & /*k_buf*/, DAT::tdual_int_1d /*k_sendlist*/, DAT::tdual_int_1d /*k_copylist*/, diff --git a/src/KOKKOS/kokkos_type.h b/src/KOKKOS/kokkos_type.h index 239dab92868..e778cb15d5e 100644 --- a/src/KOKKOS/kokkos_type.h +++ b/src/KOKKOS/kokkos_type.h @@ -495,6 +495,12 @@ struct alignas(2*sizeof(double)) s_KK_double2 { s_KK_double2() { v[0] = v[1] = 0.0; } +// NOLINTNEXTLINE + KOKKOS_INLINE_FUNCTION + s_KK_double2(const double &x, const double &y) { + v[0] = x; + v[1] = y; + } // NOLINTNEXTLINE KOKKOS_INLINE_FUNCTION @@ -514,6 +520,13 @@ struct alignas(2*sizeof(KK_FLOAT)) s_KK_FLOAT2 { v[0] = v[1] = 0; } +// NOLINTNEXTLINE + KOKKOS_INLINE_FUNCTION + s_KK_FLOAT2(const KK_FLOAT &x, const KK_FLOAT &y) { + v[0] = x; + v[1] = y; + } + // NOLINTNEXTLINE KOKKOS_INLINE_FUNCTION void operator+=(const s_KK_FLOAT2 &rhs) { diff --git a/src/REAXFF/fix_qeq_reaxff.cpp b/src/REAXFF/fix_qeq_reaxff.cpp index 4ce4d0fa6ea..6dd861837ed 100644 --- a/src/REAXFF/fix_qeq_reaxff.cpp +++ b/src/REAXFF/fix_qeq_reaxff.cpp @@ -89,10 +89,14 @@ FixQEqReaxFF::FixQEqReaxFF(LAMMPS *lmp, int narg, char **arg) : dual_enabled = 0; + // matrix-free support only available for Kokkos backend + matrix_free = 0; // default to false + int iarg = 8; while (iarg < narg) { if (strcmp(arg[iarg],"dual") == 0) dual_enabled = 1; else if (strcmp(arg[iarg],"nowarn") == 0) maxwarn = 0; + else if (strcmp(arg[iarg],"matfree") == 0) matrix_free = 1; else if (strcmp(arg[iarg],"maxiter") == 0) { if (iarg+1 > narg-1) error->all(FLERR, iarg, "Illegal fix {} command", style); diff --git a/src/REAXFF/fix_qeq_reaxff.h b/src/REAXFF/fix_qeq_reaxff.h index 591dee6fc02..89ff441c821 100644 --- a/src/REAXFF/fix_qeq_reaxff.h +++ b/src/REAXFF/fix_qeq_reaxff.h @@ -63,6 +63,7 @@ class FixQEqReaxFF : public Fix { int n_cap, nmax, m_cap; int pack_flag; int nlevels_respa; + int matrix_free; // kokkos backend; whether or not to use the matrix-free form class NeighList *list; class PairReaxFF *reaxff; class FixEfield *efield; diff --git a/src/fix.cpp b/src/fix.cpp index 88d467e8d8e..dd77430a769 100644 --- a/src/fix.cpp +++ b/src/fix.cpp @@ -111,7 +111,7 @@ Fix::Fix(LAMMPS *lmp, int /*narg*/, char **arg) : datamask_modify = ALL_MASK; kokkosable = copymode = 0; - forward_comm_device = exchange_comm_device = sort_device = 0; + forward_comm_device = reverse_comm_device = exchange_comm_device = sort_device = 0; fuse_integrate_flag = 0; } diff --git a/src/fix.h b/src/fix.h index da65e469397..685399109a3 100644 --- a/src/fix.h +++ b/src/fix.h @@ -132,6 +132,7 @@ class Fix : protected Pointers { int kokkosable; // 1 if Kokkos fix int forward_comm_device; // 1 if forward comm on Device + int reverse_comm_device; // 1 if reverse comm on Device int exchange_comm_device; // 1 if exchange comm on Device int fuse_integrate_flag; // 1 if can fuse initial integrate with final integrate int sort_device; // 1 if sort on Device