From 1dd287a5d6857adc044ac535ee092d765fb5f1b3 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Tue, 17 Feb 2026 07:05:28 -0800 Subject: [PATCH 01/14] Add a matrix-free method to the Kokkos implementation of fix/qeq --- src/KOKKOS/fix_qeq_reaxff_kokkos.cpp | 899 ++++++++++++++++----------- src/KOKKOS/fix_qeq_reaxff_kokkos.h | 132 ++-- src/KOKKOS/kokkos_type.h | 13 + src/REAXFF/fix_qeq_reaxff.cpp | 4 + src/REAXFF/fix_qeq_reaxff.h | 1 + 5 files changed, 642 insertions(+), 407 deletions(-) diff --git a/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp b/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp index e5dc705aa5b..2ebd8663347 100644 --- a/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp +++ b/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp @@ -75,6 +75,9 @@ FixQEqReaxFFKokkos(LAMMPS *lmp, int narg, char **arg) : d_mfill_offset = typename AT::t_bigint_scalar("qeq/kk:mfill_offset"); converged = 0; + + if (comm->me == 0 && matrix_free) + utils::logmesg(lmp, "Using matrix-free form for fix/qeq/kk\n"); } /* ---------------------------------------------------------------------- */ @@ -213,43 +216,47 @@ void FixQEqReaxFFKokkos::pre_force(int /*vflag*/) allocate_array(); - // get max number of neighbor + if (!matrix_free) { + // create the functor that handles the H matrix + FixQEqReaxFFKokkosNeighborFunctor neighbor_functor(*this); - if (!allocated_flag || last_allocate < neighbor->lastcall - || nlocal_last_allocate != nlocal) { - allocate_matrix(); - last_allocate = update->ntimestep; - nlocal_last_allocate = nlocal; - } + // get max number of neighbor - // compute_H + if (!allocated_flag || last_allocate < neighbor->lastcall + || nlocal_last_allocate != nlocal) { + // determine the total space for the H matrix + m_cap_big = 0; + Kokkos::parallel_reduce(Kokkos::RangePolicy(0,nn), neighbor_functor, m_cap_big); - if (execution_space == HostKK) { // CPU - if (neighflag == FULL) { - FixQEqReaxFFKokkosComputeHFunctor computeH_functor(this); - Kokkos::parallel_scan(nn,computeH_functor); - } else { // HALF and HALFTHREAD are the same - FixQEqReaxFFKokkosComputeHFunctor computeH_functor(this); - Kokkos::parallel_scan(nn,computeH_functor); + allocate_matrix(); + neighbor_functor.update_after_allocation(*this); + last_allocate = update->ntimestep; + nlocal_last_allocate = nlocal; } - } else { // GPU, use teams - Kokkos::deep_copy(d_mfill_offset,0); - - int atoms_per_team = FixQEqReaxFFKokkos::compute_h_teamsize; - int vector_length = FixQEqReaxFFKokkos::compute_h_vectorsize; - - int num_teams = nn / atoms_per_team + (nn % atoms_per_team ? 1 : 0); - - Kokkos::TeamPolicy policy(num_teams, atoms_per_team, - vector_length); - if (neighflag == FULL) { - FixQEqReaxFFKokkosComputeHFunctor computeH_functor( - this, atoms_per_team, vector_length); - Kokkos::parallel_for(policy, computeH_functor); - } else { // HALF and HALFTHREAD are the same - FixQEqReaxFFKokkosComputeHFunctor computeH_functor( - this, atoms_per_team, vector_length); - Kokkos::parallel_for(policy, computeH_functor); + + // compute_H + + if constexpr (is_host) { // CPU + if (neighflag == FULL) { + Kokkos::parallel_scan(Kokkos::RangePolicy>(0,nn), neighbor_functor); + } else { + Kokkos::parallel_scan(Kokkos::RangePolicy>(0,nn), neighbor_functor); + } + } else { // GPU, use teams + Kokkos::deep_copy(d_mfill_offset,0); + + int atoms_per_team = FixQEqReaxFFKokkos::compute_h_teamsize; + int vector_length = FixQEqReaxFFKokkos::compute_h_vectorsize; + + int num_teams = nn / atoms_per_team + (nn % atoms_per_team ? 1 : 0); + + if (neighflag == FULL) { + Kokkos::parallel_for(Kokkos::TeamPolicy>(num_teams, atoms_per_team, + vector_length), neighbor_functor); + } else { // HALF and HALFTHREAD are the same + Kokkos::parallel_for(Kokkos::TeamPolicy>(num_teams, atoms_per_team, + vector_length), neighbor_functor); + } } } @@ -302,30 +309,9 @@ void FixQEqReaxFFKokkos::pre_force(int /*vflag*/) /* ---------------------------------------------------------------------- */ -template -// NOLINTNEXTLINE -KOKKOS_INLINE_FUNCTION -void FixQEqReaxFFKokkos::num_neigh_item(int ii, bigint &totneigh) const -{ - const int i = d_ilist[ii]; - totneigh += d_numneigh[i]; -} - -/* ---------------------------------------------------------------------- */ - template void FixQEqReaxFFKokkos::allocate_matrix() { - // determine the total space for the H matrix - - m_cap_big = 0; - - // limit scope of functor to allow deallocation of views - { - FixQEqReaxFFKokkosNumNeighFunctor neigh_functor(this); - Kokkos::parallel_reduce(nn,neigh_functor,m_cap_big); - } - // deallocate first to reduce memory overhead d_firstnbr = typename AT::t_bigint_1d(); @@ -405,305 +391,6 @@ void FixQEqReaxFFKokkos::operator()(TagQEqZero, const int &ii) const /* ---------------------------------------------------------------------- */ -template -template -// NOLINTNEXTLINE -KOKKOS_INLINE_FUNCTION -void FixQEqReaxFFKokkos::compute_h_item(int ii, bigint &m_fill, const bool &final) const -{ - const int i = d_ilist[ii]; - int j,jj,jtype; - - if (mask[i] & groupbit) { - - const KK_FLOAT xtmp = x(i,0); - const KK_FLOAT ytmp = x(i,1); - const KK_FLOAT ztmp = x(i,2); - const int itype = type(i); - const tagint itag = tag(i); - const int jnum = d_numneigh[i]; - if (final) - d_firstnbr[i] = m_fill; - - for (jj = 0; jj < jnum; jj++) { - j = d_neighbors(i,jj); - j &= NEIGHMASK; - jtype = type(j); - - const KK_FLOAT delx = x(j,0) - xtmp; - const KK_FLOAT dely = x(j,1) - ytmp; - const KK_FLOAT delz = x(j,2) - ztmp; - - if (NEIGHFLAG != FULL) { - // skip half of the interactions - const tagint jtag = tag(j); - if (j >= nlocal) { - if (itag > jtag) { - if ((itag+jtag) % 2 == 0) continue; - } else if (itag < jtag) { - if ((itag+jtag) % 2 == 1) continue; - } else { - if (x(j,2) < ztmp) continue; - if (x(j,2) == ztmp && x(j,1) < ytmp) continue; - if (x(j,2) == ztmp && x(j,1) == ytmp && x(j,0) < xtmp) continue; - } - } - } - - const KK_FLOAT rsq = delx*delx + dely*dely + delz*delz; - if (rsq > cutsq) continue; - - if (final) { - const KK_FLOAT r = sqrt(rsq); - d_jlist(m_fill) = j; - const KK_FLOAT shldij = d_shield(itype,jtype); - d_val(m_fill) = calculate_H_k(r,shldij); - } - m_fill++; - } - if (final) - d_numnbrs[i] = int(m_fill - d_firstnbr[i]); - } -} - -/* ---------------------------------------------------------------------- */ - -// Calculate Qeq matrix H where H is a sparse matrix and H[i][j] represents the electrostatic interaction coefficients on atom-i with atom-j -// d_val - contains the non-zero entries of sparse matrix H -// d_numnbrs - d_numnbrs[i] contains the # of non-zero entries in the i-th row of H (which also represents the # of neighbor atoms with electrostatic interaction coefficients with atom-i) -// d_firstnbr- d_firstnbr[i] contains the beginning index from where the H matrix entries corresponding to row-i is stored in d_val -// d_jlist - contains the column index corresponding to each entry in d_val - -template -template -// NOLINTNEXTLINE -KOKKOS_INLINE_FUNCTION -void FixQEqReaxFFKokkos::compute_h_team( - const typename Kokkos::TeamPolicy::member_type &team, - int atoms_per_team, int vector_length) const { - - // scratch space setup - Kokkos::View, - Kokkos::MemoryTraits> - s_ilist(team.team_shmem(), atoms_per_team); - Kokkos::View, - Kokkos::MemoryTraits> - s_numnbrs(team.team_shmem(), atoms_per_team); - Kokkos::View, - Kokkos::MemoryTraits> - s_firstnbr(team.team_shmem(), atoms_per_team); - - Kokkos::View, - Kokkos::MemoryTraits> - s_jtype(team.team_shmem(), atoms_per_team, vector_length); - Kokkos::View, - Kokkos::MemoryTraits> - s_jlist(team.team_shmem(), atoms_per_team, vector_length); - Kokkos::View, - Kokkos::MemoryTraits> - s_r(team.team_shmem(), atoms_per_team, vector_length); - - // team of threads work on atoms with index in [firstatom, lastatom) - int firstatom = team.league_rank() * atoms_per_team; - int lastatom = - (firstatom + atoms_per_team < nn) ? (firstatom + atoms_per_team) : nn; - - // kokkos-thread-0 is used to load info from global memory into scratch space - if (team.team_rank() == 0) { - - // copy atom indices from d_ilist[firstatom:lastatom] to scratch space s_ilist[0:atoms_per_team] - // copy # of neighbor atoms for all the atoms with indices in d_ilist[firstatom:lastatom] from d_numneigh to scratch space s_numneigh[0:atoms_per_team] - // calculate total number of neighbor atoms for all atoms assigned to the current team of threads (Note - Total # of neighbor atoms here provides the - // upper bound space requirement to store the H matrix values corresponding to the atoms with indices in d_ilist[firstatom:lastatom]) - - Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team, atoms_per_team), - [&](const int &idx, int &totalnbrs, bool final) { - int ii = firstatom + idx; - - if (ii < nn) { - const int i = d_ilist[ii]; - int jnum = d_numneigh[i]; - - if (final) { - s_ilist[idx] = i; - s_numnbrs[idx] = jnum; - s_firstnbr[idx] = totalnbrs; - } - totalnbrs += jnum; - } else { - s_numnbrs[idx] = 0; - } - }); - } - - // barrier ensures that the data moved to scratch space is visible to all the - // threads of the corresponding team - team.team_barrier(); - - // calculate the global memory offset from where the H matrix values to be - // calculated by the current team will be stored in d_val - bigint team_firstnbr_idx = 0; - Kokkos::single(Kokkos::PerTeam(team), - [&](bigint &val) { - int totalnbrs = s_firstnbr[lastatom - firstatom - 1] + - s_numnbrs[lastatom - firstatom - 1]; - val = Kokkos::atomic_fetch_add(&d_mfill_offset(), totalnbrs); - }, - team_firstnbr_idx); - - // map the H matrix computation of each atom to kokkos-thread (one atom per - // kokkos-thread) neighbor computation for each atom is assigned to vector - // lanes of the corresponding thread - Kokkos::parallel_for( - Kokkos::TeamThreadRange(team, atoms_per_team), [&](const int &idx) { - int ii = firstatom + idx; - - if (ii < nn) { - const int i = s_ilist[idx]; - - if (mask[i] & groupbit) { - const KK_FLOAT xtmp = x(i, 0); - const KK_FLOAT ytmp = x(i, 1); - const KK_FLOAT ztmp = x(i, 2); - const int itype = type(i); - tagint itag = tag(i); // removed "const" to work around GCC 7 bug - int jnum = s_numnbrs[idx]; // removed "const" to work around GCC 7 bug - - // calculate the write-offset for atom-i's first neighbor - bigint atomi_firstnbr_idx = team_firstnbr_idx + s_firstnbr[idx]; - Kokkos::single(Kokkos::PerThread(team), - [&]() { d_firstnbr[i] = atomi_firstnbr_idx; }); - - // current # of neighbor atoms with non-zero electrostatic - // interaction coefficients with atom-i which represents the # of - // non-zero elements in row-i of H matrix - int atomi_nbrs_inH = 0; - - // calculate H matrix values corresponding to atom-i where neighbors - // are processed in batches and the batch size is vector_length - for (int jj_start = 0; jj_start < jnum; jj_start += vector_length) { - - bigint atomi_nbr_writeIdx = atomi_firstnbr_idx + atomi_nbrs_inH; - - // count the # of neighbor atoms with non-zero electrostatic - // interaction coefficients with atom-i in the current batch - int atomi_nbrs_curbatch = 0; - - // compute rsq, jtype, j and store in scratch space which is - // reused later - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, vector_length), - [&](const int &idx, int &m_fill) { - const int jj = jj_start + idx; - - // initialize: -1 represents no interaction with atom-j - // where j = d_neighbors(i,jj) - s_jlist(team.team_rank(), idx) = -1; - - if (jj < jnum) { - int j = d_neighbors(i, jj); - j &= NEIGHMASK; - const int jtype = type(j); - - const KK_FLOAT delx = x(j, 0) - xtmp; - const KK_FLOAT dely = x(j, 1) - ytmp; - const KK_FLOAT delz = x(j, 2) - ztmp; - - // valid nbr interaction - bool valid = true; - if (NEIGHFLAG != FULL) { - // skip half of the interactions - const tagint jtag = tag(j); - if (j >= nlocal) { - if (itag > jtag) { - if ((itag + jtag) % 2 == 0) - valid = false; - } else if (itag < jtag) { - if ((itag + jtag) % 2 == 1) - valid = false; - } else { - if (x(j, 2) < ztmp) - valid = false; - if (x(j, 2) == ztmp && x(j, 1) < ytmp) - valid = false; - if (x(j, 2) == ztmp && x(j, 1) == ytmp && - x(j, 0) < xtmp) - valid = false; - } - } - } - - const KK_FLOAT rsq = - delx * delx + dely * dely + delz * delz; - if (rsq > cutsq) - valid = false; - - if (valid) { - s_jlist(team.team_rank(), idx) = j; - s_jtype(team.team_rank(), idx) = jtype; - s_r(team.team_rank(), idx) = sqrt(rsq); - m_fill++; - } - } - }, - atomi_nbrs_curbatch); - - // write non-zero entries of H to global memory - Kokkos::parallel_scan( - Kokkos::ThreadVectorRange(team, vector_length), - [&](const int &idx, int &m_fill, bool final) { - int j = s_jlist(team.team_rank(), idx); - if (final) { - if (j != -1) { - const int jtype = s_jtype(team.team_rank(), idx); - const KK_FLOAT r = s_r(team.team_rank(), idx); - const KK_FLOAT shldij = d_shield(itype, jtype); - - d_jlist[atomi_nbr_writeIdx + m_fill] = j; - d_val[atomi_nbr_writeIdx + m_fill] = - calculate_H_k(r, shldij); - } - } - - if (j != -1) { - m_fill++; - } - }); - atomi_nbrs_inH += atomi_nbrs_curbatch; - } - - Kokkos::single(Kokkos::PerThread(team), - [&]() { d_numnbrs[i] = atomi_nbrs_inH; }); - } - } - }); -} - -/* ---------------------------------------------------------------------- */ - -template -// NOLINTNEXTLINE -KOKKOS_INLINE_FUNCTION -KK_FLOAT FixQEqReaxFFKokkos::calculate_H_k(const KK_FLOAT &r, const KK_FLOAT &shld) const -{ - KK_FLOAT taper, denom; - - taper = d_tap[7] * r + d_tap[6]; - taper = taper * r + d_tap[5]; - taper = taper * r + d_tap[4]; - taper = taper * r + d_tap[3]; - taper = taper * r + d_tap[2]; - taper = taper * r + d_tap[1]; - taper = taper * r + d_tap[0]; - - denom = r * r * r + shld; - denom = cbrt(denom); - - return taper * static_cast(EV_TO_KCAL_PER_MOL) / denom; -} - -/* ---------------------------------------------------------------------- */ - template // NOLINTNEXTLINE KOKKOS_INLINE_FUNCTION @@ -854,6 +541,29 @@ void FixQEqReaxFFKokkos::calculate_q() /* ---------------------------------------------------------------------- */ +template +// NOLINTNEXTLINE +KOKKOS_INLINE_FUNCTION +KK_FLOAT FixQEqReaxFFKokkos::calculate_H_k(const KK_FLOAT &r, const KK_FLOAT &shld) const +{ + KK_FLOAT taper, denom; + + taper = d_tap[7] * r + d_tap[6]; + taper = taper * r + d_tap[5]; + taper = taper * r + d_tap[4]; + taper = taper * r + d_tap[3]; + taper = taper * r + d_tap[2]; + taper = taper * r + d_tap[1]; + taper = taper * r + d_tap[0]; + + denom = r * r * r + shld; + denom = cbrt(denom); + + return taper * static_cast(EV_TO_KCAL_PER_MOL) / denom; +} + +/* ---------------------------------------------------------------------- */ + template void FixQEqReaxFFKokkos::sparse_matvec_kokkos(typename AT::t_kkfloat_1d_2 &d_xx_in) { @@ -881,15 +591,28 @@ void FixQEqReaxFFKokkos::sparse_matvec_kokkos(typename AT::t_kkfloat if (need_dup) dup_o.reset_except(d_o); - if (neighflag == HALF) - Kokkos::parallel_for(Kokkos::TeamPolicy>(leaguesize, teamsize, vectorsize), *this); - else if (neighflag == HALFTHREAD) - Kokkos::parallel_for(Kokkos::TeamPolicy>(leaguesize, teamsize, vectorsize), *this); + if (matrix_free) { + if (neighflag == HALF) + Kokkos::parallel_for(Kokkos::RangePolicy>(0,nn),*this); + else if (neighflag == HALFTHREAD) + Kokkos::parallel_for(Kokkos::RangePolicy>(0,nn),*this); + } else { + if (neighflag == HALF) + Kokkos::parallel_for(Kokkos::TeamPolicy>(leaguesize, teamsize, vectorsize), *this); + else if (neighflag == HALFTHREAD) + Kokkos::parallel_for(Kokkos::TeamPolicy>(leaguesize, teamsize, vectorsize), *this); + } if (need_dup) Kokkos::Experimental::contribute(d_o, dup_o); - } else // FULL - Kokkos::parallel_for(Kokkos::TeamPolicy (leaguesize, teamsize, vectorsize), *this); + } else { + // FULL + if (matrix_free) { + Kokkos::parallel_for(Kokkos::RangePolicy(0,nn),*this); + } else { + Kokkos::parallel_for(Kokkos::TeamPolicy (leaguesize, teamsize, vectorsize), *this); + } + } } /* ---------------------------------------------------------------------- */ @@ -972,7 +695,7 @@ void FixQEqReaxFFKokkos::operator()(TagQEqSparseMatvec2_Half // NOLINTNEXTLINE KOKKOS_INLINE_FUNCTION -void FixQEqReaxFFKokkos::operator()(TagQEqSparseMatvec2_Full, const membertype_vec &team) const +void FixQEqReaxFFKokkos::operator()(TagQEqSparseMatvec2_Full, const typename Kokkos::TeamPolicy::member_type &team) const { int k = team.league_rank() * team.team_size() + team.team_rank(); if (k < nn) { @@ -999,6 +722,154 @@ void FixQEqReaxFFKokkos::operator()(TagQEqSparseMatvec2_Full, const /* ---------------------------------------------------------------------- */ +// half matrix-free form of the sparse mat-vec +template +template +// NOLINTNEXTLINE +KOKKOS_INLINE_FUNCTION +void FixQEqReaxFFKokkos::operator()(TagQEqSparseMatvec2_Half_MatrixFree, const int &ii) const +{ + // The o array is duplicated for OpenMP, atomic for GPU, and neither for Serial + auto v_o = ScatterViewHelper,decltype(dup_o),decltype(ndup_o)>::get(dup_o,ndup_o); + auto a_o = v_o.template access>(); + + const int i = d_ilist[ii]; + int j, jj, jtype; + + if (mask[i] & groupbit) { + const KK_FLOAT xtmp = x(i,0); + const KK_FLOAT ytmp = x(i,1); + const KK_FLOAT ztmp = x(i,2); + const int itype = type(i); + const tagint itag = tag(i); + const int jnum = d_numneigh[i]; + + KK_FLOAT2 doi = KK_FLOAT2(); + const KK_FLOAT2 xx_i = { d_xx(i,0), d_xx(i,1) }; + + for (jj = 0; jj < jnum; jj++) { + j = d_neighbors(i,jj); + j &= NEIGHMASK; + jtype = type(j); + + const KK_FLOAT delx = x(j,0) - xtmp; + const KK_FLOAT dely = x(j,1) - ytmp; + const KK_FLOAT delz = x(j,2) - ztmp; + + // always true + //if (neighflag != FULL) { + // skip half of the interactions + const tagint jtag = tag(j); + bool skip = [&]() -> bool { + if (j >= nlocal) { + if (itag > jtag) { + if ((itag+jtag) % 2 == 0) return true; + } else if (itag < jtag) { + if ((itag+jtag) % 2 == 1) return true; + } else { + if (x(j,2) < ztmp) return true; + if (x(j,2) == ztmp && x(j,1) < ytmp) return true; + if (x(j,2) == ztmp && x(j,1) == ytmp && x(j,0) < xtmp) return true; + } + } + return false; + }(); + + if (!skip) { + const KK_FLOAT rsq = delx*delx + dely*dely + delz*delz; + if (rsq <= cutsq) { + const KK_FLOAT r = sqrt(rsq); + const KK_FLOAT shldij = d_shield(itype, jtype); + + // we split this out here for mixed precision calculations + double h_k_val = calculate_H_k(r, shldij); + + const KK_FLOAT2 xx_j = { d_xx(j,0), d_xx(j,1) }; + if (!(converged & 1)) { + doi.v[0] += h_k_val * xx_j.v[0]; + a_o(j,0) += h_k_val * xx_i.v[0]; + } + + if (!(converged & 2)) { + doi.v[1] += h_k_val * xx_j.v[1]; + a_o(j,1) += h_k_val * xx_i.v[1]; + } + } + } + } + + if (!(converged & 1)) { + a_o(i,0) += doi.v[0]; + } + if (!(converged & 2)) { + a_o(i,1) += doi.v[1]; + } + } +} + + +/* ---------------------------------------------------------------------- */ + +// matrix-free form of the sparse mat-vec +template +// NOLINTNEXTLINE +KOKKOS_INLINE_FUNCTION +void FixQEqReaxFFKokkos::operator()(TagQEqSparseMatvec2_Full_MatrixFree, const int &ii) const +{ + const int i = d_ilist[ii]; + int j, jj, jtype; + + if (mask[i] & groupbit) { + const KK_FLOAT xtmp = x(i,0); + const KK_FLOAT ytmp = x(i,1); + const KK_FLOAT ztmp = x(i,2); + const int itype = type(i); + const tagint itag = tag(i); + const int jnum = d_numneigh[i]; + + KK_FLOAT2 doi = KK_FLOAT2(); + + for (jj = 0; jj < jnum; jj++) { + j = d_neighbors(i,jj); + j &= NEIGHMASK; + jtype = type(j); + + const KK_FLOAT delx = x(j,0) - xtmp; + const KK_FLOAT dely = x(j,1) - ytmp; + const KK_FLOAT delz = x(j,2) - ztmp; + + const KK_FLOAT rsq = delx*delx + dely*dely + delz*delz; + if (rsq <= cutsq) { + const KK_FLOAT r = sqrt(rsq); + const KK_FLOAT shldij = d_shield(itype, jtype); + + // we split this out here for mixed precision calculations + double h_k_val = calculate_H_k(r, shldij); + + const KK_FLOAT2 xx = { d_xx(j,0), d_xx(j,1) }; + if (!(converged & 1)) + doi.v[0] += h_k_val * xx.v[0]; + + if (!(converged & 2)) + doi.v[1] += h_k_val * xx.v[1]; + } + + } + + KK_FLOAT2 o = { d_o(i,0), d_o(i,1) }; + if (!(converged & 1)) + o.v[0] += doi.v[0]; + + if (!(converged & 2)) + o.v[1] += doi.v[1]; + + d_o(i,0) = o.v[0]; + d_o(i,1) = o.v[1]; + } +} + +/* ---------------------------------------------------------------------- */ + template // NOLINTNEXTLINE KOKKOS_INLINE_FUNCTION @@ -1528,6 +1399,314 @@ void FixQEqReaxFFKokkos::get_chi_field() /* ---------------------------------------------------------------------- */ +template +void FixQEqReaxFFKokkosNeighborFunctor::update_after_allocation(const FixQEqReaxFFKokkos &qeqreax) +{ + d_firstnbr = qeqreax.d_firstnbr; + d_numnbrs = qeqreax.d_numnbrs; + d_jlist = qeqreax.d_jlist; + d_val = qeqreax.d_val; +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void FixQEqReaxFFKokkosNeighborFunctor::operator()(TagQEqNeighborNumNeigh, const int& ii, bigint& totneigh) const +{ + totneigh += static_cast(d_numneigh[ii]); +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +KK_FLOAT FixQEqReaxFFKokkosNeighborFunctor::calculate_H_k(const KK_FLOAT &r, const KK_FLOAT &shld) const +{ + KK_FLOAT taper, denom; + + taper = d_tap[7] * r + d_tap[6]; + taper = taper * r + d_tap[5]; + taper = taper * r + d_tap[4]; + taper = taper * r + d_tap[3]; + taper = taper * r + d_tap[2]; + taper = taper * r + d_tap[1]; + taper = taper * r + d_tap[0]; + + denom = r * r * r + shld; + denom = cbrt(denom); + + return taper * static_cast(EV_TO_KCAL_PER_MOL) / denom; +} + +/* ---------------------------------------------------------------------- */ + +template +template +KOKKOS_INLINE_FUNCTION +void FixQEqReaxFFKokkosNeighborFunctor::operator()(TagQEqNeighborComputeH, const int& ii, bigint &m_fill, const bool &final) const +{ + const int i = d_ilist[ii]; + int j,jj,jtype; + + if (mask[i] & groupbit) { + + const KK_FLOAT xtmp = x(i,0); + const KK_FLOAT ytmp = x(i,1); + const KK_FLOAT ztmp = x(i,2); + const int itype = type(i); + const tagint itag = tag(i); + const int jnum = d_numneigh[i]; + if (final) + d_firstnbr[i] = m_fill; + + for (jj = 0; jj < jnum; jj++) { + j = d_neighbors(i,jj); + j &= NEIGHMASK; + jtype = type(j); + + const KK_FLOAT delx = x(j,0) - xtmp; + const KK_FLOAT dely = x(j,1) - ytmp; + const KK_FLOAT delz = x(j,2) - ztmp; + + if (NEIGHFLAG != FULL) { + // skip half of the interactions + const tagint jtag = tag(j); + if (j >= nlocal) { + if (itag > jtag) { + if ((itag+jtag) % 2 == 0) continue; + } else if (itag < jtag) { + if ((itag+jtag) % 2 == 1) continue; + } else { + if (x(j,2) < ztmp) continue; + if (x(j,2) == ztmp && x(j,1) < ytmp) continue; + if (x(j,2) == ztmp && x(j,1) == ytmp && x(j,0) < xtmp) continue; + } + } + } + + const KK_FLOAT rsq = delx*delx + dely*dely + delz*delz; + if (rsq > cutsq) continue; + + if (final) { + const KK_FLOAT r = sqrt(rsq); + d_jlist(m_fill) = j; + const KK_FLOAT shldij = d_shield(itype,jtype); + d_val(m_fill) = calculate_H_k(r,shldij); + } + m_fill++; + } + if (final) + d_numnbrs[i] = int(m_fill - d_firstnbr[i]); + } +} + +// Calculate Qeq matrix H where H is a sparse matrix and H[i][j] represents the electrostatic interaction coefficients on atom-i with atom-j +// d_val - contains the non-zero entries of sparse matrix H +// d_numnbrs - d_numnbrs[i] contains the # of non-zero entries in the i-th row of H (which also represents the # of neighbor atoms with electrostatic interaction coefficients with atom-i) +// d_firstnbr- d_firstnbr[i] contains the beginning index from where the H matrix entries corresponding to row-i is stored in d_val +// d_jlist - contains the column index corresponding to each entry in d_val +template +template +KOKKOS_INLINE_FUNCTION +void FixQEqReaxFFKokkosNeighborFunctor::operator()(TagQEqNeighborComputeH, const typename Kokkos::TeamPolicy::member_type &team) const +{ + + // scratch space setup + Kokkos::View, + Kokkos::MemoryTraits> + s_ilist(team.team_shmem(), atoms_per_team); + Kokkos::View, + Kokkos::MemoryTraits> + s_numnbrs(team.team_shmem(), atoms_per_team); + Kokkos::View, + Kokkos::MemoryTraits> + s_firstnbr(team.team_shmem(), atoms_per_team); + + Kokkos::View, + Kokkos::MemoryTraits> + s_jtype(team.team_shmem(), atoms_per_team, vector_length); + Kokkos::View, + Kokkos::MemoryTraits> + s_jlist(team.team_shmem(), atoms_per_team, vector_length); + Kokkos::View, + Kokkos::MemoryTraits> + s_r(team.team_shmem(), atoms_per_team, vector_length); + + // team of threads work on atoms with index in [firstatom, lastatom) + int firstatom = team.league_rank() * atoms_per_team; + int lastatom = + (firstatom + atoms_per_team < nn) ? (firstatom + atoms_per_team) : nn; + + // kokkos-thread-0 is used to load info from global memory into scratch space + if (team.team_rank() == 0) { + + // copy atom indices from d_ilist[firstatom:lastatom] to scratch space s_ilist[0:atoms_per_team] + // copy # of neighbor atoms for all the atoms with indices in d_ilist[firstatom:lastatom] from d_numneigh to scratch space s_numneigh[0:atoms_per_team] + // calculate total number of neighbor atoms for all atoms assigned to the current team of threads (Note - Total # of neighbor atoms here provides the + // upper bound space requirement to store the H matrix values corresponding to the atoms with indices in d_ilist[firstatom:lastatom]) + + Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team, atoms_per_team), + [&](const int &idx, int &totalnbrs, bool final) { + int ii = firstatom + idx; + + if (ii < nn) { + const int i = d_ilist[ii]; + int jnum = d_numneigh[i]; + + if (final) { + s_ilist[idx] = i; + s_numnbrs[idx] = jnum; + s_firstnbr[idx] = totalnbrs; + } + totalnbrs += jnum; + } else { + s_numnbrs[idx] = 0; + } + }); + } + + // barrier ensures that the data moved to scratch space is visible to all the + // threads of the corresponding team + team.team_barrier(); + + // calculate the global memory offset from where the H matrix values to be + // calculated by the current team will be stored in d_val + bigint team_firstnbr_idx = 0; + Kokkos::single(Kokkos::PerTeam(team), + [&](bigint &val) { + int totalnbrs = s_firstnbr[lastatom - firstatom - 1] + + s_numnbrs[lastatom - firstatom - 1]; + val = Kokkos::atomic_fetch_add(&d_mfill_offset(), totalnbrs); + }, + team_firstnbr_idx); + + // map the H matrix computation of each atom to kokkos-thread (one atom per + // kokkos-thread) neighbor computation for each atom is assigned to vector + // lanes of the corresponding thread + Kokkos::parallel_for(Kokkos::TeamThreadRange(team, atoms_per_team), + [&](const int &idx) { + int ii = firstatom + idx; + + if (ii < nn) { + const int i = s_ilist[idx]; + + if (mask[i] & groupbit) { + const KK_FLOAT xtmp = x(i, 0); + const KK_FLOAT ytmp = x(i, 1); + const KK_FLOAT ztmp = x(i, 2); + const int itype = type(i); + tagint itag = tag(i); // removed "const" to work around GCC 7 bug + int jnum = s_numnbrs[idx]; // removed "const" to work around GCC 7 bug + + // calculate the write-offset for atom-i's first neighbor + bigint atomi_firstnbr_idx = team_firstnbr_idx + s_firstnbr[idx]; + Kokkos::single(Kokkos::PerThread(team), + [&]() { d_firstnbr[i] = atomi_firstnbr_idx; }); + + // current # of neighbor atoms with non-zero electrostatic + // interaction coefficients with atom-i which represents the # of + // non-zero elements in row-i of H matrix + int atomi_nbrs_inH = 0; + + // calculate H matrix values corresponding to atom-i where neighbors + // are processed in batches and the batch size is vector_length + for (int jj_start = 0; jj_start < jnum; jj_start += vector_length) { + + bigint atomi_nbr_writeIdx = atomi_firstnbr_idx + atomi_nbrs_inH; + + // count the # of neighbor atoms with non-zero electrostatic + // interaction coefficients with atom-i in the current batch + int atomi_nbrs_curbatch = 0; + + // compute rsq, jtype, j and store in scratch space which is + // reused later + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, vector_length), + [&](const int &idx, int &m_fill) { + const int jj = jj_start + idx; + + // initialize: -1 represents no interaction with atom-j + // where j = d_neighbors(i,jj) + s_jlist(team.team_rank(), idx) = -1; + + if (jj < jnum) { + int j = d_neighbors(i, jj); + j &= NEIGHMASK; + const int jtype = type(j); + + const KK_FLOAT delx = x(j, 0) - xtmp; + const KK_FLOAT dely = x(j, 1) - ytmp; + const KK_FLOAT delz = x(j, 2) - ztmp; + + // valid nbr interaction + bool valid = true; + if (NEIGHFLAG != FULL) { + // skip half of the interactions + const tagint jtag = tag(j); + if (j >= nlocal) { + if (itag > jtag) { + if ((itag + jtag) % 2 == 0) + valid = false; + } else if (itag < jtag) { + if ((itag + jtag) % 2 == 1) + valid = false; + } else { + if (x(j, 2) < ztmp) + valid = false; + if (x(j, 2) == ztmp && x(j, 1) < ytmp) + valid = false; + if (x(j, 2) == ztmp && x(j, 1) == ytmp && + x(j, 0) < xtmp) + valid = false; + } + } + } + + const KK_FLOAT rsq = + delx * delx + dely * dely + delz * delz; + if (rsq > cutsq) + valid = false; + + if (valid) { + s_jlist(team.team_rank(), idx) = j; + s_jtype(team.team_rank(), idx) = jtype; + s_r(team.team_rank(), idx) = sqrt(rsq); + m_fill++; + } + } + }, + atomi_nbrs_curbatch); + + // write non-zero entries of H to global memory + Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team, vector_length), + [&](const int &idx, int &m_fill, bool final) { + int j = s_jlist(team.team_rank(), idx); + if (final) { + if (j != -1) { + const int jtype = s_jtype(team.team_rank(), idx); + const KK_FLOAT r = s_r(team.team_rank(), idx); + const KK_FLOAT shldij = d_shield(itype, jtype); + + d_jlist[atomi_nbr_writeIdx + m_fill] = j; + d_val[atomi_nbr_writeIdx + m_fill] = + calculate_H_k(r, shldij); + } + } + + if (j != -1) { + m_fill++; + } + }); + atomi_nbrs_inH += atomi_nbrs_curbatch; + } + + Kokkos::single(Kokkos::PerThread(team), + [&]() { d_numnbrs[i] = atomi_nbrs_inH; }); + } + } + }); +} + namespace LAMMPS_NS { template class FixQEqReaxFFKokkos; #ifdef LMP_KOKKOS_GPU diff --git a/src/KOKKOS/fix_qeq_reaxff_kokkos.h b/src/KOKKOS/fix_qeq_reaxff_kokkos.h index 822cfa1cedb..424cd947706 100644 --- a/src/KOKKOS/fix_qeq_reaxff_kokkos.h +++ b/src/KOKKOS/fix_qeq_reaxff_kokkos.h @@ -34,6 +34,10 @@ FixStyle(qeq/reax/kk/host,FixQEqReaxFFKokkos); namespace LAMMPS_NS { +// Forward declare a neighbor helper +template +struct FixQEqReaxFFKokkosNeighborFunctor; + struct TagQEqZero{}; struct TagQEqInitMatvec{}; struct TagQEqSparseMatvec1{}; @@ -41,8 +45,13 @@ struct TagQEqZeroQGhosts{}; template struct TagQEqSparseMatvec2_Half{}; - struct TagQEqSparseMatvec2_Full{}; + +// Matrix-free version of the sparse mat-vec operations +template +struct TagQEqSparseMatvec2_Half_MatrixFree{}; +struct TagQEqSparseMatvec2_Full_MatrixFree{}; + struct TagQEqNorm1{}; struct TagQEqDot1{}; struct TagQEqDot2{}; @@ -60,6 +69,7 @@ class FixQEqReaxFFKokkos : public FixQEqReaxFF, public KokkosBase { public: typedef DeviceType device_type; typedef ArrayTypes AT; + static constexpr bool is_host = (ExecutionSpaceFromDevice::space == HostKK); typedef KK_double2 value_type; FixQEqReaxFFKokkos(class LAMMPS *, int, char **); ~FixQEqReaxFFKokkos() override; @@ -81,33 +91,31 @@ class FixQEqReaxFFKokkos : public FixQEqReaxFF, public KokkosBase { KOKKOS_INLINE_FUNCTION void operator()(TagQEqInitMatvec, const int&) const; - template // NOLINTNEXTLINE KOKKOS_INLINE_FUNCTION - void compute_h_item(int, bigint &, const bool &) const; + void operator()(TagQEqSparseMatvec1, const int&) const; - template // NOLINTNEXTLINE KOKKOS_INLINE_FUNCTION - void compute_h_team(const typename Kokkos::TeamPolicy::member_type &team, int, int) const; + void operator()(TagQEqZeroQGhosts, const int&) const; + template // NOLINTNEXTLINE KOKKOS_INLINE_FUNCTION - void operator()(TagQEqSparseMatvec1, const int&) const; + void operator()(TagQEqSparseMatvec2_Half, const typename Kokkos::TeamPolicy>::member_type &team) const; // NOLINTNEXTLINE KOKKOS_INLINE_FUNCTION - void operator()(TagQEqZeroQGhosts, const int&) const; + void operator()(TagQEqSparseMatvec2_Full, const typename Kokkos::TeamPolicy::member_type &team) const; template // NOLINTNEXTLINE KOKKOS_INLINE_FUNCTION - void operator()(TagQEqSparseMatvec2_Half, const typename Kokkos::TeamPolicy>::member_type &team) const; + void operator()(TagQEqSparseMatvec2_Half_MatrixFree, const int&) const; - typedef typename Kokkos::TeamPolicy::member_type membertype_vec; // NOLINTNEXTLINE KOKKOS_INLINE_FUNCTION - void operator()(TagQEqSparseMatvec2_Full, const membertype_vec &team) const; + void operator()(TagQEqSparseMatvec2_Full_MatrixFree, const int&) const; // NOLINTNEXTLINE KOKKOS_INLINE_FUNCTION @@ -312,53 +320,82 @@ class FixQEqReaxFFKokkos : public FixQEqReaxFF, public KokkosBase { int pack_exchange(int, double *) override; int unpack_exchange(int, double *) override; void get_chi_field() override; + + // Let the neighbor count functor access some of my members + friend class FixQEqReaxFFKokkosNeighborFunctor; }; +// Custom functor to count various things about the number of neighbors +struct TagQEqNeighborNumNeigh{}; + +template +struct TagQEqNeighborComputeH{}; + template -struct FixQEqReaxFFKokkosNumNeighFunctor { - typedef DeviceType device_type; - typedef ArrayTypes AT; - typedef bigint value_type; - FixQEqReaxFFKokkos c; - FixQEqReaxFFKokkosNumNeighFunctor(FixQEqReaxFFKokkos* c_ptr):c(*c_ptr) { - c.cleanup_copy(); - }; -// NOLINTNEXTLINE - KOKKOS_INLINE_FUNCTION - void operator()(const int ii, bigint &totneigh) const { - c.num_neigh_item(ii, totneigh); - } -}; +struct FixQEqReaxFFKokkosNeighborFunctor { + using device_type = DeviceType; + using value_type = bigint; + using scratch_space = Kokkos::ScratchMemorySpace; + using AT = ArrayTypes; -template -struct FixQEqReaxFFKokkosComputeHFunctor { - int atoms_per_team, vector_length; - typedef bigint value_type; - typedef Kokkos::ScratchMemorySpace scratch_space; - FixQEqReaxFFKokkos c; + typename AT::t_kkfloat_1d_3_lr_const x; + typename AT::t_int_1d_const type, mask; + typename AT::t_tagint_1d_const tag; - FixQEqReaxFFKokkosComputeHFunctor(FixQEqReaxFFKokkos* c_ptr):c(*c_ptr) { - c.cleanup_copy(); - }; + typename AT::t_neighbors_2d_const d_neighbors; + typename AT::t_int_1d_randomread d_ilist, d_numneigh; - FixQEqReaxFFKokkosComputeHFunctor(FixQEqReaxFFKokkos *c_ptr, - int _atoms_per_team, int _vector_length) - : atoms_per_team(_atoms_per_team), vector_length(_vector_length), c(*c_ptr) { - c.cleanup_copy(); - }; + typename AT::t_bigint_1d d_firstnbr; + typename AT::t_int_1d d_numnbrs; + typename AT::t_int_1d d_jlist; + typename AT::t_kkfloat_1d d_val; + typename AT::t_kkfloat_2d_const d_shield; + typename AT::t_kkfloat_1d_const d_tap; -// NOLINTNEXTLINE + typename AT::t_bigint_scalar d_mfill_offset; + + int nlocal, nn, nmax; + int groupbit, neighflag; + KK_FLOAT cutsq; + + static constexpr int atoms_per_team = FixQEqReaxFFKokkos::compute_h_teamsize; + static constexpr int vector_length = FixQEqReaxFFKokkos::compute_h_vectorsize; + + // no default constructor + FixQEqReaxFFKokkosNeighborFunctor() = delete; + + FixQEqReaxFFKokkosNeighborFunctor(const FixQEqReaxFFKokkos &qeqreax) + : x(qeqreax.x), type(qeqreax.type), mask(qeqreax.mask), tag(qeqreax.tag), + d_neighbors(qeqreax.d_neighbors), d_ilist(qeqreax.d_ilist), d_numneigh(qeqreax.d_numneigh), + d_firstnbr(qeqreax.d_firstnbr), d_numnbrs(qeqreax.d_numnbrs), d_jlist(qeqreax.d_jlist), + d_val(qeqreax.d_val), d_shield(qeqreax.d_shield), d_tap(qeqreax.d_tap), d_mfill_offset(qeqreax.d_mfill_offset), + nlocal(qeqreax.nlocal), nn(qeqreax.nn), nmax(qeqreax.nmax), groupbit(qeqreax.groupbit), + neighflag(qeqreax.neighflag == FULL ? FULL : HALF), cutsq(qeqreax.cutsq) + { ; } + + void update_after_allocation(const FixQEqReaxFFKokkos &qeqreax); + + // Function to calculate H based on r and shld + // NOLINTNEXTLINE KOKKOS_INLINE_FUNCTION - void operator()(const int ii, bigint &m_fill, const bool &final) const { - c.template compute_h_item(ii,m_fill,final); - } + KK_FLOAT calculate_H_k(const KK_FLOAT &r, const KK_FLOAT &shld) const; -// NOLINTNEXTLINE + // Count the number of neighbors w/in the cutoff radius + // NOLINTNEXTLINE KOKKOS_INLINE_FUNCTION - void operator()( - const typename Kokkos::TeamPolicy::member_type &team) const { - c.template compute_h_team(team, atoms_per_team, vector_length); - } + void operator()(TagQEqNeighborNumNeigh, const int& ii, bigint& neigh_count) const; + + // Calculate H deterministically w/a scan, CPU version + template + // NOLINTNEXTLINE + KOKKOS_INLINE_FUNCTION + void operator()(TagQEqNeighborComputeH, const int& ii, bigint &m_fill, const bool &final) const; + + // Calculate H, GPU Team parallelism version + template + // NOLINTNEXTLINE + KOKKOS_INLINE_FUNCTION + void operator()(TagQEqNeighborComputeH, const typename Kokkos::TeamPolicy::member_type &team) const; [[nodiscard]] size_t team_shmem_size(int /*team_size*/) const { size_t shmem_size = @@ -377,6 +414,7 @@ struct FixQEqReaxFFKokkosComputeHFunctor { vector_length); // s_r return shmem_size; } + }; } diff --git a/src/KOKKOS/kokkos_type.h b/src/KOKKOS/kokkos_type.h index 239dab92868..e778cb15d5e 100644 --- a/src/KOKKOS/kokkos_type.h +++ b/src/KOKKOS/kokkos_type.h @@ -495,6 +495,12 @@ struct alignas(2*sizeof(double)) s_KK_double2 { s_KK_double2() { v[0] = v[1] = 0.0; } +// NOLINTNEXTLINE + KOKKOS_INLINE_FUNCTION + s_KK_double2(const double &x, const double &y) { + v[0] = x; + v[1] = y; + } // NOLINTNEXTLINE KOKKOS_INLINE_FUNCTION @@ -514,6 +520,13 @@ struct alignas(2*sizeof(KK_FLOAT)) s_KK_FLOAT2 { v[0] = v[1] = 0; } +// NOLINTNEXTLINE + KOKKOS_INLINE_FUNCTION + s_KK_FLOAT2(const KK_FLOAT &x, const KK_FLOAT &y) { + v[0] = x; + v[1] = y; + } + // NOLINTNEXTLINE KOKKOS_INLINE_FUNCTION void operator+=(const s_KK_FLOAT2 &rhs) { diff --git a/src/REAXFF/fix_qeq_reaxff.cpp b/src/REAXFF/fix_qeq_reaxff.cpp index 4ce4d0fa6ea..8d79544f28b 100644 --- a/src/REAXFF/fix_qeq_reaxff.cpp +++ b/src/REAXFF/fix_qeq_reaxff.cpp @@ -89,10 +89,14 @@ FixQEqReaxFF::FixQEqReaxFF(LAMMPS *lmp, int narg, char **arg) : dual_enabled = 0; + // matrix-free support only available for Kokkos backend + matrix_free = 0; // default to false + int iarg = 8; while (iarg < narg) { if (strcmp(arg[iarg],"dual") == 0) dual_enabled = 1; else if (strcmp(arg[iarg],"nowarn") == 0) maxwarn = 0; + else if (strcmp(arg[iarg],"matrix_free") == 0) matrix_free = 1; else if (strcmp(arg[iarg],"maxiter") == 0) { if (iarg+1 > narg-1) error->all(FLERR, iarg, "Illegal fix {} command", style); diff --git a/src/REAXFF/fix_qeq_reaxff.h b/src/REAXFF/fix_qeq_reaxff.h index 591dee6fc02..89ff441c821 100644 --- a/src/REAXFF/fix_qeq_reaxff.h +++ b/src/REAXFF/fix_qeq_reaxff.h @@ -63,6 +63,7 @@ class FixQEqReaxFF : public Fix { int n_cap, nmax, m_cap; int pack_flag; int nlevels_respa; + int matrix_free; // kokkos backend; whether or not to use the matrix-free form class NeighList *list; class PairReaxFF *reaxff; class FixEfield *efield; From 9af8e6dfede4262fde349ab2c93d0e7bbdad8052 Mon Sep 17 00:00:00 2001 From: Balint Joo Date: Tue, 17 Feb 2026 07:18:09 -0800 Subject: [PATCH 02/14] Added support for device-offloaded reverse fix comms to the Kokkos package --- src/KOKKOS/comm_kokkos.cpp | 82 +++++++++++++++++++++++++++- src/KOKKOS/comm_kokkos.h | 2 + src/KOKKOS/fix_qeq_reaxff_kokkos.cpp | 40 +++++++++++++- src/KOKKOS/fix_qeq_reaxff_kokkos.h | 14 +++++ src/KOKKOS/kokkos.cpp | 23 ++++++-- src/KOKKOS/kokkos.h | 2 + src/KOKKOS/kokkos_base.h | 6 ++ src/fix.cpp | 2 +- src/fix.h | 1 + 9 files changed, 163 insertions(+), 9 deletions(-) diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp index 00cdc31f5b6..a4f1eea9968 100644 --- a/src/KOKKOS/comm_kokkos.cpp +++ b/src/KOKKOS/comm_kokkos.cpp @@ -113,6 +113,7 @@ void CommKokkos::init() reverse_pair_comm_legacy = lmp->kokkos->reverse_pair_comm_legacy; forward_fix_comm_legacy = lmp->kokkos->forward_fix_comm_legacy; reverse_comm_legacy = lmp->kokkos->reverse_comm_legacy; + reverse_fix_comm_legacy = lmp->kokkos->reverse_fix_comm_legacy; exchange_comm_on_host = lmp->kokkos->exchange_comm_on_host; forward_comm_on_host = lmp->kokkos->forward_comm_on_host; reverse_comm_on_host = lmp->kokkos->reverse_comm_on_host; @@ -444,10 +445,87 @@ void CommKokkos::forward_comm_device(Fix *fix, int size) void CommKokkos::reverse_comm(Fix *fix, int size) { - k_sendlist.sync_host(); - CommBrick::reverse_comm(fix, size); + if(fix->execution_space == Host || fix->execution_space == HostKK || !fix->reverse_comm_device || reverse_fix_comm_legacy ) { + k_sendlist.sync_host(); + CommBrick::reverse_comm(fix, size); + } + else { + k_sendlist.sync_device(); + reverse_comm_device(fix,size); + } } +/* ---------------------------------------------------------------------- */ + +template +void CommKokkos::reverse_comm_device(Fix *fix, int size) +{ + int iswap, n, nsize; + MPI_Request request; + DAT::tdual_double_1d k_buf_tmp; + + if (size) nsize = size; + else nsize = fix->comm_reverse; + KokkosBase* fixKKBase = dynamic_cast(fix); + + for (iswap = 0; iswap < nswap; iswap++) { + int n = MAX(max_buf_fix,nsize*sendnum[iswap]); + n = MAX(n,nsize*recvnum[iswap]); + if (n > max_buf_fix) + grow_buf_fix(n); + } + + // exchange data with another proc + // if other proc is self, just copy + + for (iswap = nswap-1; iswap >= 0; iswap--) { + + n = fixKKBase->pack_reverse_comm_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_send_fix); + + // exchange with another proc + // if self, set recv buffer to send buffer + + if (sendproc[iswap] != me) { + double* buf_send_fix; + double* buf_recv_fix; + if (lmp->kokkos->gpu_aware_flag) { + buf_send_fix = k_buf_send_fix.view().data(); + buf_recv_fix = k_buf_recv_fix.view().data(); + } else { + k_buf_send_fix.modify(); + k_buf_send_fix.sync(); + buf_send_fix = k_buf_send_fix.view_host().data(); + buf_recv_fix = k_buf_recv_fix.view_host().data(); + } + + if (sendnum[iswap]) { + DeviceType().fence(); + MPI_Irecv(buf_recv_fix,nsize*sendnum[iswap],MPI_DOUBLE, + sendproc[iswap],0,world,&request); + } + if (recvnum[iswap]) { + DeviceType().fence(); + MPI_Send(buf_send_fix,n,MPI_DOUBLE,recvproc[iswap],0,world); + } + if (sendnum[iswap]) { + MPI_Wait(&request,MPI_STATUS_IGNORE); + DeviceType().fence(); + } + + if (!lmp->kokkos->gpu_aware_flag) { + k_buf_recv_fix.modify(); + k_buf_recv_fix.sync(); + } + k_buf_tmp = k_buf_recv_fix; + } else k_buf_tmp = k_buf_send_fix; + + // unpack buffer + auto k_sendlist_iswap = Kokkos::subview(k_sendlist,iswap,Kokkos::ALL); + fixKKBase->unpack_reverse_comm_kokkos(sendnum[iswap], k_sendlist_iswap,k_buf_tmp); + DeviceType().fence(); + } + +} /* ---------------------------------------------------------------------- reverse communication invoked by a Fix with variable size data diff --git a/src/KOKKOS/comm_kokkos.h b/src/KOKKOS/comm_kokkos.h index b91740304d2..875ed290d8a 100644 --- a/src/KOKKOS/comm_kokkos.h +++ b/src/KOKKOS/comm_kokkos.h @@ -30,6 +30,7 @@ class CommKokkos : public CommBrick { bool reverse_pair_comm_legacy; bool forward_fix_comm_legacy; bool reverse_comm_legacy; + bool reverse_fix_comm_legacy; bool exchange_comm_on_host; bool forward_comm_on_host; bool reverse_comm_on_host; @@ -64,6 +65,7 @@ class CommKokkos : public CommBrick { template void forward_comm_device(Pair *pair, int size=0); template void reverse_comm_device(Pair *pair, int size=0); template void forward_comm_device(Fix *fix, int size=0); + template void reverse_comm_device(Fix *fix, int size=0); template void exchange_device(); template void borders_device(); diff --git a/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp b/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp index 2ebd8663347..f0480bdb3e3 100644 --- a/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp +++ b/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp @@ -55,7 +55,7 @@ FixQEqReaxFFKokkos(LAMMPS *lmp, int narg, char **arg) : { kokkosable = 1; comm_forward = comm_reverse = 2; // fused - forward_comm_device = exchange_comm_device = sort_device = 1; + forward_comm_device = reverse_comm_device = exchange_comm_device = sort_device = 1; atomKK = (AtomKokkos *) atom; execution_space = ExecutionSpaceFromDevice::space; @@ -1136,7 +1136,45 @@ void FixQEqReaxFFKokkos::unpack_forward_comm(int n, int first, doubl } /* ---------------------------------------------------------------------- */ +template +int FixQEqReaxFFKokkos::pack_reverse_comm_kokkos(int n, int first_in, DAT::tdual_double_1d &buf) +{ + first = first_in; + d_buf = buf.view(); + Kokkos::parallel_for(Kokkos::RangePolicy(0,n),*this); + return n*2; +} + +template +KOKKOS_INLINE_FUNCTION +void FixQEqReaxFFKokkos::operator()(TagQEqPackReverseComm, const int &i) const +{ + //const auto d = d_o(i+first); + if (!( converged & 1)) d_buf[2*i] = d_o(i+first,0); + if (!( converged & 2)) d_buf[2*i+1] = d_o(i+first,1); +} + +/* ---------------------------------------------------------------------- */ +template +void FixQEqReaxFFKokkos::unpack_reverse_comm_kokkos(int n, DAT::tdual_int_1d k_sendlist, DAT::tdual_double_1d& buf) +{ + d_buf = buf.view(); + d_sendlist = k_sendlist.view(); + Kokkos::parallel_for(Kokkos::RangePolicy(0,n),*this); +} + +template +KOKKOS_INLINE_FUNCTION +void FixQEqReaxFFKokkos::operator()(TagQEqUnpackReverseComm, const int &i) const +{ + int j = d_sendlist(i); + + if ( !(converged & 1)) d_o(j,0) += d_buf[2*i]; + if ( !(converged & 2)) d_o(j,1) += d_buf[2*i+1]; +} + +/* ---------------------------------------------------------------------- */ template int FixQEqReaxFFKokkos::pack_reverse_comm(int n, int first, double *buf) { diff --git a/src/KOKKOS/fix_qeq_reaxff_kokkos.h b/src/KOKKOS/fix_qeq_reaxff_kokkos.h index 424cd947706..aced8c082eb 100644 --- a/src/KOKKOS/fix_qeq_reaxff_kokkos.h +++ b/src/KOKKOS/fix_qeq_reaxff_kokkos.h @@ -61,6 +61,8 @@ struct TagQEqSum2{}; struct TagQEqCalculateQ{}; struct TagQEqPackForwardComm{}; struct TagQEqUnpackForwardComm{}; +struct TagQEqPackReverseComm{}; +struct TagQEqUnpackReverseComm{}; struct TagQEqPackExchange{}; struct TagQEqUnpackExchange{}; @@ -157,6 +159,14 @@ class FixQEqReaxFFKokkos : public FixQEqReaxFF, public KokkosBase { KOKKOS_INLINE_FUNCTION void operator()(TagQEqUnpackForwardComm, const int&) const; +// NOLINTNEXTLINE + KOKKOS_INLINE_FUNCTION + void operator()(TagQEqPackReverseComm, const int&) const; + +// NOLINTNEXTLINE + KOKKOS_INLINE_FUNCTION + void operator()(TagQEqUnpackReverseComm, const int&) const; + // NOLINTNEXTLINE KOKKOS_INLINE_FUNCTION void operator()(TagQEqPackExchange, const int&) const; @@ -190,7 +200,11 @@ class FixQEqReaxFFKokkos : public FixQEqReaxFF, public KokkosBase { void unpack_forward_comm_kokkos(int, int, DAT::tdual_double_1d&) override; int pack_forward_comm(int, int *, double *, int, int *) override; void unpack_forward_comm(int, int, double *) override; + + int pack_reverse_comm_kokkos(int, int, DAT::tdual_double_1d&) override; int pack_reverse_comm(int, int, double *) override; + + void unpack_reverse_comm_kokkos(int, DAT::tdual_int_1d, DAT::tdual_double_1d&) override; void unpack_reverse_comm(int, int *, double *) override; double memory_usage() override; void sparse_matvec_kokkos(typename AT::t_kkfloat_1d_2 &); diff --git a/src/KOKKOS/kokkos.cpp b/src/KOKKOS/kokkos.cpp index 8827a73f6e8..2ea9612d673 100644 --- a/src/KOKKOS/kokkos.cpp +++ b/src/KOKKOS/kokkos.cpp @@ -68,6 +68,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp) forward_pair_comm_changed = 0; reverse_pair_comm_changed = 0; forward_fix_comm_changed = 0; + reverse_fix_comm_changed = 0; reverse_comm_changed = 0; sort_changed = atom_map_changed = 0; @@ -287,7 +288,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp) newtonflag = 0; exchange_comm_legacy = forward_comm_legacy = reverse_comm_legacy = 0; - forward_pair_comm_legacy = reverse_pair_comm_legacy = forward_fix_comm_legacy = 0; + forward_pair_comm_legacy = reverse_pair_comm_legacy = forward_fix_comm_legacy = reverse_fix_comm_legacy = 0; sort_legacy = 0; atom_map_legacy = 0; @@ -303,7 +304,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp) newtonflag = 1; exchange_comm_legacy = forward_comm_legacy = reverse_comm_legacy = 1; - forward_pair_comm_legacy = reverse_pair_comm_legacy = forward_fix_comm_legacy = 1; + forward_pair_comm_legacy = reverse_pair_comm_legacy = forward_fix_comm_legacy = reverse_fix_comm_legacy = 1; sort_legacy = 1; atom_map_legacy = 1; @@ -488,17 +489,17 @@ void KokkosLMP::accelerator(int narg, char **arg) if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command"); if (strcmp(arg[iarg+1],"no") == 0) { exchange_comm_legacy = forward_comm_legacy = reverse_comm_legacy = 1; - forward_pair_comm_legacy = reverse_pair_comm_legacy = forward_fix_comm_legacy = 1; + forward_pair_comm_legacy = reverse_pair_comm_legacy = forward_fix_comm_legacy = reverse_fix_comm_legacy = 1; exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0; } else if (strcmp(arg[iarg+1],"host") == 0) { exchange_comm_legacy = forward_comm_legacy = reverse_comm_legacy = 0; - forward_pair_comm_legacy = reverse_pair_comm_legacy = forward_fix_comm_legacy = 1; + forward_pair_comm_legacy = reverse_pair_comm_legacy = forward_fix_comm_legacy = reverse_fix_comm_legacy = 1; exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 1; } else if (strcmp(arg[iarg+1],"device") == 0) { exchange_comm_legacy = forward_comm_legacy = reverse_comm_legacy = 0; - forward_pair_comm_legacy = reverse_pair_comm_legacy = forward_fix_comm_legacy = 0; + forward_pair_comm_legacy = reverse_pair_comm_legacy = forward_fix_comm_legacy = reverse_fix_comm_legacy = 0; exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0; } else error->all(FLERR,"Illegal package kokkos command"); @@ -551,6 +552,14 @@ void KokkosLMP::accelerator(int narg, char **arg) else error->all(FLERR,"Illegal package kokkos command"); forward_fix_comm_changed = 0; iarg += 2; + } else if (strcmp(arg[iarg],"comm/fix/reverse") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command"); + if (strcmp(arg[iarg+1],"no") == 0) reverse_fix_comm_legacy = 1; + else if (strcmp(arg[iarg+1],"host") == 0) reverse_fix_comm_legacy = 1; + else if (strcmp(arg[iarg+1],"device") == 0) reverse_fix_comm_legacy = 0; + else error->all(FLERR,"Illegal package kokkos command"); + reverse_fix_comm_changed = 0; + iarg += 2; } else if (strcmp(arg[iarg],"comm/reverse") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command"); else if (strcmp(arg[iarg+1],"no") == 0) reverse_comm_legacy = 1; @@ -652,6 +661,10 @@ void KokkosLMP::accelerator(int narg, char **arg) if (forward_fix_comm_legacy == 0) { forward_fix_comm_legacy = 1; forward_fix_comm_changed = 1; + } + if (reverse_fix_comm_legacy == 0) { + reverse_fix_comm_legacy = 1; + reverse_fix_comm_changed = 1; } if (reverse_comm_legacy == 0 && reverse_comm_on_host == 0) { reverse_comm_legacy = 1; diff --git a/src/KOKKOS/kokkos.h b/src/KOKKOS/kokkos.h index 0dee4dd650f..4e3af0bdbcd 100644 --- a/src/KOKKOS/kokkos.h +++ b/src/KOKKOS/kokkos.h @@ -32,6 +32,7 @@ class KokkosLMP : protected Pointers { int forward_pair_comm_legacy; int reverse_pair_comm_legacy; int forward_fix_comm_legacy; + int reverse_fix_comm_legacy; int reverse_comm_legacy; int sort_legacy; int atom_map_legacy; @@ -43,6 +44,7 @@ class KokkosLMP : protected Pointers { int forward_pair_comm_changed; int reverse_pair_comm_changed; int forward_fix_comm_changed; + int reverse_fix_comm_changed; int reverse_comm_changed; int sort_changed; int atom_map_changed; diff --git a/src/KOKKOS/kokkos_base.h b/src/KOKKOS/kokkos_base.h index b931f2d2623..c00d7f66008 100644 --- a/src/KOKKOS/kokkos_base.h +++ b/src/KOKKOS/kokkos_base.h @@ -41,6 +41,12 @@ class KokkosBase { int, int *) {return 0;}; virtual void unpack_forward_comm_fix_kokkos(int, int, DAT::tdual_double_1d &) {} + + virtual int pack_reverse_comm_fix_kokkos(int, int, DAT::tdual_double_1d &) {return 0;}; + virtual void unpack_reverse_comm_fix_kokkos(int, DAT::tdual_int_2d, + int, DAT::tdual_double_1d &) {} + + virtual int pack_exchange_kokkos(const int & /*nsend*/, DAT::tdual_double_2d_lr & /*k_buf*/, DAT::tdual_int_1d /*k_sendlist*/, DAT::tdual_int_1d /*k_copylist*/, diff --git a/src/fix.cpp b/src/fix.cpp index 88d467e8d8e..dd77430a769 100644 --- a/src/fix.cpp +++ b/src/fix.cpp @@ -111,7 +111,7 @@ Fix::Fix(LAMMPS *lmp, int /*narg*/, char **arg) : datamask_modify = ALL_MASK; kokkosable = copymode = 0; - forward_comm_device = exchange_comm_device = sort_device = 0; + forward_comm_device = reverse_comm_device = exchange_comm_device = sort_device = 0; fuse_integrate_flag = 0; } diff --git a/src/fix.h b/src/fix.h index da65e469397..685399109a3 100644 --- a/src/fix.h +++ b/src/fix.h @@ -132,6 +132,7 @@ class Fix : protected Pointers { int kokkosable; // 1 if Kokkos fix int forward_comm_device; // 1 if forward comm on Device + int reverse_comm_device; // 1 if reverse comm on Device int exchange_comm_device; // 1 if exchange comm on Device int fuse_integrate_flag; // 1 if can fuse initial integrate with final integrate int sort_device; // 1 if sort on Device From 098cc4318fa1441bdc357504f16063d96d4211b1 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Tue, 17 Feb 2026 07:22:39 -0800 Subject: [PATCH 03/14] Update fix_qeq_reaxff.rst and package.rst for the new `matrix_free` keyword and the new `comm/fix/reverse` command line argument --- doc/src/fix_qeq_reaxff.rst | 12 +++++++++++- doc/src/package.rst | 14 ++++++++++---- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/doc/src/fix_qeq_reaxff.rst b/doc/src/fix_qeq_reaxff.rst index 74411383968..a68b7e54c9b 100644 --- a/doc/src/fix_qeq_reaxff.rst +++ b/doc/src/fix_qeq_reaxff.rst @@ -24,10 +24,11 @@ Syntax .. parsed-literal:: - keyword = *dual* or *maxiter* or *nowarn* + keyword = *dual* or *maxiter* or *nowarn* or *matrix_free* *dual* = process S and T matrix in parallel (only for qeq/reaxff/omp) *maxiter* N = limit the number of iterations to *N* *nowarn* = do not print a warning message if the maximum number of iterations was reached + *matrix_free* = use a matrix-free approach for applying the H matrix (only for qeq/reaxff/kk) Examples """""""" @@ -90,6 +91,15 @@ same fixed number of QEq iterations is desired, which can be achieved by using a very small tolerance and setting *maxiter* to the desired number of iterations. +The optional *matrix_free* keyword replaces the sequence of +explicitly constructing the H matrix then (repeatedly) applying it +with a matrix-free approach where the H matrix is effectively +regenerated each time it is applied. This trades performance for +reduced memory requirements because it avoids the overheads of +storing the matrix. This is only supported for the *qeq/reaxff/kk* +style and will work as expected for both full and half qeq neighbor +lists. + .. note:: In order to solve the self-consistent equations for electronegativity diff --git a/doc/src/package.rst b/doc/src/package.rst index eefecac4fd4..5a7b420e6a5 100644 --- a/doc/src/package.rst +++ b/doc/src/package.rst @@ -79,7 +79,7 @@ Syntax *no_affinity* values = none *kokkos* args = keyword value ... zero or more keyword/value pairs may be appended - keywords = *neigh* or *neigh/qeq* or *neigh/thread* or *neigh/transpose* or *newton* or *binsize* or *comm* or *comm/exchange* or *comm/forward* or *comm/pair/forward* or *comm/fix/forward* or *comm/reverse* or *comm/pair/reverse* or *sort* or *atom/map* or *gpu/aware* or *pair/only* + keywords = *neigh* or *neigh/qeq* or *neigh/thread* or *neigh/transpose* or *newton* or *binsize* or *comm* or *comm/exchange* or *comm/forward* or *comm/pair/forward* or *comm/fix/forward* or *comm/reverse* or *comm/pair/reverse* or *comm/fix/reverse* or *sort* or *atom/map* or *gpu/aware* or *pair/only* *neigh* value = *full* or *half* full = full neighbor list half = half neighbor list built in thread-safe manner @@ -98,7 +98,7 @@ Syntax *binsize* value = size size = bin size for neighbor list construction (distance units) *comm* value = *no* or *host* or *device* - use value for comm/exchange and comm/forward and comm/pair/forward and comm/fix/forward and comm/reverse + use value for comm/exchange and comm/forward and comm/pair/forward and comm/fix/forward and comm/reverse and comm/fix/reverse *comm/exchange* value = *no* or *host* or *device* *comm/forward* value = *no* or *host* or *device* *comm/pair/forward* value = *no* or *device* @@ -110,6 +110,10 @@ Syntax *comm/pair/reverse* value = *no* or *device* *no* = perform communication pack/unpack in non-KOKKOS mode *device* = perform pack/unpack on device (e.g. on GPU) + *comm/fix/reverse* value = *no** or *host* or *device* + *no* = perform communication pack/unpack in non-KOKKOS mode + *host* = perform pack/unpack on host (e.g. with OpenMP threading) + *device* = perform pack/unpack on device (e.g. on GPU) *sort* value = *no* or *device* *no* = perform atom sorting in non-KOKKOS mode *device* = perform atom sorting on device (e.g. on GPU) @@ -545,7 +549,8 @@ rule of thumb may give too large a binsize and the default should be overridden with a smaller value. The *comm* and *comm/exchange* and *comm/forward* and *comm/pair/forward* -and *comm/fix/forward* and *comm/reverse* and *comm/pair/reverse* +and *comm/fix/forward* and *comm/reverse* and *comm/pair/reverse* and +*comm/fix/reverse* keywords determine whether the host or device performs the packing and unpacking of data when communicating per-atom data between processors. "Exchange" communication happens only on timesteps that neighbor lists @@ -573,7 +578,8 @@ keywords, if a value of *host* is used it will be automatically be changed to *no* since these keywords don't support *host* mode. The value of *no* will also always be used when running on the CPU, i.e. setting the value to *device* will have no effect if the pair/fix style is -running on the CPU. For the *comm/fix/forward* or *comm/pair/reverse* +running on the CPU. For the *comm/fix/forward* or *comm/pair/reverse* or +*comm/fix/reverse* keywords, not all styles support *device* mode and in that case will run in *no* mode instead. From 6f83af4a46fd67a6b47c8b3b7455886242096318 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Tue, 17 Feb 2026 10:45:49 -0800 Subject: [PATCH 04/14] Updated contributing authors lists --- src/KOKKOS/comm_kokkos.cpp | 5 +++++ src/KOKKOS/fix_qeq_reaxff_kokkos.cpp | 2 ++ 2 files changed, 7 insertions(+) diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp index a4f1eea9968..be58761a408 100644 --- a/src/KOKKOS/comm_kokkos.cpp +++ b/src/KOKKOS/comm_kokkos.cpp @@ -12,6 +12,11 @@ See the README file in the top-level LAMMPS directory. ------------------------------------------------------------------------- */ +/* ---------------------------------------------------------------------- + Contributing authors: Christian Trott (SNL), Stan Moore (SNL), + Lewis Russell (U. Strathclyde), Balint Joo (NVIDIA) +------------------------------------------------------------------------- */ + #include "comm_kokkos.h" #include "atom.h" diff --git a/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp b/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp index f0480bdb3e3..d9786be565b 100644 --- a/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp +++ b/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp @@ -23,6 +23,8 @@ - Fused the CG solve for "S" and "T" matrices - Improved the SpMV algorithm by using vector instead of team level parallelism on GPUs + Evan Weinberg (NVIDIA): Matrix-free representation + Balint Joo (NVIDIA): Device-resident reverse comms ------------------------------------------------------------------------- */ #include "fix_qeq_reaxff_kokkos.h" From a13673c012cc6c7007f3c8d9b08f69dd6c78e4f3 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Tue, 17 Feb 2026 11:04:54 -0800 Subject: [PATCH 05/14] Fix whitespace --- doc/src/package.rst | 2 +- src/KOKKOS/comm_kokkos.cpp | 5 ++--- src/KOKKOS/fix_qeq_reaxff_kokkos.cpp | 14 ++++++++------ src/KOKKOS/fix_qeq_reaxff_kokkos.h | 2 +- src/KOKKOS/kokkos.cpp | 2 +- 5 files changed, 13 insertions(+), 12 deletions(-) diff --git a/doc/src/package.rst b/doc/src/package.rst index 5a7b420e6a5..d5bb2eda7b7 100644 --- a/doc/src/package.rst +++ b/doc/src/package.rst @@ -550,7 +550,7 @@ overridden with a smaller value. The *comm* and *comm/exchange* and *comm/forward* and *comm/pair/forward* and *comm/fix/forward* and *comm/reverse* and *comm/pair/reverse* and -*comm/fix/reverse* +*comm/fix/reverse* keywords determine whether the host or device performs the packing and unpacking of data when communicating per-atom data between processors. "Exchange" communication happens only on timesteps that neighbor lists diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp index be58761a408..723d79c2d63 100644 --- a/src/KOKKOS/comm_kokkos.cpp +++ b/src/KOKKOS/comm_kokkos.cpp @@ -450,11 +450,10 @@ void CommKokkos::forward_comm_device(Fix *fix, int size) void CommKokkos::reverse_comm(Fix *fix, int size) { - if(fix->execution_space == Host || fix->execution_space == HostKK || !fix->reverse_comm_device || reverse_fix_comm_legacy ) { + if(fix->execution_space == Host || fix->execution_space == HostKK || !fix->reverse_comm_device || reverse_fix_comm_legacy ) { k_sendlist.sync_host(); CommBrick::reverse_comm(fix, size); - } - else { + } else { k_sendlist.sync_device(); reverse_comm_device(fix,size); } diff --git a/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp b/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp index d9786be565b..124402bbcc0 100644 --- a/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp +++ b/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp @@ -1138,7 +1138,7 @@ void FixQEqReaxFFKokkos::unpack_forward_comm(int n, int first, doubl } /* ---------------------------------------------------------------------- */ -template +template int FixQEqReaxFFKokkos::pack_reverse_comm_kokkos(int n, int first_in, DAT::tdual_double_1d &buf) { first = first_in; @@ -1149,7 +1149,7 @@ int FixQEqReaxFFKokkos::pack_reverse_comm_kokkos(int n, int first_in template KOKKOS_INLINE_FUNCTION -void FixQEqReaxFFKokkos::operator()(TagQEqPackReverseComm, const int &i) const +void FixQEqReaxFFKokkos::operator()(TagQEqPackReverseComm, const int &i) const { //const auto d = d_o(i+first); if (!( converged & 1)) d_buf[2*i] = d_o(i+first,0); @@ -1157,17 +1157,18 @@ void FixQEqReaxFFKokkos::operator()(TagQEqPackReverseComm, const int } /* ---------------------------------------------------------------------- */ -template -void FixQEqReaxFFKokkos::unpack_reverse_comm_kokkos(int n, DAT::tdual_int_1d k_sendlist, DAT::tdual_double_1d& buf) + +template +void FixQEqReaxFFKokkos::unpack_reverse_comm_kokkos(int n, DAT::tdual_int_1d k_sendlist, DAT::tdual_double_1d& buf) { d_buf = buf.view(); d_sendlist = k_sendlist.view(); Kokkos::parallel_for(Kokkos::RangePolicy(0,n),*this); -} +} template KOKKOS_INLINE_FUNCTION -void FixQEqReaxFFKokkos::operator()(TagQEqUnpackReverseComm, const int &i) const +void FixQEqReaxFFKokkos::operator()(TagQEqUnpackReverseComm, const int &i) const { int j = d_sendlist(i); @@ -1177,6 +1178,7 @@ void FixQEqReaxFFKokkos::operator()(TagQEqUnpackReverseComm, const i } /* ---------------------------------------------------------------------- */ + template int FixQEqReaxFFKokkos::pack_reverse_comm(int n, int first, double *buf) { diff --git a/src/KOKKOS/fix_qeq_reaxff_kokkos.h b/src/KOKKOS/fix_qeq_reaxff_kokkos.h index aced8c082eb..9129f13fdd0 100644 --- a/src/KOKKOS/fix_qeq_reaxff_kokkos.h +++ b/src/KOKKOS/fix_qeq_reaxff_kokkos.h @@ -204,7 +204,7 @@ class FixQEqReaxFFKokkos : public FixQEqReaxFF, public KokkosBase { int pack_reverse_comm_kokkos(int, int, DAT::tdual_double_1d&) override; int pack_reverse_comm(int, int, double *) override; - void unpack_reverse_comm_kokkos(int, DAT::tdual_int_1d, DAT::tdual_double_1d&) override; + void unpack_reverse_comm_kokkos(int, DAT::tdual_int_1d, DAT::tdual_double_1d&) override; void unpack_reverse_comm(int, int *, double *) override; double memory_usage() override; void sparse_matvec_kokkos(typename AT::t_kkfloat_1d_2 &); diff --git a/src/KOKKOS/kokkos.cpp b/src/KOKKOS/kokkos.cpp index 2ea9612d673..24b04fbc23a 100644 --- a/src/KOKKOS/kokkos.cpp +++ b/src/KOKKOS/kokkos.cpp @@ -68,7 +68,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp) forward_pair_comm_changed = 0; reverse_pair_comm_changed = 0; forward_fix_comm_changed = 0; - reverse_fix_comm_changed = 0; + reverse_fix_comm_changed = 0; reverse_comm_changed = 0; sort_changed = atom_map_changed = 0; From 7d31bb494c94e59c7ea63cf59cc8b2d0c43ee662 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Wed, 18 Feb 2026 08:36:41 -0700 Subject: [PATCH 06/14] whitespace --- src/KOKKOS/comm_kokkos.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp index 723d79c2d63..7e6ea556ab3 100644 --- a/src/KOKKOS/comm_kokkos.cpp +++ b/src/KOKKOS/comm_kokkos.cpp @@ -450,7 +450,8 @@ void CommKokkos::forward_comm_device(Fix *fix, int size) void CommKokkos::reverse_comm(Fix *fix, int size) { - if(fix->execution_space == Host || fix->execution_space == HostKK || !fix->reverse_comm_device || reverse_fix_comm_legacy ) { + if (fix->execution_space == Host || fix->execution_space == HostKK || + !fix->reverse_comm_device || reverse_fix_comm_legacy) { k_sendlist.sync_host(); CommBrick::reverse_comm(fix, size); } else { From 5e909e0d79df87393a9ca6facb96eedb1831579c Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Wed, 18 Feb 2026 08:41:52 -0700 Subject: [PATCH 07/14] Add missing code block to reset reverse_fix_comm flags --- src/KOKKOS/comm_kokkos.cpp | 3 ++- src/KOKKOS/kokkos.cpp | 19 ++++++++++++++----- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp index 7e6ea556ab3..02c18a4cac1 100644 --- a/src/KOKKOS/comm_kokkos.cpp +++ b/src/KOKKOS/comm_kokkos.cpp @@ -358,7 +358,8 @@ void CommKokkos::reverse_comm_device() void CommKokkos::forward_comm(Fix *fix, int size) { - if (fix->execution_space == Host || fix->execution_space == HostKK || !fix->forward_comm_device || forward_fix_comm_legacy) { + if (fix->execution_space == Host || fix->execution_space == HostKK || + !fix->forward_comm_device || forward_fix_comm_legacy) { k_sendlist.sync_host(); CommBrick::forward_comm(fix, size); } else { diff --git a/src/KOKKOS/kokkos.cpp b/src/KOKKOS/kokkos.cpp index 24b04fbc23a..da76aad75d2 100644 --- a/src/KOKKOS/kokkos.cpp +++ b/src/KOKKOS/kokkos.cpp @@ -288,7 +288,8 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp) newtonflag = 0; exchange_comm_legacy = forward_comm_legacy = reverse_comm_legacy = 0; - forward_pair_comm_legacy = reverse_pair_comm_legacy = forward_fix_comm_legacy = reverse_fix_comm_legacy = 0; + forward_pair_comm_legacy = reverse_pair_comm_legacy = + forward_fix_comm_legacy = reverse_fix_comm_legacy = 0; sort_legacy = 0; atom_map_legacy = 0; @@ -304,7 +305,8 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp) newtonflag = 1; exchange_comm_legacy = forward_comm_legacy = reverse_comm_legacy = 1; - forward_pair_comm_legacy = reverse_pair_comm_legacy = forward_fix_comm_legacy = reverse_fix_comm_legacy = 1; + forward_pair_comm_legacy = reverse_pair_comm_legacy = + forward_fix_comm_legacy = reverse_fix_comm_legacy = 1; sort_legacy = 1; atom_map_legacy = 1; @@ -489,17 +491,20 @@ void KokkosLMP::accelerator(int narg, char **arg) if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command"); if (strcmp(arg[iarg+1],"no") == 0) { exchange_comm_legacy = forward_comm_legacy = reverse_comm_legacy = 1; - forward_pair_comm_legacy = reverse_pair_comm_legacy = forward_fix_comm_legacy = reverse_fix_comm_legacy = 1; + forward_pair_comm_legacy = reverse_pair_comm_legacy = + forward_fix_comm_legacy = reverse_fix_comm_legacy = 1; exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0; } else if (strcmp(arg[iarg+1],"host") == 0) { exchange_comm_legacy = forward_comm_legacy = reverse_comm_legacy = 0; - forward_pair_comm_legacy = reverse_pair_comm_legacy = forward_fix_comm_legacy = reverse_fix_comm_legacy = 1; + forward_pair_comm_legacy = reverse_pair_comm_legacy = + forward_fix_comm_legacy = reverse_fix_comm_legacy = 1; exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 1; } else if (strcmp(arg[iarg+1],"device") == 0) { exchange_comm_legacy = forward_comm_legacy = reverse_comm_legacy = 0; - forward_pair_comm_legacy = reverse_pair_comm_legacy = forward_fix_comm_legacy = reverse_fix_comm_legacy = 0; + forward_pair_comm_legacy = reverse_pair_comm_legacy = + forward_fix_comm_legacy = reverse_fix_comm_legacy = 0; exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0; } else error->all(FLERR,"Illegal package kokkos command"); @@ -706,6 +711,10 @@ void KokkosLMP::accelerator(int narg, char **arg) forward_fix_comm_legacy = 0; forward_fix_comm_changed = 0; } + if (reverse_fix_comm_changed) { + reverse_fix_comm_legacy = 0; + reverse_fix_comm_changed = 0; + } if (reverse_comm_changed) { reverse_comm_legacy = 0; reverse_comm_changed = 0; From aba596855a04d9e0bcfea275025b9342d9997cb2 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Wed, 18 Feb 2026 08:49:49 -0700 Subject: [PATCH 08/14] more whitespace --- src/KOKKOS/fix_qeq_reaxff_kokkos.cpp | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp b/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp index 124402bbcc0..1247ae315f2 100644 --- a/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp +++ b/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp @@ -362,8 +362,8 @@ void FixQEqReaxFFKokkos::allocate_array() if (efield) get_chi_field(); Kokkos::parallel_for(Kokkos::RangePolicy(0,nn),*this); - } + /* ---------------------------------------------------------------------- */ template @@ -809,7 +809,6 @@ void FixQEqReaxFFKokkos::operator()(TagQEqSparseMatvec2_Half_MatrixF } } - /* ---------------------------------------------------------------------- */ // matrix-free form of the sparse mat-vec @@ -1152,8 +1151,8 @@ KOKKOS_INLINE_FUNCTION void FixQEqReaxFFKokkos::operator()(TagQEqPackReverseComm, const int &i) const { //const auto d = d_o(i+first); - if (!( converged & 1)) d_buf[2*i] = d_o(i+first,0); - if (!( converged & 2)) d_buf[2*i+1] = d_o(i+first,1); + if (!(converged & 1)) d_buf[2*i] = d_o(i+first,0); + if (!(converged & 2)) d_buf[2*i+1] = d_o(i+first,1); } /* ---------------------------------------------------------------------- */ @@ -1172,9 +1171,8 @@ void FixQEqReaxFFKokkos::operator()(TagQEqUnpackReverseComm, const i { int j = d_sendlist(i); - if ( !(converged & 1)) d_o(j,0) += d_buf[2*i]; - if ( !(converged & 2)) d_o(j,1) += d_buf[2*i+1]; - + if (!(converged & 1)) d_o(j,0) += d_buf[2*i]; + if (!(converged & 2)) d_o(j,1) += d_buf[2*i+1]; } /* ---------------------------------------------------------------------- */ From fa940591400852f81084130ca6de9ce279b71e33 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Wed, 18 Feb 2026 09:11:35 -0700 Subject: [PATCH 09/14] Remove extra fence --- src/KOKKOS/comm_kokkos.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp index 02c18a4cac1..e65986f9cc4 100644 --- a/src/KOKKOS/comm_kokkos.cpp +++ b/src/KOKKOS/comm_kokkos.cpp @@ -528,7 +528,6 @@ void CommKokkos::reverse_comm_device(Fix *fix, int size) // unpack buffer auto k_sendlist_iswap = Kokkos::subview(k_sendlist,iswap,Kokkos::ALL); fixKKBase->unpack_reverse_comm_kokkos(sendnum[iswap], k_sendlist_iswap,k_buf_tmp); - DeviceType().fence(); } } From fa4e558889e29e855e1cd5aabe486c02fe7f2218 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 18 Feb 2026 08:22:15 -0800 Subject: [PATCH 10/14] Keyword replacement: matrix_free -> matfree --- doc/src/fix_qeq_reaxff.rst | 6 +++--- src/REAXFF/fix_qeq_reaxff.cpp | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/src/fix_qeq_reaxff.rst b/doc/src/fix_qeq_reaxff.rst index a68b7e54c9b..23482485613 100644 --- a/doc/src/fix_qeq_reaxff.rst +++ b/doc/src/fix_qeq_reaxff.rst @@ -24,11 +24,11 @@ Syntax .. parsed-literal:: - keyword = *dual* or *maxiter* or *nowarn* or *matrix_free* + keyword = *dual* or *maxiter* or *nowarn* or *matfree* *dual* = process S and T matrix in parallel (only for qeq/reaxff/omp) *maxiter* N = limit the number of iterations to *N* *nowarn* = do not print a warning message if the maximum number of iterations was reached - *matrix_free* = use a matrix-free approach for applying the H matrix (only for qeq/reaxff/kk) + *matfree* = use a matrix-free approach for applying the H matrix (only for qeq/reaxff/kk) Examples """""""" @@ -91,7 +91,7 @@ same fixed number of QEq iterations is desired, which can be achieved by using a very small tolerance and setting *maxiter* to the desired number of iterations. -The optional *matrix_free* keyword replaces the sequence of +The optional *matfree* keyword replaces the sequence of explicitly constructing the H matrix then (repeatedly) applying it with a matrix-free approach where the H matrix is effectively regenerated each time it is applied. This trades performance for diff --git a/src/REAXFF/fix_qeq_reaxff.cpp b/src/REAXFF/fix_qeq_reaxff.cpp index 8d79544f28b..6dd861837ed 100644 --- a/src/REAXFF/fix_qeq_reaxff.cpp +++ b/src/REAXFF/fix_qeq_reaxff.cpp @@ -96,7 +96,7 @@ FixQEqReaxFF::FixQEqReaxFF(LAMMPS *lmp, int narg, char **arg) : while (iarg < narg) { if (strcmp(arg[iarg],"dual") == 0) dual_enabled = 1; else if (strcmp(arg[iarg],"nowarn") == 0) maxwarn = 0; - else if (strcmp(arg[iarg],"matrix_free") == 0) matrix_free = 1; + else if (strcmp(arg[iarg],"matfree") == 0) matrix_free = 1; else if (strcmp(arg[iarg],"maxiter") == 0) { if (iarg+1 > narg-1) error->all(FLERR, iarg, "Illegal fix {} command", style); From da690cebb788031c807c675385960134b734d620 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 18 Feb 2026 08:48:14 -0800 Subject: [PATCH 11/14] Fixed identification of qeq/reax/kk --- src/KOKKOS/fix_qeq_reaxff_kokkos.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp b/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp index 1247ae315f2..839ae242a86 100644 --- a/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp +++ b/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp @@ -79,7 +79,7 @@ FixQEqReaxFFKokkos(LAMMPS *lmp, int narg, char **arg) : converged = 0; if (comm->me == 0 && matrix_free) - utils::logmesg(lmp, "Using matrix-free form for fix/qeq/kk\n"); + utils::logmesg(lmp, "Using matrix-free form for fix qeq/reax/kk\n"); } /* ---------------------------------------------------------------------- */ From b6013f1ae6fbb91e4259045420c32a6a8b6da535 Mon Sep 17 00:00:00 2001 From: Evan Weinberg Date: Wed, 18 Feb 2026 09:08:58 -0800 Subject: [PATCH 12/14] Base class signature fix for unpack_reverse_comm_fix_kokkos (thanks copilot!) --- src/KOKKOS/kokkos_base.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/KOKKOS/kokkos_base.h b/src/KOKKOS/kokkos_base.h index c00d7f66008..6b07a7fd2bc 100644 --- a/src/KOKKOS/kokkos_base.h +++ b/src/KOKKOS/kokkos_base.h @@ -43,7 +43,7 @@ class KokkosBase { virtual int pack_reverse_comm_fix_kokkos(int, int, DAT::tdual_double_1d &) {return 0;}; - virtual void unpack_reverse_comm_fix_kokkos(int, DAT::tdual_int_2d, + virtual void unpack_reverse_comm_fix_kokkos(int, DAT::tdual_int_1d, int, DAT::tdual_double_1d &) {} From 82a0f07f6e225f274f0b7d92e9d87682854eba66 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Wed, 18 Feb 2026 10:41:06 -0700 Subject: [PATCH 13/14] Small doc tweak --- doc/src/fix_qeq_reaxff.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/doc/src/fix_qeq_reaxff.rst b/doc/src/fix_qeq_reaxff.rst index 23482485613..a536fc31181 100644 --- a/doc/src/fix_qeq_reaxff.rst +++ b/doc/src/fix_qeq_reaxff.rst @@ -92,13 +92,12 @@ by using a very small tolerance and setting *maxiter* to the desired number of iterations. The optional *matfree* keyword replaces the sequence of -explicitly constructing the H matrix then (repeatedly) applying it +explicitly constructing the H matrix, then (repeatedly) applying it with a matrix-free approach where the H matrix is effectively regenerated each time it is applied. This trades performance for reduced memory requirements because it avoids the overheads of storing the matrix. This is only supported for the *qeq/reaxff/kk* -style and will work as expected for both full and half qeq neighbor -lists. +style, with both full and half qeq neighbor lists supported. .. note:: From 8581d85e15a6421de560ac9f853f68f99b4e25e5 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Wed, 18 Feb 2026 10:41:20 -0700 Subject: [PATCH 14/14] Small changes suggested by copilot --- doc/src/package.rst | 2 +- src/KOKKOS/fix_qeq_reaxff_kokkos.cpp | 2 -- src/KOKKOS/fix_qeq_reaxff_kokkos.h | 2 +- src/KOKKOS/kokkos.cpp | 2 +- 4 files changed, 3 insertions(+), 5 deletions(-) diff --git a/doc/src/package.rst b/doc/src/package.rst index d5bb2eda7b7..f8f7b86ddd1 100644 --- a/doc/src/package.rst +++ b/doc/src/package.rst @@ -110,7 +110,7 @@ Syntax *comm/pair/reverse* value = *no* or *device* *no* = perform communication pack/unpack in non-KOKKOS mode *device* = perform pack/unpack on device (e.g. on GPU) - *comm/fix/reverse* value = *no** or *host* or *device* + *comm/fix/reverse* value = *no* or *host* or *device* *no* = perform communication pack/unpack in non-KOKKOS mode *host* = perform pack/unpack on host (e.g. with OpenMP threading) *device* = perform pack/unpack on device (e.g. on GPU) diff --git a/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp b/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp index 839ae242a86..73202ba4279 100644 --- a/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp +++ b/src/KOKKOS/fix_qeq_reaxff_kokkos.cpp @@ -825,7 +825,6 @@ void FixQEqReaxFFKokkos::operator()(TagQEqSparseMatvec2_Full_MatrixF const KK_FLOAT ytmp = x(i,1); const KK_FLOAT ztmp = x(i,2); const int itype = type(i); - const tagint itag = tag(i); const int jnum = d_numneigh[i]; KK_FLOAT2 doi = KK_FLOAT2(); @@ -1150,7 +1149,6 @@ template KOKKOS_INLINE_FUNCTION void FixQEqReaxFFKokkos::operator()(TagQEqPackReverseComm, const int &i) const { - //const auto d = d_o(i+first); if (!(converged & 1)) d_buf[2*i] = d_o(i+first,0); if (!(converged & 2)) d_buf[2*i+1] = d_o(i+first,1); } diff --git a/src/KOKKOS/fix_qeq_reaxff_kokkos.h b/src/KOKKOS/fix_qeq_reaxff_kokkos.h index 9129f13fdd0..34c00036e6d 100644 --- a/src/KOKKOS/fix_qeq_reaxff_kokkos.h +++ b/src/KOKKOS/fix_qeq_reaxff_kokkos.h @@ -385,7 +385,7 @@ struct FixQEqReaxFFKokkosNeighborFunctor { d_val(qeqreax.d_val), d_shield(qeqreax.d_shield), d_tap(qeqreax.d_tap), d_mfill_offset(qeqreax.d_mfill_offset), nlocal(qeqreax.nlocal), nn(qeqreax.nn), nmax(qeqreax.nmax), groupbit(qeqreax.groupbit), neighflag(qeqreax.neighflag == FULL ? FULL : HALF), cutsq(qeqreax.cutsq) - { ; } + {} void update_after_allocation(const FixQEqReaxFFKokkos &qeqreax); diff --git a/src/KOKKOS/kokkos.cpp b/src/KOKKOS/kokkos.cpp index da76aad75d2..87db605acc9 100644 --- a/src/KOKKOS/kokkos.cpp +++ b/src/KOKKOS/kokkos.cpp @@ -667,7 +667,7 @@ void KokkosLMP::accelerator(int narg, char **arg) forward_fix_comm_legacy = 1; forward_fix_comm_changed = 1; } - if (reverse_fix_comm_legacy == 0) { + if (reverse_fix_comm_legacy == 0) { reverse_fix_comm_legacy = 1; reverse_fix_comm_changed = 1; }