From 12048dbe497ca8f04b240118135b6b6a78623b52 Mon Sep 17 00:00:00 2001 From: Maciej Waruszewski Date: Tue, 3 Mar 2026 16:32:36 -0700 Subject: [PATCH 01/25] Allow specifying inner loops ranges as min/max index --- components/omega/src/infra/OmegaKokkos.h | 41 +++++++++++++++++++----- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/components/omega/src/infra/OmegaKokkos.h b/components/omega/src/infra/OmegaKokkos.h index a7ab8bca89da..7e582da1ff41 100644 --- a/components/omega/src/infra/OmegaKokkos.h +++ b/components/omega/src/infra/OmegaKokkos.h @@ -355,11 +355,18 @@ inline void parallelForOuter(const int (&UpperBounds)[N], F &&Functor, } // parallelForInner + +template +KOKKOS_FUNCTION void parallelForInner(const TeamMember &Team, int MinIndex, + int MaxIndex, F &&Functor) { + const auto Policy = TeamThreadRange(Team, MinIndex, MaxIndex + 1); + Kokkos::parallel_for(Policy, std::forward(Functor)); +} + template KOKKOS_FUNCTION void parallelForInner(const TeamMember &Team, int UpperBound, F &&Functor) { - const auto Policy = TeamThreadRange(Team, UpperBound); - Kokkos::parallel_for(Policy, std::forward(Functor)); + parallelForInner(Team, 0, UpperBound - 1, std::forward(Functor)); } // This struct is used to get the right accumulator type to be used in @@ -413,23 +420,41 @@ inline void parallelReduceOuter(const int (&UpperBounds)[N], F &&Functor, } // parallelReduceInner + template -KOKKOS_FUNCTION void parallelReduceInner(const TeamMember &Team, int UpperBound, - F &&Functor, R &&...Reducers) { - const auto Policy = TeamThreadRange(Team, UpperBound); +KOKKOS_FUNCTION void parallelReduceInner(const TeamMember &Team, int MinIndex, + int MaxIndex, F &&Functor, + R &&...Reducers) { + const auto Policy = TeamThreadRange(Team, MinIndex, MaxIndex + 1); Kokkos::parallel_reduce(Policy, std::forward(Functor), std::forward(Reducers)...); } +template +KOKKOS_FUNCTION void parallelReduceInner(const TeamMember &Team, int UpperBound, + F &&Functor, R &&...Reducers) { + parallelReduceInner(Team, 0, UpperBound - 1, std::forward(Functor), + std::forward(Reducers)...); +} + // parallelScanInner + template -KOKKOS_FUNCTION void parallelScanInner(const TeamMember &Team, int UpperBound, - F &&Functor, R &&...Reducers) { - const auto Policy = TeamThreadRange(Team, UpperBound); +KOKKOS_FUNCTION void parallelScanInner(const TeamMember &Team, int MinIndex, + int MaxIndex, F &&Functor, + R &&...Reducers) { + const auto Policy = TeamThreadRange(Team, MinIndex, MaxIndex + 1); Kokkos::parallel_scan(Policy, std::forward(Functor), std::forward(Reducers)...); } +template +KOKKOS_FUNCTION void parallelScanInner(const TeamMember &Team, int UpperBound, + F &&Functor, R &&...Reducers) { + parallelScanInner(Team, 0, UpperBound - 1, std::forward(Functor), + std::forward(Reducers)...); +} + } // end namespace OMEGA //===----------------------------------------------------------------------===// From 92bae1fe4b9dadc652f9bc66efa5888f0341e9f9 Mon Sep 17 00:00:00 2001 From: Maciej Waruszewski Date: Tue, 3 Mar 2026 16:33:18 -0700 Subject: [PATCH 02/25] Test new inner loops forms --- .../omega/test/infra/OmegaKokkosHiParTest.cpp | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/components/omega/test/infra/OmegaKokkosHiParTest.cpp b/components/omega/test/infra/OmegaKokkosHiParTest.cpp index c7793bd13cc7..bdb605c70f36 100644 --- a/components/omega/test/infra/OmegaKokkosHiParTest.cpp +++ b/components/omega/test/infra/OmegaKokkosHiParTest.cpp @@ -355,7 +355,7 @@ Error testHiparFor2DFor1D(int N1, int N2) { HostArray3DI4 RefAH("RefA3H", N1, N2, N3); for (int J1 = 0; J1 < N1; ++J1) { for (int J2 = 0; J2 < N2; ++J2) { - for (int J3 = 0; J3 < J1 + J2; ++J3) { + for (int J3 = J1; J3 <= J1 + J2; ++J3) { RefAH(J1, J2, J3) = f3(J1, J2, J3, N1, N2, N3); } } @@ -365,7 +365,7 @@ Error testHiparFor2DFor1D(int N1, int N2) { parallelForOuter( {N1, N2}, KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team) { parallelForInner( - Team, J1 + J2, INNER_LAMBDA(int J3) { + Team, J1, J1 + J2, INNER_LAMBDA(int J3) { A(J1, J2, J3) = f3(J1, J2, J3, N1, N2, N3); }); }); @@ -389,7 +389,7 @@ Error testHiparFor2DReduce1D(int N1, int N2) { for (int J2 = 0; J2 < N2; ++J2) { I4 Sum = 0; I4 Max = std::numeric_limits::min(); - for (int J3 = 0; J3 < J1 + J2; ++J3) { + for (int J3 = J1; J3 <= J1 + J2; ++J3) { Sum += f3(J1, J2, J3, N1, N2, N3); Max = std::max(Max, f3(J1, J2, J3, N1, N2, N3)); } @@ -404,7 +404,7 @@ Error testHiparFor2DReduce1D(int N1, int N2) { {N1, N2}, KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team) { I4 Sum; parallelReduceInner( - Team, J1 + J2, + Team, J1, J1 + J2, INNER_LAMBDA(int J3, I4 &Accum) { Accum += f3(J1, J2, J3, N1, N2, N3); }, @@ -413,7 +413,7 @@ Error testHiparFor2DReduce1D(int N1, int N2) { I4 Max; parallelReduceInner( - Team, J1 + J2, + Team, J1, J1 + J2, INNER_LAMBDA(int J3, I4 &Accum) { Accum = Kokkos::max(Accum, f3(J1, J2, J3, N1, N2, N3)); }, @@ -437,7 +437,7 @@ Error testHiparFor2DReduce1D(int N1, int N2) { {N1, N2}, KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team) { I4 Sum, Max; parallelReduceInner( - Team, J1 + J2, + Team, J1, J1 + J2, INNER_LAMBDA(int J3, I4 &AccumSum, I4 &AccumMax) { AccumSum += f3(J1, J2, J3, N1, N2, N3); AccumMax = Kokkos::max(AccumMax, f3(J1, J2, J3, N1, N2, N3)); @@ -464,7 +464,7 @@ Error testHiparFor2DScan1D(int N1, int N2) { for (int J1 = 0; J1 < N1; ++J1) { for (int J2 = 0; J2 < N2; ++J2) { I4 RSum = 0; - for (int J3 = 0; J3 < J1 + J2; ++J3) { + for (int J3 = J1; J3 <= J1 + J2; ++J3) { RefRSumH(J1, J2, J3) = RSum; RSum += f3(J1, J2, J3, N1, N2, N3); } @@ -475,7 +475,7 @@ Error testHiparFor2DScan1D(int N1, int N2) { parallelForOuter( {N1, N2}, KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team) { parallelScanInner( - Team, J1 + J2, INNER_LAMBDA(int J3, I4 &Accum, bool IsFinal) { + Team, J1, J1 + J2, INNER_LAMBDA(int J3, I4 &Accum, bool IsFinal) { if (IsFinal) { RSum(J1, J2, J3) = Accum; } @@ -500,7 +500,7 @@ Error testHiparReduce2DReduce1D(int N1, int N2) { I4 RefMax = std::numeric_limits::min(); for (int J1 = 0; J1 < N1; ++J1) { for (int J2 = 0; J2 < N2; ++J2) { - for (int J3 = 0; J3 < J1 + J2; ++J3) { + for (int J3 = J1; J3 <= J1 + J2; ++J3) { RefSum += f3(J1, J2, J3, N1, N2, N3); RefMax = std::max(RefMax, f3(J1, J2, J3, N1, N2, N3)); } @@ -513,7 +513,7 @@ Error testHiparReduce2DReduce1D(int N1, int N2) { KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team, I4 &AccumOuter) { I4 SumInner; parallelReduceInner( - Team, J1 + J2, + Team, J1, J1 + J2, INNER_LAMBDA(int J3, I4 &AccumInner) { AccumInner += f3(J1, J2, J3, N1, N2, N3); }, @@ -534,7 +534,7 @@ Error testHiparReduce2DReduce1D(int N1, int N2) { KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team, I4 &AccumOuter) { I4 MaxInner; parallelReduceInner( - Team, J1 + J2, + Team, J1, J1 + J2, INNER_LAMBDA(int J3, I4 &AccumInner) { AccumInner = Kokkos::max(AccumInner, f3(J1, J2, J3, N1, N2, N3)); @@ -556,7 +556,7 @@ Error testHiparReduce2DReduce1D(int N1, int N2) { I4 &AccumMaxOuter) { I4 SumInner, MaxInner; parallelReduceInner( - Team, J1 + J2, + Team, J1, J1 + J2, INNER_LAMBDA(int J3, I4 &AccumSumInner, I4 &AccumMaxInner) { AccumSumInner += f3(J1, J2, J3, N1, N2, N3); AccumMaxInner = From cdb41e5a9514e20dc9f114c39cc85cac747d3df0 Mon Sep 17 00:00:00 2001 From: Maciej Waruszewski Date: Fri, 13 Mar 2026 15:16:12 -0600 Subject: [PATCH 03/25] Split OmegaKokkos.h --- components/omega/src/infra/OmegaKokkos.h | 358 +++--------------- .../omega/src/infra/OmegaKokkosFlatPar.h | 115 ++++++ components/omega/src/infra/OmegaKokkosHiPar.h | 176 +++++++++ 3 files changed, 345 insertions(+), 304 deletions(-) create mode 100644 components/omega/src/infra/OmegaKokkosFlatPar.h create mode 100644 components/omega/src/infra/OmegaKokkosHiPar.h diff --git a/components/omega/src/infra/OmegaKokkos.h b/components/omega/src/infra/OmegaKokkos.h index 7e582da1ff41..c8aacd9d91ed 100644 --- a/components/omega/src/infra/OmegaKokkos.h +++ b/components/omega/src/infra/OmegaKokkos.h @@ -1,13 +1,13 @@ #ifndef OMEGA_KOKKOS_H #define OMEGA_KOKKOS_H -//===-- base/OmegaKokkos.h - Omega extension of Kokkos ------*- C++ -*-===// +//===-- infra/OmegaKokkos.h - Omega extension of Kokkos ------*- C++ -*-===// // /// \file /// \brief Extends Kokkos for Omega /// /// This header extends Kokkos for Omega. // -//===----------------------------------------------------------------------===// +//===-------------------------------------------------------------------===// #include "DataTypes.h" #include "Error.h" @@ -19,6 +19,9 @@ namespace OMEGA { #define OMEGA_SCOPE(a, b) auto &a = b +using ExecSpace = MemSpace::execution_space; +using HostExecSpace = HostMemSpace::execution_space; + /// An enum is used to provide a shorthand for determining the type of /// field. These correspond to the supported Omega data types (Real will be /// identical to R4 or R8 depending on settings) @@ -70,23 +73,50 @@ template struct ArrayRank { static constexpr bool Is5D = T::rank == 5; }; -using ExecSpace = MemSpace::execution_space; -using HostExecSpace = HostMemSpace::execution_space; -using TeamPolicy = Kokkos::TeamPolicy; -using TeamMember = TeamPolicy::member_type; -using ScratchMemSpace = ExecSpace::scratch_memory_space; -using Kokkos::MemoryUnmanaged; -using Kokkos::PerTeam; -using Kokkos::TeamThreadRange; -using RealScratchArray = - Kokkos::View; - -/// team_size for hierarchical parallelism -#ifdef OMEGA_TARGET_DEVICE -constexpr int OMEGA_TEAMSIZE = 64; -#else -constexpr int OMEGA_TEAMSIZE = 1; -#endif +template +auto createHostMirrorCopy(const V &View) + -> Kokkos::View { + return Kokkos::create_mirror_view_and_copy(HostExecSpace(), View); +} + +template +auto createDeviceMirrorCopy(const V &View) + -> Kokkos::View { + return Kokkos::create_mirror_view_and_copy(ExecSpace(), View); +} + +// function alias to follow Camel Naming Convention +template void deepCopy(D &&Dst, S &&Src) { + Kokkos::deep_copy(std::forward(Dst), std::forward(Src)); +} + +template +void deepCopy(E &Space, D &Dst, const S &Src) { + Kokkos::deep_copy(Space, Dst, Src); +} + +// Check if two arrays are identical +template +bool arraysEqual(const ArrayTypeA &A, const ArrayTypeB &B) { + OMEGA_REQUIRE(A.span_is_contiguous() && B.span_is_contiguous(), + "arraysEqual works only for contiguous arrays"); + OMEGA_REQUIRE(A.size() == B.size(), + "arrayEqual can only compare arrays of equal size"); + + // This is a debug utility and not performance critical + // so just copy to the host and compare there + const auto AH = createHostMirrorCopy(A); + const auto BH = createHostMirrorCopy(B); + + bool Equal = true; + for (size_t I = 0; I < AH.size(); I++) { + if (AH.data()[I] != BH.data()[I]) { + Equal = false; + break; + } + } + return Equal; +} // Takes a functor that uses multidimensional indexing // and converts it into one that also accepts linear index @@ -169,293 +199,13 @@ template struct LinearIdxWrapper : F { #endif }; -template -auto createHostMirrorCopy(const V &View) - -> Kokkos::View { - return Kokkos::create_mirror_view_and_copy(HostExecSpace(), View); -} - -template -auto createDeviceMirrorCopy(const V &View) - -> Kokkos::View { - return Kokkos::create_mirror_view_and_copy(ExecSpace(), View); -} - -// function alias to follow Camel Naming Convention -template void deepCopy(D &&Dst, S &&Src) { - Kokkos::deep_copy(std::forward(Dst), std::forward(Src)); -} - -template -void deepCopy(E &Space, D &Dst, const S &Src) { - Kokkos::deep_copy(Space, Dst, Src); -} - -// Check if two arrays are identical -template -bool arraysEqual(const ArrayTypeA &A, const ArrayTypeB &B) { - OMEGA_REQUIRE(A.span_is_contiguous() && B.span_is_contiguous(), - "arraysEqual works only for contiguous arrays"); - OMEGA_REQUIRE(A.size() == B.size(), - "arrayEqual can only compare arrays of equal size"); - - // This is a debug utility and not performance critical - // so just copy to the host and compare there - const auto AH = createHostMirrorCopy(A); - const auto BH = createHostMirrorCopy(B); - - bool Equal = true; - for (size_t I = 0; I < AH.size(); I++) { - if (AH.data()[I] != BH.data()[I]) { - Equal = false; - break; - } - } - return Equal; -} - -using Bounds1D = Kokkos::RangePolicy>; - -#if OMEGA_LAYOUT_RIGHT - -template -using Bounds = Kokkos::MDRangePolicy< - ExecSpace, Kokkos::Rank, - Kokkos::IndexType>; - -#elif OMEGA_LAYOUT_LEFT - -template -using Bounds = Kokkos::MDRangePolicy< - ExecSpace, Kokkos::Rank, - Kokkos::IndexType>; - -#else - -#error "OMEGA Memory Layout is not defined." - -#endif - -// parallelFor: with label -template -inline void parallelFor(const std::string &Label, const int (&UpperBounds)[N], - F &&Functor) { - if constexpr (N == 1) { - const auto Policy = Bounds1D(0, UpperBounds[0]); - Kokkos::parallel_for(Label, Policy, std::forward(Functor)); - - } else { -#ifdef OMEGA_TARGET_DEVICE - // On device convert the functor to use one dimensional indexing and use - // 1D RangePolicy - auto LinFunctor = LinearIdxWrapper{std::forward(Functor), UpperBounds}; - int LinBound = 1; - for (int Rank = 0; Rank < N; ++Rank) { - LinBound *= UpperBounds[Rank]; - } - const auto Policy = Bounds1D(0, LinBound); - Kokkos::parallel_for(Label, Policy, std::move(LinFunctor)); -#else - // On host use MDRangePolicy - const int LowerBounds[N] = {0}; - const auto Policy = Bounds(LowerBounds, UpperBounds); - Kokkos::parallel_for(Label, Policy, std::forward(Functor)); -#endif - } -} - -// parallelFor: without label -template -inline void parallelFor(const int (&UpperBounds)[N], F &&Functor) { - parallelFor("", UpperBounds, std::forward(Functor)); -} - -// parallelReduce: with label -template -inline void parallelReduce(const std::string &Label, - const int (&UpperBounds)[N], F &&Functor, - R &&...Reducers) { - if constexpr (N == 1) { - const auto Policy = Bounds1D(0, UpperBounds[0]); - Kokkos::parallel_reduce(Label, Policy, std::forward(Functor), - std::forward(Reducers)...); - - } else { - -#ifdef OMEGA_TARGET_DEVICE - // On device convert the functor to use one dimensional indexing and use - // 1D RangePolicy - auto LinFunctor = LinearIdxWrapper{std::forward(Functor), UpperBounds}; - int LinBound = 1; - for (int Rank = 0; Rank < N; ++Rank) { - LinBound *= UpperBounds[Rank]; - } - const auto Policy = Bounds1D(0, LinBound); - Kokkos::parallel_reduce(Label, Policy, std::move(LinFunctor), - std::forward(Reducers)...); -#else - // On host use MDRangePolicy - const int LowerBounds[N] = {0}; - const auto Policy = Bounds(LowerBounds, UpperBounds); - Kokkos::parallel_reduce(Label, Policy, std::forward(Functor), - std::forward(Reducers)...); -#endif - } -} - -// parallelReduce: without label -template -inline void parallelReduce(const int (&UpperBounds)[N], F &&Functor, - R &&...Reducers) { - parallelReduce("", UpperBounds, std::forward(Functor), - std::forward(Reducers)...); -} - -/// Hierarchical parallelism wrappers - -#define INNER_LAMBDA [=] -// #define INNER_LAMBDA [&] - -KOKKOS_INLINE_FUNCTION void teamBarrier(const TeamMember &Team) { - Team.team_barrier(); -} - -// parallelForOuter: with label -template -inline void parallelForOuter(const std::string &Label, - const int (&UpperBounds)[N], F &&Functor, - int ScratchValsPerTeam = 0) { - - auto LinFunctor = LinearIdxWrapper{std::forward(Functor), UpperBounds}; - int LinBound = 1; - for (int Rank = 0; Rank < N; ++Rank) { - LinBound *= UpperBounds[Rank]; - } - - auto Policy = TeamPolicy(LinBound, OMEGA_TEAMSIZE); - - if (ScratchValsPerTeam > 0) { - Policy.set_scratch_size( - 0, Kokkos::PerTeam(ScratchValsPerTeam * sizeof(Real))); - } - - Kokkos::parallel_for( - Label, Policy, KOKKOS_LAMBDA(const TeamMember &Team) { - const int TeamId = Team.league_rank(); - LinFunctor(TeamId, Team); - }); -} - -// parallelForOuter: without label -template -inline void parallelForOuter(const int (&UpperBounds)[N], F &&Functor, - int ScratchValsPerTeam = 0) { - parallelForOuter("", UpperBounds, std::forward(Functor), - ScratchValsPerTeam); -} - -// parallelForInner - -template -KOKKOS_FUNCTION void parallelForInner(const TeamMember &Team, int MinIndex, - int MaxIndex, F &&Functor) { - const auto Policy = TeamThreadRange(Team, MinIndex, MaxIndex + 1); - Kokkos::parallel_for(Policy, std::forward(Functor)); -} - -template -KOKKOS_FUNCTION void parallelForInner(const TeamMember &Team, int UpperBound, - F &&Functor) { - parallelForInner(Team, 0, UpperBound - 1, std::forward(Functor)); -} - -// This struct is used to get the right accumulator type to be used in -// the outer parallel lambda based on the final reduction variable type. -// The final reduction variable can be either a reference to -// an arithmetic type (int&, Real&) or a Kokkos reducer (Kokkos::Max). -// We need to know this type because nvcc does not allow generic lambdas. -template struct AccumTypeHelper; - -template -struct AccumTypeHelper>> { - using Type = T; -}; - -template -struct AccumTypeHelper>> { - using Type = typename T::value_type; -}; - -template using AccumType = typename AccumTypeHelper::Type; - -// parallelReduceOuter: with label -template -inline void parallelReduceOuter(const std::string &Label, - const int (&UpperBounds)[N], F &&Functor, - R &&...Reducers) { - - auto LinFunctor = LinearIdxWrapper{std::forward(Functor), UpperBounds}; - int LinBound = 1; - for (int Rank = 0; Rank < N; ++Rank) { - LinBound *= UpperBounds[Rank]; - } - - auto Policy = TeamPolicy(LinBound, OMEGA_TEAMSIZE); - Kokkos::parallel_reduce( - Label, Policy, - KOKKOS_LAMBDA(const TeamMember &Team, - AccumType> &...Accums) { - const int TeamId = Team.league_rank(); - LinFunctor(TeamId, Team, Accums...); - }, - std::forward(Reducers)...); -} - -// parallelReduceOuter: without label -template -inline void parallelReduceOuter(const int (&UpperBounds)[N], F &&Functor, - R &&...Reducers) { - parallelReduceOuter("", UpperBounds, std::forward(Functor), - std::forward(Reducers)...); -} - -// parallelReduceInner - -template -KOKKOS_FUNCTION void parallelReduceInner(const TeamMember &Team, int MinIndex, - int MaxIndex, F &&Functor, - R &&...Reducers) { - const auto Policy = TeamThreadRange(Team, MinIndex, MaxIndex + 1); - Kokkos::parallel_reduce(Policy, std::forward(Functor), - std::forward(Reducers)...); -} - -template -KOKKOS_FUNCTION void parallelReduceInner(const TeamMember &Team, int UpperBound, - F &&Functor, R &&...Reducers) { - parallelReduceInner(Team, 0, UpperBound - 1, std::forward(Functor), - std::forward(Reducers)...); -} - -// parallelScanInner +} // end namespace OMEGA -template -KOKKOS_FUNCTION void parallelScanInner(const TeamMember &Team, int MinIndex, - int MaxIndex, F &&Functor, - R &&...Reducers) { - const auto Policy = TeamThreadRange(Team, MinIndex, MaxIndex + 1); - Kokkos::parallel_scan(Policy, std::forward(Functor), - std::forward(Reducers)...); -} +// Flat parallelism wrappers +#include "OmegaKokkosFlatPar.h" -template -KOKKOS_FUNCTION void parallelScanInner(const TeamMember &Team, int UpperBound, - F &&Functor, R &&...Reducers) { - parallelScanInner(Team, 0, UpperBound - 1, std::forward(Functor), - std::forward(Reducers)...); -} - -} // end namespace OMEGA +// Hierarchical parallelism wrappers +#include "OmegaKokkosHiPar.h" //===----------------------------------------------------------------------===// #endif diff --git a/components/omega/src/infra/OmegaKokkosFlatPar.h b/components/omega/src/infra/OmegaKokkosFlatPar.h new file mode 100644 index 000000000000..4077cbda25b3 --- /dev/null +++ b/components/omega/src/infra/OmegaKokkosFlatPar.h @@ -0,0 +1,115 @@ +#ifndef OMEGA_KOKKOS_FLATPAR_H +#define OMEGA_KOKKOS_FLATPAR_H +//===-- infra/OmegaKokkosFlatPar.h - Omega flat parallelism wrappers ------*- +// C++ -*-===// +// +/// \file +/// \brief Omega flat parallelism wrappers +/// +/// INTERNAL HEADER NOT MEANT TO BE INCLUDED DIRECTLY +// +//===--------------------------------------------------------------------------------===// + +namespace OMEGA { + +using Bounds1D = Kokkos::RangePolicy>; + +#if OMEGA_LAYOUT_RIGHT + +template +using Bounds = Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank, + Kokkos::IndexType>; + +#elif OMEGA_LAYOUT_LEFT + +template +using Bounds = Kokkos::MDRangePolicy< + ExecSpace, Kokkos::Rank, + Kokkos::IndexType>; + +#else + +#error "OMEGA Memory Layout is not defined." + +#endif + +// parallelFor: with label +template +inline void parallelFor(const std::string &Label, const int (&UpperBounds)[N], + F &&Functor) { + if constexpr (N == 1) { + const auto Policy = Bounds1D(0, UpperBounds[0]); + Kokkos::parallel_for(Label, Policy, std::forward(Functor)); + + } else { +#ifdef OMEGA_TARGET_DEVICE + // On device convert the functor to use one dimensional indexing and use + // 1D RangePolicy + auto LinFunctor = LinearIdxWrapper{std::forward(Functor), UpperBounds}; + int LinBound = 1; + for (int Rank = 0; Rank < N; ++Rank) { + LinBound *= UpperBounds[Rank]; + } + const auto Policy = Bounds1D(0, LinBound); + Kokkos::parallel_for(Label, Policy, std::move(LinFunctor)); +#else + // On host use MDRangePolicy + const int LowerBounds[N] = {0}; + const auto Policy = Bounds(LowerBounds, UpperBounds); + Kokkos::parallel_for(Label, Policy, std::forward(Functor)); +#endif + } +} + +// parallelFor: without label +template +inline void parallelFor(const int (&UpperBounds)[N], F &&Functor) { + parallelFor("", UpperBounds, std::forward(Functor)); +} + +// parallelReduce: with label +template +inline void parallelReduce(const std::string &Label, + const int (&UpperBounds)[N], F &&Functor, + R &&...Reducers) { + if constexpr (N == 1) { + const auto Policy = Bounds1D(0, UpperBounds[0]); + Kokkos::parallel_reduce(Label, Policy, std::forward(Functor), + std::forward(Reducers)...); + + } else { + +#ifdef OMEGA_TARGET_DEVICE + // On device convert the functor to use one dimensional indexing and use + // 1D RangePolicy + auto LinFunctor = LinearIdxWrapper{std::forward(Functor), UpperBounds}; + int LinBound = 1; + for (int Rank = 0; Rank < N; ++Rank) { + LinBound *= UpperBounds[Rank]; + } + const auto Policy = Bounds1D(0, LinBound); + Kokkos::parallel_reduce(Label, Policy, std::move(LinFunctor), + std::forward(Reducers)...); +#else + // On host use MDRangePolicy + const int LowerBounds[N] = {0}; + const auto Policy = Bounds(LowerBounds, UpperBounds); + Kokkos::parallel_reduce(Label, Policy, std::forward(Functor), + std::forward(Reducers)...); +#endif + } +} + +// parallelReduce: without label +template +inline void parallelReduce(const int (&UpperBounds)[N], F &&Functor, + R &&...Reducers) { + parallelReduce("", UpperBounds, std::forward(Functor), + std::forward(Reducers)...); +} + +} // end namespace OMEGA + +//===----------------------------------------------------------------------===// +#endif diff --git a/components/omega/src/infra/OmegaKokkosHiPar.h b/components/omega/src/infra/OmegaKokkosHiPar.h new file mode 100644 index 000000000000..a71e0926990d --- /dev/null +++ b/components/omega/src/infra/OmegaKokkosHiPar.h @@ -0,0 +1,176 @@ +#ifndef OMEGA_KOKKOS_HIPAR_H +#define OMEGA_KOKKOS_HIPAR_H +//===-- infra/OmegaKokkosHiPar.h - Omega hierarchical parallelism wrappers +//------*- C++ -*-===// +// +/// \file +/// \brief Omega hierarchical parallelism wrappers +/// +/// INTERNAL HEADER NOT MEANT TO BE INCLUDED DIRECTLY +// +//===--------------------------------------------------------------------------------------===// + +namespace OMEGA { + +using TeamPolicy = Kokkos::TeamPolicy; +using TeamMember = TeamPolicy::member_type; +using ScratchMemSpace = ExecSpace::scratch_memory_space; +using Kokkos::MemoryUnmanaged; +using Kokkos::PerTeam; +using Kokkos::TeamThreadRange; +using RealScratchArray = + Kokkos::View; + +/// team_size for hierarchical parallelism +#ifdef OMEGA_TARGET_DEVICE +constexpr int OMEGA_TEAMSIZE = 64; +#else +constexpr int OMEGA_TEAMSIZE = 1; +#endif + +#define INNER_LAMBDA [=] +// #define INNER_LAMBDA [&] + +KOKKOS_INLINE_FUNCTION void teamBarrier(const TeamMember &Team) { + Team.team_barrier(); +} + +// parallelForOuter: with label +template +inline void parallelForOuter(const std::string &Label, + const int (&UpperBounds)[N], F &&Functor, + int ScratchValsPerTeam = 0) { + + auto LinFunctor = LinearIdxWrapper{std::forward(Functor), UpperBounds}; + int LinBound = 1; + for (int Rank = 0; Rank < N; ++Rank) { + LinBound *= UpperBounds[Rank]; + } + + auto Policy = TeamPolicy(LinBound, OMEGA_TEAMSIZE); + + if (ScratchValsPerTeam > 0) { + Policy.set_scratch_size( + 0, Kokkos::PerTeam(ScratchValsPerTeam * sizeof(Real))); + } + + Kokkos::parallel_for( + Label, Policy, KOKKOS_LAMBDA(const TeamMember &Team) { + const int TeamId = Team.league_rank(); + LinFunctor(TeamId, Team); + }); +} + +// parallelForOuter: without label +template +inline void parallelForOuter(const int (&UpperBounds)[N], F &&Functor, + int ScratchValsPerTeam = 0) { + parallelForOuter("", UpperBounds, std::forward(Functor), + ScratchValsPerTeam); +} + +// This struct is used to get the right accumulator type to be used in +// the outer parallel lambda based on the final reduction variable type. +// The final reduction variable can be either a reference to +// an arithmetic type (int&, Real&) or a Kokkos reducer (Kokkos::Max). +// We need to know this type because nvcc does not allow generic lambdas. +template struct AccumTypeHelper; + +template +struct AccumTypeHelper>> { + using Type = T; +}; + +template +struct AccumTypeHelper>> { + using Type = typename T::value_type; +}; + +template using AccumType = typename AccumTypeHelper::Type; + +// parallelReduceOuter: with label +template +inline void parallelReduceOuter(const std::string &Label, + const int (&UpperBounds)[N], F &&Functor, + R &&...Reducers) { + + auto LinFunctor = LinearIdxWrapper{std::forward(Functor), UpperBounds}; + int LinBound = 1; + for (int Rank = 0; Rank < N; ++Rank) { + LinBound *= UpperBounds[Rank]; + } + + auto Policy = TeamPolicy(LinBound, OMEGA_TEAMSIZE); + Kokkos::parallel_reduce( + Label, Policy, + KOKKOS_LAMBDA(const TeamMember &Team, + AccumType> &...Accums) { + const int TeamId = Team.league_rank(); + LinFunctor(TeamId, Team, Accums...); + }, + std::forward(Reducers)...); +} + +// parallelReduceOuter: without label +template +inline void parallelReduceOuter(const int (&UpperBounds)[N], F &&Functor, + R &&...Reducers) { + parallelReduceOuter("", UpperBounds, std::forward(Functor), + std::forward(Reducers)...); +} + +// parallelForInner + +template +KOKKOS_FUNCTION void parallelForInner(const TeamMember &Team, int MinIndex, + int MaxIndex, F &&Functor) { + const auto Policy = TeamThreadRange(Team, MinIndex, MaxIndex + 1); + Kokkos::parallel_for(Policy, std::forward(Functor)); +} + +template +KOKKOS_FUNCTION void parallelForInner(const TeamMember &Team, int UpperBound, + F &&Functor) { + parallelForInner(Team, 0, UpperBound - 1, std::forward(Functor)); +} + +// parallelReduceInner + +template +KOKKOS_FUNCTION void parallelReduceInner(const TeamMember &Team, int MinIndex, + int MaxIndex, F &&Functor, + R &&...Reducers) { + const auto Policy = TeamThreadRange(Team, MinIndex, MaxIndex + 1); + Kokkos::parallel_reduce(Policy, std::forward(Functor), + std::forward(Reducers)...); +} + +template +KOKKOS_FUNCTION void parallelReduceInner(const TeamMember &Team, int UpperBound, + F &&Functor, R &&...Reducers) { + parallelReduceInner(Team, 0, UpperBound - 1, std::forward(Functor), + std::forward(Reducers)...); +} + +// parallelScanInner + +template +KOKKOS_FUNCTION void parallelScanInner(const TeamMember &Team, int MinIndex, + int MaxIndex, F &&Functor, + R &&...Reducers) { + const auto Policy = TeamThreadRange(Team, MinIndex, MaxIndex + 1); + Kokkos::parallel_scan(Policy, std::forward(Functor), + std::forward(Reducers)...); +} + +template +KOKKOS_FUNCTION void parallelScanInner(const TeamMember &Team, int UpperBound, + F &&Functor, R &&...Reducers) { + parallelScanInner(Team, 0, UpperBound - 1, std::forward(Functor), + std::forward(Reducers)...); +} + +} // end namespace OMEGA + +//===----------------------------------------------------------------------===// +#endif From 906c2cc11c610a773ce6113278b45dbb2838e1a6 Mon Sep 17 00:00:00 2001 From: Maciej Waruszewski Date: Tue, 17 Mar 2026 12:12:03 -0600 Subject: [PATCH 04/25] RealScratchArray > ArrayScratch1DReal --- components/omega/src/infra/OmegaKokkosHiPar.h | 2 +- components/omega/src/ocn/VertAdv.cpp | 18 +++++++++--------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/components/omega/src/infra/OmegaKokkosHiPar.h b/components/omega/src/infra/OmegaKokkosHiPar.h index a71e0926990d..d9ec635d3a16 100644 --- a/components/omega/src/infra/OmegaKokkosHiPar.h +++ b/components/omega/src/infra/OmegaKokkosHiPar.h @@ -18,7 +18,7 @@ using ScratchMemSpace = ExecSpace::scratch_memory_space; using Kokkos::MemoryUnmanaged; using Kokkos::PerTeam; using Kokkos::TeamThreadRange; -using RealScratchArray = +using ArrayScratch1DReal = Kokkos::View; /// team_size for hierarchical parallelism diff --git a/components/omega/src/ocn/VertAdv.cpp b/components/omega/src/ocn/VertAdv.cpp index 343136b10d34..635a13e2c853 100644 --- a/components/omega/src/ocn/VertAdv.cpp +++ b/components/omega/src/ocn/VertAdv.cpp @@ -379,7 +379,7 @@ void VertAdv::computeVerticalVelocity( parallelForOuter( "computeVerticalVelocity", {NCellsOwned}, KOKKOS_LAMBDA(int ICell, const TeamMember &Team) { - RealScratchArray DivHU(Team.team_scratch(0), LocNVertLayers); + ArrayScratch1DReal DivHU(Team.team_scratch(0), LocNVertLayers); const Real InvAreaCell = 1._Real / LocAreaCell(ICell); @@ -520,7 +520,7 @@ void VertAdv::computeVelocityVAdvTend( // Allocate scratch space for W times Du/Dz at vertical interfaces // between edges - RealScratchArray WDuDzEdge(Team.team_scratch(0), LocNVertLayersP1); + ArrayScratch1DReal WDuDzEdge(Team.team_scratch(0), LocNVertLayersP1); // Flux is zero at top and bottom Kokkos::single( @@ -829,13 +829,13 @@ void VertAdv::computeFCTVAdvTend( const I4 KMax = MaxLayerCell(ICell); I4 KRange = vertRangeChunked(KMin, KMax); - RealScratchArray InvNewProvThick(Team.team_scratch(0), - LocNVertLayers); - RealScratchArray WorkTend(Team.team_scratch(0), LocNVertLayers); - RealScratchArray FlxIn(Team.team_scratch(0), LocNVertLayers); - RealScratchArray FlxOut(Team.team_scratch(0), LocNVertLayers); - RealScratchArray RescaledFlux(Team.team_scratch(0), - LocNVertLayers + 1); + ArrayScratch1DReal InvNewProvThick(Team.team_scratch(0), + LocNVertLayers); + ArrayScratch1DReal WorkTend(Team.team_scratch(0), LocNVertLayers); + ArrayScratch1DReal FlxIn(Team.team_scratch(0), LocNVertLayers); + ArrayScratch1DReal FlxOut(Team.team_scratch(0), LocNVertLayers); + ArrayScratch1DReal RescaledFlux(Team.team_scratch(0), + LocNVertLayers + 1); parallelForInner( Team, KRange, INNER_LAMBDA(int KChunk) { From 6fedbe6c00bba232a7c4006cdca5c465b4337281 Mon Sep 17 00:00:00 2001 From: Maciej Waruszewski Date: Tue, 17 Mar 2026 12:14:47 -0600 Subject: [PATCH 05/25] Do not bring MemoryUnmanaged into OMEGA namespace --- components/omega/src/base/TriDiagSolvers.h | 5 +++-- components/omega/src/infra/OmegaKokkosHiPar.h | 1 - 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/components/omega/src/base/TriDiagSolvers.h b/components/omega/src/base/TriDiagSolvers.h index f024a76aa2aa..5b3294ab5080 100644 --- a/components/omega/src/base/TriDiagSolvers.h +++ b/components/omega/src/base/TriDiagSolvers.h @@ -35,8 +35,9 @@ using TriDiagDiffSolver = ThomasDiffusionSolver; #endif // Type of real array of size (NRow, VecLength) in the scratch memory space -using TriDiagScratchArray = Kokkos::View; +using TriDiagScratchArray = + Kokkos::View; // Scratch data for general tridiagonal solver struct TriDiagScratch { diff --git a/components/omega/src/infra/OmegaKokkosHiPar.h b/components/omega/src/infra/OmegaKokkosHiPar.h index d9ec635d3a16..c269fb48d430 100644 --- a/components/omega/src/infra/OmegaKokkosHiPar.h +++ b/components/omega/src/infra/OmegaKokkosHiPar.h @@ -15,7 +15,6 @@ namespace OMEGA { using TeamPolicy = Kokkos::TeamPolicy; using TeamMember = TeamPolicy::member_type; using ScratchMemSpace = ExecSpace::scratch_memory_space; -using Kokkos::MemoryUnmanaged; using Kokkos::PerTeam; using Kokkos::TeamThreadRange; using ArrayScratch1DReal = From 009b1acbe0b2591069a3e6203c53c716c56248ba Mon Sep 17 00:00:00 2001 From: Maciej Waruszewski Date: Tue, 17 Mar 2026 13:48:25 -0600 Subject: [PATCH 06/25] Introduce LaunchConfig and TeamScratch --- components/omega/src/infra/OmegaKokkos.h | 19 +++-- components/omega/src/infra/OmegaKokkosHiPar.h | 78 +++++++++++++++---- components/omega/src/ocn/VertAdv.cpp | 19 ++--- 3 files changed, 86 insertions(+), 30 deletions(-) diff --git a/components/omega/src/infra/OmegaKokkos.h b/components/omega/src/infra/OmegaKokkos.h index c8aacd9d91ed..2ca20c49fd3f 100644 --- a/components/omega/src/infra/OmegaKokkos.h +++ b/components/omega/src/infra/OmegaKokkos.h @@ -124,16 +124,17 @@ template struct LinearIdxWrapper : F { static_assert(Rank >= 1 && Rank <= 5, "LinearIdxWrapper supports ranks 1-5"); using F::operator(); - LinearIdxWrapper(F &&Functor, const int (&Bounds)[Rank]) - : F(std::move(Functor)) { - computeStrides(Bounds); + template + LinearIdxWrapper(F &&Functor, Array &&Bounds) : F(std::move(Functor)) { + computeStrides(std::forward(Bounds)); } - LinearIdxWrapper(const F &Functor, const int (&Bounds)[Rank]) : F(Functor) { - computeStrides(Bounds); + template + LinearIdxWrapper(const F &Functor, Array &&Bounds) : F(Functor) { + computeStrides(std::forward(Bounds)); } - void computeStrides(const int (&Bounds)[Rank]) { + template void computeStrides(Array &&Bounds) { if constexpr (Rank > 1) { Strides[Rank - 2] = Bounds[Rank - 1]; for (int I = Rank - 3; I >= 0; --I) { @@ -199,6 +200,12 @@ template struct LinearIdxWrapper : F { #endif }; +// Deduction guides for deducing Rank +template +LinearIdxWrapper(F, const int (&)[Rank]) -> LinearIdxWrapper; +template +LinearIdxWrapper(F, std::array) -> LinearIdxWrapper; + } // end namespace OMEGA // Flat parallelism wrappers diff --git a/components/omega/src/infra/OmegaKokkosHiPar.h b/components/omega/src/infra/OmegaKokkosHiPar.h index c269fb48d430..a57dd448eae2 100644 --- a/components/omega/src/infra/OmegaKokkosHiPar.h +++ b/components/omega/src/infra/OmegaKokkosHiPar.h @@ -30,27 +30,64 @@ constexpr int OMEGA_TEAMSIZE = 1; #define INNER_LAMBDA [=] // #define INNER_LAMBDA [&] +template struct TeamScratch { + size_t BytesPerTeam = 0; + + TeamScratch() = default; + + template TeamScratch(const int (&NVals)[N]) { + static_assert(N == sizeof...(T)); + int I = 0; + ((BytesPerTeam += sizeof(T) * NVals[I++]), ...); + } + + TeamScratch(int NVals) : TeamScratch({{NVals}}) {} +}; + +template struct LaunchConfig { + std::array UpperBounds; + int TeamSize; + size_t ScratchBytesPerTeam; + + template + LaunchConfig(const int (&UpperBoundsIn)[N], int TeamSize, + const TeamScratch &Scratch) + : TeamSize(TeamSize), ScratchBytesPerTeam(Scratch.BytesPerTeam) { + std::copy(std::begin(UpperBoundsIn), std::end(UpperBoundsIn), + std::begin(UpperBounds)); + } + + template + LaunchConfig(const int (&UpperBounds)[N], const TeamScratch &Scratch) + : LaunchConfig(UpperBounds, OMEGA_TEAMSIZE, Scratch) {} + + LaunchConfig(const int (&UpperBounds)[N], int TeamSize) + : LaunchConfig(UpperBounds, TeamSize, TeamScratch<>{}) {} + + LaunchConfig(const int (&UpperBounds)[N]) + : LaunchConfig(UpperBounds, OMEGA_TEAMSIZE, TeamScratch<>{}) {} +}; + KOKKOS_INLINE_FUNCTION void teamBarrier(const TeamMember &Team) { Team.team_barrier(); } -// parallelForOuter: with label +// parallelForOuter: with label and with launch config template inline void parallelForOuter(const std::string &Label, - const int (&UpperBounds)[N], F &&Functor, - int ScratchValsPerTeam = 0) { + const LaunchConfig &Config, F &&Functor) { - auto LinFunctor = LinearIdxWrapper{std::forward(Functor), UpperBounds}; - int LinBound = 1; + auto LinFunctor = + LinearIdxWrapper{std::forward(Functor), Config.UpperBounds}; + int LinBound = 1; for (int Rank = 0; Rank < N; ++Rank) { - LinBound *= UpperBounds[Rank]; + LinBound *= Config.UpperBounds[Rank]; } - auto Policy = TeamPolicy(LinBound, OMEGA_TEAMSIZE); + auto Policy = TeamPolicy(LinBound, Config.TeamSize); - if (ScratchValsPerTeam > 0) { - Policy.set_scratch_size( - 0, Kokkos::PerTeam(ScratchValsPerTeam * sizeof(Real))); + if (Config.ScratchBytesPerTeam > 0) { + Policy.set_scratch_size(0, Kokkos::PerTeam(Config.ScratchBytesPerTeam)); } Kokkos::parallel_for( @@ -60,12 +97,23 @@ inline void parallelForOuter(const std::string &Label, }); } -// parallelForOuter: without label +// parallelForOuter: without label and with launch config +template +inline void parallelForOuter(const LaunchConfig &Config, F &&Functor) { + parallelForOuter("", Config, std::forward(Functor)); +} + +// parallelForOuter: with label and with array bounds +template +inline void parallelForOuter(const std::string &Label, + const int (&UpperBounds)[N], F &&Functor) { + parallelForOuter(Label, LaunchConfig(UpperBounds), std::forward(Functor)); +} + +// parallelForOuter: without label and with array bounds template -inline void parallelForOuter(const int (&UpperBounds)[N], F &&Functor, - int ScratchValsPerTeam = 0) { - parallelForOuter("", UpperBounds, std::forward(Functor), - ScratchValsPerTeam); +inline void parallelForOuter(const int (&UpperBounds)[N], F &&Functor) { + parallelForOuter("", LaunchConfig(UpperBounds), std::forward(Functor)); } // This struct is used to get the right accumulator type to be used in diff --git a/components/omega/src/ocn/VertAdv.cpp b/components/omega/src/ocn/VertAdv.cpp index 635a13e2c853..081f5a38d6bc 100644 --- a/components/omega/src/ocn/VertAdv.cpp +++ b/components/omega/src/ocn/VertAdv.cpp @@ -377,7 +377,8 @@ void VertAdv::computeVerticalVelocity( // Loop over all cells owned by the task parallelForOuter( - "computeVerticalVelocity", {NCellsOwned}, + "computeVerticalVelocity", + LaunchConfig({NCellsOwned}, TeamScratch(NVertLayers)), KOKKOS_LAMBDA(int ICell, const TeamMember &Team) { ArrayScratch1DReal DivHU(Team.team_scratch(0), LocNVertLayers); @@ -431,8 +432,7 @@ void VertAdv::computeVerticalVelocity( LocVertVel(ICell, KRev) = Accum; } }); - }, - NVertLayers); + }); // TODO: currently assuming TotalVerticalVelocity = VerticalVelocity, i.e. // purely from divergence of horizontal velocity. Need to add optional @@ -510,7 +510,8 @@ void VertAdv::computeVelocityVAdvTend( // Loop over every owned edge parallelForOuter( - "computeVelocityVAdvTend", {NEdgesOwned}, + "computeVelocityVAdvTend", + LaunchConfig({NEdgesOwned}, TeamScratch(NVertLayersP1)), KOKKOS_LAMBDA(int IEdge, const TeamMember &Team) { const I4 Cell1 = LocCOnE(IEdge, 0); const I4 Cell2 = LocCOnE(IEdge, 1); @@ -565,8 +566,7 @@ void VertAdv::computeVelocityVAdvTend( (WDuDzEdge(K) + WDuDzEdge(K + 1)); } }); - }, - NVertLayersP1); + }); } // end computeVelocityVAdvTend @@ -823,7 +823,9 @@ void VertAdv::computeFCTVAdvTend( OMEGA_SCOPE(LocEps, Eps); parallelForOuter( - "computeFCTVAdvTend", {NTracers, NCellsOwned}, + "computeFCTVAdvTend", + LaunchConfig({NTracers, NCellsOwned}, + TeamScratch(5 * NVertLayers + 1)), KOKKOS_LAMBDA(int L, int ICell, const TeamMember &Team) { const I4 KMin = MinLayerCell(ICell); const I4 KMax = MaxLayerCell(ICell); @@ -951,8 +953,7 @@ void VertAdv::computeFCTVAdvTend( } }); // TODO: Monotonicity and diagnostic checks - }, - 5 * NVertLayers + 1); + }); } // end computeFTCVAdvTend From 5148fa4bc18872782c036aec140cb768b5217b03 Mon Sep 17 00:00:00 2001 From: Maciej Waruszewski Date: Tue, 17 Mar 2026 15:46:38 -0600 Subject: [PATCH 07/25] Use LaunchConfig to convert tridiag solver to use wrappers --- components/omega/src/base/TriDiagSolvers.h | 68 +++---- components/omega/src/infra/OmegaKokkosHiPar.h | 1 - .../omega/test/base/TriDiagSolversTest.cpp | 185 +++++++++--------- 3 files changed, 119 insertions(+), 135 deletions(-) diff --git a/components/omega/src/base/TriDiagSolvers.h b/components/omega/src/base/TriDiagSolvers.h index 5b3294ab5080..0b9a947667b6 100644 --- a/components/omega/src/base/TriDiagSolvers.h +++ b/components/omega/src/base/TriDiagSolvers.h @@ -57,11 +57,10 @@ struct ThomasSolver { // Create a Kokkos team policy for solving NBatch systems of size NRow // and set scratch size - static TeamPolicy makeTeamPolicy(int NBatch, int NRow) { - TeamPolicy Policy((NBatch + VecLength - 1) / VecLength, 1, 1); - Policy.set_scratch_size( - 0, Kokkos::PerTeam(4 * NRow * VecLength * sizeof(Real))); - return Policy; + static LaunchConfig<1> makeLaunchConfig(int NBatch, int NRow) { + const int NTeams = (NBatch + VecLength - 1) / VecLength; + const int NScratch = 4 * NRow * VecLength; + return LaunchConfig({NTeams}, 1, TeamScratch(NScratch)); } // Solve the system defined in the scratch data argument `Scratch` @@ -101,11 +100,11 @@ struct ThomasSolver { const int NBatch = X.extent_int(0); const int NRow = X.extent_int(1); - TeamPolicy Policy = makeTeamPolicy(NBatch, NRow); + auto LConfig = makeLaunchConfig(NBatch, NRow); - Kokkos::parallel_for( - Policy, KOKKOS_LAMBDA(const TeamMember &Member) { - const int IStart = Member.league_rank() * VecLength; + parallelForOuter( + LConfig, KOKKOS_LAMBDA(const int IChunk, const TeamMember &Member) { + const int IStart = IChunk * VecLength; TriDiagScratch Scratch(Member, NRow); @@ -140,11 +139,9 @@ struct PCRSolver { // Create a Kokkos team policy for solving NBatch systems of size NRow // and set scratch size - static TeamPolicy makeTeamPolicy(int NBatch, int NRow) { - TeamPolicy Policy(NBatch, NRow, 1); - Policy.set_scratch_size( - 0, Kokkos::PerTeam(4 * NRow * VecLength * sizeof(Real))); - return Policy; + static LaunchConfig<1> makeLaunchConfig(int NBatch, int NRow) { + const int NScratch = 4 * NRow * VecLength; + return LaunchConfig({NBatch}, NRow, TeamScratch(NScratch)); } // Solve the system defined in the scratch data argument `Scratch` @@ -218,13 +215,12 @@ struct PCRSolver { static void solve(const Array2DReal &DL, const Array2DReal &D, const Array2DReal &DU, const Array2DReal &X) { - const int NBatch = X.extent_int(0); - const int NRow = X.extent_int(1); - TeamPolicy Policy = makeTeamPolicy(NBatch, NRow); + const int NBatch = X.extent_int(0); + const int NRow = X.extent_int(1); + auto LConfig = makeLaunchConfig(NBatch, NRow); - Kokkos::parallel_for( - Policy, KOKKOS_LAMBDA(const TeamMember &Member) { - const int I = Member.league_rank(); + parallelForOuter( + LConfig, KOKKOS_LAMBDA(int I, const TeamMember &Member) { const int K = Member.team_rank(); TriDiagScratch Scratch(Member, NRow); @@ -264,11 +260,10 @@ struct ThomasDiffusionSolver { // Create a Kokkos team policy for solving NBatch systems of size NRow // and set scratch size - static TeamPolicy makeTeamPolicy(int NBatch, int NRow) { - TeamPolicy Policy((NBatch + VecLength - 1) / VecLength, 1, 1); - Policy.set_scratch_size( - 0, Kokkos::PerTeam(4 * NRow * VecLength * sizeof(Real))); - return Policy; + static LaunchConfig<1> makeLaunchConfig(int NBatch, int NRow) { + const int NTeams = (NBatch + VecLength - 1) / VecLength; + const int NScratch = 4 * NRow * VecLength; + return LaunchConfig({NTeams}, 1, TeamScratch(NScratch)); } // Solve the system defined in the scratch data argument `Scratch` @@ -327,11 +322,11 @@ struct ThomasDiffusionSolver { const int NBatch = X.extent_int(0); const int NRow = X.extent_int(1); - TeamPolicy Policy = makeTeamPolicy(NBatch, NRow); + auto LConfig = makeLaunchConfig(NBatch, NRow); - Kokkos::parallel_for( - Policy, KOKKOS_LAMBDA(const TeamMember &Member) { - const int IStart = Member.league_rank() * VecLength; + parallelForOuter( + LConfig, KOKKOS_LAMBDA(int IChunk, const TeamMember &Member) { + const int IStart = IChunk * VecLength; TriDiagDiffScratch Scratch(Member, NRow); @@ -365,11 +360,9 @@ struct PCRDiffusionSolver { // Create a Kokkos team policy for solving NBatch systems of size NRow // and set scratch size - static TeamPolicy makeTeamPolicy(int NBatch, int NRow) { - TeamPolicy Policy(NBatch, NRow, 1); - Policy.set_scratch_size( - 0, Kokkos::PerTeam(4 * NRow * VecLength * sizeof(Real))); - return Policy; + static LaunchConfig<1> makeLaunchConfig(int NBatch, int NRow) { + const int NScratch = 4 * NRow * VecLength; + return LaunchConfig({NBatch}, NRow, TeamScratch(NScratch)); } // Solve the system defined in the scratch data argument `Scratch` @@ -461,10 +454,9 @@ struct PCRDiffusionSolver { const int NBatch = X.extent_int(0); const int NRow = X.extent_int(1); - TeamPolicy Policy = makeTeamPolicy(NBatch, NRow); - Kokkos::parallel_for( - Policy, KOKKOS_LAMBDA(const TeamMember &Member) { - const int I = Member.league_rank(); + auto LConfig = makeLaunchConfig(NBatch, NRow); + parallelForOuter( + LConfig, KOKKOS_LAMBDA(int I, const TeamMember &Member) { const int K = Member.team_rank(); TriDiagDiffScratch Scratch(Member, NRow); diff --git a/components/omega/src/infra/OmegaKokkosHiPar.h b/components/omega/src/infra/OmegaKokkosHiPar.h index a57dd448eae2..b18c6e3ac97a 100644 --- a/components/omega/src/infra/OmegaKokkosHiPar.h +++ b/components/omega/src/infra/OmegaKokkosHiPar.h @@ -16,7 +16,6 @@ using TeamPolicy = Kokkos::TeamPolicy; using TeamMember = TeamPolicy::member_type; using ScratchMemSpace = ExecSpace::scratch_memory_space; using Kokkos::PerTeam; -using Kokkos::TeamThreadRange; using ArrayScratch1DReal = Kokkos::View; diff --git a/components/omega/test/base/TriDiagSolversTest.cpp b/components/omega/test/base/TriDiagSolversTest.cpp index f6b08f986c9e..99236b40fe93 100644 --- a/components/omega/test/base/TriDiagSolversTest.cpp +++ b/components/omega/test/base/TriDiagSolversTest.cpp @@ -165,46 +165,44 @@ Real runDiffManufactured(int NCells) { U(ICell) = manufacturedSolution(XCell(ICell), 0); }); - TeamPolicy Policy = TriDiagDiffSolver::makeTeamPolicy(1, NCells); + auto LConfig = TriDiagDiffSolver::makeLaunchConfig(1, NCells); // Integrate in time with backward Euler for (int Step = 0; Step < NSteps; ++Step) { const Real Time = Step * TimeStep; const Real TimeNext = (Step + 1) * TimeStep; - Kokkos::parallel_for( - Policy, KOKKOS_LAMBDA(const TeamMember &Member) { + parallelForOuter( + LConfig, KOKKOS_LAMBDA(int, const TeamMember &Member) { TriDiagDiffScratch Scratch(Member, NCells); // Setup the system to be solved - Kokkos::parallel_for( - TeamThreadRange(Member, NCells), [=](int ICell) { - for (int IVec = 0; IVec < VecLength; ++IVec) { - - // Forcing term from the manufactured solution - const Real F = - manufacturedForcing(XCell(ICell), TimeNext); - - Scratch.H(ICell, IVec) = LayerThick(ICell); - - if (ICell == NCells - 1) { - // Boundary condition - const Real XBnd = XVertex(ICell + 1); - const Real BoundaryCoeff = - -(2 + Kokkos::sin(XBnd)) * Kokkos::tan(XBnd); - Scratch.H(ICell, IVec) -= TimeStep * BoundaryCoeff; - Scratch.G(ICell, IVec) = 0; - } else { - const Real AvgLayerThick = - (LayerThick(ICell + 1) + LayerThick(ICell)) / 2; - Scratch.G(ICell, IVec) = - Diffusivity(ICell + 1) * TimeStep / AvgLayerThick; - } - // RHS - Scratch.X(ICell, IVec) = - LayerThick(ICell) * (U(ICell) + TimeStep * F); - } - }); + parallelForInner(Member, NCells, [=](int ICell) { + for (int IVec = 0; IVec < VecLength; ++IVec) { + + // Forcing term from the manufactured solution + const Real F = manufacturedForcing(XCell(ICell), TimeNext); + + Scratch.H(ICell, IVec) = LayerThick(ICell); + + if (ICell == NCells - 1) { + // Boundary condition + const Real XBnd = XVertex(ICell + 1); + const Real BoundaryCoeff = + -(2 + Kokkos::sin(XBnd)) * Kokkos::tan(XBnd); + Scratch.H(ICell, IVec) -= TimeStep * BoundaryCoeff; + Scratch.G(ICell, IVec) = 0; + } else { + const Real AvgLayerThick = + (LayerThick(ICell + 1) + LayerThick(ICell)) / 2; + Scratch.G(ICell, IVec) = + Diffusivity(ICell + 1) * TimeStep / AvgLayerThick; + } + // RHS + Scratch.X(ICell, IVec) = + LayerThick(ICell) * (U(ICell) + TimeStep * F); + } + }); // Solve the system Member.team_barrier(); @@ -212,9 +210,9 @@ Real runDiffManufactured(int NCells) { Member.team_barrier(); // Store the solution - Kokkos::parallel_for( - TeamThreadRange(Member, NCells), - [=](int ICell) { U(ICell) = Scratch.X(ICell, 0); }); + parallelForInner(Member, NCells, [=](int ICell) { + U(ICell) = Scratch.X(ICell, 0); + }); }); } @@ -318,45 +316,42 @@ Real runDiffusionStability(bool UseGeneralSolver, Real DiffValue) { for (int Step = 0; Step < NSteps; ++Step) { if (UseGeneralSolver) { - TeamPolicy Policy = TriDiagSolver::makeTeamPolicy(1, NCells); + auto LConfig = TriDiagSolver::makeLaunchConfig(1, NCells); - Kokkos::parallel_for( - Policy, KOKKOS_LAMBDA(const TeamMember &Member) { + parallelForOuter( + LConfig, KOKKOS_LAMBDA(int, const TeamMember &Member) { TriDiagScratch Scratch(Member, NCells); // Setup the system to be solved in the form expected by the // general tridiagonal solver - Kokkos::parallel_for( - TeamThreadRange(Member, NCells), [=](int ICell) { - for (int IVec = 0; IVec < VecLength; ++IVec) { - - if (ICell < NCells - 1) { - const Real AvgLayerThick = - (LayerThick(ICell + 1) + LayerThick(ICell)) / - 2; - Scratch.DU(ICell, IVec) = -Diffusivity(ICell + 1) * - TimeStep / AvgLayerThick; - } else { - Scratch.DU(ICell, IVec) = 0; - } - - if (ICell > 0) { - const Real AvgLayerThick = - (LayerThick(ICell) + LayerThick(ICell - 1)) / - 2; - Scratch.DL(ICell, IVec) = - -Diffusivity(ICell) * TimeStep / AvgLayerThick; - } else { - Scratch.DL(ICell, IVec) = 0; - } - - Scratch.D(ICell, IVec) = LayerThick(ICell) - - Scratch.DU(ICell, IVec) - - Scratch.DL(ICell, IVec); - - Scratch.X(ICell, IVec) = LayerThick(ICell) * U(ICell); - } - }); + parallelForInner(Member, NCells, [=](int ICell) { + for (int IVec = 0; IVec < VecLength; ++IVec) { + + if (ICell < NCells - 1) { + const Real AvgLayerThick = + (LayerThick(ICell + 1) + LayerThick(ICell)) / 2; + Scratch.DU(ICell, IVec) = + -Diffusivity(ICell + 1) * TimeStep / AvgLayerThick; + } else { + Scratch.DU(ICell, IVec) = 0; + } + + if (ICell > 0) { + const Real AvgLayerThick = + (LayerThick(ICell) + LayerThick(ICell - 1)) / 2; + Scratch.DL(ICell, IVec) = + -Diffusivity(ICell) * TimeStep / AvgLayerThick; + } else { + Scratch.DL(ICell, IVec) = 0; + } + + Scratch.D(ICell, IVec) = LayerThick(ICell) - + Scratch.DU(ICell, IVec) - + Scratch.DL(ICell, IVec); + + Scratch.X(ICell, IVec) = LayerThick(ICell) * U(ICell); + } + }); // Solve the system Member.team_barrier(); @@ -364,38 +359,36 @@ Real runDiffusionStability(bool UseGeneralSolver, Real DiffValue) { Member.team_barrier(); // Save the solution - Kokkos::parallel_for( - TeamThreadRange(Member, NCells), - [=](int ICell) { U(ICell) = Scratch.X(ICell, 0); }); + parallelForInner(Member, NCells, [=](int ICell) { + U(ICell) = Scratch.X(ICell, 0); + }); }); } else { - TeamPolicy Policy = TriDiagDiffSolver::makeTeamPolicy(1, NCells); + auto LConfig = TriDiagDiffSolver::makeLaunchConfig(1, NCells); - Kokkos::parallel_for( - Policy, KOKKOS_LAMBDA(const TeamMember &Member) { + parallelForOuter( + LConfig, KOKKOS_LAMBDA(int, const TeamMember &Member) { TriDiagDiffScratch Scratch(Member, NCells); // Setup the system to be solved in the form expected by the // specialized diffusion tridiagonal solver - Kokkos::parallel_for( - TeamThreadRange(Member, NCells), [=](int ICell) { - for (int IVec = 0; IVec < VecLength; ++IVec) { - - Scratch.H(ICell, IVec) = LayerThick(ICell); - - if (ICell < NCells - 1) { - const Real AvgLayerThick = - (LayerThick(ICell + 1) + LayerThick(ICell)) / - 2; - Scratch.G(ICell, IVec) = Diffusivity(ICell + 1) * - TimeStep / AvgLayerThick; - } else { - Scratch.G(ICell, IVec) = 0; - } - - Scratch.X(ICell, IVec) = LayerThick(ICell) * U(ICell); - } - }); + parallelForInner(Member, NCells, [=](int ICell) { + for (int IVec = 0; IVec < VecLength; ++IVec) { + + Scratch.H(ICell, IVec) = LayerThick(ICell); + + if (ICell < NCells - 1) { + const Real AvgLayerThick = + (LayerThick(ICell + 1) + LayerThick(ICell)) / 2; + Scratch.G(ICell, IVec) = + Diffusivity(ICell + 1) * TimeStep / AvgLayerThick; + } else { + Scratch.G(ICell, IVec) = 0; + } + + Scratch.X(ICell, IVec) = LayerThick(ICell) * U(ICell); + } + }); // Solve the system Member.team_barrier(); @@ -403,9 +396,9 @@ Real runDiffusionStability(bool UseGeneralSolver, Real DiffValue) { Member.team_barrier(); // Store the solution - Kokkos::parallel_for( - TeamThreadRange(Member, NCells), - [=](int ICell) { U(ICell) = Scratch.X(ICell, 0); }); + parallelForInner(Member, NCells, [=](int ICell) { + U(ICell) = Scratch.X(ICell, 0); + }); }); } } From d966d8a752f5c68f8d2ab153ce339ed1a8b711a2 Mon Sep 17 00:00:00 2001 From: Maciej Waruszewski Date: Fri, 20 Mar 2026 10:51:04 -0600 Subject: [PATCH 08/25] Add Range struct for inner loop ranges --- components/omega/src/infra/OmegaKokkosHiPar.h | 32 +++++++++++-------- .../omega/test/infra/OmegaKokkosHiParTest.cpp | 17 +++++----- 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/components/omega/src/infra/OmegaKokkosHiPar.h b/components/omega/src/infra/OmegaKokkosHiPar.h index b18c6e3ac97a..e2d6a88adb7f 100644 --- a/components/omega/src/infra/OmegaKokkosHiPar.h +++ b/components/omega/src/infra/OmegaKokkosHiPar.h @@ -165,28 +165,33 @@ inline void parallelReduceOuter(const int (&UpperBounds)[N], F &&Functor, std::forward(Reducers)...); } +// Inclusive range of indices +struct Range { + int First; + int Last; +}; + // parallelForInner template -KOKKOS_FUNCTION void parallelForInner(const TeamMember &Team, int MinIndex, - int MaxIndex, F &&Functor) { - const auto Policy = TeamThreadRange(Team, MinIndex, MaxIndex + 1); +KOKKOS_FUNCTION void parallelForInner(const TeamMember &Team, Range Rng, + F &&Functor) { + const auto Policy = TeamThreadRange(Team, Rng.First, Rng.Last + 1); Kokkos::parallel_for(Policy, std::forward(Functor)); } template KOKKOS_FUNCTION void parallelForInner(const TeamMember &Team, int UpperBound, F &&Functor) { - parallelForInner(Team, 0, UpperBound - 1, std::forward(Functor)); + parallelForInner(Team, Range{0, UpperBound - 1}, std::forward(Functor)); } // parallelReduceInner template -KOKKOS_FUNCTION void parallelReduceInner(const TeamMember &Team, int MinIndex, - int MaxIndex, F &&Functor, - R &&...Reducers) { - const auto Policy = TeamThreadRange(Team, MinIndex, MaxIndex + 1); +KOKKOS_FUNCTION void parallelReduceInner(const TeamMember &Team, Range Rng, + F &&Functor, R &&...Reducers) { + const auto Policy = TeamThreadRange(Team, Rng.First, Rng.Last + 1); Kokkos::parallel_reduce(Policy, std::forward(Functor), std::forward(Reducers)...); } @@ -194,17 +199,16 @@ KOKKOS_FUNCTION void parallelReduceInner(const TeamMember &Team, int MinIndex, template KOKKOS_FUNCTION void parallelReduceInner(const TeamMember &Team, int UpperBound, F &&Functor, R &&...Reducers) { - parallelReduceInner(Team, 0, UpperBound - 1, std::forward(Functor), + parallelReduceInner(Team, Range{0, UpperBound - 1}, std::forward(Functor), std::forward(Reducers)...); } // parallelScanInner template -KOKKOS_FUNCTION void parallelScanInner(const TeamMember &Team, int MinIndex, - int MaxIndex, F &&Functor, - R &&...Reducers) { - const auto Policy = TeamThreadRange(Team, MinIndex, MaxIndex + 1); +KOKKOS_FUNCTION void parallelScanInner(const TeamMember &Team, Range Rng, + F &&Functor, R &&...Reducers) { + const auto Policy = TeamThreadRange(Team, Rng.First, Rng.Last + 1); Kokkos::parallel_scan(Policy, std::forward(Functor), std::forward(Reducers)...); } @@ -212,7 +216,7 @@ KOKKOS_FUNCTION void parallelScanInner(const TeamMember &Team, int MinIndex, template KOKKOS_FUNCTION void parallelScanInner(const TeamMember &Team, int UpperBound, F &&Functor, R &&...Reducers) { - parallelScanInner(Team, 0, UpperBound - 1, std::forward(Functor), + parallelScanInner(Team, Range{0, UpperBound - 1}, std::forward(Functor), std::forward(Reducers)...); } diff --git a/components/omega/test/infra/OmegaKokkosHiParTest.cpp b/components/omega/test/infra/OmegaKokkosHiParTest.cpp index bdb605c70f36..8bb7f833658d 100644 --- a/components/omega/test/infra/OmegaKokkosHiParTest.cpp +++ b/components/omega/test/infra/OmegaKokkosHiParTest.cpp @@ -365,7 +365,7 @@ Error testHiparFor2DFor1D(int N1, int N2) { parallelForOuter( {N1, N2}, KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team) { parallelForInner( - Team, J1, J1 + J2, INNER_LAMBDA(int J3) { + Team, Range{J1, J1 + J2}, INNER_LAMBDA(int J3) { A(J1, J2, J3) = f3(J1, J2, J3, N1, N2, N3); }); }); @@ -404,7 +404,7 @@ Error testHiparFor2DReduce1D(int N1, int N2) { {N1, N2}, KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team) { I4 Sum; parallelReduceInner( - Team, J1, J1 + J2, + Team, Range{J1, J1 + J2}, INNER_LAMBDA(int J3, I4 &Accum) { Accum += f3(J1, J2, J3, N1, N2, N3); }, @@ -413,7 +413,7 @@ Error testHiparFor2DReduce1D(int N1, int N2) { I4 Max; parallelReduceInner( - Team, J1, J1 + J2, + Team, Range{J1, J1 + J2}, INNER_LAMBDA(int J3, I4 &Accum) { Accum = Kokkos::max(Accum, f3(J1, J2, J3, N1, N2, N3)); }, @@ -437,7 +437,7 @@ Error testHiparFor2DReduce1D(int N1, int N2) { {N1, N2}, KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team) { I4 Sum, Max; parallelReduceInner( - Team, J1, J1 + J2, + Team, Range{J1, J1 + J2}, INNER_LAMBDA(int J3, I4 &AccumSum, I4 &AccumMax) { AccumSum += f3(J1, J2, J3, N1, N2, N3); AccumMax = Kokkos::max(AccumMax, f3(J1, J2, J3, N1, N2, N3)); @@ -475,7 +475,8 @@ Error testHiparFor2DScan1D(int N1, int N2) { parallelForOuter( {N1, N2}, KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team) { parallelScanInner( - Team, J1, J1 + J2, INNER_LAMBDA(int J3, I4 &Accum, bool IsFinal) { + Team, Range{J1, J1 + J2}, + INNER_LAMBDA(int J3, I4 &Accum, bool IsFinal) { if (IsFinal) { RSum(J1, J2, J3) = Accum; } @@ -513,7 +514,7 @@ Error testHiparReduce2DReduce1D(int N1, int N2) { KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team, I4 &AccumOuter) { I4 SumInner; parallelReduceInner( - Team, J1, J1 + J2, + Team, Range{J1, J1 + J2}, INNER_LAMBDA(int J3, I4 &AccumInner) { AccumInner += f3(J1, J2, J3, N1, N2, N3); }, @@ -534,7 +535,7 @@ Error testHiparReduce2DReduce1D(int N1, int N2) { KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team, I4 &AccumOuter) { I4 MaxInner; parallelReduceInner( - Team, J1, J1 + J2, + Team, Range{J1, J1 + J2}, INNER_LAMBDA(int J3, I4 &AccumInner) { AccumInner = Kokkos::max(AccumInner, f3(J1, J2, J3, N1, N2, N3)); @@ -556,7 +557,7 @@ Error testHiparReduce2DReduce1D(int N1, int N2) { I4 &AccumMaxOuter) { I4 SumInner, MaxInner; parallelReduceInner( - Team, J1, J1 + J2, + Team, Range{J1, J1 + J2}, INNER_LAMBDA(int J3, I4 &AccumSumInner, I4 &AccumMaxInner) { AccumSumInner += f3(J1, J2, J3, N1, N2, N3); AccumMaxInner = From 4330b6ca301c6a71edbdd2f4938eb67ba2ac2910 Mon Sep 17 00:00:00 2001 From: Maciej Waruszewski Date: Fri, 20 Mar 2026 12:20:23 -0600 Subject: [PATCH 09/25] Allow LaunchConfig in parallelReduceOuter --- components/omega/src/infra/OmegaKokkosHiPar.h | 54 +++++++++++++------ 1 file changed, 38 insertions(+), 16 deletions(-) diff --git a/components/omega/src/infra/OmegaKokkosHiPar.h b/components/omega/src/infra/OmegaKokkosHiPar.h index e2d6a88adb7f..bc148678c5b0 100644 --- a/components/omega/src/infra/OmegaKokkosHiPar.h +++ b/components/omega/src/infra/OmegaKokkosHiPar.h @@ -74,19 +74,19 @@ KOKKOS_INLINE_FUNCTION void teamBarrier(const TeamMember &Team) { // parallelForOuter: with label and with launch config template inline void parallelForOuter(const std::string &Label, - const LaunchConfig &Config, F &&Functor) { + const LaunchConfig &LConfig, F &&Functor) { auto LinFunctor = - LinearIdxWrapper{std::forward(Functor), Config.UpperBounds}; + LinearIdxWrapper{std::forward(Functor), LConfig.UpperBounds}; int LinBound = 1; for (int Rank = 0; Rank < N; ++Rank) { - LinBound *= Config.UpperBounds[Rank]; + LinBound *= LConfig.UpperBounds[Rank]; } - auto Policy = TeamPolicy(LinBound, Config.TeamSize); + auto Policy = TeamPolicy(LinBound, LConfig.TeamSize); - if (Config.ScratchBytesPerTeam > 0) { - Policy.set_scratch_size(0, Kokkos::PerTeam(Config.ScratchBytesPerTeam)); + if (LConfig.ScratchBytesPerTeam > 0) { + Policy.set_scratch_size(0, Kokkos::PerTeam(LConfig.ScratchBytesPerTeam)); } Kokkos::parallel_for( @@ -98,8 +98,8 @@ inline void parallelForOuter(const std::string &Label, // parallelForOuter: without label and with launch config template -inline void parallelForOuter(const LaunchConfig &Config, F &&Functor) { - parallelForOuter("", Config, std::forward(Functor)); +inline void parallelForOuter(const LaunchConfig &LConfig, F &&Functor) { + parallelForOuter("", LConfig, std::forward(Functor)); } // parallelForOuter: with label and with array bounds @@ -134,19 +134,24 @@ struct AccumTypeHelper>> { template using AccumType = typename AccumTypeHelper::Type; -// parallelReduceOuter: with label +// parallelReduceOuter: with label and with launch config template inline void parallelReduceOuter(const std::string &Label, - const int (&UpperBounds)[N], F &&Functor, + const LaunchConfig &LConfig, F &&Functor, R &&...Reducers) { - auto LinFunctor = LinearIdxWrapper{std::forward(Functor), UpperBounds}; - int LinBound = 1; + auto LinFunctor = + LinearIdxWrapper{std::forward(Functor), LConfig.UpperBounds}; + int LinBound = 1; for (int Rank = 0; Rank < N; ++Rank) { - LinBound *= UpperBounds[Rank]; + LinBound *= LConfig.UpperBounds[Rank]; + } + + auto Policy = TeamPolicy(LinBound, LConfig.TeamSize); + if (LConfig.ScratchBytesPerTeam > 0) { + Policy.set_scratch_size(0, Kokkos::PerTeam(LConfig.ScratchBytesPerTeam)); } - auto Policy = TeamPolicy(LinBound, OMEGA_TEAMSIZE); Kokkos::parallel_reduce( Label, Policy, KOKKOS_LAMBDA(const TeamMember &Team, @@ -157,11 +162,28 @@ inline void parallelReduceOuter(const std::string &Label, std::forward(Reducers)...); } -// parallelReduceOuter: without label +// parallelReduceOuter: without label and with launch config +template +inline void parallelReduceOuter(const LaunchConfig &LConfig, F &&Functor, + R &&...Reducers) { + parallelReduceOuter("", LConfig, std::forward(Functor), + std::forward(Reducers)...); +} + +// parallelReduceOuter: with label and with array bounds +template +inline void parallelReduceOuter(const std::string Label, + const int (&UpperBounds)[N], F &&Functor, + R &&...Reducers) { + parallelReduceOuter(Label, LaunchConfig(UpperBounds), + std::forward(Functor), std::forward(Reducers)...); +} + +// parallelReduceOuter: without label and with array bounds template inline void parallelReduceOuter(const int (&UpperBounds)[N], F &&Functor, R &&...Reducers) { - parallelReduceOuter("", UpperBounds, std::forward(Functor), + parallelReduceOuter("", LaunchConfig(UpperBounds), std::forward(Functor), std::forward(Reducers)...); } From 9cfc61bda6a928f385ee81bd887ccbbb91d5a26f Mon Sep 17 00:00:00 2001 From: Maciej Waruszewski Date: Fri, 20 Mar 2026 13:36:30 -0600 Subject: [PATCH 10/25] Add parallelSearchInner --- components/omega/src/infra/OmegaKokkosHiPar.h | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/components/omega/src/infra/OmegaKokkosHiPar.h b/components/omega/src/infra/OmegaKokkosHiPar.h index bc148678c5b0..c627a2216624 100644 --- a/components/omega/src/infra/OmegaKokkosHiPar.h +++ b/components/omega/src/infra/OmegaKokkosHiPar.h @@ -242,6 +242,44 @@ KOKKOS_FUNCTION void parallelScanInner(const TeamMember &Team, int UpperBound, std::forward(Reducers)...); } +// parallelSearchInner +// Given a functor taking an index and returning a bool this function +// returns the first index in the range [0, UpperBound) for which the input +// functor returns true. If no such index is found it returns -1 +template +KOKKOS_FUNCTION void parallelSearchInner(const TeamMember &Team, int UpperBound, + F &&Functor, int &Idx) { + static_assert(std::is_same_v, bool>, + "parallelSearchInner requires a functor that takes an int and " + "returns bool"); + + // There are different implementations for host and device since the + // parallel_reduce version doesn't return early leading to performance loss + // on CPUs +#ifndef OMEGA_TARGET_DEVICE + Idx = -1; + for (int I = 0; I < UpperBound; ++I) { + if (Functor(I)) { + Idx = I; + break; + } + } +#else + const auto Policy = TeamThreadRange(Team, UpperBound); + Kokkos::parallel_reduce( + Policy, + INNER_LAMBDA(int I, int &Accum) { + if (I <= Accum && Functor(I)) { + Accum = I; + } + }, + Kokkos::Min(Idx)); + if (Idx == Kokkos::reduction_identity::min()) { + Idx = -1; + } +#endif +} + } // end namespace OMEGA //===----------------------------------------------------------------------===// From bdd4c37b7ffe305ea33313b4ed431d11d39b15d4 Mon Sep 17 00:00:00 2001 From: Maciej Waruszewski Date: Thu, 5 Feb 2026 12:27:43 -0700 Subject: [PATCH 11/25] Add test for parallelSearchInner --- .../omega/test/infra/OmegaKokkosHiParTest.cpp | 105 ++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/components/omega/test/infra/OmegaKokkosHiParTest.cpp b/components/omega/test/infra/OmegaKokkosHiParTest.cpp index 8bb7f833658d..f4789333eb50 100644 --- a/components/omega/test/infra/OmegaKokkosHiParTest.cpp +++ b/components/omega/test/infra/OmegaKokkosHiParTest.cpp @@ -262,6 +262,110 @@ Error testHiparReduce1DReduce1D(int N1) { return Err; } +Error testHiparFor1DSearch1D(int N2) { + Error Err; + + const int Threshold = N2 / 2; + const int N1 = 3 * N2 + 3; + + HostArray2DI4 DataH("DataH", N1, N2); + + for (int J1 = 0; J1 < 3 * N2; ++J1) { + if (J1 < N2 + 10) { + for (int J2 = 0; J2 < N2; ++J2) { + DataH(J1, J2) = Threshold - (J1 - J2); + } + } else { + for (int J2 = 0; J2 < N2; ++J2) { + DataH(J1, J2) = Threshold - (J1 / 4 - J2); + } + } + } + + // Ensure these patterns are in the input data + for (int J2 = 0; J2 < N2; ++J2) { + // Everything above threshold + DataH(3 * N2, J2) = Threshold + 1; + // Everything below threshold + DataH(3 * N2 + 1, J2) = Threshold - 1; + // Multiple non-consecutive values above threshold + DataH(3 * N2 + 2, J2) = Threshold - 3 + J2 % 4; + } + + auto DataD = createDeviceMirrorCopy(DataH); + + HostArray1DI4 RefIdxH("RefIdxH", N1); + Array1DI4 IdxD("IdxD", N1); + + // test searching full range + + for (int J1 = 0; J1 < N1; ++J1) { + int Idx = -1; + for (int J2 = 0; J2 < N2; ++J2) { + if (DataH(J1, J2) >= Threshold) { + Idx = J2; + break; + } + } + RefIdxH(J1) = Idx; + } + + parallelForOuter( + {N1}, KOKKOS_LAMBDA(int J1, const TeamMember &Team) { + parallelSearchInner( + Team, N2, + INNER_LAMBDA(int J2) { return DataD(J1, J2) >= Threshold; }, + IdxD(J1)); + }); + + if (!arraysEqual(IdxD, RefIdxH)) { + Err += Error(ErrorCode::Fail, + errorMsg("parallelFor1DSearch1D Full FAIL", N1)); + } + + deepCopy(RefIdxH, 0); + deepCopy(IdxD, 0); + + // test searching limited range + + if (N2 / 4 > 0) { + + for (int J1 = 0; J1 < N1; ++J1) { + int Idx = -1; + const int Start = N2 / 4 - J1 % (N2 / 4); + const int End = 3 * N2 / 4 + J1 % (N2 / 4); + for (int J2 = Start; J2 < End; ++J2) { + if (DataH(J1, J2) >= Threshold) { + Idx = J2; + break; + } + } + RefIdxH(J1) = Idx; + } + + parallelForOuter( + {N1}, KOKKOS_LAMBDA(int J1, const TeamMember &Team) { + const int Start = N2 / 4 - J1 % (N2 / 4); + const int End = 3 * N2 / 4 + J1 % (N2 / 4); + int SearchIdx; + parallelSearchInner( + Team, End - Start, + INNER_LAMBDA(int J2) { + return DataD(J1, J2 + Start) >= Threshold; + }, + SearchIdx); + IdxD(J1) = SearchIdx == -1 ? SearchIdx : SearchIdx + Start; + }); + + if (!arraysEqual(IdxD, RefIdxH)) { + Err += Error(ErrorCode::Fail, + errorMsg("parallelFor1DSearch1D Limited FAIL", N1)); + } + } + + return Err; +} + Error testHiparFor1DMultiple1D(int N1, int N2) { Error Err; @@ -688,6 +792,7 @@ int main(int argc, char **argv) { #if !defined(KOKKOS_ENABLE_SYCL) || KOKKOS_VERSION_GREATER_EQUAL(4, 7, 1) Err += testHiparReduce1DReduce1D(N1); #endif + Err += testHiparFor1DSearch1D(N1); Err += testHiparFor1DMultiple1D(1, N1); Err += testHiparFor1DMultiple1D((N1 + 1) / 2, N1); From 8a68967a0075cf6c670563613e086deddcad47f6 Mon Sep 17 00:00:00 2001 From: Maciej Waruszewski Date: Thu, 5 Feb 2026 16:04:28 -0700 Subject: [PATCH 12/25] Add docs for parallelSearchInner --- .../omega/doc/devGuide/ParallelLoops.md | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/components/omega/doc/devGuide/ParallelLoops.md b/components/omega/doc/devGuide/ParallelLoops.md index 2ada95cfdb71..9a8251d4228c 100644 --- a/components/omega/doc/devGuide/ParallelLoops.md +++ b/components/omega/doc/devGuide/ParallelLoops.md @@ -115,6 +115,7 @@ The following inner iteration patterns are supported in Omega: - `parallelForInner` - `parallelReduceInner` - `parallelScanInner` +- `parallelSearchInner` To provide even more flexibility, the outer loops support iterating over a multi-dimensional range. Currently, the inner loops are limited to one dimension. @@ -277,3 +278,25 @@ Moreover, this example illustrates that the final scan value can be obtained by an additional argument `FinalScanValue`. Labels are not supported by `parallelScanInner` and only one-dimensional index range can be used. In contrast to `parallelReduceInner`, `parallelScanInner` supports only sum-based scans and only one scan variable. + +### parallelSearchInner +To search an index range in parallel for the first index where a given condition occurs Omega +provides the `parallelSearchInner` function. +For example, the following code finds, for each row of a matrix, the first column index where +the matrix element is above a certain threshold. If no element matches the condition then +`parallelSearchInner` returns `-1`. +```c++ + Array2DReal M("M", N1, N2); + Array1DI3 ThresholdIdx("ThresholdIdx", N1); + parallelForOuter( + {N1}, KOKKOS_LAMBDA(int J1, const TeamMember &Team) { + + int Idx; + parallelSearchInner(Team, N2, INNER_LAMBDA(Int J2) { + return M(J1, J2) > Threshold; + }, Idx); + + ThresholdIdx(J1) = Idx; + }); +``` +Labels are not supported by `parallelSearchInner` and only one-dimensional index range can be used. From e1835b44f2591b09ff4b3e2224c15a2f24689665 Mon Sep 17 00:00:00 2001 From: Maciej Waruszewski Date: Wed, 25 Feb 2026 16:40:53 -0700 Subject: [PATCH 13/25] Incorporate copilot suggestions Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../omega/doc/devGuide/ParallelLoops.md | 25 ++++++++++--------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/components/omega/doc/devGuide/ParallelLoops.md b/components/omega/doc/devGuide/ParallelLoops.md index 9a8251d4228c..7d4a5ae190ab 100644 --- a/components/omega/doc/devGuide/ParallelLoops.md +++ b/components/omega/doc/devGuide/ParallelLoops.md @@ -192,7 +192,7 @@ a 3D array in parallel using hierarchical parallelism. Array3DReal A("A", N1, N2, N3); parallelForOuter( {N1, N2}, KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team) { - parallelForInner(Team, N3, INNER_LAMBDA(Int J3) { + parallelForInner(Team, N3, INNER_LAMBDA(int J3) { A(J1, J2, J3) = J1 + J2 + J3; }); }); @@ -204,7 +204,7 @@ diagonal of a square matrix one can do: Array2DReal M("M", N, N); parallelForOuter( {N}, KOKKOS_LAMBDA(int J1, const TeamMember &Team) { - parallelForInner(Team, J1, INNER_LAMBDA(Int J2) { + parallelForInner(Team, J1, INNER_LAMBDA(int J2) { M(J1, J2) = J1 + J2; }); }); @@ -220,7 +220,7 @@ in a 2D array might be done as follows. parallelForOuter( {N1, N2}, KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team) { Real SumD3; - parallelReduceInner(Team, N3, INNER_LAMBDA(Int J3, Real &Accum) { + parallelReduceInner(Team, N3, INNER_LAMBDA(int J3, Real &Accum) { Accum += A(J1, J2, J3); }, SumD3); B(J1, J2) = SumD3; @@ -234,10 +234,10 @@ For example, to additionally compute and store maxima along the third dimension parallelForOuter( {N1, N2}, KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team) { Real SumD3, MaxD3; - parallelReduceInner(Team, N3, INNER_LAMBDA(Int J3, Real &AccumSum, Real &AccumMax) { + parallelReduceInner(Team, N3, INNER_LAMBDA(int J3, Real &AccumSum, Real &AccumMax) { AccumSum += A(J1, J2, J3); - AccumMax = Kokkos::Max(AccumMax, A(J1, J2, J3)); - }, SumN3, MaxN3); + AccumMax = Kokkos::max(AccumMax, A(J1, J2, J3)); + }, SumD3, Kokkos::Max(MaxD3)); B(J1, J2) = SumD3; C(J1, J2) = MaxD3; }); @@ -254,7 +254,7 @@ be done as follows. Array3DReal D("D", N1, N2, N3); parallelForOuter( {N1, N2}, KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team) { - parallelScanInner(Team, N1, INNER_LAMBDA(Int J3, Real &Accum, bool IsFinal) { + parallelScanInner(Team, N3, INNER_LAMBDA(int J3, Real &Accum, bool IsFinal) { Accum += A(J1, J2, J3); if (IsFinal) { D(J1, J2, J3) = Accum; @@ -267,7 +267,7 @@ before the `if` statement. That is, it performs an inclusive scan. To compute an simply move the addition after the `if` statement. ```c++ Real FinalScanValue; - parallelScanInner(Team, N1, INNER_LAMBDA(Int J3, Real &Accum, bool IsFinal) { + parallelScanInner(Team, N3, INNER_LAMBDA(int J3, Real &Accum, bool IsFinal) { if (IsFinal) { D(J1, J2, J3) = Accum; } @@ -280,19 +280,20 @@ and only one-dimensional index range can be used. In contrast to `parallelReduce `parallelScanInner` supports only sum-based scans and only one scan variable. ### parallelSearchInner -To search an index range in parallel for the first index where a given condition occurs Omega -provides the `parallelSearchInner` function. +To search an index range in parallel for the first index at which a given condition occurs, +Omega provides the `parallelSearchInner` function. For example, the following code finds, for each row of a matrix, the first column index where the matrix element is above a certain threshold. If no element matches the condition then `parallelSearchInner` returns `-1`. ```c++ Array2DReal M("M", N1, N2); - Array1DI3 ThresholdIdx("ThresholdIdx", N1); + Array1DI4 ThresholdIdx("ThresholdIdx", N1); + const Real Threshold = 0.5; parallelForOuter( {N1}, KOKKOS_LAMBDA(int J1, const TeamMember &Team) { int Idx; - parallelSearchInner(Team, N2, INNER_LAMBDA(Int J2) { + parallelSearchInner(Team, N2, INNER_LAMBDA(int J2) { return M(J1, J2) > Threshold; }, Idx); From 2eeeb91b2009b8d2b73c1d02dd2d683bfaa79b40 Mon Sep 17 00:00:00 2001 From: Maciej Waruszewski Date: Fri, 20 Mar 2026 16:11:13 -0600 Subject: [PATCH 14/25] Add a test using LaunchConfig and TeamScratch --- components/omega/src/infra/OmegaKokkosHiPar.h | 2 + .../omega/test/infra/OmegaKokkosHiParTest.cpp | 71 +++++++++++++++++++ 2 files changed, 73 insertions(+) diff --git a/components/omega/src/infra/OmegaKokkosHiPar.h b/components/omega/src/infra/OmegaKokkosHiPar.h index c627a2216624..232bc1100182 100644 --- a/components/omega/src/infra/OmegaKokkosHiPar.h +++ b/components/omega/src/infra/OmegaKokkosHiPar.h @@ -18,6 +18,8 @@ using ScratchMemSpace = ExecSpace::scratch_memory_space; using Kokkos::PerTeam; using ArrayScratch1DReal = Kokkos::View; +using ArrayScratch1DI4 = + Kokkos::View; /// team_size for hierarchical parallelism #ifdef OMEGA_TARGET_DEVICE diff --git a/components/omega/test/infra/OmegaKokkosHiParTest.cpp b/components/omega/test/infra/OmegaKokkosHiParTest.cpp index f4789333eb50..2e74f01b9dbb 100644 --- a/components/omega/test/infra/OmegaKokkosHiParTest.cpp +++ b/components/omega/test/infra/OmegaKokkosHiParTest.cpp @@ -366,6 +366,75 @@ Error testHiparFor1DSearch1D(int N2) { return Err; } +Error testHiparLaunchConfig1D(int N1, int N2) { + Error Err; + + HostArray2DReal RefOutH("RefOutH", N1, N2 - 3); + + for (int J1 = 0; J1 < N1; ++J1) { + HostArray1DI4 ScratchAH("ScratchAH", N2); + + for (int J2 = 0; J2 < N2; ++J2) { + ScratchAH(J2) = f2(J1, J2, N1, N2) * f2(J1, J2, N1, N2); + } + + HostArray1DReal ScratchBH("ScratchBH", N2 - 2); + + for (int J2 = 1; J2 < N2 - 1; ++J2) { + ScratchBH(J2 - 1) = + 1._Real / (1._Real + ScratchAH(J2 + 1) - ScratchAH(J2 - 1)); + } + + for (int J2 = 0; J2 < N2 - 3; ++J2) { + RefOutH(J1, J2) = ScratchBH(J2) / ScratchBH(J2 + 1); + } + } + + Array2DReal OutD("OutD", N1, N2 - 3); + +#ifdef OMEGA_DEVICE + const int TeamSize = 32; +#else + const int TeamSize = 1; +#endif + + auto LConfig = + LaunchConfig({N1}, TeamSize, TeamScratch({N2, N2 - 2})); + parallelForOuter( + LConfig, KOKKOS_LAMBDA(int J1, const TeamMember &Team) { + ArrayScratch1DI4 ScratchA(Team.team_scratch(0), N2); + + parallelForInner( + Team, N2, INNER_LAMBDA(int J2) { + ScratchA(J2) = f2(J1, J2, N1, N2) * f2(J1, J2, N1, N2); + }); + + teamBarrier(Team); + + ArrayScratch1DReal ScratchB(Team.team_scratch(0), N2 - 2); + + parallelForInner( + Team, Range{1, N2 - 2}, INNER_LAMBDA(int J2) { + ScratchB(J2 - 1) = + 1._Real / (1._Real + ScratchA(J2 + 1) - ScratchA(J2 - 1)); + }); + + teamBarrier(Team); + + parallelForInner( + Team, N2 - 3, INNER_LAMBDA(int J2) { + OutD(J1, J2) = ScratchB(J2) / ScratchB(J2 + 1); + }); + }); + + if (!arraysEqual(OutD, RefOutH)) { + Err += Error(ErrorCode::Fail, + errorMsg("parallelForLaunchConfig1D FAIL", N1, N2)); + } + + return Err; +} + Error testHiparFor1DMultiple1D(int N1, int N2) { Error Err; @@ -794,6 +863,8 @@ int main(int argc, char **argv) { #endif Err += testHiparFor1DSearch1D(N1); + Err += testHiparLaunchConfig1D(2 * N1, N1 + 3); + Err += testHiparFor1DMultiple1D(1, N1); Err += testHiparFor1DMultiple1D((N1 + 1) / 2, N1); Err += testHiparFor1DMultiple1D(2 * N1, N1); From e8ef30081c0971da1f0835673fe52ea89409d939 Mon Sep 17 00:00:00 2001 From: Maciej Waruszewski Date: Fri, 20 Mar 2026 16:20:52 -0600 Subject: [PATCH 15/25] Simplify and add comment to TeamScratch --- components/omega/src/infra/OmegaKokkosHiPar.h | 12 ++++++------ components/omega/test/infra/OmegaKokkosHiParTest.cpp | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/components/omega/src/infra/OmegaKokkosHiPar.h b/components/omega/src/infra/OmegaKokkosHiPar.h index 232bc1100182..0caa6be49d1f 100644 --- a/components/omega/src/infra/OmegaKokkosHiPar.h +++ b/components/omega/src/infra/OmegaKokkosHiPar.h @@ -31,18 +31,18 @@ constexpr int OMEGA_TEAMSIZE = 1; #define INNER_LAMBDA [=] // #define INNER_LAMBDA [&] +// Helper struct for providing information about scratch memory requirements +// TeamScratch(4, 8) stores the number of bytes needed for +// 4 values of type Real and 8 vals of type I4 template struct TeamScratch { size_t BytesPerTeam = 0; TeamScratch() = default; - template TeamScratch(const int (&NVals)[N]) { - static_assert(N == sizeof...(T)); - int I = 0; - ((BytesPerTeam += sizeof(T) * NVals[I++]), ...); + template TeamScratch(ArgT... Args) { + static_assert(sizeof...(ArgT) == sizeof...(T)); + ((BytesPerTeam += sizeof(T) * Args), ...); } - - TeamScratch(int NVals) : TeamScratch({{NVals}}) {} }; template struct LaunchConfig { diff --git a/components/omega/test/infra/OmegaKokkosHiParTest.cpp b/components/omega/test/infra/OmegaKokkosHiParTest.cpp index 2e74f01b9dbb..f243e3d6b6c8 100644 --- a/components/omega/test/infra/OmegaKokkosHiParTest.cpp +++ b/components/omega/test/infra/OmegaKokkosHiParTest.cpp @@ -399,7 +399,7 @@ Error testHiparLaunchConfig1D(int N1, int N2) { #endif auto LConfig = - LaunchConfig({N1}, TeamSize, TeamScratch({N2, N2 - 2})); + LaunchConfig({N1}, TeamSize, TeamScratch(N2, N2 - 2)); parallelForOuter( LConfig, KOKKOS_LAMBDA(int J1, const TeamMember &Team) { ArrayScratch1DI4 ScratchA(Team.team_scratch(0), N2); From db67c73d9eb376dfc6fcfe6310a86d2b7ab53430 Mon Sep 17 00:00:00 2001 From: Maciej Waruszewski Date: Thu, 26 Mar 2026 15:41:56 -0600 Subject: [PATCH 16/25] Support Range in parallelSearchInner --- components/omega/src/infra/OmegaKokkosHiPar.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/components/omega/src/infra/OmegaKokkosHiPar.h b/components/omega/src/infra/OmegaKokkosHiPar.h index 0caa6be49d1f..720b52086bf4 100644 --- a/components/omega/src/infra/OmegaKokkosHiPar.h +++ b/components/omega/src/infra/OmegaKokkosHiPar.h @@ -249,7 +249,7 @@ KOKKOS_FUNCTION void parallelScanInner(const TeamMember &Team, int UpperBound, // returns the first index in the range [0, UpperBound) for which the input // functor returns true. If no such index is found it returns -1 template -KOKKOS_FUNCTION void parallelSearchInner(const TeamMember &Team, int UpperBound, +KOKKOS_FUNCTION void parallelSearchInner(const TeamMember &Team, Range Rng, F &&Functor, int &Idx) { static_assert(std::is_same_v, bool>, "parallelSearchInner requires a functor that takes an int and " @@ -260,14 +260,14 @@ KOKKOS_FUNCTION void parallelSearchInner(const TeamMember &Team, int UpperBound, // on CPUs #ifndef OMEGA_TARGET_DEVICE Idx = -1; - for (int I = 0; I < UpperBound; ++I) { + for (int I = Rng.First; I <= Rng.Last; ++I) { if (Functor(I)) { Idx = I; break; } } #else - const auto Policy = TeamThreadRange(Team, UpperBound); + const auto Policy = TeamThreadRange(Team, Rng.First, Rng.Last + 1); Kokkos::parallel_reduce( Policy, INNER_LAMBDA(int I, int &Accum) { @@ -282,6 +282,13 @@ KOKKOS_FUNCTION void parallelSearchInner(const TeamMember &Team, int UpperBound, #endif } +template +KOKKOS_FUNCTION void parallelSearchInner(const TeamMember &Team, int UpperBound, + F &&Functor, int &Idx) { + parallelSearchInner(Team, Range{0, UpperBound - 1}, std::forward(Functor), + Idx); +} + } // end namespace OMEGA //===----------------------------------------------------------------------===// From 41efca9eff84b4ac8f16b8ed17f212bafded7f6e Mon Sep 17 00:00:00 2001 From: Maciej Waruszewski Date: Fri, 27 Mar 2026 16:33:21 -0600 Subject: [PATCH 17/25] Add teamScratch --- components/omega/src/infra/OmegaKokkosHiPar.h | 4 ++++ components/omega/test/infra/OmegaKokkosHiParTest.cpp | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/components/omega/src/infra/OmegaKokkosHiPar.h b/components/omega/src/infra/OmegaKokkosHiPar.h index 720b52086bf4..0f3266e5dc5a 100644 --- a/components/omega/src/infra/OmegaKokkosHiPar.h +++ b/components/omega/src/infra/OmegaKokkosHiPar.h @@ -73,6 +73,10 @@ KOKKOS_INLINE_FUNCTION void teamBarrier(const TeamMember &Team) { Team.team_barrier(); } +KOKKOS_INLINE_FUNCTION decltype(auto) teamScratch(const TeamMember &Team) { + return Team.team_scratch(0); +} + // parallelForOuter: with label and with launch config template inline void parallelForOuter(const std::string &Label, diff --git a/components/omega/test/infra/OmegaKokkosHiParTest.cpp b/components/omega/test/infra/OmegaKokkosHiParTest.cpp index f243e3d6b6c8..7f3a09da6ed8 100644 --- a/components/omega/test/infra/OmegaKokkosHiParTest.cpp +++ b/components/omega/test/infra/OmegaKokkosHiParTest.cpp @@ -402,7 +402,7 @@ Error testHiparLaunchConfig1D(int N1, int N2) { LaunchConfig({N1}, TeamSize, TeamScratch(N2, N2 - 2)); parallelForOuter( LConfig, KOKKOS_LAMBDA(int J1, const TeamMember &Team) { - ArrayScratch1DI4 ScratchA(Team.team_scratch(0), N2); + ArrayScratch1DI4 ScratchA(teamScratch(Team), N2); parallelForInner( Team, N2, INNER_LAMBDA(int J2) { @@ -411,7 +411,7 @@ Error testHiparLaunchConfig1D(int N1, int N2) { teamBarrier(Team); - ArrayScratch1DReal ScratchB(Team.team_scratch(0), N2 - 2); + ArrayScratch1DReal ScratchB(teamScratch(Team), N2 - 2); parallelForInner( Team, Range{1, N2 - 2}, INNER_LAMBDA(int J2) { From 41bf4a9e116bc064cf3612c95ee46330fe6074bb Mon Sep 17 00:00:00 2001 From: Maciej Waruszewski Date: Fri, 27 Mar 2026 17:03:49 -0600 Subject: [PATCH 18/25] Add namespace to TeamThreadRange --- components/omega/src/infra/OmegaKokkosHiPar.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/components/omega/src/infra/OmegaKokkosHiPar.h b/components/omega/src/infra/OmegaKokkosHiPar.h index 0f3266e5dc5a..6a2093e4994b 100644 --- a/components/omega/src/infra/OmegaKokkosHiPar.h +++ b/components/omega/src/infra/OmegaKokkosHiPar.h @@ -204,7 +204,7 @@ struct Range { template KOKKOS_FUNCTION void parallelForInner(const TeamMember &Team, Range Rng, F &&Functor) { - const auto Policy = TeamThreadRange(Team, Rng.First, Rng.Last + 1); + const auto Policy = Kokkos::TeamThreadRange(Team, Rng.First, Rng.Last + 1); Kokkos::parallel_for(Policy, std::forward(Functor)); } @@ -219,7 +219,7 @@ KOKKOS_FUNCTION void parallelForInner(const TeamMember &Team, int UpperBound, template KOKKOS_FUNCTION void parallelReduceInner(const TeamMember &Team, Range Rng, F &&Functor, R &&...Reducers) { - const auto Policy = TeamThreadRange(Team, Rng.First, Rng.Last + 1); + const auto Policy = Kokkos::TeamThreadRange(Team, Rng.First, Rng.Last + 1); Kokkos::parallel_reduce(Policy, std::forward(Functor), std::forward(Reducers)...); } @@ -236,7 +236,7 @@ KOKKOS_FUNCTION void parallelReduceInner(const TeamMember &Team, int UpperBound, template KOKKOS_FUNCTION void parallelScanInner(const TeamMember &Team, Range Rng, F &&Functor, R &&...Reducers) { - const auto Policy = TeamThreadRange(Team, Rng.First, Rng.Last + 1); + const auto Policy = Kokkos::TeamThreadRange(Team, Rng.First, Rng.Last + 1); Kokkos::parallel_scan(Policy, std::forward(Functor), std::forward(Reducers)...); } @@ -271,7 +271,7 @@ KOKKOS_FUNCTION void parallelSearchInner(const TeamMember &Team, Range Rng, } } #else - const auto Policy = TeamThreadRange(Team, Rng.First, Rng.Last + 1); + const auto Policy = Kokkos::TeamThreadRange(Team, Rng.First, Rng.Last + 1); Kokkos::parallel_reduce( Policy, INNER_LAMBDA(int I, int &Accum) { From 5145145906b350a7cd1722bf99992de66239de44 Mon Sep 17 00:00:00 2001 From: Maciej Waruszewski Date: Wed, 1 Apr 2026 11:06:47 -0600 Subject: [PATCH 19/25] Fix tests --- components/omega/test/infra/OmegaKokkosHiParTest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/omega/test/infra/OmegaKokkosHiParTest.cpp b/components/omega/test/infra/OmegaKokkosHiParTest.cpp index 7f3a09da6ed8..01587e7ed558 100644 --- a/components/omega/test/infra/OmegaKokkosHiParTest.cpp +++ b/components/omega/test/infra/OmegaKokkosHiParTest.cpp @@ -399,7 +399,7 @@ Error testHiparLaunchConfig1D(int N1, int N2) { #endif auto LConfig = - LaunchConfig({N1}, TeamSize, TeamScratch(N2, N2 - 2)); + LaunchConfig({N1}, TeamSize, TeamScratch(N2 - 2, N2)); parallelForOuter( LConfig, KOKKOS_LAMBDA(int J1, const TeamMember &Team) { ArrayScratch1DI4 ScratchA(teamScratch(Team), N2); From dbbb9089ff1b44f9d8a2c73da88d9448b05ab314 Mon Sep 17 00:00:00 2001 From: Maciej Waruszewski Date: Thu, 19 Mar 2026 15:25:09 -0600 Subject: [PATCH 20/25] Update parallel loops docs --- .../omega/doc/devGuide/ParallelLoops.md | 97 +++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/components/omega/doc/devGuide/ParallelLoops.md b/components/omega/doc/devGuide/ParallelLoops.md index 7d4a5ae190ab..a9c0d0e0e22c 100644 --- a/components/omega/doc/devGuide/ParallelLoops.md +++ b/components/omega/doc/devGuide/ParallelLoops.md @@ -150,6 +150,26 @@ To do that Kokkos provides the `single` function. To execute a statement once pe }); ``` +### Inner Iteration Ranges + +There are two ways of specifying the iteration range of an inner loop. +The first takes the total number of iterations `N` as the second argument +```c++ + parallelForInner(Team, N, INNER_LAMBDA (int K) { + }); +``` +and the loop index `K` takes values from `0` up to and including `N - 1`. +The second way uses a helper struct `Range` to provide a range of valid indices +```c++ + parallelForInner(Team, Range{N1, N2}, INNER_LAMBDA (int K) { + }); +``` +Note that this range is inclusive, i.e. the loop index `K` takes values from `N1` up to and including `N2`. +This means that `Range{0, N}` specifies a diffrent range than the first example. +For simplicity, most examples in this document use the first way of specyfying the range, +but a `Range` argument can be passed to all inner iteration patters. + + ### parallelForOuter To start outer iterations over a multidimensional index range the `parallelForOuter` wrapper is available. A call to `parallelForOuter` might look as follows. @@ -301,3 +321,80 @@ the matrix element is above a certain threshold. If no element matches the condi }); ``` Labels are not supported by `parallelSearchInner` and only one-dimensional index range can be used. + +### Launch Config + +While specyfing loop bounds is enough to start an outer parallel loop, sometimes more control over the underlaying +Kokkos `TeamPolicy` is desired. The most common use case is utilizing scratch memory, a concept discussed more +thoroughly in the next sub-section. To enable more control, outer loops can be launched by providing +a `LaunchConfig` struct as the first argument, which is composed of three parts: +- loop bounds, +- team size, +- amount of scratch memory. + +For example, the following snippet launches a loop iterating over a two-dimensional index range +with team size of 32 and enough scratch memory for 8 `Real` values and 4 `I4` values per team. +```c++ + auto LConfig = LaunchConfig({N1, N2}, 32, TeamScratch(8, 4)); + parallelForOuter(LConfig, + KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team) { + }); +``` +It is not necessary to provide all three arguments to `LaunchConfig`. If you want the default team size, +or you don't need any scratch memory, you can use the follwing constructors. +```c++ + auto LConfig1 = LaunchConfig({N1, N2}, TeamScratch(8, 4)); + auto LConfig2 = LaunchConfig({N1, N2}, 32); +``` +For simplicity, most examples in this document use the simple form of launching outer loops with just the bounds, +but `LaunchConfig` can be used for all types of outer parallel loops. +Inner parallel loops cannot use `LaunchConfig`. + +### Team Scratch Memory + +In hierarchical code, it is often useful to have some amount of scratch memory private to each team. +Scratch memory enables reuse of expensive to compute data in inner loops. +To enable scratch memory, the outer loops needs to be launched with the `LaunchConfig` parameter described above, +configured with the requested number of scratch values. +Inside the outer loop, unmanaged scratch arrays can be created from a pool of memory accesible +by calling the `teamScratch(Team)` function. +Scratch arrays have a different type than normal Omega arrays, for example `ArrayScratch1DReal` is the +type of a 1D scratch array of Reals. They also cannot have labels. + +As an example, the following code uses scratch memory to compute an expensive function on elements of a 2D array `A`. +It then computes finite differences along the second dimension of the scratch array, and stores them in `A`. +By using scratch memory, the expensive function is only computed once for every element, and there is no need for global memory allocation. +```c++ + Array2DReal A("A", N1, N2); + parallelForOuter( + LaunchConfig({N1}, TeamScratch(N2)), + KOKKOS_LAMBDA(int J1, const TeamMember &Team) { + + ArrayScratch1DReal SA(teamScratch(Team), N2); + + parallelForInner(Team, N2, INNER_LAMBDA (int J2) { + SA(J2) = expensiveFunc(A(J1, J2)); + }); + + teamBarrier(Team); + + parallelForInner(Team, N2, INNER_LAMBDA (int J2) { + + const int J2M1 = Kokkos::max(J2 - 1, 0); + const int J2P1 = Kokkos::min(J2 + 1, N2 - 1); + + A(J1, J2) = SA(J2P1) - SA(J2M1); + }); + }); +``` +You can create multiple scratch arrays of different types, as in the following code. +```c++ + parallelForOuter( + LaunchConfig({N1}, TeamScratch(4, 8)), + KOKKOS_LAMBDA(int J1, const TeamMember &Team) { + ArrayScratch1DI4 ScratchI4(teamScratch(Team), 8); + ArrayScratch1DReal ScratchReal(teamScratch(Team), 4); + }); +``` +As the above example illustrates, the order in which the arrays are created inside the outer region +doesn't need to match the order of arguments to `TeamScratch`. From 12b39ae36f911ec024247f657d4b0aa424b86ce6 Mon Sep 17 00:00:00 2001 From: Maciej Waruszewski Date: Wed, 1 Apr 2026 14:31:17 -0600 Subject: [PATCH 21/25] Simplify par search tests using Range --- components/omega/test/infra/OmegaKokkosHiParTest.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/components/omega/test/infra/OmegaKokkosHiParTest.cpp b/components/omega/test/infra/OmegaKokkosHiParTest.cpp index 01587e7ed558..b5b08de34908 100644 --- a/components/omega/test/infra/OmegaKokkosHiParTest.cpp +++ b/components/omega/test/infra/OmegaKokkosHiParTest.cpp @@ -349,12 +349,10 @@ Error testHiparFor1DSearch1D(int N2) { const int End = 3 * N2 / 4 + J1 % (N2 / 4); int SearchIdx; parallelSearchInner( - Team, End - Start, - INNER_LAMBDA(int J2) { - return DataD(J1, J2 + Start) >= Threshold; - }, + Team, Range{Start, End - 1}, + INNER_LAMBDA(int J2) { return DataD(J1, J2) >= Threshold; }, SearchIdx); - IdxD(J1) = SearchIdx == -1 ? SearchIdx : SearchIdx + Start; + IdxD(J1) = SearchIdx; }); if (!arraysEqual(IdxD, RefIdxH)) { From 8eb573adbf95fcff736db6b139691dc8699c0fe1 Mon Sep 17 00:00:00 2001 From: Maciej Waruszewski Date: Thu, 2 Apr 2026 13:47:22 -0600 Subject: [PATCH 22/25] Fix scratch alignment issue --- components/omega/src/infra/OmegaKokkosHiPar.h | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/components/omega/src/infra/OmegaKokkosHiPar.h b/components/omega/src/infra/OmegaKokkosHiPar.h index 6a2093e4994b..c810d6f880d7 100644 --- a/components/omega/src/infra/OmegaKokkosHiPar.h +++ b/components/omega/src/infra/OmegaKokkosHiPar.h @@ -16,10 +16,13 @@ using TeamPolicy = Kokkos::TeamPolicy; using TeamMember = TeamPolicy::member_type; using ScratchMemSpace = ExecSpace::scratch_memory_space; using Kokkos::PerTeam; -using ArrayScratch1DReal = - Kokkos::View; -using ArrayScratch1DI4 = - Kokkos::View; + +template +using ArrayScratch1D = + Kokkos::View; + +using ArrayScratch1DReal = ArrayScratch1D; +using ArrayScratch1DI4 = ArrayScratch1D; /// team_size for hierarchical parallelism #ifdef OMEGA_TARGET_DEVICE @@ -41,7 +44,7 @@ template struct TeamScratch { template TeamScratch(ArgT... Args) { static_assert(sizeof...(ArgT) == sizeof...(T)); - ((BytesPerTeam += sizeof(T) * Args), ...); + ((BytesPerTeam += ArrayScratch1D::shmem_size(Args)), ...); } }; From 6753a0b7e10f7fcf5a7b99f4361dfdeb6e1103d3 Mon Sep 17 00:00:00 2001 From: Maciej Waruszewski Date: Thu, 2 Apr 2026 15:02:23 -0600 Subject: [PATCH 23/25] Add copilot fixes Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- components/omega/doc/devGuide/ParallelLoops.md | 12 ++++++------ components/omega/src/infra/OmegaKokkos.h | 4 +++- components/omega/src/infra/OmegaKokkosHiPar.h | 2 +- components/omega/test/infra/OmegaKokkosHiParTest.cpp | 2 +- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/components/omega/doc/devGuide/ParallelLoops.md b/components/omega/doc/devGuide/ParallelLoops.md index a9c0d0e0e22c..804f04ad49b7 100644 --- a/components/omega/doc/devGuide/ParallelLoops.md +++ b/components/omega/doc/devGuide/ParallelLoops.md @@ -165,9 +165,9 @@ The second way uses a helper struct `Range` to provide a range of valid indices }); ``` Note that this range is inclusive, i.e. the loop index `K` takes values from `N1` up to and including `N2`. -This means that `Range{0, N}` specifies a diffrent range than the first example. -For simplicity, most examples in this document use the first way of specyfying the range, -but a `Range` argument can be passed to all inner iteration patters. +This means that `Range{0, N}` specifies a different range than the first example. +For simplicity, most examples in this document use the first way of specifying the range, +but a `Range` argument can be passed to all inner iteration patterns. ### parallelForOuter @@ -324,7 +324,7 @@ Labels are not supported by `parallelSearchInner` and only one-dimensional index ### Launch Config -While specyfing loop bounds is enough to start an outer parallel loop, sometimes more control over the underlaying +While specifying loop bounds is enough to start an outer parallel loop, sometimes more control over the underlying Kokkos `TeamPolicy` is desired. The most common use case is utilizing scratch memory, a concept discussed more thoroughly in the next sub-section. To enable more control, outer loops can be launched by providing a `LaunchConfig` struct as the first argument, which is composed of three parts: @@ -341,7 +341,7 @@ with team size of 32 and enough scratch memory for 8 `Real` values and 4 `I4` va }); ``` It is not necessary to provide all three arguments to `LaunchConfig`. If you want the default team size, -or you don't need any scratch memory, you can use the follwing constructors. +or you don't need any scratch memory, you can use the following constructors. ```c++ auto LConfig1 = LaunchConfig({N1, N2}, TeamScratch(8, 4)); auto LConfig2 = LaunchConfig({N1, N2}, 32); @@ -356,7 +356,7 @@ In hierarchical code, it is often useful to have some amount of scratch memory p Scratch memory enables reuse of expensive to compute data in inner loops. To enable scratch memory, the outer loops needs to be launched with the `LaunchConfig` parameter described above, configured with the requested number of scratch values. -Inside the outer loop, unmanaged scratch arrays can be created from a pool of memory accesible +Inside the outer loop, unmanaged scratch arrays can be created from a pool of memory accessible by calling the `teamScratch(Team)` function. Scratch arrays have a different type than normal Omega arrays, for example `ArrayScratch1DReal` is the type of a 1D scratch array of Reals. They also cannot have labels. diff --git a/components/omega/src/infra/OmegaKokkos.h b/components/omega/src/infra/OmegaKokkos.h index 2ca20c49fd3f..9c5466574e19 100644 --- a/components/omega/src/infra/OmegaKokkos.h +++ b/components/omega/src/infra/OmegaKokkos.h @@ -11,6 +11,8 @@ #include "DataTypes.h" #include "Error.h" +#include +#include #include #include #include @@ -101,7 +103,7 @@ bool arraysEqual(const ArrayTypeA &A, const ArrayTypeB &B) { OMEGA_REQUIRE(A.span_is_contiguous() && B.span_is_contiguous(), "arraysEqual works only for contiguous arrays"); OMEGA_REQUIRE(A.size() == B.size(), - "arrayEqual can only compare arrays of equal size"); + "arraysEqual can only compare arrays of equal size"); // This is a debug utility and not performance critical // so just copy to the host and compare there diff --git a/components/omega/src/infra/OmegaKokkosHiPar.h b/components/omega/src/infra/OmegaKokkosHiPar.h index c810d6f880d7..f100cfdb5025 100644 --- a/components/omega/src/infra/OmegaKokkosHiPar.h +++ b/components/omega/src/infra/OmegaKokkosHiPar.h @@ -181,7 +181,7 @@ inline void parallelReduceOuter(const LaunchConfig &LConfig, F &&Functor, // parallelReduceOuter: with label and with array bounds template -inline void parallelReduceOuter(const std::string Label, +inline void parallelReduceOuter(const std::string &Label, const int (&UpperBounds)[N], F &&Functor, R &&...Reducers) { parallelReduceOuter(Label, LaunchConfig(UpperBounds), diff --git a/components/omega/test/infra/OmegaKokkosHiParTest.cpp b/components/omega/test/infra/OmegaKokkosHiParTest.cpp index b5b08de34908..dbffd9617f07 100644 --- a/components/omega/test/infra/OmegaKokkosHiParTest.cpp +++ b/components/omega/test/infra/OmegaKokkosHiParTest.cpp @@ -390,7 +390,7 @@ Error testHiparLaunchConfig1D(int N1, int N2) { Array2DReal OutD("OutD", N1, N2 - 3); -#ifdef OMEGA_DEVICE +#ifdef OMEGA_TARGET_DEVICE const int TeamSize = 32; #else const int TeamSize = 1; From 23b78fc7e5dfec8b06583227484e9d9a03c5002e Mon Sep 17 00:00:00 2001 From: Maciej Waruszewski Date: Thu, 2 Apr 2026 17:10:59 -0600 Subject: [PATCH 24/25] Use Range in inner loops where possible Co-authored-by: mwarusz <5665111+mwarusz@users.noreply.github.com> --- components/omega/src/ocn/Tendencies.cpp | 33 +++++-------- components/omega/src/ocn/VertCoord.cpp | 49 +++++++------------ .../omega/src/timeStepping/TimeStepper.cpp | 48 +++++++----------- 3 files changed, 47 insertions(+), 83 deletions(-) diff --git a/components/omega/src/ocn/Tendencies.cpp b/components/omega/src/ocn/Tendencies.cpp index 3c3aacb8dde5..680ad0f381e8 100644 --- a/components/omega/src/ocn/Tendencies.cpp +++ b/components/omega/src/ocn/Tendencies.cpp @@ -379,15 +379,12 @@ void Tendencies::computeThicknessTendenciesOnly( parallelForOuter( {Mesh->NCellsAll}, KOKKOS_LAMBDA(int ICell, const TeamMember &Team) { - const int KMin = MinLayerCell(ICell); - const int KMax = MaxLayerCell(ICell); - const int KRange = vertRange(KMin, KMax); + const int KMin = MinLayerCell(ICell); + const int KMax = MaxLayerCell(ICell); parallelForInner( - Team, KRange, INNER_LAMBDA(int KChunk) { - const int K = KMin + KChunk; - LocLayerThicknessTend(ICell, K) = 0; - }); + Team, Range{KMin, KMax}, + INNER_LAMBDA(int K) { LocLayerThicknessTend(ICell, K) = 0; }); }); // Compute thickness flux divergence @@ -454,15 +451,12 @@ void Tendencies::computeVelocityTendenciesOnly( parallelForOuter( {Mesh->NEdgesAll}, KOKKOS_LAMBDA(int IEdge, const TeamMember &Team) { - const int KMin = MinLayerEdgeBot(IEdge); - const int KMax = MaxLayerEdgeTop(IEdge); - const int KRange = vertRange(KMin, KMax); + const int KMin = MinLayerEdgeBot(IEdge); + const int KMax = MaxLayerEdgeTop(IEdge); parallelForInner( - Team, KRange, INNER_LAMBDA(int KChunk) { - const int K = KMin + KChunk; - LocNormalVelocityTend(IEdge, K) = 0; - }); + Team, Range{KMin, KMax}, + INNER_LAMBDA(int K) { LocNormalVelocityTend(IEdge, K) = 0; }); }); // Compute potential vorticity horizontal advection @@ -663,14 +657,11 @@ void Tendencies::computeTracerTendenciesOnly( parallelForOuter( {NTracers, Mesh->NCellsAll}, KOKKOS_LAMBDA(int L, int ICell, const TeamMember &Team) { - const int KMin = MinLayerCell(ICell); - const int KMax = MaxLayerCell(ICell); - const int KRange = vertRange(KMin, KMax); + const int KMin = MinLayerCell(ICell); + const int KMax = MaxLayerCell(ICell); parallelForInner( - Team, KRange, INNER_LAMBDA(int KChunk) { - const int K = KMin + KChunk; - LocTracerTend(L, ICell, K) = 0; - }); + Team, Range{KMin, KMax}, + INNER_LAMBDA(int K) { LocTracerTend(L, ICell, K) = 0; }); }); // compute tracer horizotal advection diff --git a/components/omega/src/ocn/VertCoord.cpp b/components/omega/src/ocn/VertCoord.cpp index b481439202b8..b6a2adc30b49 100644 --- a/components/omega/src/ocn/VertCoord.cpp +++ b/components/omega/src/ocn/VertCoord.cpp @@ -768,11 +768,8 @@ void VertCoord::setMasks() { const I4 KMax = LocMaxLyrEdgeTop(IEdge); parallelForInner( - Team, KMax - KMin + 1, INNER_LAMBDA(int K) { - I4 KLyr = KMin + K; - - LocEdgeMask(IEdge, KLyr) = 1._Real; - }); + Team, Range{KMin, KMax}, + INNER_LAMBDA(int K) { LocEdgeMask(IEdge, K) = 1._Real; }); }); EdgeMaskH = createHostMirrorCopy(EdgeMask); @@ -791,11 +788,8 @@ void VertCoord::setMasks() { const I4 KMax = LocMaxLyrCell(ICell); parallelForInner( - Team, KMax - KMin + 1, INNER_LAMBDA(int K) { - I4 KLyr = KMin + K; - - LocCellMask(ICell, KLyr) = 1._Real; - }); + Team, Range{KMin, KMax}, + INNER_LAMBDA(int K) { LocCellMask(ICell, K) = 1._Real; }); }); CellMaskH = createHostMirrorCopy(CellMask); @@ -815,11 +809,8 @@ void VertCoord::setMasks() { const I4 KMax = LocMaxLyrVrtxBot(IVertex); parallelForInner( - Team, KMax - KMin + 1, INNER_LAMBDA(int K) { - I4 KLyr = KMin + K; - - LocVrtxMask(IVertex, KLyr) = 1._Real; - }); + Team, Range{KMin, KMax}, + INNER_LAMBDA(int K) { LocVrtxMask(IVertex, K) = 1._Real; }); }); VertexMaskH = createHostMirrorCopy(VertexMask); @@ -868,21 +859,18 @@ void VertCoord::computePressure( parallelForOuter( "computePressure", {NCellsAll}, KOKKOS_LAMBDA(int ICell, const TeamMember &Team) { - const I4 KMin = LocMinLayerCell(ICell); - const I4 KMax = LocMaxLayerCell(ICell); - const I4 KRange = vertRange(KMin, KMax); - + const I4 KMin = LocMinLayerCell(ICell); + const I4 KMax = LocMaxLayerCell(ICell); LocPressInterf(ICell, KMin) = SurfacePressure(ICell); parallelScanInner( - Team, KRange, INNER_LAMBDA(int K, Real &Accum, bool IsFinal) { - const I4 KLyr = K + KMin; - Real Increment = Gravity * RhoSw * LayerThickness(ICell, KLyr); + Team, Range{KMin, KMax}, INNER_LAMBDA(int K, Real &Accum, bool IsFinal) { + Real Increment = Gravity * RhoSw * LayerThickness(ICell, K); Accum += Increment; if (IsFinal) { - LocPressInterf(ICell, KLyr + 1) = + LocPressInterf(ICell, K + 1) = SurfacePressure(ICell) + Accum; - LocPressMid(ICell, KLyr) = + LocPressMid(ICell, K) = SurfacePressure(ICell) + Accum - 0.5 * Increment; } }); @@ -982,10 +970,8 @@ void VertCoord::computeTargetThickness() { parallelForOuter( "computeTargetThickness", {NCellsAll}, KOKKOS_LAMBDA(int ICell, const TeamMember &Team) { - const I4 KMin = LocMinLayerCell(ICell); - const I4 KMax = LocMaxLayerCell(ICell); - const I4 KRange = vertRange(KMin, KMax); - + const I4 KMin = LocMinLayerCell(ICell); + const I4 KMax = LocMaxLayerCell(ICell); Real Coeff = (LocPressInterf(ICell, KMax + 1) - LocPressInterf(ICell, KMin)) / (Gravity * RhoSw); @@ -993,11 +979,10 @@ void VertCoord::computeTargetThickness() { Real SumWh = 0; Real SumRefH = 0; parallelReduceInner( - Team, KRange, + Team, Range{KMin, KMax}, INNER_LAMBDA(const int K, Real &LocalWh, Real &LocalSum) { - const I4 KLyr = K + KMin; - const Real RefLayerThick = LocRefLayerThick(ICell, KLyr); - LocalWh += LocVertCoordMvmtWgts(KLyr) * RefLayerThick; + const Real RefLayerThick = LocRefLayerThick(ICell, K); + LocalWh += LocVertCoordMvmtWgts(K) * RefLayerThick; LocalSum += RefLayerThick; }, SumWh, SumRefH); diff --git a/components/omega/src/timeStepping/TimeStepper.cpp b/components/omega/src/timeStepping/TimeStepper.cpp index 7ee026c384b7..f52a8696547e 100644 --- a/components/omega/src/timeStepping/TimeStepper.cpp +++ b/components/omega/src/timeStepping/TimeStepper.cpp @@ -402,13 +402,11 @@ void TimeStepper::updateThicknessByTend(OceanState *State1, int TimeLevel1, parallelForOuter( "updateThickByTend", {Mesh->NCellsAll}, KOKKOS_LAMBDA(int ICell, const TeamMember &Team) { - const int KMin = MinLayerCell(ICell); - const int KMax = MaxLayerCell(ICell); - const int KRange = vertRange(KMin, KMax); + const int KMin = MinLayerCell(ICell); + const int KMax = MaxLayerCell(ICell); parallelForInner( - Team, KRange, INNER_LAMBDA(int KChunk) { - const int K = KMin + KChunk; + Team, Range{KMin, KMax}, INNER_LAMBDA(int K) { LayerThick1(ICell, K) = LayerThick2(ICell, K) + CoeffSeconds * LayerThickTend(ICell, K); @@ -437,13 +435,11 @@ void TimeStepper::updateVelocityByTend(OceanState *State1, int TimeLevel1, parallelForOuter( "updateVelByTend", {Mesh->NEdgesAll}, KOKKOS_LAMBDA(int IEdge, const TeamMember &Team) { - const int KMin = MinLayerEdgeBot(IEdge); - const int KMax = MaxLayerEdgeTop(IEdge); - const int KRange = vertRange(KMin, KMax); + const int KMin = MinLayerEdgeBot(IEdge); + const int KMax = MaxLayerEdgeTop(IEdge); parallelForInner( - Team, KRange, INNER_LAMBDA(int KChunk) { - const int K = KMin + KChunk; + Team, Range{KMin, KMax}, INNER_LAMBDA(int K) { NormalVel1(IEdge, K) = NormalVel2(IEdge, K) + CoeffSeconds * NormalVelTend(IEdge, K); }); @@ -484,12 +480,10 @@ void TimeStepper::updateTracersByTend(const Array3DReal &NextTracers, parallelForOuter( "updateTracersByTend", {NTracers, Mesh->NCellsAll}, KOKKOS_LAMBDA(int L, int ICell, const TeamMember &Team) { - const int KMin = MinLayerCell(ICell); - const int KMax = MaxLayerCell(ICell); - const int KRange = vertRange(KMin, KMax); + const int KMin = MinLayerCell(ICell); + const int KMax = MaxLayerCell(ICell); parallelForInner( - Team, KRange, INNER_LAMBDA(int KChunk) { - const int K = KMin + KChunk; + Team, Range{KMin, KMax}, INNER_LAMBDA(int K) { NextTracers(L, ICell, K) = (CurTracers(L, ICell, K) * LayerThick2(ICell, K) + CoeffSeconds * TracerTend(L, ICell, K)) / @@ -512,12 +506,10 @@ void TimeStepper::weightTracers(const Array3DReal &NextTracers, parallelForOuter( "weightTracers", {NTracers, Mesh->NCellsAll}, KOKKOS_LAMBDA(int L, int ICell, const TeamMember &Team) { - const int KMin = MinLayerCell(ICell); - const int KMax = MaxLayerCell(ICell); - const int KRange = vertRange(KMin, KMax); + const int KMin = MinLayerCell(ICell); + const int KMax = MaxLayerCell(ICell); parallelForInner( - Team, KRange, INNER_LAMBDA(int KChunk) { - const int K = KMin + KChunk; + Team, Range{KMin, KMax}, INNER_LAMBDA(int K) { NextTracers(L, ICell, K) = CurTracers(L, ICell, K) * CurThickness(ICell, K); }); @@ -541,12 +533,10 @@ void TimeStepper::accumulateTracersUpdate(const Array3DReal &AccumTracer, parallelForOuter( "accumulateTracersUpdate", {NTracers, Mesh->NCellsAll}, KOKKOS_LAMBDA(int L, int ICell, const TeamMember &Team) { - const int KMin = MinLayerCell(ICell); - const int KMax = MaxLayerCell(ICell); - const int KRange = vertRange(KMin, KMax); + const int KMin = MinLayerCell(ICell); + const int KMax = MaxLayerCell(ICell); parallelForInner( - Team, KRange, INNER_LAMBDA(int KChunk) { - const int K = KMin + KChunk; + Team, Range{KMin, KMax}, INNER_LAMBDA(int K) { AccumTracer(L, ICell, K) += CoeffSeconds * TracerTend(L, ICell, K); }); @@ -567,12 +557,10 @@ void TimeStepper::finalizeTracersUpdate(const Array3DReal &NextTracers, parallelForOuter( "finalizeTracersUpdate", {NTracers, Mesh->NCellsAll}, KOKKOS_LAMBDA(int L, int ICell, const TeamMember &Team) { - const int KMin = MinLayerCell(ICell); - const int KMax = MaxLayerCell(ICell); - const int KRange = vertRange(KMin, KMax); + const int KMin = MinLayerCell(ICell); + const int KMax = MaxLayerCell(ICell); parallelForInner( - Team, KRange, INNER_LAMBDA(int KChunk) { - const int K = KMin + KChunk; + Team, Range{KMin, KMax}, INNER_LAMBDA(int K) { NextTracers(L, ICell, K) /= NextThick(ICell, K); }); }); From 62496e83c575355ebdf04a540f404b925df3b2f1 Mon Sep 17 00:00:00 2001 From: Maciej Waruszewski Date: Thu, 2 Apr 2026 17:09:00 -0600 Subject: [PATCH 25/25] Use Team, teamBarrier, and teamScratch everywhere Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- components/omega/src/base/TriDiagSolvers.h | 68 +++++++++---------- components/omega/src/ocn/VertAdv.cpp | 23 +++---- .../omega/test/base/TriDiagSolversTest.cpp | 42 ++++++------ 3 files changed, 66 insertions(+), 67 deletions(-) diff --git a/components/omega/src/base/TriDiagSolvers.h b/components/omega/src/base/TriDiagSolvers.h index 0b9a947667b6..095c53d47d2c 100644 --- a/components/omega/src/base/TriDiagSolvers.h +++ b/components/omega/src/base/TriDiagSolvers.h @@ -47,9 +47,9 @@ struct TriDiagScratch { TriDiagScratchArray X; // rhs on input, contains solution after calling solve // Constructor takes team member and system size - KOKKOS_FUNCTION TriDiagScratch(const TeamMember &Member, int NRow) - : DL(Member.team_scratch(0), NRow), D(Member.team_scratch(0), NRow), - DU(Member.team_scratch(0), NRow), X(Member.team_scratch(0), NRow) {} + KOKKOS_FUNCTION TriDiagScratch(const TeamMember &Team, int NRow) + : DL(teamScratch(Team), NRow), D(teamScratch(Team), NRow), + DU(teamScratch(Team), NRow), X(teamScratch(Team), NRow) {} }; // Thomas algorithm solver for general tridiagonal systems @@ -66,7 +66,7 @@ struct ThomasSolver { // Solve the system defined in the scratch data argument `Scratch` // This a team-level function that needs to be called inside a // parallel loop using TeamPolicy, hence it has a team member argument - static void KOKKOS_FUNCTION solve(const TeamMember &Member, + static void KOKKOS_FUNCTION solve(const TeamMember &Team, const TriDiagScratch &Scratch) { const int NRow = Scratch.X.extent_int(0); @@ -103,10 +103,10 @@ struct ThomasSolver { auto LConfig = makeLaunchConfig(NBatch, NRow); parallelForOuter( - LConfig, KOKKOS_LAMBDA(const int IChunk, const TeamMember &Member) { + LConfig, KOKKOS_LAMBDA(const int IChunk, const TeamMember &Team) { const int IStart = IChunk * VecLength; - TriDiagScratch Scratch(Member, NRow); + TriDiagScratch Scratch(Team, NRow); for (int K = 0; K < NRow; ++K) { for (int IVec = 0; IVec < VecLength; ++IVec) { @@ -120,7 +120,7 @@ struct ThomasSolver { } } - solve(Member, Scratch); + solve(Team, Scratch); for (int IVec = 0; IVec < VecLength; ++IVec) { for (int K = 0; K < NRow; ++K) { @@ -147,12 +147,12 @@ struct PCRSolver { // Solve the system defined in the scratch data argument `Scratch` // This a team-level function that needs to be called inside a // parallel loop using TeamPolicy, hence it has a team member argument - static void KOKKOS_FUNCTION solve(const TeamMember &Member, + static void KOKKOS_FUNCTION solve(const TeamMember &Team, const TriDiagScratch &Scratch) { const int NRow = Scratch.X.extent_int(0); // Row index = Thread index - const int K = Member.team_rank(); + const int K = Team.team_rank(); // Number of reduction levels const int NLevels = Kokkos::ceil(Kokkos::log2(NRow)); @@ -178,7 +178,7 @@ struct PCRSolver { const Real NewDL = alpha * Scratch.DL(Kmh, 0); const Real NewDU = gamma * Scratch.DU(Kph, 0); - Member.team_barrier(); + teamBarrier(Team); // Store new system coefficients Scratch.D(K, 0) = NewD; @@ -186,7 +186,7 @@ struct PCRSolver { Scratch.DL(K, 0) = NewDL; Scratch.DU(K, 0) = NewDU; - Member.team_barrier(); + teamBarrier(Team); } const int Stride = 1 << (NLevels - 1); @@ -220,21 +220,21 @@ struct PCRSolver { auto LConfig = makeLaunchConfig(NBatch, NRow); parallelForOuter( - LConfig, KOKKOS_LAMBDA(int I, const TeamMember &Member) { - const int K = Member.team_rank(); + LConfig, KOKKOS_LAMBDA(int I, const TeamMember &Team) { + const int K = Team.team_rank(); - TriDiagScratch Scratch(Member, NRow); + TriDiagScratch Scratch(Team, NRow); Scratch.DL(K, 0) = DL(I, K); Scratch.D(K, 0) = D(I, K); Scratch.DU(K, 0) = DU(I, K); Scratch.X(K, 0) = X(I, K); - Member.team_barrier(); + teamBarrier(Team); - solve(Member, Scratch); + solve(Team, Scratch); - Member.team_barrier(); + teamBarrier(Team); X(I, K) = Scratch.X(K, 0); }); @@ -250,9 +250,9 @@ struct TriDiagDiffScratch { TriDiagScratchArray X; // rhs on input, contains solution after calling solve TriDiagScratchArray Alpha; // internal workspace - KOKKOS_FUNCTION TriDiagDiffScratch(const TeamMember &Member, int NRow) - : G(Member.team_scratch(0), NRow), H(Member.team_scratch(0), NRow), - X(Member.team_scratch(0), NRow), Alpha(Member.team_scratch(0), NRow) {} + KOKKOS_FUNCTION TriDiagDiffScratch(const TeamMember &Team, int NRow) + : G(teamScratch(Team), NRow), H(teamScratch(Team), NRow), + X(teamScratch(Team), NRow), Alpha(teamScratch(Team), NRow) {} }; // Thomas algorithm solver for diffusion-type tridiagonal systems @@ -269,7 +269,7 @@ struct ThomasDiffusionSolver { // Solve the system defined in the scratch data argument `Scratch` // This a team-level function that needs to be called inside a // parallel loop using TeamPolicy, hence it has a team member argument - static void KOKKOS_FUNCTION solve(const TeamMember &Member, + static void KOKKOS_FUNCTION solve(const TeamMember &Team, const TriDiagDiffScratch &Scratch) { const int NRow = Scratch.X.extent_int(0); @@ -325,10 +325,10 @@ struct ThomasDiffusionSolver { auto LConfig = makeLaunchConfig(NBatch, NRow); parallelForOuter( - LConfig, KOKKOS_LAMBDA(int IChunk, const TeamMember &Member) { + LConfig, KOKKOS_LAMBDA(int IChunk, const TeamMember &Team) { const int IStart = IChunk * VecLength; - TriDiagDiffScratch Scratch(Member, NRow); + TriDiagDiffScratch Scratch(Team, NRow); for (int K = 0; K < NRow; ++K) { for (int IVec = 0; IVec < VecLength; ++IVec) { @@ -341,7 +341,7 @@ struct ThomasDiffusionSolver { } } - solve(Member, Scratch); + solve(Team, Scratch); for (int IVec = 0; IVec < VecLength; ++IVec) { for (int K = 0; K < NRow; ++K) { @@ -368,12 +368,12 @@ struct PCRDiffusionSolver { // Solve the system defined in the scratch data argument `Scratch` // This a team-level function that needs to be called inside a // parallel loop using TeamPolicy, hence it has a team member argument - static void KOKKOS_FUNCTION solve(const TeamMember &Member, + static void KOKKOS_FUNCTION solve(const TeamMember &Team, const TriDiagDiffScratch &Scratch) { const int NRow = Scratch.X.extent_int(0); // Row index = Thread index - const int K = Member.team_rank(); + const int K = Team.team_rank(); // Number of reduction levels const int NLevels = Kokkos::ceil(Kokkos::log2(NRow)); @@ -406,14 +406,14 @@ struct PCRDiffusionSolver { const Real NewH = Scratch.H(K, 0) + Alpha * Scratch.H(Kmh, 0) + Beta * Scratch.H(Kph, 0); - Member.team_barrier(); + teamBarrier(Team); // Store new system coefficients Scratch.H(K, 0) = NewH; Scratch.G(K, 0) = NewG; Scratch.X(K, 0) = NewX; - Member.team_barrier(); + teamBarrier(Team); } const int Stride = 1 << (NLevels - 1); @@ -456,20 +456,20 @@ struct PCRDiffusionSolver { auto LConfig = makeLaunchConfig(NBatch, NRow); parallelForOuter( - LConfig, KOKKOS_LAMBDA(int I, const TeamMember &Member) { - const int K = Member.team_rank(); + LConfig, KOKKOS_LAMBDA(int I, const TeamMember &Team) { + const int K = Team.team_rank(); - TriDiagDiffScratch Scratch(Member, NRow); + TriDiagDiffScratch Scratch(Team, NRow); Scratch.G(K, 0) = G(I, K); Scratch.H(K, 0) = H(I, K); Scratch.X(K, 0) = X(I, K); - Member.team_barrier(); + teamBarrier(Team); - solve(Member, Scratch); + solve(Team, Scratch); - Member.team_barrier(); + teamBarrier(Team); X(I, K) = Scratch.X(K, 0); }); diff --git a/components/omega/src/ocn/VertAdv.cpp b/components/omega/src/ocn/VertAdv.cpp index 081f5a38d6bc..e951cbf02428 100644 --- a/components/omega/src/ocn/VertAdv.cpp +++ b/components/omega/src/ocn/VertAdv.cpp @@ -380,7 +380,7 @@ void VertAdv::computeVerticalVelocity( "computeVerticalVelocity", LaunchConfig({NCellsOwned}, TeamScratch(NVertLayers)), KOKKOS_LAMBDA(int ICell, const TeamMember &Team) { - ArrayScratch1DReal DivHU(Team.team_scratch(0), LocNVertLayers); + ArrayScratch1DReal DivHU(teamScratch(Team), LocNVertLayers); const Real InvAreaCell = 1._Real / LocAreaCell(ICell); @@ -411,7 +411,7 @@ void VertAdv::computeVerticalVelocity( } }); - Team.team_barrier(); + teamBarrier(Team); // Set velocity through top and bottom interfaces to zero Kokkos::single( @@ -521,7 +521,7 @@ void VertAdv::computeVelocityVAdvTend( // Allocate scratch space for W times Du/Dz at vertical interfaces // between edges - ArrayScratch1DReal WDuDzEdge(Team.team_scratch(0), LocNVertLayersP1); + ArrayScratch1DReal WDuDzEdge(teamScratch(Team), LocNVertLayersP1); // Flux is zero at top and bottom Kokkos::single( @@ -551,7 +551,7 @@ void VertAdv::computeVelocityVAdvTend( } }); - Team.team_barrier(); + teamBarrier(Team); KRange = vertRangeChunked(KMin, KMax); // Average W*Du/Dz from interfaces to layer midpoints @@ -831,12 +831,11 @@ void VertAdv::computeFCTVAdvTend( const I4 KMax = MaxLayerCell(ICell); I4 KRange = vertRangeChunked(KMin, KMax); - ArrayScratch1DReal InvNewProvThick(Team.team_scratch(0), - LocNVertLayers); - ArrayScratch1DReal WorkTend(Team.team_scratch(0), LocNVertLayers); - ArrayScratch1DReal FlxIn(Team.team_scratch(0), LocNVertLayers); - ArrayScratch1DReal FlxOut(Team.team_scratch(0), LocNVertLayers); - ArrayScratch1DReal RescaledFlux(Team.team_scratch(0), + ArrayScratch1DReal InvNewProvThick(teamScratch(Team), LocNVertLayers); + ArrayScratch1DReal WorkTend(teamScratch(Team), LocNVertLayers); + ArrayScratch1DReal FlxIn(teamScratch(Team), LocNVertLayers); + ArrayScratch1DReal FlxOut(teamScratch(Team), LocNVertLayers); + ArrayScratch1DReal RescaledFlux(teamScratch(Team), LocNVertLayers + 1); parallelForInner( @@ -914,7 +913,7 @@ void VertAdv::computeFCTVAdvTend( } }); - Team.team_barrier(); + teamBarrier(Team); KRange = vertRangeChunked(KMin + 1, KMax); @@ -937,7 +936,7 @@ void VertAdv::computeFCTVAdvTend( } }); - Team.team_barrier(); + teamBarrier(Team); // Accumulate total FCT vertical advection tendency KRange = vertRangeChunked(KMin, KMax); diff --git a/components/omega/test/base/TriDiagSolversTest.cpp b/components/omega/test/base/TriDiagSolversTest.cpp index 99236b40fe93..97aac1107e20 100644 --- a/components/omega/test/base/TriDiagSolversTest.cpp +++ b/components/omega/test/base/TriDiagSolversTest.cpp @@ -173,11 +173,11 @@ Real runDiffManufactured(int NCells) { const Real TimeNext = (Step + 1) * TimeStep; parallelForOuter( - LConfig, KOKKOS_LAMBDA(int, const TeamMember &Member) { - TriDiagDiffScratch Scratch(Member, NCells); + LConfig, KOKKOS_LAMBDA(int, const TeamMember &Team) { + TriDiagDiffScratch Scratch(Team, NCells); // Setup the system to be solved - parallelForInner(Member, NCells, [=](int ICell) { + parallelForInner(Team, NCells, [=](int ICell) { for (int IVec = 0; IVec < VecLength; ++IVec) { // Forcing term from the manufactured solution @@ -205,12 +205,12 @@ Real runDiffManufactured(int NCells) { }); // Solve the system - Member.team_barrier(); - TriDiagDiffSolver::solve(Member, Scratch); - Member.team_barrier(); + teamBarrier(Team); + TriDiagDiffSolver::solve(Team, Scratch); + teamBarrier(Team); // Store the solution - parallelForInner(Member, NCells, [=](int ICell) { + parallelForInner(Team, NCells, [=](int ICell) { U(ICell) = Scratch.X(ICell, 0); }); }); @@ -319,12 +319,12 @@ Real runDiffusionStability(bool UseGeneralSolver, Real DiffValue) { auto LConfig = TriDiagSolver::makeLaunchConfig(1, NCells); parallelForOuter( - LConfig, KOKKOS_LAMBDA(int, const TeamMember &Member) { - TriDiagScratch Scratch(Member, NCells); + LConfig, KOKKOS_LAMBDA(int, const TeamMember &Team) { + TriDiagScratch Scratch(Team, NCells); // Setup the system to be solved in the form expected by the // general tridiagonal solver - parallelForInner(Member, NCells, [=](int ICell) { + parallelForInner(Team, NCells, [=](int ICell) { for (int IVec = 0; IVec < VecLength; ++IVec) { if (ICell < NCells - 1) { @@ -354,12 +354,12 @@ Real runDiffusionStability(bool UseGeneralSolver, Real DiffValue) { }); // Solve the system - Member.team_barrier(); - TriDiagSolver::solve(Member, Scratch); - Member.team_barrier(); + teamBarrier(Team); + TriDiagSolver::solve(Team, Scratch); + teamBarrier(Team); // Save the solution - parallelForInner(Member, NCells, [=](int ICell) { + parallelForInner(Team, NCells, [=](int ICell) { U(ICell) = Scratch.X(ICell, 0); }); }); @@ -367,12 +367,12 @@ Real runDiffusionStability(bool UseGeneralSolver, Real DiffValue) { auto LConfig = TriDiagDiffSolver::makeLaunchConfig(1, NCells); parallelForOuter( - LConfig, KOKKOS_LAMBDA(int, const TeamMember &Member) { - TriDiagDiffScratch Scratch(Member, NCells); + LConfig, KOKKOS_LAMBDA(int, const TeamMember &Team) { + TriDiagDiffScratch Scratch(Team, NCells); // Setup the system to be solved in the form expected by the // specialized diffusion tridiagonal solver - parallelForInner(Member, NCells, [=](int ICell) { + parallelForInner(Team, NCells, [=](int ICell) { for (int IVec = 0; IVec < VecLength; ++IVec) { Scratch.H(ICell, IVec) = LayerThick(ICell); @@ -391,12 +391,12 @@ Real runDiffusionStability(bool UseGeneralSolver, Real DiffValue) { }); // Solve the system - Member.team_barrier(); - TriDiagDiffSolver::solve(Member, Scratch); - Member.team_barrier(); + teamBarrier(Team); + TriDiagDiffSolver::solve(Team, Scratch); + teamBarrier(Team); // Store the solution - parallelForInner(Member, NCells, [=](int ICell) { + parallelForInner(Team, NCells, [=](int ICell) { U(ICell) = Scratch.X(ICell, 0); }); });