From 12048dbe497ca8f04b240118135b6b6a78623b52 Mon Sep 17 00:00:00 2001
From: Maciej Waruszewski <mwarusz@igf.fuw.edu.pl>
Date: Tue, 3 Mar 2026 16:32:36 -0700
Subject: [PATCH 01/25] Allow specifying inner loops ranges as min/max index

---
 components/omega/src/infra/OmegaKokkos.h | 41 +++++++++++++++++++-----
 1 file changed, 33 insertions(+), 8 deletions(-)
diff --git a/components/omega/src/infra/OmegaKokkos.h b/components/omega/src/infra/OmegaKokkos.h
index a7ab8bca89da..7e582da1ff41 100644
--- a/components/omega/src/infra/OmegaKokkos.h
+++ b/components/omega/src/infra/OmegaKokkos.h
@@ -355,11 +355,18 @@ inline void parallelForOuter(const int (&UpperBounds)[N], F &&Functor,
 }
 
 // parallelForInner
+
+template <class F>
+KOKKOS_FUNCTION void parallelForInner(const TeamMember &Team, int MinIndex,
+                                      int MaxIndex, F &&Functor) {
+   const auto Policy = TeamThreadRange(Team, MinIndex, MaxIndex + 1);
+   Kokkos::parallel_for(Policy, std::forward<F>(Functor));
+}
+
 template <class F>
 KOKKOS_FUNCTION void parallelForInner(const TeamMember &Team, int UpperBound,
                                       F &&Functor) {
-   const auto Policy = TeamThreadRange(Team, UpperBound);
-   Kokkos::parallel_for(Policy, std::forward<F>(Functor));
+   parallelForInner(Team, 0, UpperBound - 1, std::forward<F>(Functor));
 }
 
 // This struct is used to get the right accumulator type to be used in
@@ -413,23 +420,41 @@ inline void parallelReduceOuter(const int (&UpperBounds)[N], F &&Functor,
 }
 
 // parallelReduceInner
+
 template <class F, class... R>
-KOKKOS_FUNCTION void parallelReduceInner(const TeamMember &Team, int UpperBound,
-                                         F &&Functor, R &&...Reducers) {
-   const auto Policy = TeamThreadRange(Team, UpperBound);
+KOKKOS_FUNCTION void parallelReduceInner(const TeamMember &Team, int MinIndex,
+                                         int MaxIndex, F &&Functor,
+                                         R &&...Reducers) {
+   const auto Policy = TeamThreadRange(Team, MinIndex, MaxIndex + 1);
    Kokkos::parallel_reduce(Policy, std::forward<F>(Functor),
                            std::forward<R>(Reducers)...);
 }
 
+template <class F, class... R>
+KOKKOS_FUNCTION void parallelReduceInner(const TeamMember &Team, int UpperBound,
+                                         F &&Functor, R &&...Reducers) {
+   parallelReduceInner(Team, 0, UpperBound - 1, std::forward<F>(Functor),
+                       std::forward<R>(Reducers)...);
+}
+
 // parallelScanInner
+
 template <class F, class... R>
-KOKKOS_FUNCTION void parallelScanInner(const TeamMember &Team, int UpperBound,
-                                       F &&Functor, R &&...Reducers) {
-   const auto Policy = TeamThreadRange(Team, UpperBound);
+KOKKOS_FUNCTION void parallelScanInner(const TeamMember &Team, int MinIndex,
+                                       int MaxIndex, F &&Functor,
+                                       R &&...Reducers) {
+   const auto Policy = TeamThreadRange(Team, MinIndex, MaxIndex + 1);
    Kokkos::parallel_scan(Policy, std::forward<F>(Functor),
                          std::forward<R>(Reducers)...);
 }
 
+template <class F, class... R>
+KOKKOS_FUNCTION void parallelScanInner(const TeamMember &Team, int UpperBound,
+                                       F &&Functor, R &&...Reducers) {
+   parallelScanInner(Team, 0, UpperBound - 1, std::forward<F>(Functor),
+                     std::forward<R>(Reducers)...);
+}
+
 } // end namespace OMEGA
 
 //===----------------------------------------------------------------------===//

From 92bae1fe4b9dadc652f9bc66efa5888f0341e9f9 Mon Sep 17 00:00:00 2001
From: Maciej Waruszewski <mwarusz@igf.fuw.edu.pl>
Date: Tue, 3 Mar 2026 16:33:18 -0700
Subject: [PATCH 02/25] Test new inner loops forms

---
 .../omega/test/infra/OmegaKokkosHiParTest.cpp | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/components/omega/test/infra/OmegaKokkosHiParTest.cpp b/components/omega/test/infra/OmegaKokkosHiParTest.cpp
index c7793bd13cc7..bdb605c70f36 100644
--- a/components/omega/test/infra/OmegaKokkosHiParTest.cpp
+++ b/components/omega/test/infra/OmegaKokkosHiParTest.cpp
@@ -355,7 +355,7 @@ Error testHiparFor2DFor1D(int N1, int N2) {
    HostArray3DI4 RefAH("RefA3H", N1, N2, N3);
    for (int J1 = 0; J1 < N1; ++J1) {
       for (int J2 = 0; J2 < N2; ++J2) {
-         for (int J3 = 0; J3 < J1 + J2; ++J3) {
+         for (int J3 = J1; J3 <= J1 + J2; ++J3) {
             RefAH(J1, J2, J3) = f3(J1, J2, J3, N1, N2, N3);
          }
       }
@@ -365,7 +365,7 @@ Error testHiparFor2DFor1D(int N1, int N2) {
    parallelForOuter(
        {N1, N2}, KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team) {
           parallelForInner(
-              Team, J1 + J2, INNER_LAMBDA(int J3) {
+              Team, J1, J1 + J2, INNER_LAMBDA(int J3) {
                  A(J1, J2, J3) = f3(J1, J2, J3, N1, N2, N3);
               });
        });
@@ -389,7 +389,7 @@ Error testHiparFor2DReduce1D(int N1, int N2) {
       for (int J2 = 0; J2 < N2; ++J2) {
          I4 Sum = 0;
          I4 Max = std::numeric_limits<I4>::min();
-         for (int J3 = 0; J3 < J1 + J2; ++J3) {
+         for (int J3 = J1; J3 <= J1 + J2; ++J3) {
             Sum += f3(J1, J2, J3, N1, N2, N3);
             Max = std::max(Max, f3(J1, J2, J3, N1, N2, N3));
          }
@@ -404,7 +404,7 @@ Error testHiparFor2DReduce1D(int N1, int N2) {
        {N1, N2}, KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team) {
           I4 Sum;
           parallelReduceInner(
-              Team, J1 + J2,
+              Team, J1, J1 + J2,
               INNER_LAMBDA(int J3, I4 &Accum) {
                  Accum += f3(J1, J2, J3, N1, N2, N3);
               },
@@ -413,7 +413,7 @@ Error testHiparFor2DReduce1D(int N1, int N2) {
 
           I4 Max;
           parallelReduceInner(
-              Team, J1 + J2,
+              Team, J1, J1 + J2,
               INNER_LAMBDA(int J3, I4 &Accum) {
                  Accum = Kokkos::max(Accum, f3(J1, J2, J3, N1, N2, N3));
               },
@@ -437,7 +437,7 @@ Error testHiparFor2DReduce1D(int N1, int N2) {
        {N1, N2}, KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team) {
           I4 Sum, Max;
           parallelReduceInner(
-              Team, J1 + J2,
+              Team, J1, J1 + J2,
               INNER_LAMBDA(int J3, I4 &AccumSum, I4 &AccumMax) {
                  AccumSum += f3(J1, J2, J3, N1, N2, N3);
                  AccumMax = Kokkos::max(AccumMax, f3(J1, J2, J3, N1, N2, N3));
@@ -464,7 +464,7 @@ Error testHiparFor2DScan1D(int N1, int N2) {
    for (int J1 = 0; J1 < N1; ++J1) {
       for (int J2 = 0; J2 < N2; ++J2) {
          I4 RSum = 0;
-         for (int J3 = 0; J3 < J1 + J2; ++J3) {
+         for (int J3 = J1; J3 <= J1 + J2; ++J3) {
             RefRSumH(J1, J2, J3) = RSum;
             RSum += f3(J1, J2, J3, N1, N2, N3);
          }
@@ -475,7 +475,7 @@ Error testHiparFor2DScan1D(int N1, int N2) {
    parallelForOuter(
        {N1, N2}, KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team) {
           parallelScanInner(
-              Team, J1 + J2, INNER_LAMBDA(int J3, I4 &Accum, bool IsFinal) {
+              Team, J1, J1 + J2, INNER_LAMBDA(int J3, I4 &Accum, bool IsFinal) {
                  if (IsFinal) {
                     RSum(J1, J2, J3) = Accum;
                  }
@@ -500,7 +500,7 @@ Error testHiparReduce2DReduce1D(int N1, int N2) {
    I4 RefMax = std::numeric_limits<I4>::min();
    for (int J1 = 0; J1 < N1; ++J1) {
       for (int J2 = 0; J2 < N2; ++J2) {
-         for (int J3 = 0; J3 < J1 + J2; ++J3) {
+         for (int J3 = J1; J3 <= J1 + J2; ++J3) {
             RefSum += f3(J1, J2, J3, N1, N2, N3);
             RefMax = std::max(RefMax, f3(J1, J2, J3, N1, N2, N3));
          }
@@ -513,7 +513,7 @@ Error testHiparReduce2DReduce1D(int N1, int N2) {
        KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team, I4 &AccumOuter) {
           I4 SumInner;
           parallelReduceInner(
-              Team, J1 + J2,
+              Team, J1, J1 + J2,
               INNER_LAMBDA(int J3, I4 &AccumInner) {
                  AccumInner += f3(J1, J2, J3, N1, N2, N3);
               },
@@ -534,7 +534,7 @@ Error testHiparReduce2DReduce1D(int N1, int N2) {
        KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team, I4 &AccumOuter) {
           I4 MaxInner;
           parallelReduceInner(
-              Team, J1 + J2,
+              Team, J1, J1 + J2,
               INNER_LAMBDA(int J3, I4 &AccumInner) {
                  AccumInner =
                      Kokkos::max(AccumInner, f3(J1, J2, J3, N1, N2, N3));
@@ -556,7 +556,7 @@ Error testHiparReduce2DReduce1D(int N1, int N2) {
                      I4 &AccumMaxOuter) {
           I4 SumInner, MaxInner;
           parallelReduceInner(
-              Team, J1 + J2,
+              Team, J1, J1 + J2,
               INNER_LAMBDA(int J3, I4 &AccumSumInner, I4 &AccumMaxInner) {
                  AccumSumInner += f3(J1, J2, J3, N1, N2, N3);
                  AccumMaxInner =

From cdb41e5a9514e20dc9f114c39cc85cac747d3df0 Mon Sep 17 00:00:00 2001
From: Maciej Waruszewski <mwarusz@igf.fuw.edu.pl>
Date: Fri, 13 Mar 2026 15:16:12 -0600
Subject: [PATCH 03/25] Split OmegaKokkos.h

---
 components/omega/src/infra/OmegaKokkos.h      | 358 +++---------------
 .../omega/src/infra/OmegaKokkosFlatPar.h      | 115 ++++++
 components/omega/src/infra/OmegaKokkosHiPar.h | 176 +++++++++
 3 files changed, 345 insertions(+), 304 deletions(-)
 create mode 100644 components/omega/src/infra/OmegaKokkosFlatPar.h
 create mode 100644 components/omega/src/infra/OmegaKokkosHiPar.h

diff --git a/components/omega/src/infra/OmegaKokkos.h b/components/omega/src/infra/OmegaKokkos.h
index 7e582da1ff41..c8aacd9d91ed 100644
--- a/components/omega/src/infra/OmegaKokkos.h
+++ b/components/omega/src/infra/OmegaKokkos.h
@@ -1,13 +1,13 @@
 #ifndef OMEGA_KOKKOS_H
 #define OMEGA_KOKKOS_H
-//===-- base/OmegaKokkos.h - Omega extension of Kokkos ------*- C++ -*-===//
+//===-- infra/OmegaKokkos.h - Omega extension of Kokkos ------*- C++ -*-===//
 //
 /// \file
 /// \brief Extends Kokkos for Omega
 ///
 /// This header extends Kokkos for Omega.
 //
-//===----------------------------------------------------------------------===//
+//===-------------------------------------------------------------------===//
 
 #include "DataTypes.h"
 #include "Error.h"
@@ -19,6 +19,9 @@ namespace OMEGA {
 
 #define OMEGA_SCOPE(a, b) auto &a = b
 
+using ExecSpace     = MemSpace::execution_space;
+using HostExecSpace = HostMemSpace::execution_space;
+
 /// An enum is used to provide a shorthand for determining the type of
 /// field. These correspond to the supported Omega data types (Real will be
 /// identical to R4 or R8 depending on settings)
@@ -70,23 +73,50 @@ template <class T> struct ArrayRank {
    static constexpr bool Is5D = T::rank == 5;
 };
 
-using ExecSpace       = MemSpace::execution_space;
-using HostExecSpace   = HostMemSpace::execution_space;
-using TeamPolicy      = Kokkos::TeamPolicy<ExecSpace>;
-using TeamMember      = TeamPolicy::member_type;
-using ScratchMemSpace = ExecSpace::scratch_memory_space;
-using Kokkos::MemoryUnmanaged;
-using Kokkos::PerTeam;
-using Kokkos::TeamThreadRange;
-using RealScratchArray =
-    Kokkos::View<Real *, ScratchMemSpace, Kokkos::MemoryUnmanaged>;
-
-/// team_size for hierarchical parallelism
-#ifdef OMEGA_TARGET_DEVICE
-constexpr int OMEGA_TEAMSIZE = 64;
-#else
-constexpr int OMEGA_TEAMSIZE = 1;
-#endif
+template <typename V>
+auto createHostMirrorCopy(const V &View)
+    -> Kokkos::View<typename V::data_type, HostMemLayout, HostMemSpace> {
+   return Kokkos::create_mirror_view_and_copy(HostExecSpace(), View);
+}
+
+template <typename V>
+auto createDeviceMirrorCopy(const V &View)
+    -> Kokkos::View<typename V::data_type, MemLayout, MemSpace> {
+   return Kokkos::create_mirror_view_and_copy(ExecSpace(), View);
+}
+
+// function alias to follow Camel Naming Convention
+template <typename D, typename S> void deepCopy(D &&Dst, S &&Src) {
+   Kokkos::deep_copy(std::forward<D>(Dst), std::forward<S>(Src));
+}
+
+template <typename E, typename D, typename S>
+void deepCopy(E &Space, D &Dst, const S &Src) {
+   Kokkos::deep_copy(Space, Dst, Src);
+}
+
+// Check if two arrays are identical
+template <class ArrayTypeA, class ArrayTypeB>
+bool arraysEqual(const ArrayTypeA &A, const ArrayTypeB &B) {
+   OMEGA_REQUIRE(A.span_is_contiguous() && B.span_is_contiguous(),
+                 "arraysEqual works only for contiguous arrays");
+   OMEGA_REQUIRE(A.size() == B.size(),
+                 "arrayEqual can only compare arrays of equal size");
+
+   // This is a debug utility and not performance critical
+   // so just copy to the host and compare there
+   const auto AH = createHostMirrorCopy(A);
+   const auto BH = createHostMirrorCopy(B);
+
+   bool Equal = true;
+   for (size_t I = 0; I < AH.size(); I++) {
+      if (AH.data()[I] != BH.data()[I]) {
+         Equal = false;
+         break;
+      }
+   }
+   return Equal;
+}
 
 // Takes a functor that uses multidimensional indexing
 // and converts it into one that also accepts linear index
@@ -169,293 +199,13 @@ template <class F, int Rank> struct LinearIdxWrapper : F {
 #endif
 };
 
-template <typename V>
-auto createHostMirrorCopy(const V &View)
-    -> Kokkos::View<typename V::data_type, HostMemLayout, HostMemSpace> {
-   return Kokkos::create_mirror_view_and_copy(HostExecSpace(), View);
-}
-
-template <typename V>
-auto createDeviceMirrorCopy(const V &View)
-    -> Kokkos::View<typename V::data_type, MemLayout, MemSpace> {
-   return Kokkos::create_mirror_view_and_copy(ExecSpace(), View);
-}
-
-// function alias to follow Camel Naming Convention
-template <typename D, typename S> void deepCopy(D &&Dst, S &&Src) {
-   Kokkos::deep_copy(std::forward<D>(Dst), std::forward<S>(Src));
-}
-
-template <typename E, typename D, typename S>
-void deepCopy(E &Space, D &Dst, const S &Src) {
-   Kokkos::deep_copy(Space, Dst, Src);
-}
-
-// Check if two arrays are identical
-template <class ArrayTypeA, class ArrayTypeB>
-bool arraysEqual(const ArrayTypeA &A, const ArrayTypeB &B) {
-   OMEGA_REQUIRE(A.span_is_contiguous() && B.span_is_contiguous(),
-                 "arraysEqual works only for contiguous arrays");
-   OMEGA_REQUIRE(A.size() == B.size(),
-                 "arrayEqual can only compare arrays of equal size");
-
-   // This is a debug utility and not performance critical
-   // so just copy to the host and compare there
-   const auto AH = createHostMirrorCopy(A);
-   const auto BH = createHostMirrorCopy(B);
-
-   bool Equal = true;
-   for (size_t I = 0; I < AH.size(); I++) {
-      if (AH.data()[I] != BH.data()[I]) {
-         Equal = false;
-         break;
-      }
-   }
-   return Equal;
-}
-
-using Bounds1D = Kokkos::RangePolicy<ExecSpace, Kokkos::IndexType<int>>;
-
-#if OMEGA_LAYOUT_RIGHT
-
-template <int N>
-using Bounds = Kokkos::MDRangePolicy<
-    ExecSpace, Kokkos::Rank<N, Kokkos::Iterate::Right, Kokkos::Iterate::Right>,
-    Kokkos::IndexType<int>>;
-
-#elif OMEGA_LAYOUT_LEFT
-
-template <int N>
-using Bounds = Kokkos::MDRangePolicy<
-    ExecSpace, Kokkos::Rank<N, Kokkos::Iterate::Left, Kokkos::Iterate::Left>,
-    Kokkos::IndexType<int>>;
-
-#else
-
-#error "OMEGA Memory Layout is not defined."
-
-#endif
-
-// parallelFor: with label
-template <int N, class F>
-inline void parallelFor(const std::string &Label, const int (&UpperBounds)[N],
-                        F &&Functor) {
-   if constexpr (N == 1) {
-      const auto Policy = Bounds1D(0, UpperBounds[0]);
-      Kokkos::parallel_for(Label, Policy, std::forward<F>(Functor));
-
-   } else {
-#ifdef OMEGA_TARGET_DEVICE
-      // On device convert the functor to use one dimensional indexing and use
-      // 1D RangePolicy
-      auto LinFunctor = LinearIdxWrapper{std::forward<F>(Functor), UpperBounds};
-      int LinBound    = 1;
-      for (int Rank = 0; Rank < N; ++Rank) {
-         LinBound *= UpperBounds[Rank];
-      }
-      const auto Policy = Bounds1D(0, LinBound);
-      Kokkos::parallel_for(Label, Policy, std::move(LinFunctor));
-#else
-      // On host use MDRangePolicy
-      const int LowerBounds[N] = {0};
-      const auto Policy        = Bounds<N>(LowerBounds, UpperBounds);
-      Kokkos::parallel_for(Label, Policy, std::forward<F>(Functor));
-#endif
-   }
-}
-
-// parallelFor: without label
-template <int N, class F>
-inline void parallelFor(const int (&UpperBounds)[N], F &&Functor) {
-   parallelFor("", UpperBounds, std::forward<F>(Functor));
-}
-
-// parallelReduce: with label
-template <int N, class F, class... R>
-inline void parallelReduce(const std::string &Label,
-                           const int (&UpperBounds)[N], F &&Functor,
-                           R &&...Reducers) {
-   if constexpr (N == 1) {
-      const auto Policy = Bounds1D(0, UpperBounds[0]);
-      Kokkos::parallel_reduce(Label, Policy, std::forward<F>(Functor),
-                              std::forward<R>(Reducers)...);
-
-   } else {
-
-#ifdef OMEGA_TARGET_DEVICE
-      // On device convert the functor to use one dimensional indexing and use
-      // 1D RangePolicy
-      auto LinFunctor = LinearIdxWrapper{std::forward<F>(Functor), UpperBounds};
-      int LinBound    = 1;
-      for (int Rank = 0; Rank < N; ++Rank) {
-         LinBound *= UpperBounds[Rank];
-      }
-      const auto Policy = Bounds1D(0, LinBound);
-      Kokkos::parallel_reduce(Label, Policy, std::move(LinFunctor),
-                              std::forward<R>(Reducers)...);
-#else
-      // On host use MDRangePolicy
-      const int LowerBounds[N] = {0};
-      const auto Policy        = Bounds<N>(LowerBounds, UpperBounds);
-      Kokkos::parallel_reduce(Label, Policy, std::forward<F>(Functor),
-                              std::forward<R>(Reducers)...);
-#endif
-   }
-}
-
-// parallelReduce: without label
-template <int N, class F, class... R>
-inline void parallelReduce(const int (&UpperBounds)[N], F &&Functor,
-                           R &&...Reducers) {
-   parallelReduce("", UpperBounds, std::forward<F>(Functor),
-                  std::forward<R>(Reducers)...);
-}
-
-/// Hierarchical parallelism wrappers
-
-#define INNER_LAMBDA [=]
-// #define INNER_LAMBDA [&]
-
-KOKKOS_INLINE_FUNCTION void teamBarrier(const TeamMember &Team) {
-   Team.team_barrier();
-}
-
-// parallelForOuter: with label
-template <int N, class F>
-inline void parallelForOuter(const std::string &Label,
-                             const int (&UpperBounds)[N], F &&Functor,
-                             int ScratchValsPerTeam = 0) {
-
-   auto LinFunctor = LinearIdxWrapper{std::forward<F>(Functor), UpperBounds};
-   int LinBound    = 1;
-   for (int Rank = 0; Rank < N; ++Rank) {
-      LinBound *= UpperBounds[Rank];
-   }
-
-   auto Policy = TeamPolicy(LinBound, OMEGA_TEAMSIZE);
-
-   if (ScratchValsPerTeam > 0) {
-      Policy.set_scratch_size(
-          0, Kokkos::PerTeam(ScratchValsPerTeam * sizeof(Real)));
-   }
-
-   Kokkos::parallel_for(
-       Label, Policy, KOKKOS_LAMBDA(const TeamMember &Team) {
-          const int TeamId = Team.league_rank();
-          LinFunctor(TeamId, Team);
-       });
-}
-
-// parallelForOuter: without label
-template <int N, class F>
-inline void parallelForOuter(const int (&UpperBounds)[N], F &&Functor,
-                             int ScratchValsPerTeam = 0) {
-   parallelForOuter("", UpperBounds, std::forward<F>(Functor),
-                    ScratchValsPerTeam);
-}
-
-// parallelForInner
-
-template <class F>
-KOKKOS_FUNCTION void parallelForInner(const TeamMember &Team, int MinIndex,
-                                      int MaxIndex, F &&Functor) {
-   const auto Policy = TeamThreadRange(Team, MinIndex, MaxIndex + 1);
-   Kokkos::parallel_for(Policy, std::forward<F>(Functor));
-}
-
-template <class F>
-KOKKOS_FUNCTION void parallelForInner(const TeamMember &Team, int UpperBound,
-                                      F &&Functor) {
-   parallelForInner(Team, 0, UpperBound - 1, std::forward<F>(Functor));
-}
-
-// This struct is used to get the right accumulator type to be used in
-// the outer parallel lambda based on the final reduction variable type.
-// The final reduction variable can be either a reference to
-// an arithmetic type (int&, Real&) or a Kokkos reducer (Kokkos::Max<Real>).
-// We need to know this type because nvcc does not allow generic lambdas.
-template <class T, class Enable = void> struct AccumTypeHelper;
-
-template <class T>
-struct AccumTypeHelper<T, std::enable_if_t<std::is_arithmetic_v<T>>> {
-   using Type = T;
-};
-
-template <class T>
-struct AccumTypeHelper<T, std::enable_if_t<Kokkos::is_reducer_v<T>>> {
-   using Type = typename T::value_type;
-};
-
-template <class T> using AccumType = typename AccumTypeHelper<T>::Type;
-
-// parallelReduceOuter: with label
-template <int N, class F, class... R>
-inline void parallelReduceOuter(const std::string &Label,
-                                const int (&UpperBounds)[N], F &&Functor,
-                                R &&...Reducers) {
-
-   auto LinFunctor = LinearIdxWrapper{std::forward<F>(Functor), UpperBounds};
-   int LinBound    = 1;
-   for (int Rank = 0; Rank < N; ++Rank) {
-      LinBound *= UpperBounds[Rank];
-   }
-
-   auto Policy = TeamPolicy(LinBound, OMEGA_TEAMSIZE);
-   Kokkos::parallel_reduce(
-       Label, Policy,
-       KOKKOS_LAMBDA(const TeamMember &Team,
-                     AccumType<std::remove_reference_t<R>> &...Accums) {
-          const int TeamId = Team.league_rank();
-          LinFunctor(TeamId, Team, Accums...);
-       },
-       std::forward<R>(Reducers)...);
-}
-
-// parallelReduceOuter: without label
-template <int N, class F, class... R>
-inline void parallelReduceOuter(const int (&UpperBounds)[N], F &&Functor,
-                                R &&...Reducers) {
-   parallelReduceOuter("", UpperBounds, std::forward<F>(Functor),
-                       std::forward<R>(Reducers)...);
-}
-
-// parallelReduceInner
-
-template <class F, class... R>
-KOKKOS_FUNCTION void parallelReduceInner(const TeamMember &Team, int MinIndex,
-                                         int MaxIndex, F &&Functor,
-                                         R &&...Reducers) {
-   const auto Policy = TeamThreadRange(Team, MinIndex, MaxIndex + 1);
-   Kokkos::parallel_reduce(Policy, std::forward<F>(Functor),
-                           std::forward<R>(Reducers)...);
-}
-
-template <class F, class... R>
-KOKKOS_FUNCTION void parallelReduceInner(const TeamMember &Team, int UpperBound,
-                                         F &&Functor, R &&...Reducers) {
-   parallelReduceInner(Team, 0, UpperBound - 1, std::forward<F>(Functor),
-                       std::forward<R>(Reducers)...);
-}
-
-// parallelScanInner
+} // end namespace OMEGA
 
-template <class F, class... R>
-KOKKOS_FUNCTION void parallelScanInner(const TeamMember &Team, int MinIndex,
-                                       int MaxIndex, F &&Functor,
-                                       R &&...Reducers) {
-   const auto Policy = TeamThreadRange(Team, MinIndex, MaxIndex + 1);
-   Kokkos::parallel_scan(Policy, std::forward<F>(Functor),
-                         std::forward<R>(Reducers)...);
-}
+// Flat parallelism wrappers
+#include "OmegaKokkosFlatPar.h"
 
-template <class F, class... R>
-KOKKOS_FUNCTION void parallelScanInner(const TeamMember &Team, int UpperBound,
-                                       F &&Functor, R &&...Reducers) {
-   parallelScanInner(Team, 0, UpperBound - 1, std::forward<F>(Functor),
-                     std::forward<R>(Reducers)...);
-}
-
-} // end namespace OMEGA
+// Hierarchical parallelism wrappers
+#include "OmegaKokkosHiPar.h"
 
 //===----------------------------------------------------------------------===//
 #endif
diff --git a/components/omega/src/infra/OmegaKokkosFlatPar.h b/components/omega/src/infra/OmegaKokkosFlatPar.h
new file mode 100644
index 000000000000..4077cbda25b3
--- /dev/null
+++ b/components/omega/src/infra/OmegaKokkosFlatPar.h
@@ -0,0 +1,115 @@
+#ifndef OMEGA_KOKKOS_FLATPAR_H
+#define OMEGA_KOKKOS_FLATPAR_H
+//===-- infra/OmegaKokkosFlatPar.h - Omega flat parallelism wrappers ------*-
+// C++ -*-===//
+//
+/// \file
+/// \brief Omega flat parallelism wrappers
+///
+/// INTERNAL HEADER NOT MEANT TO BE INCLUDED DIRECTLY
+//
+//===--------------------------------------------------------------------------------===//
+
+namespace OMEGA {
+
+using Bounds1D = Kokkos::RangePolicy<ExecSpace, Kokkos::IndexType<int>>;
+
+#if OMEGA_LAYOUT_RIGHT
+
+template <int N>
+using Bounds = Kokkos::MDRangePolicy<
+    ExecSpace, Kokkos::Rank<N, Kokkos::Iterate::Right, Kokkos::Iterate::Right>,
+    Kokkos::IndexType<int>>;
+
+#elif OMEGA_LAYOUT_LEFT
+
+template <int N>
+using Bounds = Kokkos::MDRangePolicy<
+    ExecSpace, Kokkos::Rank<N, Kokkos::Iterate::Left, Kokkos::Iterate::Left>,
+    Kokkos::IndexType<int>>;
+
+#else
+
+#error "OMEGA Memory Layout is not defined."
+
+#endif
+
+// parallelFor: with label
+template <int N, class F>
+inline void parallelFor(const std::string &Label, const int (&UpperBounds)[N],
+                        F &&Functor) {
+   if constexpr (N == 1) {
+      const auto Policy = Bounds1D(0, UpperBounds[0]);
+      Kokkos::parallel_for(Label, Policy, std::forward<F>(Functor));
+
+   } else {
+#ifdef OMEGA_TARGET_DEVICE
+      // On device convert the functor to use one dimensional indexing and use
+      // 1D RangePolicy
+      auto LinFunctor = LinearIdxWrapper{std::forward<F>(Functor), UpperBounds};
+      int LinBound    = 1;
+      for (int Rank = 0; Rank < N; ++Rank) {
+         LinBound *= UpperBounds[Rank];
+      }
+      const auto Policy = Bounds1D(0, LinBound);
+      Kokkos::parallel_for(Label, Policy, std::move(LinFunctor));
+#else
+      // On host use MDRangePolicy
+      const int LowerBounds[N] = {0};
+      const auto Policy        = Bounds<N>(LowerBounds, UpperBounds);
+      Kokkos::parallel_for(Label, Policy, std::forward<F>(Functor));
+#endif
+   }
+}
+
+// parallelFor: without label
+template <int N, class F>
+inline void parallelFor(const int (&UpperBounds)[N], F &&Functor) {
+   parallelFor("", UpperBounds, std::forward<F>(Functor));
+}
+
+// parallelReduce: with label
+template <int N, class F, class... R>
+inline void parallelReduce(const std::string &Label,
+                           const int (&UpperBounds)[N], F &&Functor,
+                           R &&...Reducers) {
+   if constexpr (N == 1) {
+      const auto Policy = Bounds1D(0, UpperBounds[0]);
+      Kokkos::parallel_reduce(Label, Policy, std::forward<F>(Functor),
+                              std::forward<R>(Reducers)...);
+
+   } else {
+
+#ifdef OMEGA_TARGET_DEVICE
+      // On device convert the functor to use one dimensional indexing and use
+      // 1D RangePolicy
+      auto LinFunctor = LinearIdxWrapper{std::forward<F>(Functor), UpperBounds};
+      int LinBound    = 1;
+      for (int Rank = 0; Rank < N; ++Rank) {
+         LinBound *= UpperBounds[Rank];
+      }
+      const auto Policy = Bounds1D(0, LinBound);
+      Kokkos::parallel_reduce(Label, Policy, std::move(LinFunctor),
+                              std::forward<R>(Reducers)...);
+#else
+      // On host use MDRangePolicy
+      const int LowerBounds[N] = {0};
+      const auto Policy        = Bounds<N>(LowerBounds, UpperBounds);
+      Kokkos::parallel_reduce(Label, Policy, std::forward<F>(Functor),
+                              std::forward<R>(Reducers)...);
+#endif
+   }
+}
+
+// parallelReduce: without label
+template <int N, class F, class... R>
+inline void parallelReduce(const int (&UpperBounds)[N], F &&Functor,
+                           R &&...Reducers) {
+   parallelReduce("", UpperBounds, std::forward<F>(Functor),
+                  std::forward<R>(Reducers)...);
+}
+
+} // end namespace OMEGA
+
+//===----------------------------------------------------------------------===//
+#endif
diff --git a/components/omega/src/infra/OmegaKokkosHiPar.h b/components/omega/src/infra/OmegaKokkosHiPar.h
new file mode 100644
index 000000000000..a71e0926990d
--- /dev/null
+++ b/components/omega/src/infra/OmegaKokkosHiPar.h
@@ -0,0 +1,176 @@
+#ifndef OMEGA_KOKKOS_HIPAR_H
+#define OMEGA_KOKKOS_HIPAR_H
+//===-- infra/OmegaKokkosHiPar.h - Omega hierarchical parallelism wrappers
+//------*- C++ -*-===//
+//
+/// \file
+/// \brief Omega hierarchical parallelism wrappers
+///
+/// INTERNAL HEADER NOT MEANT TO BE INCLUDED DIRECTLY
+//
+//===--------------------------------------------------------------------------------------===//
+
+namespace OMEGA {
+
+using TeamPolicy      = Kokkos::TeamPolicy<ExecSpace>;
+using TeamMember      = TeamPolicy::member_type;
+using ScratchMemSpace = ExecSpace::scratch_memory_space;
+using Kokkos::MemoryUnmanaged;
+using Kokkos::PerTeam;
+using Kokkos::TeamThreadRange;
+using RealScratchArray =
+    Kokkos::View<Real *, ScratchMemSpace, Kokkos::MemoryUnmanaged>;
+
+/// team_size for hierarchical parallelism
+#ifdef OMEGA_TARGET_DEVICE
+constexpr int OMEGA_TEAMSIZE = 64;
+#else
+constexpr int OMEGA_TEAMSIZE = 1;
+#endif
+
+#define INNER_LAMBDA [=]
+// #define INNER_LAMBDA [&]
+
+KOKKOS_INLINE_FUNCTION void teamBarrier(const TeamMember &Team) {
+   Team.team_barrier();
+}
+
+// parallelForOuter: with label
+template <int N, class F>
+inline void parallelForOuter(const std::string &Label,
+                             const int (&UpperBounds)[N], F &&Functor,
+                             int ScratchValsPerTeam = 0) {
+
+   auto LinFunctor = LinearIdxWrapper{std::forward<F>(Functor), UpperBounds};
+   int LinBound    = 1;
+   for (int Rank = 0; Rank < N; ++Rank) {
+      LinBound *= UpperBounds[Rank];
+   }
+
+   auto Policy = TeamPolicy(LinBound, OMEGA_TEAMSIZE);
+
+   if (ScratchValsPerTeam > 0) {
+      Policy.set_scratch_size(
+          0, Kokkos::PerTeam(ScratchValsPerTeam * sizeof(Real)));
+   }
+
+   Kokkos::parallel_for(
+       Label, Policy, KOKKOS_LAMBDA(const TeamMember &Team) {
+          const int TeamId = Team.league_rank();
+          LinFunctor(TeamId, Team);
+       });
+}
+
+// parallelForOuter: without label
+template <int N, class F>
+inline void parallelForOuter(const int (&UpperBounds)[N], F &&Functor,
+                             int ScratchValsPerTeam = 0) {
+   parallelForOuter("", UpperBounds, std::forward<F>(Functor),
+                    ScratchValsPerTeam);
+}
+
+// This struct is used to get the right accumulator type to be used in
+// the outer parallel lambda based on the final reduction variable type.
+// The final reduction variable can be either a reference to
+// an arithmetic type (int&, Real&) or a Kokkos reducer (Kokkos::Max<Real>).
+// We need to know this type because nvcc does not allow generic lambdas.
+template <class T, class Enable = void> struct AccumTypeHelper;
+
+template <class T>
+struct AccumTypeHelper<T, std::enable_if_t<std::is_arithmetic_v<T>>> {
+   using Type = T;
+};
+
+template <class T>
+struct AccumTypeHelper<T, std::enable_if_t<Kokkos::is_reducer_v<T>>> {
+   using Type = typename T::value_type;
+};
+
+template <class T> using AccumType = typename AccumTypeHelper<T>::Type;
+
+// parallelReduceOuter: with label
+template <int N, class F, class... R>
+inline void parallelReduceOuter(const std::string &Label,
+                                const int (&UpperBounds)[N], F &&Functor,
+                                R &&...Reducers) {
+
+   auto LinFunctor = LinearIdxWrapper{std::forward<F>(Functor), UpperBounds};
+   int LinBound    = 1;
+   for (int Rank = 0; Rank < N; ++Rank) {
+      LinBound *= UpperBounds[Rank];
+   }
+
+   auto Policy = TeamPolicy(LinBound, OMEGA_TEAMSIZE);
+   Kokkos::parallel_reduce(
+       Label, Policy,
+       KOKKOS_LAMBDA(const TeamMember &Team,
+                     AccumType<std::remove_reference_t<R>> &...Accums) {
+          const int TeamId = Team.league_rank();
+          LinFunctor(TeamId, Team, Accums...);
+       },
+       std::forward<R>(Reducers)...);
+}
+
+// parallelReduceOuter: without label
+template <int N, class F, class... R>
+inline void parallelReduceOuter(const int (&UpperBounds)[N], F &&Functor,
+                                R &&...Reducers) {
+   parallelReduceOuter("", UpperBounds, std::forward<F>(Functor),
+                       std::forward<R>(Reducers)...);
+}
+
+// parallelForInner
+
+template <class F>
+KOKKOS_FUNCTION void parallelForInner(const TeamMember &Team, int MinIndex,
+                                      int MaxIndex, F &&Functor) {
+   const auto Policy = TeamThreadRange(Team, MinIndex, MaxIndex + 1);
+   Kokkos::parallel_for(Policy, std::forward<F>(Functor));
+}
+
+template <class F>
+KOKKOS_FUNCTION void parallelForInner(const TeamMember &Team, int UpperBound,
+                                      F &&Functor) {
+   parallelForInner(Team, 0, UpperBound - 1, std::forward<F>(Functor));
+}
+
+// parallelReduceInner
+
+template <class F, class... R>
+KOKKOS_FUNCTION void parallelReduceInner(const TeamMember &Team, int MinIndex,
+                                         int MaxIndex, F &&Functor,
+                                         R &&...Reducers) {
+   const auto Policy = TeamThreadRange(Team, MinIndex, MaxIndex + 1);
+   Kokkos::parallel_reduce(Policy, std::forward<F>(Functor),
+                           std::forward<R>(Reducers)...);
+}
+
+template <class F, class... R>
+KOKKOS_FUNCTION void parallelReduceInner(const TeamMember &Team, int UpperBound,
+                                         F &&Functor, R &&...Reducers) {
+   parallelReduceInner(Team, 0, UpperBound - 1, std::forward<F>(Functor),
+                       std::forward<R>(Reducers)...);
+}
+
+// parallelScanInner
+
+template <class F, class... R>
+KOKKOS_FUNCTION void parallelScanInner(const TeamMember &Team, int MinIndex,
+                                       int MaxIndex, F &&Functor,
+                                       R &&...Reducers) {
+   const auto Policy = TeamThreadRange(Team, MinIndex, MaxIndex + 1);
+   Kokkos::parallel_scan(Policy, std::forward<F>(Functor),
+                         std::forward<R>(Reducers)...);
+}
+
+template <class F, class... R>
+KOKKOS_FUNCTION void parallelScanInner(const TeamMember &Team, int UpperBound,
+                                       F &&Functor, R &&...Reducers) {
+   parallelScanInner(Team, 0, UpperBound - 1, std::forward<F>(Functor),
+                     std::forward<R>(Reducers)...);
+}
+
+} // end namespace OMEGA
+
+//===----------------------------------------------------------------------===//
+#endif

From 906c2cc11c610a773ce6113278b45dbb2838e1a6 Mon Sep 17 00:00:00 2001
From: Maciej Waruszewski <mwarusz@igf.fuw.edu.pl>
Date: Tue, 17 Mar 2026 12:12:03 -0600
Subject: [PATCH 04/25] RealScratchArray > ArrayScratch1DReal

---
 components/omega/src/infra/OmegaKokkosHiPar.h |  2 +-
 components/omega/src/ocn/VertAdv.cpp          | 18 +++++++++---------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/components/omega/src/infra/OmegaKokkosHiPar.h b/components/omega/src/infra/OmegaKokkosHiPar.h
index a71e0926990d..d9ec635d3a16 100644
--- a/components/omega/src/infra/OmegaKokkosHiPar.h
+++ b/components/omega/src/infra/OmegaKokkosHiPar.h
@@ -18,7 +18,7 @@ using ScratchMemSpace = ExecSpace::scratch_memory_space;
 using Kokkos::MemoryUnmanaged;
 using Kokkos::PerTeam;
 using Kokkos::TeamThreadRange;
-using RealScratchArray =
+using ArrayScratch1DReal =
     Kokkos::View<Real *, ScratchMemSpace, Kokkos::MemoryUnmanaged>;
 
 /// team_size for hierarchical parallelism
diff --git a/components/omega/src/ocn/VertAdv.cpp b/components/omega/src/ocn/VertAdv.cpp
index 343136b10d34..635a13e2c853 100644
--- a/components/omega/src/ocn/VertAdv.cpp
+++ b/components/omega/src/ocn/VertAdv.cpp
@@ -379,7 +379,7 @@ void VertAdv::computeVerticalVelocity(
    parallelForOuter(
        "computeVerticalVelocity", {NCellsOwned},
        KOKKOS_LAMBDA(int ICell, const TeamMember &Team) {
-          RealScratchArray DivHU(Team.team_scratch(0), LocNVertLayers);
+          ArrayScratch1DReal DivHU(Team.team_scratch(0), LocNVertLayers);
 
           const Real InvAreaCell = 1._Real / LocAreaCell(ICell);
 
@@ -520,7 +520,7 @@ void VertAdv::computeVelocityVAdvTend(
 
           // Allocate scratch space for W times Du/Dz at vertical interfaces
           // between edges
-          RealScratchArray WDuDzEdge(Team.team_scratch(0), LocNVertLayersP1);
+          ArrayScratch1DReal WDuDzEdge(Team.team_scratch(0), LocNVertLayersP1);
 
           // Flux is zero at top and bottom
           Kokkos::single(
@@ -829,13 +829,13 @@ void VertAdv::computeFCTVAdvTend(
           const I4 KMax = MaxLayerCell(ICell);
           I4 KRange     = vertRangeChunked(KMin, KMax);
 
-          RealScratchArray InvNewProvThick(Team.team_scratch(0),
-                                           LocNVertLayers);
-          RealScratchArray WorkTend(Team.team_scratch(0), LocNVertLayers);
-          RealScratchArray FlxIn(Team.team_scratch(0), LocNVertLayers);
-          RealScratchArray FlxOut(Team.team_scratch(0), LocNVertLayers);
-          RealScratchArray RescaledFlux(Team.team_scratch(0),
-                                        LocNVertLayers + 1);
+          ArrayScratch1DReal InvNewProvThick(Team.team_scratch(0),
+                                             LocNVertLayers);
+          ArrayScratch1DReal WorkTend(Team.team_scratch(0), LocNVertLayers);
+          ArrayScratch1DReal FlxIn(Team.team_scratch(0), LocNVertLayers);
+          ArrayScratch1DReal FlxOut(Team.team_scratch(0), LocNVertLayers);
+          ArrayScratch1DReal RescaledFlux(Team.team_scratch(0),
+                                          LocNVertLayers + 1);
 
           parallelForInner(
               Team, KRange, INNER_LAMBDA(int KChunk) {

From 6fedbe6c00bba232a7c4006cdca5c465b4337281 Mon Sep 17 00:00:00 2001
From: Maciej Waruszewski <mwarusz@igf.fuw.edu.pl>
Date: Tue, 17 Mar 2026 12:14:47 -0600
Subject: [PATCH 05/25] Do not bring MemoryUnmanaged into OMEGA namespace

---
 components/omega/src/base/TriDiagSolvers.h    | 5 +++--
 components/omega/src/infra/OmegaKokkosHiPar.h | 1 -
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/components/omega/src/base/TriDiagSolvers.h b/components/omega/src/base/TriDiagSolvers.h
index f024a76aa2aa..5b3294ab5080 100644
--- a/components/omega/src/base/TriDiagSolvers.h
+++ b/components/omega/src/base/TriDiagSolvers.h
@@ -35,8 +35,9 @@ using TriDiagDiffSolver = ThomasDiffusionSolver;
 #endif
 
 // Type of real array of size (NRow, VecLength) in the scratch memory space
-using TriDiagScratchArray = Kokkos::View<Real *[VecLength], MemLayout,
-                                         ScratchMemSpace, MemoryUnmanaged>;
+using TriDiagScratchArray =
+    Kokkos::View<Real *[VecLength], MemLayout, ScratchMemSpace,
+                 Kokkos::MemoryUnmanaged>;
 
 // Scratch data for general tridiagonal solver
 struct TriDiagScratch {
diff --git a/components/omega/src/infra/OmegaKokkosHiPar.h b/components/omega/src/infra/OmegaKokkosHiPar.h
index d9ec635d3a16..c269fb48d430 100644
--- a/components/omega/src/infra/OmegaKokkosHiPar.h
+++ b/components/omega/src/infra/OmegaKokkosHiPar.h
@@ -15,7 +15,6 @@ namespace OMEGA {
 using TeamPolicy      = Kokkos::TeamPolicy<ExecSpace>;
 using TeamMember      = TeamPolicy::member_type;
 using ScratchMemSpace = ExecSpace::scratch_memory_space;
-using Kokkos::MemoryUnmanaged;
 using Kokkos::PerTeam;
 using Kokkos::TeamThreadRange;
 using ArrayScratch1DReal =

From 009b1acbe0b2591069a3e6203c53c716c56248ba Mon Sep 17 00:00:00 2001
From: Maciej Waruszewski <mwarusz@igf.fuw.edu.pl>
Date: Tue, 17 Mar 2026 13:48:25 -0600
Subject: [PATCH 06/25] Introduce LaunchConfig and TeamScratch

---
 components/omega/src/infra/OmegaKokkos.h      | 19 +++--
 components/omega/src/infra/OmegaKokkosHiPar.h | 78 +++++++++++++++----
 components/omega/src/ocn/VertAdv.cpp          | 19 ++---
 3 files changed, 86 insertions(+), 30 deletions(-)

diff --git a/components/omega/src/infra/OmegaKokkos.h b/components/omega/src/infra/OmegaKokkos.h
index c8aacd9d91ed..2ca20c49fd3f 100644
--- a/components/omega/src/infra/OmegaKokkos.h
+++ b/components/omega/src/infra/OmegaKokkos.h
@@ -124,16 +124,17 @@ template <class F, int Rank> struct LinearIdxWrapper : F {
    static_assert(Rank >= 1 && Rank <= 5, "LinearIdxWrapper supports ranks 1-5");
    using F::operator();
 
-   LinearIdxWrapper(F &&Functor, const int (&Bounds)[Rank])
-       : F(std::move(Functor)) {
-      computeStrides(Bounds);
+   template <class Array>
+   LinearIdxWrapper(F &&Functor, Array &&Bounds) : F(std::move(Functor)) {
+      computeStrides(std::forward<Array>(Bounds));
    }
 
-   LinearIdxWrapper(const F &Functor, const int (&Bounds)[Rank]) : F(Functor) {
-      computeStrides(Bounds);
+   template <class Array>
+   LinearIdxWrapper(const F &Functor, Array &&Bounds) : F(Functor) {
+      computeStrides(std::forward<Array>(Bounds));
    }
 
-   void computeStrides(const int (&Bounds)[Rank]) {
+   template <class Array> void computeStrides(Array &&Bounds) {
       if constexpr (Rank > 1) {
          Strides[Rank - 2] = Bounds[Rank - 1];
          for (int I = Rank - 3; I >= 0; --I) {
@@ -199,6 +200,12 @@ template <class F, int Rank> struct LinearIdxWrapper : F {
 #endif
 };
 
+// Deduction guides for deducing Rank
+template <class F, int Rank>
+LinearIdxWrapper(F, const int (&)[Rank]) -> LinearIdxWrapper<F, Rank>;
+template <class F, size_t Rank>
+LinearIdxWrapper(F, std::array<int, Rank>) -> LinearIdxWrapper<F, Rank>;
+
 } // end namespace OMEGA
 
 // Flat parallelism wrappers
diff --git a/components/omega/src/infra/OmegaKokkosHiPar.h b/components/omega/src/infra/OmegaKokkosHiPar.h
index c269fb48d430..a57dd448eae2 100644
--- a/components/omega/src/infra/OmegaKokkosHiPar.h
+++ b/components/omega/src/infra/OmegaKokkosHiPar.h
@@ -30,27 +30,64 @@ constexpr int OMEGA_TEAMSIZE = 1;
 #define INNER_LAMBDA [=]
 // #define INNER_LAMBDA [&]
 
+template <class... T> struct TeamScratch {
+   size_t BytesPerTeam = 0;
+
+   TeamScratch() = default;
+
+   template <int N> TeamScratch(const int (&NVals)[N]) {
+      static_assert(N == sizeof...(T));
+      int I = 0;
+      ((BytesPerTeam += sizeof(T) * NVals[I++]), ...);
+   }
+
+   TeamScratch(int NVals) : TeamScratch({{NVals}}) {}
+};
+
+template <int N> struct LaunchConfig {
+   std::array<int, N> UpperBounds;
+   int TeamSize;
+   size_t ScratchBytesPerTeam;
+
+   template <class... T>
+   LaunchConfig(const int (&UpperBoundsIn)[N], int TeamSize,
+                const TeamScratch<T...> &Scratch)
+       : TeamSize(TeamSize), ScratchBytesPerTeam(Scratch.BytesPerTeam) {
+      std::copy(std::begin(UpperBoundsIn), std::end(UpperBoundsIn),
+                std::begin(UpperBounds));
+   }
+
+   template <class... T>
+   LaunchConfig(const int (&UpperBounds)[N], const TeamScratch<T...> &Scratch)
+       : LaunchConfig(UpperBounds, OMEGA_TEAMSIZE, Scratch) {}
+
+   LaunchConfig(const int (&UpperBounds)[N], int TeamSize)
+       : LaunchConfig(UpperBounds, TeamSize, TeamScratch<>{}) {}
+
+   LaunchConfig(const int (&UpperBounds)[N])
+       : LaunchConfig(UpperBounds, OMEGA_TEAMSIZE, TeamScratch<>{}) {}
+};
+
 KOKKOS_INLINE_FUNCTION void teamBarrier(const TeamMember &Team) {
    Team.team_barrier();
 }
 
-// parallelForOuter: with label
+// parallelForOuter: with label and with launch config
 template <int N, class F>
 inline void parallelForOuter(const std::string &Label,
-                             const int (&UpperBounds)[N], F &&Functor,
-                             int ScratchValsPerTeam = 0) {
+                             const LaunchConfig<N> &Config, F &&Functor) {
 
-   auto LinFunctor = LinearIdxWrapper{std::forward<F>(Functor), UpperBounds};
-   int LinBound    = 1;
+   auto LinFunctor =
+       LinearIdxWrapper{std::forward<F>(Functor), Config.UpperBounds};
+   int LinBound = 1;
    for (int Rank = 0; Rank < N; ++Rank) {
-      LinBound *= UpperBounds[Rank];
+      LinBound *= Config.UpperBounds[Rank];
    }
 
-   auto Policy = TeamPolicy(LinBound, OMEGA_TEAMSIZE);
+   auto Policy = TeamPolicy(LinBound, Config.TeamSize);
 
-   if (ScratchValsPerTeam > 0) {
-      Policy.set_scratch_size(
-          0, Kokkos::PerTeam(ScratchValsPerTeam * sizeof(Real)));
+   if (Config.ScratchBytesPerTeam > 0) {
+      Policy.set_scratch_size(0, Kokkos::PerTeam(Config.ScratchBytesPerTeam));
    }
 
    Kokkos::parallel_for(
@@ -60,12 +97,23 @@ inline void parallelForOuter(const std::string &Label,
        });
 }
 
-// parallelForOuter: without label
+// parallelForOuter: without label and with launch config
+template <int N, class F>
+inline void parallelForOuter(const LaunchConfig<N> &Config, F &&Functor) {
+   parallelForOuter("", Config, std::forward<F>(Functor));
+}
+
+// parallelForOuter: with label and with array bounds
+template <int N, class F>
+inline void parallelForOuter(const std::string &Label,
+                             const int (&UpperBounds)[N], F &&Functor) {
+   parallelForOuter(Label, LaunchConfig(UpperBounds), std::forward<F>(Functor));
+}
+
+// parallelForOuter: without label and with array bounds
 template <int N, class F>
-inline void parallelForOuter(const int (&UpperBounds)[N], F &&Functor,
-                             int ScratchValsPerTeam = 0) {
-   parallelForOuter("", UpperBounds, std::forward<F>(Functor),
-                    ScratchValsPerTeam);
+inline void parallelForOuter(const int (&UpperBounds)[N], F &&Functor) {
+   parallelForOuter("", LaunchConfig(UpperBounds), std::forward<F>(Functor));
 }
 
 // This struct is used to get the right accumulator type to be used in
diff --git a/components/omega/src/ocn/VertAdv.cpp b/components/omega/src/ocn/VertAdv.cpp
index 635a13e2c853..081f5a38d6bc 100644
--- a/components/omega/src/ocn/VertAdv.cpp
+++ b/components/omega/src/ocn/VertAdv.cpp
@@ -377,7 +377,8 @@ void VertAdv::computeVerticalVelocity(
 
    // Loop over all cells owned by the task
    parallelForOuter(
-       "computeVerticalVelocity", {NCellsOwned},
+       "computeVerticalVelocity",
+       LaunchConfig({NCellsOwned}, TeamScratch<Real>(NVertLayers)),
        KOKKOS_LAMBDA(int ICell, const TeamMember &Team) {
           ArrayScratch1DReal DivHU(Team.team_scratch(0), LocNVertLayers);
 
@@ -431,8 +432,7 @@ void VertAdv::computeVerticalVelocity(
                     LocVertVel(ICell, KRev) = Accum;
                  }
               });
-       },
-       NVertLayers);
+       });
 
    // TODO: currently assuming TotalVerticalVelocity = VerticalVelocity, i.e.
    //  purely from divergence of horizontal velocity. Need to add optional
@@ -510,7 +510,8 @@ void VertAdv::computeVelocityVAdvTend(
 
    // Loop over every owned edge
    parallelForOuter(
-       "computeVelocityVAdvTend", {NEdgesOwned},
+       "computeVelocityVAdvTend",
+       LaunchConfig({NEdgesOwned}, TeamScratch<Real>(NVertLayersP1)),
        KOKKOS_LAMBDA(int IEdge, const TeamMember &Team) {
           const I4 Cell1 = LocCOnE(IEdge, 0);
           const I4 Cell2 = LocCOnE(IEdge, 1);
@@ -565,8 +566,7 @@ void VertAdv::computeVelocityVAdvTend(
                                          (WDuDzEdge(K) + WDuDzEdge(K + 1));
                  }
               });
-       },
-       NVertLayersP1);
+       });
 
 } // end computeVelocityVAdvTend
 
@@ -823,7 +823,9 @@ void VertAdv::computeFCTVAdvTend(
    OMEGA_SCOPE(LocEps, Eps);
 
    parallelForOuter(
-       "computeFCTVAdvTend", {NTracers, NCellsOwned},
+       "computeFCTVAdvTend",
+       LaunchConfig({NTracers, NCellsOwned},
+                    TeamScratch<Real>(5 * NVertLayers + 1)),
        KOKKOS_LAMBDA(int L, int ICell, const TeamMember &Team) {
           const I4 KMin = MinLayerCell(ICell);
           const I4 KMax = MaxLayerCell(ICell);
@@ -951,8 +953,7 @@ void VertAdv::computeFCTVAdvTend(
                  }
               });
           // TODO: Monotonicity and diagnostic checks
-       },
-       5 * NVertLayers + 1);
+       });
 
 } // end computeFTCVAdvTend
 

From 5148fa4bc18872782c036aec140cb768b5217b03 Mon Sep 17 00:00:00 2001
From: Maciej Waruszewski <mwarusz@igf.fuw.edu.pl>
Date: Tue, 17 Mar 2026 15:46:38 -0600
Subject: [PATCH 07/25] Use LaunchConfig to convert tridiag solver to use
 wrappers

---
 components/omega/src/base/TriDiagSolvers.h    |  68 +++----
 components/omega/src/infra/OmegaKokkosHiPar.h |   1 -
 .../omega/test/base/TriDiagSolversTest.cpp    | 185 +++++++++---------
 3 files changed, 119 insertions(+), 135 deletions(-)

diff --git a/components/omega/src/base/TriDiagSolvers.h b/components/omega/src/base/TriDiagSolvers.h
index 5b3294ab5080..0b9a947667b6 100644
--- a/components/omega/src/base/TriDiagSolvers.h
+++ b/components/omega/src/base/TriDiagSolvers.h
@@ -57,11 +57,10 @@ struct ThomasSolver {
 
    // Create a Kokkos team policy for solving NBatch systems of size NRow
    // and set scratch size
-   static TeamPolicy makeTeamPolicy(int NBatch, int NRow) {
-      TeamPolicy Policy((NBatch + VecLength - 1) / VecLength, 1, 1);
-      Policy.set_scratch_size(
-          0, Kokkos::PerTeam(4 * NRow * VecLength * sizeof(Real)));
-      return Policy;
+   static LaunchConfig<1> makeLaunchConfig(int NBatch, int NRow) {
+      const int NTeams   = (NBatch + VecLength - 1) / VecLength;
+      const int NScratch = 4 * NRow * VecLength;
+      return LaunchConfig({NTeams}, 1, TeamScratch<Real>(NScratch));
    }
 
    // Solve the system defined in the scratch data argument `Scratch`
@@ -101,11 +100,11 @@ struct ThomasSolver {
       const int NBatch = X.extent_int(0);
       const int NRow   = X.extent_int(1);
 
-      TeamPolicy Policy = makeTeamPolicy(NBatch, NRow);
+      auto LConfig = makeLaunchConfig(NBatch, NRow);
 
-      Kokkos::parallel_for(
-          Policy, KOKKOS_LAMBDA(const TeamMember &Member) {
-             const int IStart = Member.league_rank() * VecLength;
+      parallelForOuter(
+          LConfig, KOKKOS_LAMBDA(const int IChunk, const TeamMember &Member) {
+             const int IStart = IChunk * VecLength;
 
              TriDiagScratch Scratch(Member, NRow);
 
@@ -140,11 +139,9 @@ struct PCRSolver {
 
    // Create a Kokkos team policy for solving NBatch systems of size NRow
    // and set scratch size
-   static TeamPolicy makeTeamPolicy(int NBatch, int NRow) {
-      TeamPolicy Policy(NBatch, NRow, 1);
-      Policy.set_scratch_size(
-          0, Kokkos::PerTeam(4 * NRow * VecLength * sizeof(Real)));
-      return Policy;
+   static LaunchConfig<1> makeLaunchConfig(int NBatch, int NRow) {
+      const int NScratch = 4 * NRow * VecLength;
+      return LaunchConfig({NBatch}, NRow, TeamScratch<Real>(NScratch));
    }
 
    // Solve the system defined in the scratch data argument `Scratch`
@@ -218,13 +215,12 @@ struct PCRSolver {
    static void solve(const Array2DReal &DL, const Array2DReal &D,
                      const Array2DReal &DU, const Array2DReal &X) {
 
-      const int NBatch  = X.extent_int(0);
-      const int NRow    = X.extent_int(1);
-      TeamPolicy Policy = makeTeamPolicy(NBatch, NRow);
+      const int NBatch = X.extent_int(0);
+      const int NRow   = X.extent_int(1);
+      auto LConfig     = makeLaunchConfig(NBatch, NRow);
 
-      Kokkos::parallel_for(
-          Policy, KOKKOS_LAMBDA(const TeamMember &Member) {
-             const int I = Member.league_rank();
+      parallelForOuter(
+          LConfig, KOKKOS_LAMBDA(int I, const TeamMember &Member) {
              const int K = Member.team_rank();
 
              TriDiagScratch Scratch(Member, NRow);
@@ -264,11 +260,10 @@ struct ThomasDiffusionSolver {
 
    // Create a Kokkos team policy for solving NBatch systems of size NRow
    // and set scratch size
-   static TeamPolicy makeTeamPolicy(int NBatch, int NRow) {
-      TeamPolicy Policy((NBatch + VecLength - 1) / VecLength, 1, 1);
-      Policy.set_scratch_size(
-          0, Kokkos::PerTeam(4 * NRow * VecLength * sizeof(Real)));
-      return Policy;
+   static LaunchConfig<1> makeLaunchConfig(int NBatch, int NRow) {
+      const int NTeams   = (NBatch + VecLength - 1) / VecLength;
+      const int NScratch = 4 * NRow * VecLength;
+      return LaunchConfig({NTeams}, 1, TeamScratch<Real>(NScratch));
    }
 
    // Solve the system defined in the scratch data argument `Scratch`
@@ -327,11 +322,11 @@ struct ThomasDiffusionSolver {
       const int NBatch = X.extent_int(0);
       const int NRow   = X.extent_int(1);
 
-      TeamPolicy Policy = makeTeamPolicy(NBatch, NRow);
+      auto LConfig = makeLaunchConfig(NBatch, NRow);
 
-      Kokkos::parallel_for(
-          Policy, KOKKOS_LAMBDA(const TeamMember &Member) {
-             const int IStart = Member.league_rank() * VecLength;
+      parallelForOuter(
+          LConfig, KOKKOS_LAMBDA(int IChunk, const TeamMember &Member) {
+             const int IStart = IChunk * VecLength;
 
              TriDiagDiffScratch Scratch(Member, NRow);
 
@@ -365,11 +360,9 @@ struct PCRDiffusionSolver {
 
    // Create a Kokkos team policy for solving NBatch systems of size NRow
    // and set scratch size
-   static TeamPolicy makeTeamPolicy(int NBatch, int NRow) {
-      TeamPolicy Policy(NBatch, NRow, 1);
-      Policy.set_scratch_size(
-          0, Kokkos::PerTeam(4 * NRow * VecLength * sizeof(Real)));
-      return Policy;
+   static LaunchConfig<1> makeLaunchConfig(int NBatch, int NRow) {
+      const int NScratch = 4 * NRow * VecLength;
+      return LaunchConfig({NBatch}, NRow, TeamScratch<Real>(NScratch));
    }
 
    // Solve the system defined in the scratch data argument `Scratch`
@@ -461,10 +454,9 @@ struct PCRDiffusionSolver {
       const int NBatch = X.extent_int(0);
       const int NRow   = X.extent_int(1);
 
-      TeamPolicy Policy = makeTeamPolicy(NBatch, NRow);
-      Kokkos::parallel_for(
-          Policy, KOKKOS_LAMBDA(const TeamMember &Member) {
-             const int I = Member.league_rank();
+      auto LConfig = makeLaunchConfig(NBatch, NRow);
+      parallelForOuter(
+          LConfig, KOKKOS_LAMBDA(int I, const TeamMember &Member) {
              const int K = Member.team_rank();
 
              TriDiagDiffScratch Scratch(Member, NRow);
diff --git a/components/omega/src/infra/OmegaKokkosHiPar.h b/components/omega/src/infra/OmegaKokkosHiPar.h
index a57dd448eae2..b18c6e3ac97a 100644
--- a/components/omega/src/infra/OmegaKokkosHiPar.h
+++ b/components/omega/src/infra/OmegaKokkosHiPar.h
@@ -16,7 +16,6 @@ using TeamPolicy      = Kokkos::TeamPolicy<ExecSpace>;
 using TeamMember      = TeamPolicy::member_type;
 using ScratchMemSpace = ExecSpace::scratch_memory_space;
 using Kokkos::PerTeam;
-using Kokkos::TeamThreadRange;
 using ArrayScratch1DReal =
     Kokkos::View<Real *, ScratchMemSpace, Kokkos::MemoryUnmanaged>;
 
diff --git a/components/omega/test/base/TriDiagSolversTest.cpp b/components/omega/test/base/TriDiagSolversTest.cpp
index f6b08f986c9e..99236b40fe93 100644
--- a/components/omega/test/base/TriDiagSolversTest.cpp
+++ b/components/omega/test/base/TriDiagSolversTest.cpp
@@ -165,46 +165,44 @@ Real runDiffManufactured(int NCells) {
           U(ICell)          = manufacturedSolution(XCell(ICell), 0);
        });
 
-   TeamPolicy Policy = TriDiagDiffSolver::makeTeamPolicy(1, NCells);
+   auto LConfig = TriDiagDiffSolver::makeLaunchConfig(1, NCells);
 
    // Integrate in time with backward Euler
    for (int Step = 0; Step < NSteps; ++Step) {
       const Real Time     = Step * TimeStep;
       const Real TimeNext = (Step + 1) * TimeStep;
 
-      Kokkos::parallel_for(
-          Policy, KOKKOS_LAMBDA(const TeamMember &Member) {
+      parallelForOuter(
+          LConfig, KOKKOS_LAMBDA(int, const TeamMember &Member) {
              TriDiagDiffScratch Scratch(Member, NCells);
 
              // Setup the system to be solved
-             Kokkos::parallel_for(
-                 TeamThreadRange(Member, NCells), [=](int ICell) {
-                    for (int IVec = 0; IVec < VecLength; ++IVec) {
-
-                       // Forcing term from the manufactured solution
-                       const Real F =
-                           manufacturedForcing(XCell(ICell), TimeNext);
-
-                       Scratch.H(ICell, IVec) = LayerThick(ICell);
-
-                       if (ICell == NCells - 1) {
-                          // Boundary condition
-                          const Real XBnd = XVertex(ICell + 1);
-                          const Real BoundaryCoeff =
-                              -(2 + Kokkos::sin(XBnd)) * Kokkos::tan(XBnd);
-                          Scratch.H(ICell, IVec) -= TimeStep * BoundaryCoeff;
-                          Scratch.G(ICell, IVec) = 0;
-                       } else {
-                          const Real AvgLayerThick =
-                              (LayerThick(ICell + 1) + LayerThick(ICell)) / 2;
-                          Scratch.G(ICell, IVec) =
-                              Diffusivity(ICell + 1) * TimeStep / AvgLayerThick;
-                       }
-                       // RHS
-                       Scratch.X(ICell, IVec) =
-                           LayerThick(ICell) * (U(ICell) + TimeStep * F);
-                    }
-                 });
+             parallelForInner(Member, NCells, [=](int ICell) {
+                for (int IVec = 0; IVec < VecLength; ++IVec) {
+
+                   // Forcing term from the manufactured solution
+                   const Real F = manufacturedForcing(XCell(ICell), TimeNext);
+
+                   Scratch.H(ICell, IVec) = LayerThick(ICell);
+
+                   if (ICell == NCells - 1) {
+                      // Boundary condition
+                      const Real XBnd = XVertex(ICell + 1);
+                      const Real BoundaryCoeff =
+                          -(2 + Kokkos::sin(XBnd)) * Kokkos::tan(XBnd);
+                      Scratch.H(ICell, IVec) -= TimeStep * BoundaryCoeff;
+                      Scratch.G(ICell, IVec) = 0;
+                   } else {
+                      const Real AvgLayerThick =
+                          (LayerThick(ICell + 1) + LayerThick(ICell)) / 2;
+                      Scratch.G(ICell, IVec) =
+                          Diffusivity(ICell + 1) * TimeStep / AvgLayerThick;
+                   }
+                   // RHS
+                   Scratch.X(ICell, IVec) =
+                       LayerThick(ICell) * (U(ICell) + TimeStep * F);
+                }
+             });
 
              // Solve the system
              Member.team_barrier();
@@ -212,9 +210,9 @@ Real runDiffManufactured(int NCells) {
              Member.team_barrier();
 
              // Store the solution
-             Kokkos::parallel_for(
-                 TeamThreadRange(Member, NCells),
-                 [=](int ICell) { U(ICell) = Scratch.X(ICell, 0); });
+             parallelForInner(Member, NCells, [=](int ICell) {
+                U(ICell) = Scratch.X(ICell, 0);
+             });
           });
    }
 
@@ -318,45 +316,42 @@ Real runDiffusionStability(bool UseGeneralSolver, Real DiffValue) {
    for (int Step = 0; Step < NSteps; ++Step) {
 
       if (UseGeneralSolver) {
-         TeamPolicy Policy = TriDiagSolver::makeTeamPolicy(1, NCells);
+         auto LConfig = TriDiagSolver::makeLaunchConfig(1, NCells);
 
-         Kokkos::parallel_for(
-             Policy, KOKKOS_LAMBDA(const TeamMember &Member) {
+         parallelForOuter(
+             LConfig, KOKKOS_LAMBDA(int, const TeamMember &Member) {
                 TriDiagScratch Scratch(Member, NCells);
 
                 // Setup the system to be solved in the form expected by the
                 // general tridiagonal solver
-                Kokkos::parallel_for(
-                    TeamThreadRange(Member, NCells), [=](int ICell) {
-                       for (int IVec = 0; IVec < VecLength; ++IVec) {
-
-                          if (ICell < NCells - 1) {
-                             const Real AvgLayerThick =
-                                 (LayerThick(ICell + 1) + LayerThick(ICell)) /
-                                 2;
-                             Scratch.DU(ICell, IVec) = -Diffusivity(ICell + 1) *
-                                                       TimeStep / AvgLayerThick;
-                          } else {
-                             Scratch.DU(ICell, IVec) = 0;
-                          }
-
-                          if (ICell > 0) {
-                             const Real AvgLayerThick =
-                                 (LayerThick(ICell) + LayerThick(ICell - 1)) /
-                                 2;
-                             Scratch.DL(ICell, IVec) =
-                                 -Diffusivity(ICell) * TimeStep / AvgLayerThick;
-                          } else {
-                             Scratch.DL(ICell, IVec) = 0;
-                          }
-
-                          Scratch.D(ICell, IVec) = LayerThick(ICell) -
-                                                   Scratch.DU(ICell, IVec) -
-                                                   Scratch.DL(ICell, IVec);
-
-                          Scratch.X(ICell, IVec) = LayerThick(ICell) * U(ICell);
-                       }
-                    });
+                parallelForInner(Member, NCells, [=](int ICell) {
+                   for (int IVec = 0; IVec < VecLength; ++IVec) {
+
+                      if (ICell < NCells - 1) {
+                         const Real AvgLayerThick =
+                             (LayerThick(ICell + 1) + LayerThick(ICell)) / 2;
+                         Scratch.DU(ICell, IVec) =
+                             -Diffusivity(ICell + 1) * TimeStep / AvgLayerThick;
+                      } else {
+                         Scratch.DU(ICell, IVec) = 0;
+                      }
+
+                      if (ICell > 0) {
+                         const Real AvgLayerThick =
+                             (LayerThick(ICell) + LayerThick(ICell - 1)) / 2;
+                         Scratch.DL(ICell, IVec) =
+                             -Diffusivity(ICell) * TimeStep / AvgLayerThick;
+                      } else {
+                         Scratch.DL(ICell, IVec) = 0;
+                      }
+
+                      Scratch.D(ICell, IVec) = LayerThick(ICell) -
+                                               Scratch.DU(ICell, IVec) -
+                                               Scratch.DL(ICell, IVec);
+
+                      Scratch.X(ICell, IVec) = LayerThick(ICell) * U(ICell);
+                   }
+                });
 
                 // Solve the system
                 Member.team_barrier();
@@ -364,38 +359,36 @@ Real runDiffusionStability(bool UseGeneralSolver, Real DiffValue) {
                 Member.team_barrier();
 
                 // Save the solution
-                Kokkos::parallel_for(
-                    TeamThreadRange(Member, NCells),
-                    [=](int ICell) { U(ICell) = Scratch.X(ICell, 0); });
+                parallelForInner(Member, NCells, [=](int ICell) {
+                   U(ICell) = Scratch.X(ICell, 0);
+                });
              });
       } else {
-         TeamPolicy Policy = TriDiagDiffSolver::makeTeamPolicy(1, NCells);
+         auto LConfig = TriDiagDiffSolver::makeLaunchConfig(1, NCells);
 
-         Kokkos::parallel_for(
-             Policy, KOKKOS_LAMBDA(const TeamMember &Member) {
+         parallelForOuter(
+             LConfig, KOKKOS_LAMBDA(int, const TeamMember &Member) {
                 TriDiagDiffScratch Scratch(Member, NCells);
 
                 // Setup the system to be solved in the form expected by the
                 // specialized diffusion tridiagonal solver
-                Kokkos::parallel_for(
-                    TeamThreadRange(Member, NCells), [=](int ICell) {
-                       for (int IVec = 0; IVec < VecLength; ++IVec) {
-
-                          Scratch.H(ICell, IVec) = LayerThick(ICell);
-
-                          if (ICell < NCells - 1) {
-                             const Real AvgLayerThick =
-                                 (LayerThick(ICell + 1) + LayerThick(ICell)) /
-                                 2;
-                             Scratch.G(ICell, IVec) = Diffusivity(ICell + 1) *
-                                                      TimeStep / AvgLayerThick;
-                          } else {
-                             Scratch.G(ICell, IVec) = 0;
-                          }
-
-                          Scratch.X(ICell, IVec) = LayerThick(ICell) * U(ICell);
-                       }
-                    });
+                parallelForInner(Member, NCells, [=](int ICell) {
+                   for (int IVec = 0; IVec < VecLength; ++IVec) {
+
+                      Scratch.H(ICell, IVec) = LayerThick(ICell);
+
+                      if (ICell < NCells - 1) {
+                         const Real AvgLayerThick =
+                             (LayerThick(ICell + 1) + LayerThick(ICell)) / 2;
+                         Scratch.G(ICell, IVec) =
+                             Diffusivity(ICell + 1) * TimeStep / AvgLayerThick;
+                      } else {
+                         Scratch.G(ICell, IVec) = 0;
+                      }
+
+                      Scratch.X(ICell, IVec) = LayerThick(ICell) * U(ICell);
+                   }
+                });
 
                 // Solve the system
                 Member.team_barrier();
@@ -403,9 +396,9 @@ Real runDiffusionStability(bool UseGeneralSolver, Real DiffValue) {
                 Member.team_barrier();
 
                 // Store the solution
-                Kokkos::parallel_for(
-                    TeamThreadRange(Member, NCells),
-                    [=](int ICell) { U(ICell) = Scratch.X(ICell, 0); });
+                parallelForInner(Member, NCells, [=](int ICell) {
+                   U(ICell) = Scratch.X(ICell, 0);
+                });
              });
       }
    }

From d966d8a752f5c68f8d2ab153ce339ed1a8b711a2 Mon Sep 17 00:00:00 2001
From: Maciej Waruszewski <mwarusz@igf.fuw.edu.pl>
Date: Fri, 20 Mar 2026 10:51:04 -0600
Subject: [PATCH 08/25] Add Range struct for inner loop ranges

---
 components/omega/src/infra/OmegaKokkosHiPar.h | 32 +++++++++++--------
 .../omega/test/infra/OmegaKokkosHiParTest.cpp | 17 +++++-----
 2 files changed, 27 insertions(+), 22 deletions(-)

diff --git a/components/omega/src/infra/OmegaKokkosHiPar.h b/components/omega/src/infra/OmegaKokkosHiPar.h
index b18c6e3ac97a..e2d6a88adb7f 100644
--- a/components/omega/src/infra/OmegaKokkosHiPar.h
+++ b/components/omega/src/infra/OmegaKokkosHiPar.h
@@ -165,28 +165,33 @@ inline void parallelReduceOuter(const int (&UpperBounds)[N], F &&Functor,
                        std::forward<R>(Reducers)...);
 }
 
+// Inclusive range of indices
+struct Range {
+   int First;
+   int Last;
+};
+
 // parallelForInner
 
 template <class F>
-KOKKOS_FUNCTION void parallelForInner(const TeamMember &Team, int MinIndex,
-                                      int MaxIndex, F &&Functor) {
-   const auto Policy = TeamThreadRange(Team, MinIndex, MaxIndex + 1);
+KOKKOS_FUNCTION void parallelForInner(const TeamMember &Team, Range Rng,
+                                      F &&Functor) {
+   const auto Policy = TeamThreadRange(Team, Rng.First, Rng.Last + 1);
    Kokkos::parallel_for(Policy, std::forward<F>(Functor));
 }
 
 template <class F>
 KOKKOS_FUNCTION void parallelForInner(const TeamMember &Team, int UpperBound,
                                       F &&Functor) {
-   parallelForInner(Team, 0, UpperBound - 1, std::forward<F>(Functor));
+   parallelForInner(Team, Range{0, UpperBound - 1}, std::forward<F>(Functor));
 }
 
 // parallelReduceInner
 
 template <class F, class... R>
-KOKKOS_FUNCTION void parallelReduceInner(const TeamMember &Team, int MinIndex,
-                                         int MaxIndex, F &&Functor,
-                                         R &&...Reducers) {
-   const auto Policy = TeamThreadRange(Team, MinIndex, MaxIndex + 1);
+KOKKOS_FUNCTION void parallelReduceInner(const TeamMember &Team, Range Rng,
+                                         F &&Functor, R &&...Reducers) {
+   const auto Policy = TeamThreadRange(Team, Rng.First, Rng.Last + 1);
    Kokkos::parallel_reduce(Policy, std::forward<F>(Functor),
                            std::forward<R>(Reducers)...);
 }
@@ -194,17 +199,16 @@ KOKKOS_FUNCTION void parallelReduceInner(const TeamMember &Team, int MinIndex,
 template <class F, class... R>
 KOKKOS_FUNCTION void parallelReduceInner(const TeamMember &Team, int UpperBound,
                                          F &&Functor, R &&...Reducers) {
-   parallelReduceInner(Team, 0, UpperBound - 1, std::forward<F>(Functor),
+   parallelReduceInner(Team, Range{0, UpperBound - 1}, std::forward<F>(Functor),
                        std::forward<R>(Reducers)...);
 }
 
 // parallelScanInner
 
 template <class F, class... R>
-KOKKOS_FUNCTION void parallelScanInner(const TeamMember &Team, int MinIndex,
-                                       int MaxIndex, F &&Functor,
-                                       R &&...Reducers) {
-   const auto Policy = TeamThreadRange(Team, MinIndex, MaxIndex + 1);
+KOKKOS_FUNCTION void parallelScanInner(const TeamMember &Team, Range Rng,
+                                       F &&Functor, R &&...Reducers) {
+   const auto Policy = TeamThreadRange(Team, Rng.First, Rng.Last + 1);
    Kokkos::parallel_scan(Policy, std::forward<F>(Functor),
                          std::forward<R>(Reducers)...);
 }
@@ -212,7 +216,7 @@ KOKKOS_FUNCTION void parallelScanInner(const TeamMember &Team, int MinIndex,
 template <class F, class... R>
 KOKKOS_FUNCTION void parallelScanInner(const TeamMember &Team, int UpperBound,
                                        F &&Functor, R &&...Reducers) {
-   parallelScanInner(Team, 0, UpperBound - 1, std::forward<F>(Functor),
+   parallelScanInner(Team, Range{0, UpperBound - 1}, std::forward<F>(Functor),
                      std::forward<R>(Reducers)...);
 }
 
diff --git a/components/omega/test/infra/OmegaKokkosHiParTest.cpp b/components/omega/test/infra/OmegaKokkosHiParTest.cpp
index bdb605c70f36..8bb7f833658d 100644
--- a/components/omega/test/infra/OmegaKokkosHiParTest.cpp
+++ b/components/omega/test/infra/OmegaKokkosHiParTest.cpp
@@ -365,7 +365,7 @@ Error testHiparFor2DFor1D(int N1, int N2) {
    parallelForOuter(
        {N1, N2}, KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team) {
           parallelForInner(
-              Team, J1, J1 + J2, INNER_LAMBDA(int J3) {
+              Team, Range{J1, J1 + J2}, INNER_LAMBDA(int J3) {
                  A(J1, J2, J3) = f3(J1, J2, J3, N1, N2, N3);
               });
        });
@@ -404,7 +404,7 @@ Error testHiparFor2DReduce1D(int N1, int N2) {
        {N1, N2}, KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team) {
           I4 Sum;
           parallelReduceInner(
-              Team, J1, J1 + J2,
+              Team, Range{J1, J1 + J2},
               INNER_LAMBDA(int J3, I4 &Accum) {
                  Accum += f3(J1, J2, J3, N1, N2, N3);
               },
@@ -413,7 +413,7 @@ Error testHiparFor2DReduce1D(int N1, int N2) {
 
           I4 Max;
           parallelReduceInner(
-              Team, J1, J1 + J2,
+              Team, Range{J1, J1 + J2},
               INNER_LAMBDA(int J3, I4 &Accum) {
                  Accum = Kokkos::max(Accum, f3(J1, J2, J3, N1, N2, N3));
               },
@@ -437,7 +437,7 @@ Error testHiparFor2DReduce1D(int N1, int N2) {
        {N1, N2}, KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team) {
           I4 Sum, Max;
           parallelReduceInner(
-              Team, J1, J1 + J2,
+              Team, Range{J1, J1 + J2},
               INNER_LAMBDA(int J3, I4 &AccumSum, I4 &AccumMax) {
                  AccumSum += f3(J1, J2, J3, N1, N2, N3);
                  AccumMax = Kokkos::max(AccumMax, f3(J1, J2, J3, N1, N2, N3));
@@ -475,7 +475,8 @@ Error testHiparFor2DScan1D(int N1, int N2) {
    parallelForOuter(
        {N1, N2}, KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team) {
           parallelScanInner(
-              Team, J1, J1 + J2, INNER_LAMBDA(int J3, I4 &Accum, bool IsFinal) {
+              Team, Range{J1, J1 + J2},
+              INNER_LAMBDA(int J3, I4 &Accum, bool IsFinal) {
                  if (IsFinal) {
                     RSum(J1, J2, J3) = Accum;
                  }
@@ -513,7 +514,7 @@ Error testHiparReduce2DReduce1D(int N1, int N2) {
        KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team, I4 &AccumOuter) {
           I4 SumInner;
           parallelReduceInner(
-              Team, J1, J1 + J2,
+              Team, Range{J1, J1 + J2},
               INNER_LAMBDA(int J3, I4 &AccumInner) {
                  AccumInner += f3(J1, J2, J3, N1, N2, N3);
               },
@@ -534,7 +535,7 @@ Error testHiparReduce2DReduce1D(int N1, int N2) {
        KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team, I4 &AccumOuter) {
           I4 MaxInner;
           parallelReduceInner(
-              Team, J1, J1 + J2,
+              Team, Range{J1, J1 + J2},
               INNER_LAMBDA(int J3, I4 &AccumInner) {
                  AccumInner =
                      Kokkos::max(AccumInner, f3(J1, J2, J3, N1, N2, N3));
@@ -556,7 +557,7 @@ Error testHiparReduce2DReduce1D(int N1, int N2) {
                      I4 &AccumMaxOuter) {
           I4 SumInner, MaxInner;
           parallelReduceInner(
-              Team, J1, J1 + J2,
+              Team, Range{J1, J1 + J2},
               INNER_LAMBDA(int J3, I4 &AccumSumInner, I4 &AccumMaxInner) {
                  AccumSumInner += f3(J1, J2, J3, N1, N2, N3);
                  AccumMaxInner =

From 4330b6ca301c6a71edbdd2f4938eb67ba2ac2910 Mon Sep 17 00:00:00 2001
From: Maciej Waruszewski <mwarusz@igf.fuw.edu.pl>
Date: Fri, 20 Mar 2026 12:20:23 -0600
Subject: [PATCH 09/25] Allow LaunchConfig in parallelReduceOuter

---
 components/omega/src/infra/OmegaKokkosHiPar.h | 54 +++++++++++++------
 1 file changed, 38 insertions(+), 16 deletions(-)

diff --git a/components/omega/src/infra/OmegaKokkosHiPar.h b/components/omega/src/infra/OmegaKokkosHiPar.h
index e2d6a88adb7f..bc148678c5b0 100644
--- a/components/omega/src/infra/OmegaKokkosHiPar.h
+++ b/components/omega/src/infra/OmegaKokkosHiPar.h
@@ -74,19 +74,19 @@ KOKKOS_INLINE_FUNCTION void teamBarrier(const TeamMember &Team) {
 // parallelForOuter: with label and with launch config
 template <int N, class F>
 inline void parallelForOuter(const std::string &Label,
-                             const LaunchConfig<N> &Config, F &&Functor) {
+                             const LaunchConfig<N> &LConfig, F &&Functor) {
 
    auto LinFunctor =
-       LinearIdxWrapper{std::forward<F>(Functor), Config.UpperBounds};
+       LinearIdxWrapper{std::forward<F>(Functor), LConfig.UpperBounds};
    int LinBound = 1;
    for (int Rank = 0; Rank < N; ++Rank) {
-      LinBound *= Config.UpperBounds[Rank];
+      LinBound *= LConfig.UpperBounds[Rank];
    }
 
-   auto Policy = TeamPolicy(LinBound, Config.TeamSize);
+   auto Policy = TeamPolicy(LinBound, LConfig.TeamSize);
 
-   if (Config.ScratchBytesPerTeam > 0) {
-      Policy.set_scratch_size(0, Kokkos::PerTeam(Config.ScratchBytesPerTeam));
+   if (LConfig.ScratchBytesPerTeam > 0) {
+      Policy.set_scratch_size(0, Kokkos::PerTeam(LConfig.ScratchBytesPerTeam));
    }
 
    Kokkos::parallel_for(
@@ -98,8 +98,8 @@ inline void parallelForOuter(const std::string &Label,
 
 // parallelForOuter: without label and with launch config
 template <int N, class F>
-inline void parallelForOuter(const LaunchConfig<N> &Config, F &&Functor) {
-   parallelForOuter("", Config, std::forward<F>(Functor));
+inline void parallelForOuter(const LaunchConfig<N> &LConfig, F &&Functor) {
+   parallelForOuter("", LConfig, std::forward<F>(Functor));
 }
 
 // parallelForOuter: with label and with array bounds
@@ -134,19 +134,24 @@ struct AccumTypeHelper<T, std::enable_if_t<Kokkos::is_reducer_v<T>>> {
 
 template <class T> using AccumType = typename AccumTypeHelper<T>::Type;
 
-// parallelReduceOuter: with label
+// parallelReduceOuter: with label and with launch config
 template <int N, class F, class... R>
 inline void parallelReduceOuter(const std::string &Label,
-                                const int (&UpperBounds)[N], F &&Functor,
+                                const LaunchConfig<N> &LConfig, F &&Functor,
                                 R &&...Reducers) {
 
-   auto LinFunctor = LinearIdxWrapper{std::forward<F>(Functor), UpperBounds};
-   int LinBound    = 1;
+   auto LinFunctor =
+       LinearIdxWrapper{std::forward<F>(Functor), LConfig.UpperBounds};
+   int LinBound = 1;
    for (int Rank = 0; Rank < N; ++Rank) {
-      LinBound *= UpperBounds[Rank];
+      LinBound *= LConfig.UpperBounds[Rank];
+   }
+
+   auto Policy = TeamPolicy(LinBound, LConfig.TeamSize);
+   if (LConfig.ScratchBytesPerTeam > 0) {
+      Policy.set_scratch_size(0, Kokkos::PerTeam(LConfig.ScratchBytesPerTeam));
    }
 
-   auto Policy = TeamPolicy(LinBound, OMEGA_TEAMSIZE);
    Kokkos::parallel_reduce(
        Label, Policy,
        KOKKOS_LAMBDA(const TeamMember &Team,
@@ -157,11 +162,28 @@ inline void parallelReduceOuter(const std::string &Label,
        std::forward<R>(Reducers)...);
 }
 
-// parallelReduceOuter: without label
+// parallelReduceOuter: without label and with launch config
+template <int N, class F, class... R>
+inline void parallelReduceOuter(const LaunchConfig<N> &LConfig, F &&Functor,
+                                R &&...Reducers) {
+   parallelReduceOuter("", LConfig, std::forward<F>(Functor),
+                       std::forward<R>(Reducers)...);
+}
+
+// parallelReduceOuter: with label and with array bounds
+template <int N, class F, class... R>
+inline void parallelReduceOuter(const std::string Label,
+                                const int (&UpperBounds)[N], F &&Functor,
+                                R &&...Reducers) {
+   parallelReduceOuter(Label, LaunchConfig(UpperBounds),
+                       std::forward<F>(Functor), std::forward<R>(Reducers)...);
+}
+
+// parallelReduceOuter: without label and with array bounds
 template <int N, class F, class... R>
 inline void parallelReduceOuter(const int (&UpperBounds)[N], F &&Functor,
                                 R &&...Reducers) {
-   parallelReduceOuter("", UpperBounds, std::forward<F>(Functor),
+   parallelReduceOuter("", LaunchConfig(UpperBounds), std::forward<F>(Functor),
                        std::forward<R>(Reducers)...);
 }
 

From 9cfc61bda6a928f385ee81bd887ccbbb91d5a26f Mon Sep 17 00:00:00 2001
From: Maciej Waruszewski <mwarusz@igf.fuw.edu.pl>
Date: Fri, 20 Mar 2026 13:36:30 -0600
Subject: [PATCH 10/25] Add parallelSearchInner

---
 components/omega/src/infra/OmegaKokkosHiPar.h | 38 +++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/components/omega/src/infra/OmegaKokkosHiPar.h b/components/omega/src/infra/OmegaKokkosHiPar.h
index bc148678c5b0..c627a2216624 100644
--- a/components/omega/src/infra/OmegaKokkosHiPar.h
+++ b/components/omega/src/infra/OmegaKokkosHiPar.h
@@ -242,6 +242,44 @@ KOKKOS_FUNCTION void parallelScanInner(const TeamMember &Team, int UpperBound,
                      std::forward<R>(Reducers)...);
 }
 
+// parallelSearchInner
+// Given a functor taking an index and returning a bool this function
+// returns the first index in the range [0, UpperBound) for which the input
+// functor returns true. If no such index is found it returns -1
+template <class F>
+KOKKOS_FUNCTION void parallelSearchInner(const TeamMember &Team, int UpperBound,
+                                         F &&Functor, int &Idx) {
+   static_assert(std::is_same_v<std::invoke_result_t<F, int>, bool>,
+                 "parallelSearchInner requires a functor that takes an int and "
+                 "returns bool");
+
+   // There are different implementations for host and device since the
+   // parallel_reduce version doesn't return early leading to performance loss
+   // on CPUs
+#ifndef OMEGA_TARGET_DEVICE
+   Idx = -1;
+   for (int I = 0; I < UpperBound; ++I) {
+      if (Functor(I)) {
+         Idx = I;
+         break;
+      }
+   }
+#else
+   const auto Policy = TeamThreadRange(Team, UpperBound);
+   Kokkos::parallel_reduce(
+       Policy,
+       INNER_LAMBDA(int I, int &Accum) {
+          if (I <= Accum && Functor(I)) {
+             Accum = I;
+          }
+       },
+       Kokkos::Min<int>(Idx));
+   if (Idx == Kokkos::reduction_identity<int>::min()) {
+      Idx = -1;
+   }
+#endif
+}
+
 } // end namespace OMEGA
 
 //===----------------------------------------------------------------------===//

From bdd4c37b7ffe305ea33313b4ed431d11d39b15d4 Mon Sep 17 00:00:00 2001
From: Maciej Waruszewski <mwarusz@igf.fuw.edu.pl>
Date: Thu, 5 Feb 2026 12:27:43 -0700
Subject: [PATCH 11/25] Add test for parallelSearchInner

---
 .../omega/test/infra/OmegaKokkosHiParTest.cpp | 105 ++++++++++++++++++
 1 file changed, 105 insertions(+)

diff --git a/components/omega/test/infra/OmegaKokkosHiParTest.cpp b/components/omega/test/infra/OmegaKokkosHiParTest.cpp
index 8bb7f833658d..f4789333eb50 100644
--- a/components/omega/test/infra/OmegaKokkosHiParTest.cpp
+++ b/components/omega/test/infra/OmegaKokkosHiParTest.cpp
@@ -262,6 +262,110 @@ Error testHiparReduce1DReduce1D(int N1) {
    return Err;
 }
 
+Error testHiparFor1DSearch1D(int N2) {
+   Error Err;
+
+   const int Threshold = N2 / 2;
+   const int N1        = 3 * N2 + 3;
+
+   HostArray2DI4 DataH("DataH", N1, N2);
+
+   for (int J1 = 0; J1 < 3 * N2; ++J1) {
+      if (J1 < N2 + 10) {
+         for (int J2 = 0; J2 < N2; ++J2) {
+            DataH(J1, J2) = Threshold - (J1 - J2);
+         }
+      } else {
+         for (int J2 = 0; J2 < N2; ++J2) {
+            DataH(J1, J2) = Threshold - (J1 / 4 - J2);
+         }
+      }
+   }
+
+   // Ensure these patterns are in the input data
+   for (int J2 = 0; J2 < N2; ++J2) {
+      // Everything above threshold
+      DataH(3 * N2, J2) = Threshold + 1;
+      // Everything below threshold
+      DataH(3 * N2 + 1, J2) = Threshold - 1;
+      // Multiple non-consecutive values above threshold
+      DataH(3 * N2 + 2, J2) = Threshold - 3 + J2 % 4;
+   }
+
+   auto DataD = createDeviceMirrorCopy(DataH);
+
+   HostArray1DI4 RefIdxH("RefIdxH", N1);
+   Array1DI4 IdxD("IdxD", N1);
+
+   // test searching full range
+
+   for (int J1 = 0; J1 < N1; ++J1) {
+      int Idx = -1;
+      for (int J2 = 0; J2 < N2; ++J2) {
+         if (DataH(J1, J2) >= Threshold) {
+            Idx = J2;
+            break;
+         }
+      }
+      RefIdxH(J1) = Idx;
+   }
+
+   parallelForOuter(
+       {N1}, KOKKOS_LAMBDA(int J1, const TeamMember &Team) {
+          parallelSearchInner(
+              Team, N2,
+              INNER_LAMBDA(int J2) { return DataD(J1, J2) >= Threshold; },
+              IdxD(J1));
+       });
+
+   if (!arraysEqual(IdxD, RefIdxH)) {
+      Err += Error(ErrorCode::Fail,
+                   errorMsg("parallelFor1DSearch1D Full FAIL", N1));
+   }
+
+   deepCopy(RefIdxH, 0);
+   deepCopy(IdxD, 0);
+
+   // test searching limited range
+
+   if (N2 / 4 > 0) {
+
+      for (int J1 = 0; J1 < N1; ++J1) {
+         int Idx         = -1;
+         const int Start = N2 / 4 - J1 % (N2 / 4);
+         const int End   = 3 * N2 / 4 + J1 % (N2 / 4);
+         for (int J2 = Start; J2 < End; ++J2) {
+            if (DataH(J1, J2) >= Threshold) {
+               Idx = J2;
+               break;
+            }
+         }
+         RefIdxH(J1) = Idx;
+      }
+
+      parallelForOuter(
+          {N1}, KOKKOS_LAMBDA(int J1, const TeamMember &Team) {
+             const int Start = N2 / 4 - J1 % (N2 / 4);
+             const int End   = 3 * N2 / 4 + J1 % (N2 / 4);
+             int SearchIdx;
+             parallelSearchInner(
+                 Team, End - Start,
+                 INNER_LAMBDA(int J2) {
+                    return DataD(J1, J2 + Start) >= Threshold;
+                 },
+                 SearchIdx);
+             IdxD(J1) = SearchIdx == -1 ? SearchIdx : SearchIdx + Start;
+          });
+
+      if (!arraysEqual(IdxD, RefIdxH)) {
+         Err += Error(ErrorCode::Fail,
+                      errorMsg("parallelFor1DSearch1D Limited FAIL", N1));
+      }
+   }
+
+   return Err;
+}
+
 Error testHiparFor1DMultiple1D(int N1, int N2) {
    Error Err;
 
@@ -688,6 +792,7 @@ int main(int argc, char **argv) {
 #if !defined(KOKKOS_ENABLE_SYCL) || KOKKOS_VERSION_GREATER_EQUAL(4, 7, 1)
             Err += testHiparReduce1DReduce1D(N1);
 #endif
+            Err += testHiparFor1DSearch1D(N1);
 
             Err += testHiparFor1DMultiple1D(1, N1);
             Err += testHiparFor1DMultiple1D((N1 + 1) / 2, N1);

From 8a68967a0075cf6c670563613e086deddcad47f6 Mon Sep 17 00:00:00 2001
From: Maciej Waruszewski <mwarusz@igf.fuw.edu.pl>
Date: Thu, 5 Feb 2026 16:04:28 -0700
Subject: [PATCH 12/25] Add docs for parallelSearchInner

---
 .../omega/doc/devGuide/ParallelLoops.md       | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/components/omega/doc/devGuide/ParallelLoops.md b/components/omega/doc/devGuide/ParallelLoops.md
index 2ada95cfdb71..9a8251d4228c 100644
--- a/components/omega/doc/devGuide/ParallelLoops.md
+++ b/components/omega/doc/devGuide/ParallelLoops.md
@@ -115,6 +115,7 @@ The following inner iteration patterns are supported in Omega:
 - `parallelForInner`
 - `parallelReduceInner`
 - `parallelScanInner`
+- `parallelSearchInner`
 
 To provide even more flexibility, the outer loops support iterating over a multi-dimensional range.
 Currently, the inner loops are limited to one dimension.
@@ -277,3 +278,25 @@ Moreover, this example illustrates that the final scan value can be obtained by
 an additional argument `FinalScanValue`. Labels are not supported by `parallelScanInner`
 and only one-dimensional index range can be used. In contrast to `parallelReduceInner`,
 `parallelScanInner` supports only sum-based scans and only one scan variable.
+
+### parallelSearchInner
+To search an index range in parallel for the first index where a given condition occurs Omega
+provides the `parallelSearchInner` function.
+For example, the following code finds, for each row of a matrix, the first column index where
+the matrix element is above a certain threshold. If no element matches the condition then
+`parallelSearchInner` returns `-1`.
+```c++
+   Array2DReal M("M", N1, N2);
+   Array1DI3 ThresholdIdx("ThresholdIdx", N1);
+   parallelForOuter(
+       {N1}, KOKKOS_LAMBDA(int J1, const TeamMember &Team) {
+
+       int Idx;
+       parallelSearchInner(Team, N2, INNER_LAMBDA(Int J2) {
+            return M(J1, J2) > Threshold;
+       }, Idx);
+
+       ThresholdIdx(J1) = Idx;
+   });
+```
+Labels are not supported by `parallelSearchInner` and only one-dimensional index range can be used.

From e1835b44f2591b09ff4b3e2224c15a2f24689665 Mon Sep 17 00:00:00 2001
From: Maciej Waruszewski <mwarusz@igf.fuw.edu.pl>
Date: Wed, 25 Feb 2026 16:40:53 -0700
Subject: [PATCH 13/25] Incorporate copilot suggestions

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 .../omega/doc/devGuide/ParallelLoops.md       | 25 ++++++++++---------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/components/omega/doc/devGuide/ParallelLoops.md b/components/omega/doc/devGuide/ParallelLoops.md
index 9a8251d4228c..7d4a5ae190ab 100644
--- a/components/omega/doc/devGuide/ParallelLoops.md
+++ b/components/omega/doc/devGuide/ParallelLoops.md
@@ -192,7 +192,7 @@ a 3D array in parallel using hierarchical parallelism.
    Array3DReal A("A", N1, N2, N3);
    parallelForOuter(
        {N1, N2}, KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team) {
-        parallelForInner(Team, N3, INNER_LAMBDA(Int J3) {
+        parallelForInner(Team, N3, INNER_LAMBDA(int J3) {
           A(J1, J2, J3) = J1 + J2 + J3;
         });
     });
@@ -204,7 +204,7 @@ diagonal of a square matrix one can do:
    Array2DReal M("M", N, N);
    parallelForOuter(
        {N}, KOKKOS_LAMBDA(int J1, const TeamMember &Team) {
-        parallelForInner(Team, J1, INNER_LAMBDA(Int J2) {
+        parallelForInner(Team, J1, INNER_LAMBDA(int J2) {
           M(J1, J2) = J1 + J2;
         });
     });
@@ -220,7 +220,7 @@ in a 2D array might be done as follows.
    parallelForOuter(
        {N1, N2}, KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team) {
         Real SumD3;
-        parallelReduceInner(Team, N3, INNER_LAMBDA(Int J3, Real &Accum) {
+        parallelReduceInner(Team, N3, INNER_LAMBDA(int J3, Real &Accum) {
             Accum += A(J1, J2, J3);
         }, SumD3);
         B(J1, J2) = SumD3;
@@ -234,10 +234,10 @@ For example, to additionally compute and store maxima along the third dimension
    parallelForOuter(
        {N1, N2}, KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team) {
         Real SumD3, MaxD3;
-        parallelReduceInner(Team, N3, INNER_LAMBDA(Int J3, Real &AccumSum, Real &AccumMax) {
+        parallelReduceInner(Team, N3, INNER_LAMBDA(int J3, Real &AccumSum, Real &AccumMax) {
             AccumSum += A(J1, J2, J3);
-            AccumMax = Kokkos::Max(AccumMax, A(J1, J2, J3));
-        }, SumN3, MaxN3);
+            AccumMax = Kokkos::max(AccumMax, A(J1, J2, J3));
+        }, SumD3, Kokkos::Max<Real>(MaxD3));
         B(J1, J2) = SumD3;
         C(J1, J2) = MaxD3;
     });
@@ -254,7 +254,7 @@ be done as follows.
    Array3DReal D("D", N1, N2, N3);
    parallelForOuter(
        {N1, N2}, KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team) {
-       parallelScanInner(Team, N1, INNER_LAMBDA(Int J3, Real &Accum, bool IsFinal) {
+       parallelScanInner(Team, N3, INNER_LAMBDA(int J3, Real &Accum, bool IsFinal) {
             Accum += A(J1, J2, J3);
             if (IsFinal) {
               D(J1, J2, J3) = Accum;
@@ -267,7 +267,7 @@ before the `if` statement. That is, it performs an inclusive scan. To compute an
 simply move the addition after the `if` statement.
 ```c++
   Real FinalScanValue;
-  parallelScanInner(Team, N1, INNER_LAMBDA(Int J3, Real &Accum, bool IsFinal) {
+  parallelScanInner(Team, N3, INNER_LAMBDA(int J3, Real &Accum, bool IsFinal) {
        if (IsFinal) {
          D(J1, J2, J3) = Accum;
        }
@@ -280,19 +280,20 @@ and only one-dimensional index range can be used. In contrast to `parallelReduce
 `parallelScanInner` supports only sum-based scans and only one scan variable.
 
 ### parallelSearchInner
-To search an index range in parallel for the first index where a given condition occurs Omega
-provides the `parallelSearchInner` function.
+To search an index range in parallel for the first index at which a given condition occurs,
+Omega provides the `parallelSearchInner` function.
 For example, the following code finds, for each row of a matrix, the first column index where
 the matrix element is above a certain threshold. If no element matches the condition then
 `parallelSearchInner` returns `-1`.
 ```c++
    Array2DReal M("M", N1, N2);
-   Array1DI3 ThresholdIdx("ThresholdIdx", N1);
+   Array1DI4 ThresholdIdx("ThresholdIdx", N1);
+   const Real Threshold = 0.5;
    parallelForOuter(
        {N1}, KOKKOS_LAMBDA(int J1, const TeamMember &Team) {
 
        int Idx;
-       parallelSearchInner(Team, N2, INNER_LAMBDA(Int J2) {
+       parallelSearchInner(Team, N2, INNER_LAMBDA(int J2) {
             return M(J1, J2) > Threshold;
        }, Idx);
 

From 2eeeb91b2009b8d2b73c1d02dd2d683bfaa79b40 Mon Sep 17 00:00:00 2001
From: Maciej Waruszewski <mwarusz@igf.fuw.edu.pl>
Date: Fri, 20 Mar 2026 16:11:13 -0600
Subject: [PATCH 14/25] Add a test using LaunchConfig and TeamScratch

---
 components/omega/src/infra/OmegaKokkosHiPar.h |  2 +
 .../omega/test/infra/OmegaKokkosHiParTest.cpp | 71 +++++++++++++++++++
 2 files changed, 73 insertions(+)

diff --git a/components/omega/src/infra/OmegaKokkosHiPar.h b/components/omega/src/infra/OmegaKokkosHiPar.h
index c627a2216624..232bc1100182 100644
--- a/components/omega/src/infra/OmegaKokkosHiPar.h
+++ b/components/omega/src/infra/OmegaKokkosHiPar.h
@@ -18,6 +18,8 @@ using ScratchMemSpace = ExecSpace::scratch_memory_space;
 using Kokkos::PerTeam;
 using ArrayScratch1DReal =
     Kokkos::View<Real *, ScratchMemSpace, Kokkos::MemoryUnmanaged>;
+using ArrayScratch1DI4 =
+    Kokkos::View<I4 *, ScratchMemSpace, Kokkos::MemoryUnmanaged>;
 
 /// team_size for hierarchical parallelism
 #ifdef OMEGA_TARGET_DEVICE
diff --git a/components/omega/test/infra/OmegaKokkosHiParTest.cpp b/components/omega/test/infra/OmegaKokkosHiParTest.cpp
index f4789333eb50..2e74f01b9dbb 100644
--- a/components/omega/test/infra/OmegaKokkosHiParTest.cpp
+++ b/components/omega/test/infra/OmegaKokkosHiParTest.cpp
@@ -366,6 +366,75 @@ Error testHiparFor1DSearch1D(int N2) {
    return Err;
 }
 
+Error testHiparLaunchConfig1D(int N1, int N2) {
+   Error Err;
+
+   HostArray2DReal RefOutH("RefOutH", N1, N2 - 3);
+
+   for (int J1 = 0; J1 < N1; ++J1) {
+      HostArray1DI4 ScratchAH("ScratchAH", N2);
+
+      for (int J2 = 0; J2 < N2; ++J2) {
+         ScratchAH(J2) = f2(J1, J2, N1, N2) * f2(J1, J2, N1, N2);
+      }
+
+      HostArray1DReal ScratchBH("ScratchBH", N2 - 2);
+
+      for (int J2 = 1; J2 < N2 - 1; ++J2) {
+         ScratchBH(J2 - 1) =
+             1._Real / (1._Real + ScratchAH(J2 + 1) - ScratchAH(J2 - 1));
+      }
+
+      for (int J2 = 0; J2 < N2 - 3; ++J2) {
+         RefOutH(J1, J2) = ScratchBH(J2) / ScratchBH(J2 + 1);
+      }
+   }
+
+   Array2DReal OutD("OutD", N1, N2 - 3);
+
+#ifdef OMEGA_DEVICE
+   const int TeamSize = 32;
+#else
+   const int TeamSize = 1;
+#endif
+
+   auto LConfig =
+       LaunchConfig({N1}, TeamSize, TeamScratch<Real, I4>({N2, N2 - 2}));
+   parallelForOuter(
+       LConfig, KOKKOS_LAMBDA(int J1, const TeamMember &Team) {
+          ArrayScratch1DI4 ScratchA(Team.team_scratch(0), N2);
+
+          parallelForInner(
+              Team, N2, INNER_LAMBDA(int J2) {
+                 ScratchA(J2) = f2(J1, J2, N1, N2) * f2(J1, J2, N1, N2);
+              });
+
+          teamBarrier(Team);
+
+          ArrayScratch1DReal ScratchB(Team.team_scratch(0), N2 - 2);
+
+          parallelForInner(
+              Team, Range{1, N2 - 2}, INNER_LAMBDA(int J2) {
+                 ScratchB(J2 - 1) =
+                     1._Real / (1._Real + ScratchA(J2 + 1) - ScratchA(J2 - 1));
+              });
+
+          teamBarrier(Team);
+
+          parallelForInner(
+              Team, N2 - 3, INNER_LAMBDA(int J2) {
+                 OutD(J1, J2) = ScratchB(J2) / ScratchB(J2 + 1);
+              });
+       });
+
+   if (!arraysEqual(OutD, RefOutH)) {
+      Err += Error(ErrorCode::Fail,
+                   errorMsg("parallelForLaunchConfig1D FAIL", N1, N2));
+   }
+
+   return Err;
+}
+
 Error testHiparFor1DMultiple1D(int N1, int N2) {
    Error Err;
 
@@ -794,6 +863,8 @@ int main(int argc, char **argv) {
 #endif
             Err += testHiparFor1DSearch1D(N1);
 
+            Err += testHiparLaunchConfig1D(2 * N1, N1 + 3);
+
             Err += testHiparFor1DMultiple1D(1, N1);
             Err += testHiparFor1DMultiple1D((N1 + 1) / 2, N1);
             Err += testHiparFor1DMultiple1D(2 * N1, N1);

From e8ef30081c0971da1f0835673fe52ea89409d939 Mon Sep 17 00:00:00 2001
From: Maciej Waruszewski <mwarusz@igf.fuw.edu.pl>
Date: Fri, 20 Mar 2026 16:20:52 -0600
Subject: [PATCH 15/25] Simplify and add comment to TeamScratch

---
 components/omega/src/infra/OmegaKokkosHiPar.h        | 12 ++++++------
 components/omega/test/infra/OmegaKokkosHiParTest.cpp |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/components/omega/src/infra/OmegaKokkosHiPar.h b/components/omega/src/infra/OmegaKokkosHiPar.h
index 232bc1100182..0caa6be49d1f 100644
--- a/components/omega/src/infra/OmegaKokkosHiPar.h
+++ b/components/omega/src/infra/OmegaKokkosHiPar.h
@@ -31,18 +31,18 @@ constexpr int OMEGA_TEAMSIZE = 1;
 #define INNER_LAMBDA [=]
 // #define INNER_LAMBDA [&]
 
+// Helper struct for providing information about scratch memory requirements
+// TeamScratch<Real, I4>(4, 8) stores the number of bytes needed for
+// 4 values of type Real and 8 vals of type I4
 template <class... T> struct TeamScratch {
    size_t BytesPerTeam = 0;
 
    TeamScratch() = default;
 
-   template <int N> TeamScratch(const int (&NVals)[N]) {
-      static_assert(N == sizeof...(T));
-      int I = 0;
-      ((BytesPerTeam += sizeof(T) * NVals[I++]), ...);
+   template <class... ArgT> TeamScratch(ArgT... Args) {
+      static_assert(sizeof...(ArgT) == sizeof...(T));
+      ((BytesPerTeam += sizeof(T) * Args), ...);
    }
-
-   TeamScratch(int NVals) : TeamScratch({{NVals}}) {}
 };
 
 template <int N> struct LaunchConfig {
diff --git a/components/omega/test/infra/OmegaKokkosHiParTest.cpp b/components/omega/test/infra/OmegaKokkosHiParTest.cpp
index 2e74f01b9dbb..f243e3d6b6c8 100644
--- a/components/omega/test/infra/OmegaKokkosHiParTest.cpp
+++ b/components/omega/test/infra/OmegaKokkosHiParTest.cpp
@@ -399,7 +399,7 @@ Error testHiparLaunchConfig1D(int N1, int N2) {
 #endif
 
    auto LConfig =
-       LaunchConfig({N1}, TeamSize, TeamScratch<Real, I4>({N2, N2 - 2}));
+       LaunchConfig({N1}, TeamSize, TeamScratch<Real, I4>(N2, N2 - 2));
    parallelForOuter(
        LConfig, KOKKOS_LAMBDA(int J1, const TeamMember &Team) {
           ArrayScratch1DI4 ScratchA(Team.team_scratch(0), N2);

From db67c73d9eb376dfc6fcfe6310a86d2b7ab53430 Mon Sep 17 00:00:00 2001
From: Maciej Waruszewski <mwarusz@igf.fuw.edu.pl>
Date: Thu, 26 Mar 2026 15:41:56 -0600
Subject: [PATCH 16/25] Support Range in parallelSearchInner

---
 components/omega/src/infra/OmegaKokkosHiPar.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/components/omega/src/infra/OmegaKokkosHiPar.h b/components/omega/src/infra/OmegaKokkosHiPar.h
index 0caa6be49d1f..720b52086bf4 100644
--- a/components/omega/src/infra/OmegaKokkosHiPar.h
+++ b/components/omega/src/infra/OmegaKokkosHiPar.h
@@ -249,7 +249,7 @@ KOKKOS_FUNCTION void parallelScanInner(const TeamMember &Team, int UpperBound,
 // returns the first index in the range [0, UpperBound) for which the input
 // functor returns true. If no such index is found it returns -1
 template <class F>
-KOKKOS_FUNCTION void parallelSearchInner(const TeamMember &Team, int UpperBound,
+KOKKOS_FUNCTION void parallelSearchInner(const TeamMember &Team, Range Rng,
                                          F &&Functor, int &Idx) {
    static_assert(std::is_same_v<std::invoke_result_t<F, int>, bool>,
                  "parallelSearchInner requires a functor that takes an int and "
@@ -260,14 +260,14 @@ KOKKOS_FUNCTION void parallelSearchInner(const TeamMember &Team, int UpperBound,
    // on CPUs
 #ifndef OMEGA_TARGET_DEVICE
    Idx = -1;
-   for (int I = 0; I < UpperBound; ++I) {
+   for (int I = Rng.First; I <= Rng.Last; ++I) {
       if (Functor(I)) {
          Idx = I;
          break;
       }
    }
 #else
-   const auto Policy = TeamThreadRange(Team, UpperBound);
+   const auto Policy = TeamThreadRange(Team, Rng.First, Rng.Last + 1);
    Kokkos::parallel_reduce(
        Policy,
        INNER_LAMBDA(int I, int &Accum) {
@@ -282,6 +282,13 @@ KOKKOS_FUNCTION void parallelSearchInner(const TeamMember &Team, int UpperBound,
 #endif
 }
 
+template <class F>
+KOKKOS_FUNCTION void parallelSearchInner(const TeamMember &Team, int UpperBound,
+                                         F &&Functor, int &Idx) {
+   parallelSearchInner(Team, Range{0, UpperBound - 1}, std::forward<F>(Functor),
+                       Idx);
+}
+
 } // end namespace OMEGA
 
 //===----------------------------------------------------------------------===//

From 41efca9eff84b4ac8f16b8ed17f212bafded7f6e Mon Sep 17 00:00:00 2001
From: Maciej Waruszewski <mwarusz@igf.fuw.edu.pl>
Date: Fri, 27 Mar 2026 16:33:21 -0600
Subject: [PATCH 17/25] Add teamScratch

---
 components/omega/src/infra/OmegaKokkosHiPar.h        | 4 ++++
 components/omega/test/infra/OmegaKokkosHiParTest.cpp | 4 ++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/components/omega/src/infra/OmegaKokkosHiPar.h b/components/omega/src/infra/OmegaKokkosHiPar.h
index 720b52086bf4..0f3266e5dc5a 100644
--- a/components/omega/src/infra/OmegaKokkosHiPar.h
+++ b/components/omega/src/infra/OmegaKokkosHiPar.h
@@ -73,6 +73,10 @@ KOKKOS_INLINE_FUNCTION void teamBarrier(const TeamMember &Team) {
    Team.team_barrier();
 }
 
+KOKKOS_INLINE_FUNCTION decltype(auto) teamScratch(const TeamMember &Team) {
+   return Team.team_scratch(0);
+}
+
 // parallelForOuter: with label and with launch config
 template <int N, class F>
 inline void parallelForOuter(const std::string &Label,
diff --git a/components/omega/test/infra/OmegaKokkosHiParTest.cpp b/components/omega/test/infra/OmegaKokkosHiParTest.cpp
index f243e3d6b6c8..7f3a09da6ed8 100644
--- a/components/omega/test/infra/OmegaKokkosHiParTest.cpp
+++ b/components/omega/test/infra/OmegaKokkosHiParTest.cpp
@@ -402,7 +402,7 @@ Error testHiparLaunchConfig1D(int N1, int N2) {
        LaunchConfig({N1}, TeamSize, TeamScratch<Real, I4>(N2, N2 - 2));
    parallelForOuter(
        LConfig, KOKKOS_LAMBDA(int J1, const TeamMember &Team) {
-          ArrayScratch1DI4 ScratchA(Team.team_scratch(0), N2);
+          ArrayScratch1DI4 ScratchA(teamScratch(Team), N2);
 
           parallelForInner(
               Team, N2, INNER_LAMBDA(int J2) {
@@ -411,7 +411,7 @@ Error testHiparLaunchConfig1D(int N1, int N2) {
 
           teamBarrier(Team);
 
-          ArrayScratch1DReal ScratchB(Team.team_scratch(0), N2 - 2);
+          ArrayScratch1DReal ScratchB(teamScratch(Team), N2 - 2);
 
           parallelForInner(
               Team, Range{1, N2 - 2}, INNER_LAMBDA(int J2) {

From 41bf4a9e116bc064cf3612c95ee46330fe6074bb Mon Sep 17 00:00:00 2001
From: Maciej Waruszewski <mwarusz@igf.fuw.edu.pl>
Date: Fri, 27 Mar 2026 17:03:49 -0600
Subject: [PATCH 18/25] Add namespace to TeamThreadRange

---
 components/omega/src/infra/OmegaKokkosHiPar.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/components/omega/src/infra/OmegaKokkosHiPar.h b/components/omega/src/infra/OmegaKokkosHiPar.h
index 0f3266e5dc5a..6a2093e4994b 100644
--- a/components/omega/src/infra/OmegaKokkosHiPar.h
+++ b/components/omega/src/infra/OmegaKokkosHiPar.h
@@ -204,7 +204,7 @@ struct Range {
 template <class F>
 KOKKOS_FUNCTION void parallelForInner(const TeamMember &Team, Range Rng,
                                       F &&Functor) {
-   const auto Policy = TeamThreadRange(Team, Rng.First, Rng.Last + 1);
+   const auto Policy = Kokkos::TeamThreadRange(Team, Rng.First, Rng.Last + 1);
    Kokkos::parallel_for(Policy, std::forward<F>(Functor));
 }
 
@@ -219,7 +219,7 @@ KOKKOS_FUNCTION void parallelForInner(const TeamMember &Team, int UpperBound,
 template <class F, class... R>
 KOKKOS_FUNCTION void parallelReduceInner(const TeamMember &Team, Range Rng,
                                          F &&Functor, R &&...Reducers) {
-   const auto Policy = TeamThreadRange(Team, Rng.First, Rng.Last + 1);
+   const auto Policy = Kokkos::TeamThreadRange(Team, Rng.First, Rng.Last + 1);
    Kokkos::parallel_reduce(Policy, std::forward<F>(Functor),
                            std::forward<R>(Reducers)...);
 }
@@ -236,7 +236,7 @@ KOKKOS_FUNCTION void parallelReduceInner(const TeamMember &Team, int UpperBound,
 template <class F, class... R>
 KOKKOS_FUNCTION void parallelScanInner(const TeamMember &Team, Range Rng,
                                        F &&Functor, R &&...Reducers) {
-   const auto Policy = TeamThreadRange(Team, Rng.First, Rng.Last + 1);
+   const auto Policy = Kokkos::TeamThreadRange(Team, Rng.First, Rng.Last + 1);
    Kokkos::parallel_scan(Policy, std::forward<F>(Functor),
                          std::forward<R>(Reducers)...);
 }
@@ -271,7 +271,7 @@ KOKKOS_FUNCTION void parallelSearchInner(const TeamMember &Team, Range Rng,
       }
    }
 #else
-   const auto Policy = TeamThreadRange(Team, Rng.First, Rng.Last + 1);
+   const auto Policy = Kokkos::TeamThreadRange(Team, Rng.First, Rng.Last + 1);
    Kokkos::parallel_reduce(
        Policy,
        INNER_LAMBDA(int I, int &Accum) {

From 5145145906b350a7cd1722bf99992de66239de44 Mon Sep 17 00:00:00 2001
From: Maciej Waruszewski <mwarusz@igf.fuw.edu.pl>
Date: Wed, 1 Apr 2026 11:06:47 -0600
Subject: [PATCH 19/25] Fix tests

---
 components/omega/test/infra/OmegaKokkosHiParTest.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/components/omega/test/infra/OmegaKokkosHiParTest.cpp b/components/omega/test/infra/OmegaKokkosHiParTest.cpp
index 7f3a09da6ed8..01587e7ed558 100644
--- a/components/omega/test/infra/OmegaKokkosHiParTest.cpp
+++ b/components/omega/test/infra/OmegaKokkosHiParTest.cpp
@@ -399,7 +399,7 @@ Error testHiparLaunchConfig1D(int N1, int N2) {
 #endif
 
    auto LConfig =
-       LaunchConfig({N1}, TeamSize, TeamScratch<Real, I4>(N2, N2 - 2));
+       LaunchConfig({N1}, TeamSize, TeamScratch<Real, I4>(N2 - 2, N2));
    parallelForOuter(
        LConfig, KOKKOS_LAMBDA(int J1, const TeamMember &Team) {
           ArrayScratch1DI4 ScratchA(teamScratch(Team), N2);

From dbbb9089ff1b44f9d8a2c73da88d9448b05ab314 Mon Sep 17 00:00:00 2001
From: Maciej Waruszewski <mwarusz@igf.fuw.edu.pl>
Date: Thu, 19 Mar 2026 15:25:09 -0600
Subject: [PATCH 20/25] Update parallel loops docs

---
 .../omega/doc/devGuide/ParallelLoops.md       | 97 +++++++++++++++++++
 1 file changed, 97 insertions(+)

diff --git a/components/omega/doc/devGuide/ParallelLoops.md b/components/omega/doc/devGuide/ParallelLoops.md
index 7d4a5ae190ab..a9c0d0e0e22c 100644
--- a/components/omega/doc/devGuide/ParallelLoops.md
+++ b/components/omega/doc/devGuide/ParallelLoops.md
@@ -150,6 +150,26 @@ To do that Kokkos provides the `single` function. To execute a statement once pe
   });
 ```
 
+### Inner Iteration Ranges
+
+There are two ways of specifying the iteration range of an inner loop.
+The first takes the total number of iterations `N` as the second argument
+```c++
+   parallelForInner(Team, N, INNER_LAMBDA (int K) {
+   });
+```
+and the loop index `K` takes values from `0` up to and including `N - 1`.
+The second way uses a helper struct `Range` to provide a range of valid indices
+```c++
+   parallelForInner(Team, Range{N1, N2}, INNER_LAMBDA (int K) {
+   });
+```
+Note that this range is inclusive, i.e. the loop index `K` takes values from `N1` up to and including `N2`.
+This means that `Range{0, N}` specifies a diffrent range than the first example.
+For simplicity, most examples in this document use the first way of specyfying the range,
+but a `Range` argument can be passed to all inner iteration patters.
+
+
 ### parallelForOuter
 To start outer iterations over a multidimensional index range the `parallelForOuter` wrapper is available.
 A call to `parallelForOuter` might look as follows.
@@ -301,3 +321,80 @@ the matrix element is above a certain threshold. If no element matches the condi
    });
 ```
 Labels are not supported by `parallelSearchInner` and only one-dimensional index range can be used.
+
+### Launch Config
+
+While specyfing loop bounds is enough to start an outer parallel loop, sometimes more control over the underlaying
+Kokkos `TeamPolicy` is desired. The most common use case is utilizing scratch memory, a concept discussed more
+thoroughly in the next sub-section. To enable more control, outer loops can be launched by providing
+a `LaunchConfig` struct as the first argument, which is composed of three parts:
+- loop bounds,
+- team size,
+- amount of scratch memory.
+
+For example, the following snippet launches a loop iterating over a two-dimensional index range
+with team size of 32 and enough scratch memory for 8 `Real` values and 4 `I4` values per team.
+```c++
+   auto LConfig = LaunchConfig({N1, N2}, 32, TeamScratch<Real, I4>(8, 4));
+   parallelForOuter(LConfig,
+       KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team) {
+   });
+```
+It is not necessary to provide all three arguments to `LaunchConfig`. If you want the default team size,
+or you don't need any scratch memory, you can use the follwing constructors.
+```c++
+   auto LConfig1 = LaunchConfig({N1, N2}, TeamScratch<Real, I4>(8, 4));
+   auto LConfig2 = LaunchConfig({N1, N2}, 32);
+```
+For simplicity, most examples in this document use the simple form of launching outer loops with just the bounds,
+but `LaunchConfig` can be used for all types of outer parallel loops.
+Inner parallel loops cannot use `LaunchConfig`.
+
+### Team Scratch Memory
+
+In hierarchical code, it is often useful to have some amount of scratch memory private to each team.
+Scratch memory enables reuse of expensive to compute data in inner loops.
+To enable scratch memory, the outer loops needs to be launched with the `LaunchConfig` parameter described above,
+configured with the requested number of scratch values.
+Inside the outer loop, unmanaged scratch arrays can be created from a pool of memory accesible
+by calling the `teamScratch(Team)` function.
+Scratch arrays have a different type than normal Omega arrays, for example `ArrayScratch1DReal` is the
+type of a 1D scratch array of Reals. They also cannot have labels.
+
+As an example, the following code uses scratch memory to compute an expensive function on elements of a 2D array `A`.
+It then computes finite differences along the second dimension of the scratch array, and stores them in `A`.
+By using scratch memory, the expensive function is only computed once for every element, and there is no need for global memory allocation.
+```c++
+   Array2DReal A("A", N1, N2);
+   parallelForOuter(
+       LaunchConfig({N1}, TeamScratch<Real>(N2)),
+       KOKKOS_LAMBDA(int J1, const TeamMember &Team) {
+
+        ArrayScratch1DReal SA(teamScratch(Team), N2);
+
+        parallelForInner(Team, N2, INNER_LAMBDA (int J2) {
+            SA(J2) = expensiveFunc(A(J1, J2));
+        });
+
+        teamBarrier(Team);
+
+        parallelForInner(Team, N2, INNER_LAMBDA (int J2) {
+
+            const int J2M1 = Kokkos::max(J2 - 1, 0);
+            const int J2P1 = Kokkos::min(J2 + 1, N2 - 1);
+
+            A(J1, J2) = SA(J2P1) - SA(J2M1);
+        });
+   });
+```
+You can create multiple scratch arrays of different types, as in the following code.
+```c++
+   parallelForOuter(
+       LaunchConfig({N1}, TeamScratch<Real, I4>(4, 8)),
+       KOKKOS_LAMBDA(int J1, const TeamMember &Team) {
+        ArrayScratch1DI4 ScratchI4(teamScratch(Team), 8);
+        ArrayScratch1DReal ScratchReal(teamScratch(Team), 4);
+   });
+```
+As the above example illustrates, the order in which the arrays are created inside the outer region
+doesn't need to match the order of arguments to `TeamScratch`.

From 12b39ae36f911ec024247f657d4b0aa424b86ce6 Mon Sep 17 00:00:00 2001
From: Maciej Waruszewski <mwarusz@igf.fuw.edu.pl>
Date: Wed, 1 Apr 2026 14:31:17 -0600
Subject: [PATCH 21/25] Simplify par search tests using Range

---
 components/omega/test/infra/OmegaKokkosHiParTest.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/components/omega/test/infra/OmegaKokkosHiParTest.cpp b/components/omega/test/infra/OmegaKokkosHiParTest.cpp
index 01587e7ed558..b5b08de34908 100644
--- a/components/omega/test/infra/OmegaKokkosHiParTest.cpp
+++ b/components/omega/test/infra/OmegaKokkosHiParTest.cpp
@@ -349,12 +349,10 @@ Error testHiparFor1DSearch1D(int N2) {
              const int End   = 3 * N2 / 4 + J1 % (N2 / 4);
              int SearchIdx;
              parallelSearchInner(
-                 Team, End - Start,
-                 INNER_LAMBDA(int J2) {
-                    return DataD(J1, J2 + Start) >= Threshold;
-                 },
+                 Team, Range{Start, End - 1},
+                 INNER_LAMBDA(int J2) { return DataD(J1, J2) >= Threshold; },
                  SearchIdx);
-             IdxD(J1) = SearchIdx == -1 ? SearchIdx : SearchIdx + Start;
+             IdxD(J1) = SearchIdx;
           });
 
       if (!arraysEqual(IdxD, RefIdxH)) {

From 8eb573adbf95fcff736db6b139691dc8699c0fe1 Mon Sep 17 00:00:00 2001
From: Maciej Waruszewski <mwarusz@igf.fuw.edu.pl>
Date: Thu, 2 Apr 2026 13:47:22 -0600
Subject: [PATCH 22/25] Fix scratch alignment issue

---
 components/omega/src/infra/OmegaKokkosHiPar.h | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/components/omega/src/infra/OmegaKokkosHiPar.h b/components/omega/src/infra/OmegaKokkosHiPar.h
index 6a2093e4994b..c810d6f880d7 100644
--- a/components/omega/src/infra/OmegaKokkosHiPar.h
+++ b/components/omega/src/infra/OmegaKokkosHiPar.h
@@ -16,10 +16,13 @@ using TeamPolicy      = Kokkos::TeamPolicy<ExecSpace>;
 using TeamMember      = TeamPolicy::member_type;
 using ScratchMemSpace = ExecSpace::scratch_memory_space;
 using Kokkos::PerTeam;
-using ArrayScratch1DReal =
-    Kokkos::View<Real *, ScratchMemSpace, Kokkos::MemoryUnmanaged>;
-using ArrayScratch1DI4 =
-    Kokkos::View<I4 *, ScratchMemSpace, Kokkos::MemoryUnmanaged>;
+
+template <class T>
+using ArrayScratch1D =
+    Kokkos::View<T *, ScratchMemSpace, Kokkos::MemoryUnmanaged>;
+
+using ArrayScratch1DReal = ArrayScratch1D<Real>;
+using ArrayScratch1DI4   = ArrayScratch1D<I4>;
 
 /// team_size for hierarchical parallelism
 #ifdef OMEGA_TARGET_DEVICE
@@ -41,7 +44,7 @@ template <class... T> struct TeamScratch {
 
    template <class... ArgT> TeamScratch(ArgT... Args) {
       static_assert(sizeof...(ArgT) == sizeof...(T));
-      ((BytesPerTeam += sizeof(T) * Args), ...);
+      ((BytesPerTeam += ArrayScratch1D<T>::shmem_size(Args)), ...);
    }
 };
 

From 6753a0b7e10f7fcf5a7b99f4361dfdeb6e1103d3 Mon Sep 17 00:00:00 2001
From: Maciej Waruszewski <mwarusz@igf.fuw.edu.pl>
Date: Thu, 2 Apr 2026 15:02:23 -0600
Subject: [PATCH 23/25] Add copilot fixes

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 components/omega/doc/devGuide/ParallelLoops.md       | 12 ++++++------
 components/omega/src/infra/OmegaKokkos.h             |  4 +++-
 components/omega/src/infra/OmegaKokkosHiPar.h        |  2 +-
 components/omega/test/infra/OmegaKokkosHiParTest.cpp |  2 +-
 4 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/components/omega/doc/devGuide/ParallelLoops.md b/components/omega/doc/devGuide/ParallelLoops.md
index a9c0d0e0e22c..804f04ad49b7 100644
--- a/components/omega/doc/devGuide/ParallelLoops.md
+++ b/components/omega/doc/devGuide/ParallelLoops.md
@@ -165,9 +165,9 @@ The second way uses a helper struct `Range` to provide a range of valid indices
    });
 ```
 Note that this range is inclusive, i.e. the loop index `K` takes values from `N1` up to and including `N2`.
-This means that `Range{0, N}` specifies a diffrent range than the first example.
-For simplicity, most examples in this document use the first way of specyfying the range,
-but a `Range` argument can be passed to all inner iteration patters.
+This means that `Range{0, N}` specifies a different range than the first example.
+For simplicity, most examples in this document use the first way of specifying the range,
+but a `Range` argument can be passed to all inner iteration patterns.
 
 
 ### parallelForOuter
@@ -324,7 +324,7 @@ Labels are not supported by `parallelSearchInner` and only one-dimensional index
 
 ### Launch Config
 
-While specyfing loop bounds is enough to start an outer parallel loop, sometimes more control over the underlaying
+While specifying loop bounds is enough to start an outer parallel loop, sometimes more control over the underlying
 Kokkos `TeamPolicy` is desired. The most common use case is utilizing scratch memory, a concept discussed more
 thoroughly in the next sub-section. To enable more control, outer loops can be launched by providing
 a `LaunchConfig` struct as the first argument, which is composed of three parts:
@@ -341,7 +341,7 @@ with team size of 32 and enough scratch memory for 8 `Real` values and 4 `I4` va
    });
 ```
 It is not necessary to provide all three arguments to `LaunchConfig`. If you want the default team size,
-or you don't need any scratch memory, you can use the follwing constructors.
+or you don't need any scratch memory, you can use the following constructors.
 ```c++
    auto LConfig1 = LaunchConfig({N1, N2}, TeamScratch<Real, I4>(8, 4));
    auto LConfig2 = LaunchConfig({N1, N2}, 32);
@@ -356,7 +356,7 @@ In hierarchical code, it is often useful to have some amount of scratch memory p
 Scratch memory enables reuse of expensive to compute data in inner loops.
 To enable scratch memory, the outer loops needs to be launched with the `LaunchConfig` parameter described above,
 configured with the requested number of scratch values.
-Inside the outer loop, unmanaged scratch arrays can be created from a pool of memory accesible
+Inside the outer loop, unmanaged scratch arrays can be created from a pool of memory accessible
 by calling the `teamScratch(Team)` function.
 Scratch arrays have a different type than normal Omega arrays, for example `ArrayScratch1DReal` is the
 type of a 1D scratch array of Reals. They also cannot have labels.
diff --git a/components/omega/src/infra/OmegaKokkos.h b/components/omega/src/infra/OmegaKokkos.h
index 2ca20c49fd3f..9c5466574e19 100644
--- a/components/omega/src/infra/OmegaKokkos.h
+++ b/components/omega/src/infra/OmegaKokkos.h
@@ -11,6 +11,8 @@
 
 #include "DataTypes.h"
 #include "Error.h"
+#include <algorithm>
+#include <array>
 #include <functional>
 #include <type_traits>
 #include <utility>
@@ -101,7 +103,7 @@ bool arraysEqual(const ArrayTypeA &A, const ArrayTypeB &B) {
    OMEGA_REQUIRE(A.span_is_contiguous() && B.span_is_contiguous(),
                  "arraysEqual works only for contiguous arrays");
    OMEGA_REQUIRE(A.size() == B.size(),
-                 "arrayEqual can only compare arrays of equal size");
+                 "arraysEqual can only compare arrays of equal size");
 
    // This is a debug utility and not performance critical
    // so just copy to the host and compare there
diff --git a/components/omega/src/infra/OmegaKokkosHiPar.h b/components/omega/src/infra/OmegaKokkosHiPar.h
index c810d6f880d7..f100cfdb5025 100644
--- a/components/omega/src/infra/OmegaKokkosHiPar.h
+++ b/components/omega/src/infra/OmegaKokkosHiPar.h
@@ -181,7 +181,7 @@ inline void parallelReduceOuter(const LaunchConfig<N> &LConfig, F &&Functor,
 
 // parallelReduceOuter: with label and with array bounds
 template <int N, class F, class... R>
-inline void parallelReduceOuter(const std::string Label,
+inline void parallelReduceOuter(const std::string &Label,
                                 const int (&UpperBounds)[N], F &&Functor,
                                 R &&...Reducers) {
    parallelReduceOuter(Label, LaunchConfig(UpperBounds),
diff --git a/components/omega/test/infra/OmegaKokkosHiParTest.cpp b/components/omega/test/infra/OmegaKokkosHiParTest.cpp
index b5b08de34908..dbffd9617f07 100644
--- a/components/omega/test/infra/OmegaKokkosHiParTest.cpp
+++ b/components/omega/test/infra/OmegaKokkosHiParTest.cpp
@@ -390,7 +390,7 @@ Error testHiparLaunchConfig1D(int N1, int N2) {
 
    Array2DReal OutD("OutD", N1, N2 - 3);
 
-#ifdef OMEGA_DEVICE
+#ifdef OMEGA_TARGET_DEVICE
    const int TeamSize = 32;
 #else
    const int TeamSize = 1;

From 23b78fc7e5dfec8b06583227484e9d9a03c5002e Mon Sep 17 00:00:00 2001
From: Maciej Waruszewski <mwarusz@igf.fuw.edu.pl>
Date: Thu, 2 Apr 2026 17:10:59 -0600
Subject: [PATCH 24/25] Use Range in inner loops where possible

Co-authored-by: mwarusz <5665111+mwarusz@users.noreply.github.com>
---
 components/omega/src/ocn/Tendencies.cpp       | 33 +++++--------
 components/omega/src/ocn/VertCoord.cpp        | 49 +++++++------------
 .../omega/src/timeStepping/TimeStepper.cpp    | 48 +++++++-----------
 3 files changed, 47 insertions(+), 83 deletions(-)

diff --git a/components/omega/src/ocn/Tendencies.cpp b/components/omega/src/ocn/Tendencies.cpp
index 3c3aacb8dde5..680ad0f381e8 100644
--- a/components/omega/src/ocn/Tendencies.cpp
+++ b/components/omega/src/ocn/Tendencies.cpp
@@ -379,15 +379,12 @@ void Tendencies::computeThicknessTendenciesOnly(
 
    parallelForOuter(
        {Mesh->NCellsAll}, KOKKOS_LAMBDA(int ICell, const TeamMember &Team) {
-          const int KMin   = MinLayerCell(ICell);
-          const int KMax   = MaxLayerCell(ICell);
-          const int KRange = vertRange(KMin, KMax);
+          const int KMin = MinLayerCell(ICell);
+          const int KMax = MaxLayerCell(ICell);
 
           parallelForInner(
-              Team, KRange, INNER_LAMBDA(int KChunk) {
-                 const int K                     = KMin + KChunk;
-                 LocLayerThicknessTend(ICell, K) = 0;
-              });
+              Team, Range{KMin, KMax},
+              INNER_LAMBDA(int K) { LocLayerThicknessTend(ICell, K) = 0; });
        });
 
    // Compute thickness flux divergence
@@ -454,15 +451,12 @@ void Tendencies::computeVelocityTendenciesOnly(
 
    parallelForOuter(
        {Mesh->NEdgesAll}, KOKKOS_LAMBDA(int IEdge, const TeamMember &Team) {
-          const int KMin   = MinLayerEdgeBot(IEdge);
-          const int KMax   = MaxLayerEdgeTop(IEdge);
-          const int KRange = vertRange(KMin, KMax);
+          const int KMin = MinLayerEdgeBot(IEdge);
+          const int KMax = MaxLayerEdgeTop(IEdge);
 
           parallelForInner(
-              Team, KRange, INNER_LAMBDA(int KChunk) {
-                 const int K                     = KMin + KChunk;
-                 LocNormalVelocityTend(IEdge, K) = 0;
-              });
+              Team, Range{KMin, KMax},
+              INNER_LAMBDA(int K) { LocNormalVelocityTend(IEdge, K) = 0; });
        });
 
    // Compute potential vorticity horizontal advection
@@ -663,14 +657,11 @@ void Tendencies::computeTracerTendenciesOnly(
    parallelForOuter(
        {NTracers, Mesh->NCellsAll},
        KOKKOS_LAMBDA(int L, int ICell, const TeamMember &Team) {
-          const int KMin   = MinLayerCell(ICell);
-          const int KMax   = MaxLayerCell(ICell);
-          const int KRange = vertRange(KMin, KMax);
+          const int KMin = MinLayerCell(ICell);
+          const int KMax = MaxLayerCell(ICell);
           parallelForInner(
-              Team, KRange, INNER_LAMBDA(int KChunk) {
-                 const int K                = KMin + KChunk;
-                 LocTracerTend(L, ICell, K) = 0;
-              });
+              Team, Range{KMin, KMax},
+              INNER_LAMBDA(int K) { LocTracerTend(L, ICell, K) = 0; });
        });
 
    // compute tracer horizotal advection
diff --git a/components/omega/src/ocn/VertCoord.cpp b/components/omega/src/ocn/VertCoord.cpp
index b481439202b8..b6a2adc30b49 100644
--- a/components/omega/src/ocn/VertCoord.cpp
+++ b/components/omega/src/ocn/VertCoord.cpp
@@ -768,11 +768,8 @@ void VertCoord::setMasks() {
           const I4 KMax = LocMaxLyrEdgeTop(IEdge);
 
           parallelForInner(
-              Team, KMax - KMin + 1, INNER_LAMBDA(int K) {
-                 I4 KLyr = KMin + K;
-
-                 LocEdgeMask(IEdge, KLyr) = 1._Real;
-              });
+              Team, Range{KMin, KMax},
+              INNER_LAMBDA(int K) { LocEdgeMask(IEdge, K) = 1._Real; });
        });
 
    EdgeMaskH = createHostMirrorCopy(EdgeMask);
@@ -791,11 +788,8 @@ void VertCoord::setMasks() {
           const I4 KMax = LocMaxLyrCell(ICell);
 
           parallelForInner(
-              Team, KMax - KMin + 1, INNER_LAMBDA(int K) {
-                 I4 KLyr = KMin + K;
-
-                 LocCellMask(ICell, KLyr) = 1._Real;
-              });
+              Team, Range{KMin, KMax},
+              INNER_LAMBDA(int K) { LocCellMask(ICell, K) = 1._Real; });
        });
 
    CellMaskH = createHostMirrorCopy(CellMask);
@@ -815,11 +809,8 @@ void VertCoord::setMasks() {
           const I4 KMax = LocMaxLyrVrtxBot(IVertex);
 
           parallelForInner(
-              Team, KMax - KMin + 1, INNER_LAMBDA(int K) {
-                 I4 KLyr = KMin + K;
-
-                 LocVrtxMask(IVertex, KLyr) = 1._Real;
-              });
+              Team, Range{KMin, KMax},
+              INNER_LAMBDA(int K) { LocVrtxMask(IVertex, K) = 1._Real; });
        });
 
    VertexMaskH = createHostMirrorCopy(VertexMask);
@@ -868,21 +859,18 @@ void VertCoord::computePressure(
    parallelForOuter(
        "computePressure", {NCellsAll},
        KOKKOS_LAMBDA(int ICell, const TeamMember &Team) {
-          const I4 KMin   = LocMinLayerCell(ICell);
-          const I4 KMax   = LocMaxLayerCell(ICell);
-          const I4 KRange = vertRange(KMin, KMax);
-
+          const I4 KMin               = LocMinLayerCell(ICell);
+          const I4 KMax               = LocMaxLayerCell(ICell);
           LocPressInterf(ICell, KMin) = SurfacePressure(ICell);
           parallelScanInner(
-              Team, KRange, INNER_LAMBDA(int K, Real &Accum, bool IsFinal) {
-                 const I4 KLyr  = K + KMin;
-                 Real Increment = Gravity * RhoSw * LayerThickness(ICell, KLyr);
+              Team, Range{KMin, KMax}, INNER_LAMBDA(int K, Real &Accum, bool IsFinal) {
+                 Real Increment = Gravity * RhoSw * LayerThickness(ICell, K);
                  Accum += Increment;
 
                  if (IsFinal) {
-                    LocPressInterf(ICell, KLyr + 1) =
+                    LocPressInterf(ICell, K + 1) =
                         SurfacePressure(ICell) + Accum;
-                    LocPressMid(ICell, KLyr) =
+                    LocPressMid(ICell, K) =
                         SurfacePressure(ICell) + Accum - 0.5 * Increment;
                  }
               });
@@ -982,10 +970,8 @@ void VertCoord::computeTargetThickness() {
    parallelForOuter(
        "computeTargetThickness", {NCellsAll},
        KOKKOS_LAMBDA(int ICell, const TeamMember &Team) {
-          const I4 KMin   = LocMinLayerCell(ICell);
-          const I4 KMax   = LocMaxLayerCell(ICell);
-          const I4 KRange = vertRange(KMin, KMax);
-
+          const I4 KMin = LocMinLayerCell(ICell);
+          const I4 KMax = LocMaxLayerCell(ICell);
           Real Coeff =
               (LocPressInterf(ICell, KMax + 1) - LocPressInterf(ICell, KMin)) /
               (Gravity * RhoSw);
@@ -993,11 +979,10 @@ void VertCoord::computeTargetThickness() {
           Real SumWh   = 0;
           Real SumRefH = 0;
           parallelReduceInner(
-              Team, KRange,
+              Team, Range{KMin, KMax},
               INNER_LAMBDA(const int K, Real &LocalWh, Real &LocalSum) {
-                 const I4 KLyr            = K + KMin;
-                 const Real RefLayerThick = LocRefLayerThick(ICell, KLyr);
-                 LocalWh += LocVertCoordMvmtWgts(KLyr) * RefLayerThick;
+                 const Real RefLayerThick = LocRefLayerThick(ICell, K);
+                 LocalWh += LocVertCoordMvmtWgts(K) * RefLayerThick;
                  LocalSum += RefLayerThick;
               },
               SumWh, SumRefH);
diff --git a/components/omega/src/timeStepping/TimeStepper.cpp b/components/omega/src/timeStepping/TimeStepper.cpp
index 7ee026c384b7..f52a8696547e 100644
--- a/components/omega/src/timeStepping/TimeStepper.cpp
+++ b/components/omega/src/timeStepping/TimeStepper.cpp
@@ -402,13 +402,11 @@ void TimeStepper::updateThicknessByTend(OceanState *State1, int TimeLevel1,
    parallelForOuter(
        "updateThickByTend", {Mesh->NCellsAll},
        KOKKOS_LAMBDA(int ICell, const TeamMember &Team) {
-          const int KMin   = MinLayerCell(ICell);
-          const int KMax   = MaxLayerCell(ICell);
-          const int KRange = vertRange(KMin, KMax);
+          const int KMin = MinLayerCell(ICell);
+          const int KMax = MaxLayerCell(ICell);
 
           parallelForInner(
-              Team, KRange, INNER_LAMBDA(int KChunk) {
-                 const int K = KMin + KChunk;
+              Team, Range{KMin, KMax}, INNER_LAMBDA(int K) {
                  LayerThick1(ICell, K) =
                      LayerThick2(ICell, K) +
                      CoeffSeconds * LayerThickTend(ICell, K);
@@ -437,13 +435,11 @@ void TimeStepper::updateVelocityByTend(OceanState *State1, int TimeLevel1,
    parallelForOuter(
        "updateVelByTend", {Mesh->NEdgesAll},
        KOKKOS_LAMBDA(int IEdge, const TeamMember &Team) {
-          const int KMin   = MinLayerEdgeBot(IEdge);
-          const int KMax   = MaxLayerEdgeTop(IEdge);
-          const int KRange = vertRange(KMin, KMax);
+          const int KMin = MinLayerEdgeBot(IEdge);
+          const int KMax = MaxLayerEdgeTop(IEdge);
 
           parallelForInner(
-              Team, KRange, INNER_LAMBDA(int KChunk) {
-                 const int K          = KMin + KChunk;
+              Team, Range{KMin, KMax}, INNER_LAMBDA(int K) {
                  NormalVel1(IEdge, K) = NormalVel2(IEdge, K) +
                                         CoeffSeconds * NormalVelTend(IEdge, K);
               });
@@ -484,12 +480,10 @@ void TimeStepper::updateTracersByTend(const Array3DReal &NextTracers,
    parallelForOuter(
        "updateTracersByTend", {NTracers, Mesh->NCellsAll},
        KOKKOS_LAMBDA(int L, int ICell, const TeamMember &Team) {
-          const int KMin   = MinLayerCell(ICell);
-          const int KMax   = MaxLayerCell(ICell);
-          const int KRange = vertRange(KMin, KMax);
+          const int KMin = MinLayerCell(ICell);
+          const int KMax = MaxLayerCell(ICell);
           parallelForInner(
-              Team, KRange, INNER_LAMBDA(int KChunk) {
-                 const int K = KMin + KChunk;
+              Team, Range{KMin, KMax}, INNER_LAMBDA(int K) {
                  NextTracers(L, ICell, K) =
                      (CurTracers(L, ICell, K) * LayerThick2(ICell, K) +
                       CoeffSeconds * TracerTend(L, ICell, K)) /
@@ -512,12 +506,10 @@ void TimeStepper::weightTracers(const Array3DReal &NextTracers,
    parallelForOuter(
        "weightTracers", {NTracers, Mesh->NCellsAll},
        KOKKOS_LAMBDA(int L, int ICell, const TeamMember &Team) {
-          const int KMin   = MinLayerCell(ICell);
-          const int KMax   = MaxLayerCell(ICell);
-          const int KRange = vertRange(KMin, KMax);
+          const int KMin = MinLayerCell(ICell);
+          const int KMax = MaxLayerCell(ICell);
           parallelForInner(
-              Team, KRange, INNER_LAMBDA(int KChunk) {
-                 const int K = KMin + KChunk;
+              Team, Range{KMin, KMax}, INNER_LAMBDA(int K) {
                  NextTracers(L, ICell, K) =
                      CurTracers(L, ICell, K) * CurThickness(ICell, K);
               });
@@ -541,12 +533,10 @@ void TimeStepper::accumulateTracersUpdate(const Array3DReal &AccumTracer,
    parallelForOuter(
        "accumulateTracersUpdate", {NTracers, Mesh->NCellsAll},
        KOKKOS_LAMBDA(int L, int ICell, const TeamMember &Team) {
-          const int KMin   = MinLayerCell(ICell);
-          const int KMax   = MaxLayerCell(ICell);
-          const int KRange = vertRange(KMin, KMax);
+          const int KMin = MinLayerCell(ICell);
+          const int KMax = MaxLayerCell(ICell);
           parallelForInner(
-              Team, KRange, INNER_LAMBDA(int KChunk) {
-                 const int K = KMin + KChunk;
+              Team, Range{KMin, KMax}, INNER_LAMBDA(int K) {
                  AccumTracer(L, ICell, K) +=
                      CoeffSeconds * TracerTend(L, ICell, K);
               });
@@ -567,12 +557,10 @@ void TimeStepper::finalizeTracersUpdate(const Array3DReal &NextTracers,
    parallelForOuter(
        "finalizeTracersUpdate", {NTracers, Mesh->NCellsAll},
        KOKKOS_LAMBDA(int L, int ICell, const TeamMember &Team) {
-          const int KMin   = MinLayerCell(ICell);
-          const int KMax   = MaxLayerCell(ICell);
-          const int KRange = vertRange(KMin, KMax);
+          const int KMin = MinLayerCell(ICell);
+          const int KMax = MaxLayerCell(ICell);
           parallelForInner(
-              Team, KRange, INNER_LAMBDA(int KChunk) {
-                 const int K = KMin + KChunk;
+              Team, Range{KMin, KMax}, INNER_LAMBDA(int K) {
                  NextTracers(L, ICell, K) /= NextThick(ICell, K);
               });
        });

From 62496e83c575355ebdf04a540f404b925df3b2f1 Mon Sep 17 00:00:00 2001
From: Maciej Waruszewski <mwarusz@igf.fuw.edu.pl>
Date: Thu, 2 Apr 2026 17:09:00 -0600
Subject: [PATCH 25/25] Use Team, teamBarrier, and teamScratch everywhere

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 components/omega/src/base/TriDiagSolvers.h    | 68 +++++++++----------
 components/omega/src/ocn/VertAdv.cpp          | 23 +++----
 .../omega/test/base/TriDiagSolversTest.cpp    | 42 ++++++------
 3 files changed, 66 insertions(+), 67 deletions(-)

diff --git a/components/omega/src/base/TriDiagSolvers.h b/components/omega/src/base/TriDiagSolvers.h
index 0b9a947667b6..095c53d47d2c 100644
--- a/components/omega/src/base/TriDiagSolvers.h
+++ b/components/omega/src/base/TriDiagSolvers.h
@@ -47,9 +47,9 @@ struct TriDiagScratch {
    TriDiagScratchArray X; // rhs on input, contains solution after calling solve
 
    // Constructor takes team member and system size
-   KOKKOS_FUNCTION TriDiagScratch(const TeamMember &Member, int NRow)
-       : DL(Member.team_scratch(0), NRow), D(Member.team_scratch(0), NRow),
-         DU(Member.team_scratch(0), NRow), X(Member.team_scratch(0), NRow) {}
+   KOKKOS_FUNCTION TriDiagScratch(const TeamMember &Team, int NRow)
+       : DL(teamScratch(Team), NRow), D(teamScratch(Team), NRow),
+         DU(teamScratch(Team), NRow), X(teamScratch(Team), NRow) {}
 };
 
 // Thomas algorithm solver for general tridiagonal systems
@@ -66,7 +66,7 @@ struct ThomasSolver {
    // Solve the system defined in the scratch data argument `Scratch`
    // This a team-level function that needs to be called inside a
    // parallel loop using TeamPolicy, hence it has a team member argument
-   static void KOKKOS_FUNCTION solve(const TeamMember &Member,
+   static void KOKKOS_FUNCTION solve(const TeamMember &Team,
                                      const TriDiagScratch &Scratch) {
       const int NRow = Scratch.X.extent_int(0);
 
@@ -103,10 +103,10 @@ struct ThomasSolver {
       auto LConfig = makeLaunchConfig(NBatch, NRow);
 
       parallelForOuter(
-          LConfig, KOKKOS_LAMBDA(const int IChunk, const TeamMember &Member) {
+          LConfig, KOKKOS_LAMBDA(const int IChunk, const TeamMember &Team) {
              const int IStart = IChunk * VecLength;
 
-             TriDiagScratch Scratch(Member, NRow);
+             TriDiagScratch Scratch(Team, NRow);
 
              for (int K = 0; K < NRow; ++K) {
                 for (int IVec = 0; IVec < VecLength; ++IVec) {
@@ -120,7 +120,7 @@ struct ThomasSolver {
                 }
              }
 
-             solve(Member, Scratch);
+             solve(Team, Scratch);
 
              for (int IVec = 0; IVec < VecLength; ++IVec) {
                 for (int K = 0; K < NRow; ++K) {
@@ -147,12 +147,12 @@ struct PCRSolver {
    // Solve the system defined in the scratch data argument `Scratch`
    // This a team-level function that needs to be called inside a
    // parallel loop using TeamPolicy, hence it has a team member argument
-   static void KOKKOS_FUNCTION solve(const TeamMember &Member,
+   static void KOKKOS_FUNCTION solve(const TeamMember &Team,
                                      const TriDiagScratch &Scratch) {
       const int NRow = Scratch.X.extent_int(0);
 
       // Row index = Thread index
-      const int K = Member.team_rank();
+      const int K = Team.team_rank();
 
       // Number of reduction levels
       const int NLevels = Kokkos::ceil(Kokkos::log2(NRow));
@@ -178,7 +178,7 @@ struct PCRSolver {
          const Real NewDL = alpha * Scratch.DL(Kmh, 0);
          const Real NewDU = gamma * Scratch.DU(Kph, 0);
 
-         Member.team_barrier();
+         teamBarrier(Team);
 
          // Store new system coefficients
          Scratch.D(K, 0)  = NewD;
@@ -186,7 +186,7 @@ struct PCRSolver {
          Scratch.DL(K, 0) = NewDL;
          Scratch.DU(K, 0) = NewDU;
 
-         Member.team_barrier();
+         teamBarrier(Team);
       }
 
       const int Stride = 1 << (NLevels - 1);
@@ -220,21 +220,21 @@ struct PCRSolver {
       auto LConfig     = makeLaunchConfig(NBatch, NRow);
 
       parallelForOuter(
-          LConfig, KOKKOS_LAMBDA(int I, const TeamMember &Member) {
-             const int K = Member.team_rank();
+          LConfig, KOKKOS_LAMBDA(int I, const TeamMember &Team) {
+             const int K = Team.team_rank();
 
-             TriDiagScratch Scratch(Member, NRow);
+             TriDiagScratch Scratch(Team, NRow);
 
              Scratch.DL(K, 0) = DL(I, K);
              Scratch.D(K, 0)  = D(I, K);
              Scratch.DU(K, 0) = DU(I, K);
              Scratch.X(K, 0)  = X(I, K);
 
-             Member.team_barrier();
+             teamBarrier(Team);
 
-             solve(Member, Scratch);
+             solve(Team, Scratch);
 
-             Member.team_barrier();
+             teamBarrier(Team);
 
              X(I, K) = Scratch.X(K, 0);
           });
@@ -250,9 +250,9 @@ struct TriDiagDiffScratch {
    TriDiagScratchArray X; // rhs on input, contains solution after calling solve
    TriDiagScratchArray Alpha; // internal workspace
 
-   KOKKOS_FUNCTION TriDiagDiffScratch(const TeamMember &Member, int NRow)
-       : G(Member.team_scratch(0), NRow), H(Member.team_scratch(0), NRow),
-         X(Member.team_scratch(0), NRow), Alpha(Member.team_scratch(0), NRow) {}
+   KOKKOS_FUNCTION TriDiagDiffScratch(const TeamMember &Team, int NRow)
+       : G(teamScratch(Team), NRow), H(teamScratch(Team), NRow),
+         X(teamScratch(Team), NRow), Alpha(teamScratch(Team), NRow) {}
 };
 
 // Thomas algorithm solver for diffusion-type tridiagonal systems
@@ -269,7 +269,7 @@ struct ThomasDiffusionSolver {
    // Solve the system defined in the scratch data argument `Scratch`
    // This a team-level function that needs to be called inside a
    // parallel loop using TeamPolicy, hence it has a team member argument
-   static void KOKKOS_FUNCTION solve(const TeamMember &Member,
+   static void KOKKOS_FUNCTION solve(const TeamMember &Team,
                                      const TriDiagDiffScratch &Scratch) {
       const int NRow = Scratch.X.extent_int(0);
 
@@ -325,10 +325,10 @@ struct ThomasDiffusionSolver {
       auto LConfig = makeLaunchConfig(NBatch, NRow);
 
       parallelForOuter(
-          LConfig, KOKKOS_LAMBDA(int IChunk, const TeamMember &Member) {
+          LConfig, KOKKOS_LAMBDA(int IChunk, const TeamMember &Team) {
              const int IStart = IChunk * VecLength;
 
-             TriDiagDiffScratch Scratch(Member, NRow);
+             TriDiagDiffScratch Scratch(Team, NRow);
 
              for (int K = 0; K < NRow; ++K) {
                 for (int IVec = 0; IVec < VecLength; ++IVec) {
@@ -341,7 +341,7 @@ struct ThomasDiffusionSolver {
                 }
              }
 
-             solve(Member, Scratch);
+             solve(Team, Scratch);
 
              for (int IVec = 0; IVec < VecLength; ++IVec) {
                 for (int K = 0; K < NRow; ++K) {
@@ -368,12 +368,12 @@ struct PCRDiffusionSolver {
    // Solve the system defined in the scratch data argument `Scratch`
    // This a team-level function that needs to be called inside a
    // parallel loop using TeamPolicy, hence it has a team member argument
-   static void KOKKOS_FUNCTION solve(const TeamMember &Member,
+   static void KOKKOS_FUNCTION solve(const TeamMember &Team,
                                      const TriDiagDiffScratch &Scratch) {
       const int NRow = Scratch.X.extent_int(0);
 
       // Row index = Thread index
-      const int K = Member.team_rank();
+      const int K = Team.team_rank();
 
       // Number of reduction levels
       const int NLevels = Kokkos::ceil(Kokkos::log2(NRow));
@@ -406,14 +406,14 @@ struct PCRDiffusionSolver {
          const Real NewH = Scratch.H(K, 0) + Alpha * Scratch.H(Kmh, 0) +
                            Beta * Scratch.H(Kph, 0);
 
-         Member.team_barrier();
+         teamBarrier(Team);
 
          // Store new system coefficients
          Scratch.H(K, 0) = NewH;
          Scratch.G(K, 0) = NewG;
          Scratch.X(K, 0) = NewX;
 
-         Member.team_barrier();
+         teamBarrier(Team);
       }
 
       const int Stride = 1 << (NLevels - 1);
@@ -456,20 +456,20 @@ struct PCRDiffusionSolver {
 
       auto LConfig = makeLaunchConfig(NBatch, NRow);
       parallelForOuter(
-          LConfig, KOKKOS_LAMBDA(int I, const TeamMember &Member) {
-             const int K = Member.team_rank();
+          LConfig, KOKKOS_LAMBDA(int I, const TeamMember &Team) {
+             const int K = Team.team_rank();
 
-             TriDiagDiffScratch Scratch(Member, NRow);
+             TriDiagDiffScratch Scratch(Team, NRow);
 
              Scratch.G(K, 0) = G(I, K);
              Scratch.H(K, 0) = H(I, K);
              Scratch.X(K, 0) = X(I, K);
 
-             Member.team_barrier();
+             teamBarrier(Team);
 
-             solve(Member, Scratch);
+             solve(Team, Scratch);
 
-             Member.team_barrier();
+             teamBarrier(Team);
 
              X(I, K) = Scratch.X(K, 0);
           });
diff --git a/components/omega/src/ocn/VertAdv.cpp b/components/omega/src/ocn/VertAdv.cpp
index 081f5a38d6bc..e951cbf02428 100644
--- a/components/omega/src/ocn/VertAdv.cpp
+++ b/components/omega/src/ocn/VertAdv.cpp
@@ -380,7 +380,7 @@ void VertAdv::computeVerticalVelocity(
        "computeVerticalVelocity",
        LaunchConfig({NCellsOwned}, TeamScratch<Real>(NVertLayers)),
        KOKKOS_LAMBDA(int ICell, const TeamMember &Team) {
-          ArrayScratch1DReal DivHU(Team.team_scratch(0), LocNVertLayers);
+          ArrayScratch1DReal DivHU(teamScratch(Team), LocNVertLayers);
 
           const Real InvAreaCell = 1._Real / LocAreaCell(ICell);
 
@@ -411,7 +411,7 @@ void VertAdv::computeVerticalVelocity(
                  }
               });
 
-          Team.team_barrier();
+          teamBarrier(Team);
 
           // Set velocity through top and bottom interfaces to zero
           Kokkos::single(
@@ -521,7 +521,7 @@ void VertAdv::computeVelocityVAdvTend(
 
           // Allocate scratch space for W times Du/Dz at vertical interfaces
           // between edges
-          ArrayScratch1DReal WDuDzEdge(Team.team_scratch(0), LocNVertLayersP1);
+          ArrayScratch1DReal WDuDzEdge(teamScratch(Team), LocNVertLayersP1);
 
           // Flux is zero at top and bottom
           Kokkos::single(
@@ -551,7 +551,7 @@ void VertAdv::computeVelocityVAdvTend(
                  }
               });
 
-          Team.team_barrier();
+          teamBarrier(Team);
 
           KRange = vertRangeChunked(KMin, KMax);
           // Average W*Du/Dz from interfaces to layer midpoints
@@ -831,12 +831,11 @@ void VertAdv::computeFCTVAdvTend(
           const I4 KMax = MaxLayerCell(ICell);
           I4 KRange     = vertRangeChunked(KMin, KMax);
 
-          ArrayScratch1DReal InvNewProvThick(Team.team_scratch(0),
-                                             LocNVertLayers);
-          ArrayScratch1DReal WorkTend(Team.team_scratch(0), LocNVertLayers);
-          ArrayScratch1DReal FlxIn(Team.team_scratch(0), LocNVertLayers);
-          ArrayScratch1DReal FlxOut(Team.team_scratch(0), LocNVertLayers);
-          ArrayScratch1DReal RescaledFlux(Team.team_scratch(0),
+          ArrayScratch1DReal InvNewProvThick(teamScratch(Team), LocNVertLayers);
+          ArrayScratch1DReal WorkTend(teamScratch(Team), LocNVertLayers);
+          ArrayScratch1DReal FlxIn(teamScratch(Team), LocNVertLayers);
+          ArrayScratch1DReal FlxOut(teamScratch(Team), LocNVertLayers);
+          ArrayScratch1DReal RescaledFlux(teamScratch(Team),
                                           LocNVertLayers + 1);
 
           parallelForInner(
@@ -914,7 +913,7 @@ void VertAdv::computeFCTVAdvTend(
                  }
               });
 
-          Team.team_barrier();
+          teamBarrier(Team);
 
           KRange = vertRangeChunked(KMin + 1, KMax);
 
@@ -937,7 +936,7 @@ void VertAdv::computeFCTVAdvTend(
                  }
               });
 
-          Team.team_barrier();
+          teamBarrier(Team);
 
           // Accumulate total FCT vertical advection tendency
           KRange = vertRangeChunked(KMin, KMax);
diff --git a/components/omega/test/base/TriDiagSolversTest.cpp b/components/omega/test/base/TriDiagSolversTest.cpp
index 99236b40fe93..97aac1107e20 100644
--- a/components/omega/test/base/TriDiagSolversTest.cpp
+++ b/components/omega/test/base/TriDiagSolversTest.cpp
@@ -173,11 +173,11 @@ Real runDiffManufactured(int NCells) {
       const Real TimeNext = (Step + 1) * TimeStep;
 
       parallelForOuter(
-          LConfig, KOKKOS_LAMBDA(int, const TeamMember &Member) {
-             TriDiagDiffScratch Scratch(Member, NCells);
+          LConfig, KOKKOS_LAMBDA(int, const TeamMember &Team) {
+             TriDiagDiffScratch Scratch(Team, NCells);
 
              // Setup the system to be solved
-             parallelForInner(Member, NCells, [=](int ICell) {
+             parallelForInner(Team, NCells, [=](int ICell) {
                 for (int IVec = 0; IVec < VecLength; ++IVec) {
 
                    // Forcing term from the manufactured solution
@@ -205,12 +205,12 @@ Real runDiffManufactured(int NCells) {
              });
 
              // Solve the system
-             Member.team_barrier();
-             TriDiagDiffSolver::solve(Member, Scratch);
-             Member.team_barrier();
+             teamBarrier(Team);
+             TriDiagDiffSolver::solve(Team, Scratch);
+             teamBarrier(Team);
 
              // Store the solution
-             parallelForInner(Member, NCells, [=](int ICell) {
+             parallelForInner(Team, NCells, [=](int ICell) {
                 U(ICell) = Scratch.X(ICell, 0);
              });
           });
@@ -319,12 +319,12 @@ Real runDiffusionStability(bool UseGeneralSolver, Real DiffValue) {
          auto LConfig = TriDiagSolver::makeLaunchConfig(1, NCells);
 
          parallelForOuter(
-             LConfig, KOKKOS_LAMBDA(int, const TeamMember &Member) {
-                TriDiagScratch Scratch(Member, NCells);
+             LConfig, KOKKOS_LAMBDA(int, const TeamMember &Team) {
+                TriDiagScratch Scratch(Team, NCells);
 
                 // Setup the system to be solved in the form expected by the
                 // general tridiagonal solver
-                parallelForInner(Member, NCells, [=](int ICell) {
+                parallelForInner(Team, NCells, [=](int ICell) {
                    for (int IVec = 0; IVec < VecLength; ++IVec) {
 
                       if (ICell < NCells - 1) {
@@ -354,12 +354,12 @@ Real runDiffusionStability(bool UseGeneralSolver, Real DiffValue) {
                 });
 
                 // Solve the system
-                Member.team_barrier();
-                TriDiagSolver::solve(Member, Scratch);
-                Member.team_barrier();
+                teamBarrier(Team);
+                TriDiagSolver::solve(Team, Scratch);
+                teamBarrier(Team);
 
                 // Save the solution
-                parallelForInner(Member, NCells, [=](int ICell) {
+                parallelForInner(Team, NCells, [=](int ICell) {
                    U(ICell) = Scratch.X(ICell, 0);
                 });
              });
@@ -367,12 +367,12 @@ Real runDiffusionStability(bool UseGeneralSolver, Real DiffValue) {
          auto LConfig = TriDiagDiffSolver::makeLaunchConfig(1, NCells);
 
          parallelForOuter(
-             LConfig, KOKKOS_LAMBDA(int, const TeamMember &Member) {
-                TriDiagDiffScratch Scratch(Member, NCells);
+             LConfig, KOKKOS_LAMBDA(int, const TeamMember &Team) {
+                TriDiagDiffScratch Scratch(Team, NCells);
 
                 // Setup the system to be solved in the form expected by the
                 // specialized diffusion tridiagonal solver
-                parallelForInner(Member, NCells, [=](int ICell) {
+                parallelForInner(Team, NCells, [=](int ICell) {
                    for (int IVec = 0; IVec < VecLength; ++IVec) {
 
                       Scratch.H(ICell, IVec) = LayerThick(ICell);
@@ -391,12 +391,12 @@ Real runDiffusionStability(bool UseGeneralSolver, Real DiffValue) {
                 });
 
                 // Solve the system
-                Member.team_barrier();
-                TriDiagDiffSolver::solve(Member, Scratch);
-                Member.team_barrier();
+                teamBarrier(Team);
+                TriDiagDiffSolver::solve(Team, Scratch);
+                teamBarrier(Team);
 
                 // Store the solution
-                parallelForInner(Member, NCells, [=](int ICell) {
+                parallelForInner(Team, NCells, [=](int ICell) {
                    U(ICell) = Scratch.X(ICell, 0);
                 });
              });