From 8120cdb4add0c0d96435934a9470c0f75f78d82c Mon Sep 17 00:00:00 2001 From: Maciej Waruszewski Date: Thu, 5 Feb 2026 12:27:22 -0700 Subject: [PATCH 1/4] Add parallelSearchInner --- components/omega/src/infra/OmegaKokkos.h | 38 ++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/components/omega/src/infra/OmegaKokkos.h b/components/omega/src/infra/OmegaKokkos.h index 5a3eb0b82499..1c98189eefeb 100644 --- a/components/omega/src/infra/OmegaKokkos.h +++ b/components/omega/src/infra/OmegaKokkos.h @@ -419,6 +419,44 @@ KOKKOS_FUNCTION void parallelScanInner(const TeamMember &Team, int UpperBound, std::forward(Reducers)...); } +// parallelSearchInner +// Given a functor taking an index and returning a bool this function +// returns the first index in the range [0, UpperBound) for which the input +// functor returns true. If no such index is found it returns -1 +template +KOKKOS_FUNCTION void parallelSearchInner(const TeamMember &Team, int UpperBound, + F &&Functor, int &Idx) { + static_assert(std::is_same_v, bool>, + "paralleSearchInner requires a functor that takes an int and " + "returns bool"); + + // There are different implementations for host and device since the + // parallel_reduce version doesn't return early leading to performance loss + // on CPUs +#ifndef OMEGA_TARGET_DEVICE + Idx = -1; + for (int I = 0; I < UpperBound; ++I) { + if (Functor(I)) { + Idx = I; + break; + } + } +#else + const auto Policy = TeamThreadRange(Team, UpperBound); + Kokkos::parallel_reduce( + Policy, + INNER_LAMBDA(int I, int &Accum) { + if (I <= Accum && Functor(I)) { + Accum = I; + } + }, + Kokkos::Min(Idx)); + if (Idx == Kokkos::reduction_identity::min()) { + Idx = -1; + } +#endif +} + } // end namespace OMEGA //===----------------------------------------------------------------------===// From 86aff81bd47faf0e925da8014d36ad8a490145de Mon Sep 17 00:00:00 2001 From: Maciej Waruszewski Date: Thu, 5 Feb 2026 12:27:43 -0700 Subject: [PATCH 2/4] Add test for parallelSearchInner --- .../omega/test/infra/OmegaKokkosHiParTest.cpp | 105 ++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/components/omega/test/infra/OmegaKokkosHiParTest.cpp b/components/omega/test/infra/OmegaKokkosHiParTest.cpp index c7793bd13cc7..a27e4297b6a9 100644 --- a/components/omega/test/infra/OmegaKokkosHiParTest.cpp +++ b/components/omega/test/infra/OmegaKokkosHiParTest.cpp @@ -262,6 +262,110 @@ Error testHiparReduce1DReduce1D(int N1) { return Err; } +Error testHiparFor1DSearch1D(int N2) { + Error Err; + + const int Threshold = N2 / 2; + const int N1 = 3 * N2 + 3; + + HostArray2DI4 DataH("DataH", N1, N2); + + for (int J1 = 0; J1 < 3 * N2; ++J1) { + if (J1 < N2 + 10) { + for (int J2 = 0; J2 < N2; ++J2) { + DataH(J1, J2) = Threshold - (J1 - J2); + } + } else { + for (int J2 = 0; J2 < N2; ++J2) { + DataH(J1, J2) = Threshold - (J1 / 4 - J2); + } + } + } + + // Ensure these patterns are in the input data + for (int J2 = 0; J2 < N2; ++J2) { + // Everything above threshold + DataH(3 * N2, J2) = Threshold + 1; + // Everything below threshold + DataH(3 * N2 + 1, J2) = Threshold - 1; + // Multiple non-consecutive values above threshold + DataH(3 * N2 + 2, J2) = Threshold - 3 + J2 % 4; + } + + auto DataD = createDeviceMirrorCopy(DataH); + + HostArray1DI4 RefIdxH("RefIdxH", N1); + Array1DI4 IdxD("IdxD", N1); + + // test searching full range + + for (int J1 = 0; J1 < N1; ++J1) { + int Idx = -1; + for (int J2 = 0; J2 < N2; ++J2) { + if (DataH(J1, J2) >= Threshold) { + Idx = J2; + break; + } + } + RefIdxH(J1) = Idx; + } + + parallelForOuter( + {N1}, KOKKOS_LAMBDA(int J1, const TeamMember &Team) { + parallelSearchInner( + Team, N2, + INNER_LAMBDA(int J2) { return DataD(J1, J2) >= Threshold; }, + IdxD(J1)); + }); + + if (!arraysEqual(IdxD, RefIdxH)) { + Err += Error(ErrorCode::Fail, + errorMsg("parallelFor1DSearch1D Full FAIL", N1)); + } + + deepCopy(RefIdxH, 0); + deepCopy(IdxD, 0); + + // test searching limited range + + if (N2 / 4 > 0) { + + for (int J1 = 0; J1 < N1; ++J1) { + int Idx = -1; + const int Start = N2 / 4 - J1 % (N2 / 4); + const int End = 3 * N2 / 4 + J1 % (N2 / 4); + for (int J2 = Start; J2 < End; ++J2) { + if (DataH(J1, J2) >= Threshold) { + Idx = J2; + break; + } + } + RefIdxH(J1) = Idx; + } + + parallelForOuter( + {N1}, KOKKOS_LAMBDA(int J1, const TeamMember &Team) { + const int Start = N2 / 4 - J1 % (N2 / 4); + const int End = 3 * N2 / 4 + J1 % (N2 / 4); + int SearchIdx; + parallelSearchInner( + Team, End - Start, + INNER_LAMBDA(int J2) { + return DataD(J1, J2 + Start) >= Threshold; + }, + SearchIdx); + IdxD(J1) = SearchIdx == -1 ? SearchIdx : SearchIdx + Start; + }); + + if (!arraysEqual(IdxD, RefIdxH)) { + Err += Error(ErrorCode::Fail, + errorMsg("parallelFor1DSearch1D Limited FAIL", N1)); + } + } + + return Err; +} + Error testHiparFor1DMultiple1D(int N1, int N2) { Error Err; @@ -687,6 +791,7 @@ int main(int argc, char **argv) { #if !defined(KOKKOS_ENABLE_SYCL) || KOKKOS_VERSION_GREATER_EQUAL(4, 7, 1) Err += testHiparReduce1DReduce1D(N1); #endif + Err += testHiparFor1DSearch1D(N1); Err += testHiparFor1DMultiple1D(1, N1); Err += testHiparFor1DMultiple1D((N1 + 1) / 2, N1); From 51bd06214c875ab0e5b01afbe3a4e36c65c291c4 Mon Sep 17 00:00:00 2001 From: Maciej Waruszewski Date: Thu, 5 Feb 2026 16:04:28 -0700 Subject: [PATCH 3/4] Add docs for parallelSearchInner --- .../omega/doc/devGuide/ParallelLoops.md | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/components/omega/doc/devGuide/ParallelLoops.md b/components/omega/doc/devGuide/ParallelLoops.md index 2ada95cfdb71..9a8251d4228c 100644 --- a/components/omega/doc/devGuide/ParallelLoops.md +++ b/components/omega/doc/devGuide/ParallelLoops.md @@ -115,6 +115,7 @@ The following inner iteration patterns are supported in Omega: - `parallelForInner` - `parallelReduceInner` - `parallelScanInner` +- `parallelSearchInner` To provide even more flexibility, the outer loops support iterating over a multi-dimensional range. Currently, the inner loops are limited to one dimension. @@ -277,3 +278,25 @@ Moreover, this example illustrates that the final scan value can be obtained by an additional argument `FinalScanValue`. Labels are not supported by `parallelScanInner` and only one-dimensional index range can be used. In contrast to `parallelReduceInner`, `parallelScanInner` supports only sum-based scans and only one scan variable. + +### parallelSearchInner +To search an index range in parallel for the first index where a given condition occurs Omega +provides the `parallelSearchInner` function. +For example, the following code finds, for each row of a matrix, the first column index where +the matrix element is above a certain threshold. If no element matches the condition then +`parallelSearchInner` returns `-1`. +```c++ + Array2DReal M("M", N1, N2); + Array1DI3 ThresholdIdx("ThresholdIdx", N1); + parallelForOuter( + {N1}, KOKKOS_LAMBDA(int J1, const TeamMember &Team) { + + int Idx; + parallelSearchInner(Team, N2, INNER_LAMBDA(Int J2) { + return M(J1, J2) > Threshold; + }, Idx); + + ThresholdIdx(J1) = Idx; + }); +``` +Labels are not supported by `parallelSearchInner` and only one-dimensional index range can be used. From 3045ab0e920fd227b89de811f09974a34865dba1 Mon Sep 17 00:00:00 2001 From: Maciej Waruszewski Date: Wed, 25 Feb 2026 16:40:53 -0700 Subject: [PATCH 4/4] Incorporate copilot suggestions Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../omega/doc/devGuide/ParallelLoops.md | 25 ++++++++++--------- components/omega/src/infra/OmegaKokkos.h | 2 +- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/components/omega/doc/devGuide/ParallelLoops.md b/components/omega/doc/devGuide/ParallelLoops.md index 9a8251d4228c..7d4a5ae190ab 100644 --- a/components/omega/doc/devGuide/ParallelLoops.md +++ b/components/omega/doc/devGuide/ParallelLoops.md @@ -192,7 +192,7 @@ a 3D array in parallel using hierarchical parallelism. Array3DReal A("A", N1, N2, N3); parallelForOuter( {N1, N2}, KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team) { - parallelForInner(Team, N3, INNER_LAMBDA(Int J3) { + parallelForInner(Team, N3, INNER_LAMBDA(int J3) { A(J1, J2, J3) = J1 + J2 + J3; }); }); @@ -204,7 +204,7 @@ diagonal of a square matrix one can do: Array2DReal M("M", N, N); parallelForOuter( {N}, KOKKOS_LAMBDA(int J1, const TeamMember &Team) { - parallelForInner(Team, J1, INNER_LAMBDA(Int J2) { + parallelForInner(Team, J1, INNER_LAMBDA(int J2) { M(J1, J2) = J1 + J2; }); }); @@ -220,7 +220,7 @@ in a 2D array might be done as follows. parallelForOuter( {N1, N2}, KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team) { Real SumD3; - parallelReduceInner(Team, N3, INNER_LAMBDA(Int J3, Real &Accum) { + parallelReduceInner(Team, N3, INNER_LAMBDA(int J3, Real &Accum) { Accum += A(J1, J2, J3); }, SumD3); B(J1, J2) = SumD3; @@ -234,10 +234,10 @@ For example, to additionally compute and store maxima along the third dimension parallelForOuter( {N1, N2}, KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team) { Real SumD3, MaxD3; - parallelReduceInner(Team, N3, INNER_LAMBDA(Int J3, Real &AccumSum, Real &AccumMax) { + parallelReduceInner(Team, N3, INNER_LAMBDA(int J3, Real &AccumSum, Real &AccumMax) { AccumSum += A(J1, J2, J3); - AccumMax = Kokkos::Max(AccumMax, A(J1, J2, J3)); - }, SumN3, MaxN3); + AccumMax = Kokkos::max(AccumMax, A(J1, J2, J3)); + }, SumD3, Kokkos::Max(MaxD3)); B(J1, J2) = SumD3; C(J1, J2) = MaxD3; }); @@ -254,7 +254,7 @@ be done as follows. Array3DReal D("D", N1, N2, N3); parallelForOuter( {N1, N2}, KOKKOS_LAMBDA(int J1, int J2, const TeamMember &Team) { - parallelScanInner(Team, N1, INNER_LAMBDA(Int J3, Real &Accum, bool IsFinal) { + parallelScanInner(Team, N3, INNER_LAMBDA(int J3, Real &Accum, bool IsFinal) { Accum += A(J1, J2, J3); if (IsFinal) { D(J1, J2, J3) = Accum; @@ -267,7 +267,7 @@ before the `if` statement. That is, it performs an inclusive scan. To compute an simply move the addition after the `if` statement. ```c++ Real FinalScanValue; - parallelScanInner(Team, N1, INNER_LAMBDA(Int J3, Real &Accum, bool IsFinal) { + parallelScanInner(Team, N3, INNER_LAMBDA(int J3, Real &Accum, bool IsFinal) { if (IsFinal) { D(J1, J2, J3) = Accum; } @@ -280,19 +280,20 @@ and only one-dimensional index range can be used. In contrast to `parallelReduce `parallelScanInner` supports only sum-based scans and only one scan variable. ### parallelSearchInner -To search an index range in parallel for the first index where a given condition occurs Omega -provides the `parallelSearchInner` function. +To search an index range in parallel for the first index at which a given condition occurs, +Omega provides the `parallelSearchInner` function. For example, the following code finds, for each row of a matrix, the first column index where the matrix element is above a certain threshold. If no element matches the condition then `parallelSearchInner` returns `-1`. ```c++ Array2DReal M("M", N1, N2); - Array1DI3 ThresholdIdx("ThresholdIdx", N1); + Array1DI4 ThresholdIdx("ThresholdIdx", N1); + const Real Threshold = 0.5; parallelForOuter( {N1}, KOKKOS_LAMBDA(int J1, const TeamMember &Team) { int Idx; - parallelSearchInner(Team, N2, INNER_LAMBDA(Int J2) { + parallelSearchInner(Team, N2, INNER_LAMBDA(int J2) { return M(J1, J2) > Threshold; }, Idx); diff --git a/components/omega/src/infra/OmegaKokkos.h b/components/omega/src/infra/OmegaKokkos.h index 1c98189eefeb..10a186fc8bed 100644 --- a/components/omega/src/infra/OmegaKokkos.h +++ b/components/omega/src/infra/OmegaKokkos.h @@ -427,7 +427,7 @@ template KOKKOS_FUNCTION void parallelSearchInner(const TeamMember &Team, int UpperBound, F &&Functor, int &Idx) { static_assert(std::is_same_v, bool>, - "paralleSearchInner requires a functor that takes an int and " + "parallelSearchInner requires a functor that takes an int and " "returns bool"); // There are different implementations for host and device since the