From 85235826767dad956428dc9bdba339ffe44bc6f7 Mon Sep 17 00:00:00 2001 From: Stan Moore Date: Mon, 30 Mar 2026 09:58:54 -0600 Subject: [PATCH 1/2] Update Kokkos library in LAMMPS to v5.1.0 --- lib/kokkos/CHANGELOG.md | 65 + lib/kokkos/CMakeLists.txt | 8 +- lib/kokkos/COPYRIGHT.md | 1 - lib/kokkos/LICENSE | 2 +- lib/kokkos/README.md | 2 +- lib/kokkos/algorithms/CMakeLists.txt | 6 +- .../algorithms/perf_test/CMakeLists.txt | 62 - lib/kokkos/algorithms/src/Kokkos_Random.hpp | 41 +- .../src/sorting/Kokkos_BinOpsPublicAPI.hpp | 8 - .../src/sorting/Kokkos_BinSortPublicAPI.hpp | 4 - .../src/sorting/impl/Kokkos_SortByKeyImpl.hpp | 26 +- .../src/sorting/impl/Kokkos_SortImpl.hpp | 26 +- .../std_algorithms/Kokkos_ExclusiveScan.hpp | 40 +- .../src/std_algorithms/Kokkos_IsSorted.hpp | 6 - .../std_algorithms/Kokkos_IsSortedUntil.hpp | 7 - .../src/std_algorithms/Kokkos_IterSwap.hpp | 10 - .../src/std_algorithms/Kokkos_MaxElement.hpp | 8 - .../src/std_algorithms/Kokkos_MinElement.hpp | 8 - .../std_algorithms/Kokkos_MinMaxElement.hpp | 9 - .../Kokkos_TransformExclusiveScan.hpp | 6 - .../Kokkos_TransformInclusiveScan.hpp | 15 - .../impl/Kokkos_Constraints.hpp | 22 - .../src/std_algorithms/impl/Kokkos_CopyIf.hpp | 5 - .../impl/Kokkos_ExclusiveScan.hpp | 68 +- .../impl/Kokkos_InclusiveScan.hpp | 38 +- .../src/std_algorithms/impl/Kokkos_Reduce.hpp | 4 - .../impl/Kokkos_TransformReduce.hpp | 6 - .../std_algorithms/impl/Kokkos_UniqueCopy.hpp | 5 - .../algorithms/unit_tests/CMakeLists.txt | 64 +- .../algorithms/unit_tests/TestBinSortA.hpp | 4 - .../algorithms/unit_tests/TestBinSortB.hpp | 4 - .../algorithms/unit_tests/TestNestedSort.hpp | 10 - .../algorithms/unit_tests/TestRandom.hpp | 26 - lib/kokkos/algorithms/unit_tests/TestSort.hpp | 11 - .../unit_tests/TestSortCustomComp.hpp | 2 +- .../unit_tests/TestStdAlgorithmsCommon.hpp | 13 - .../TestStdAlgorithmsCompileOnly.cpp | 12 - .../TestStdAlgorithmsExclusiveScan.cpp | 2 - .../TestStdAlgorithmsInclusiveScan.cpp | 2 - .../unit_tests/TestStdAlgorithmsIsSorted.cpp | 2 - .../TestStdAlgorithmsIsSortedUntil.cpp | 2 - ...estStdAlgorithmsLexicographicalCompare.cpp | 3 - .../TestStdAlgorithmsMinMaxElementOps.cpp | 9 - .../unit_tests/TestStdAlgorithmsModOps.cpp | 7 + .../unit_tests/TestStdAlgorithmsNumerics.cpp | 4 - .../TestStdAlgorithmsTeamExclusiveScan.cpp | 9 - .../TestStdAlgorithmsTeamInclusiveScan.cpp | 4 - .../TestStdAlgorithmsTeamIsSorted.cpp | 9 +- .../TestStdAlgorithmsTeamIsSortedUntil.cpp | 17 +- .../TestStdAlgorithmsTeamMaxElement.cpp | 9 +- .../TestStdAlgorithmsTeamMinElement.cpp | 9 +- .../TestStdAlgorithmsTeamMinMaxElement.cpp | 9 +- .../TestStdAlgorithmsTeamReduce.cpp | 4 - ...tdAlgorithmsTeamTransformExclusiveScan.cpp | 4 - ...tdAlgorithmsTeamTransformInclusiveScan.cpp | 4 - .../TestStdAlgorithmsTeamTransformReduce.cpp | 4 - ...estStdAlgorithmsTransformExclusiveScan.cpp | 2 - ...estStdAlgorithmsTransformInclusiveScan.cpp | 2 - lib/kokkos/benchmarks/.clang-tidy | 4 + lib/kokkos/benchmarks/CMakeLists.txt | 8 +- .../benchmark_suite/scripts/build_code.bash | 2 - .../scripts/checkout_repos.bash | 3 - .../scripts/run_benchmark.bash | 2 +- .../policy_performance/script_sample_usage.sh | 6 +- lib/kokkos/bin/nvcc_wrapper | 42 +- lib/kokkos/cmake/KokkosConfigCommon.cmake.in | 15 + lib/kokkos/cmake/KokkosCore_config.h.in | 7 +- lib/kokkos/cmake/Modules/FindTPLROCM.cmake | 18 +- .../compile_tests/cuda_compute_capability.cc | 2 + lib/kokkos/cmake/kokkos_arch.cmake | 195 +- lib/kokkos/cmake/kokkos_compiler_id.cmake | 6 - lib/kokkos/cmake/kokkos_enable_devices.cmake | 17 - lib/kokkos/cmake/kokkos_enable_options.cmake | 36 +- lib/kokkos/cmake/kokkos_functions.cmake | 4 +- lib/kokkos/cmake/kokkos_pick_cxx_std.cmake | 16 - lib/kokkos/cmake/kokkos_tpls.cmake | 20 +- lib/kokkos/cmake/kokkos_tribits.cmake | 39 +- .../performance_tests/CMakeLists.txt | 2 - .../containers/performance_tests/TestCuda.cpp | 8 +- .../containers/performance_tests/TestHPX.cpp | 10 +- .../performance_tests/TestOpenMP.cpp | 10 +- lib/kokkos/containers/src/Kokkos_Bitset.hpp | 27 - lib/kokkos/containers/src/Kokkos_DualView.hpp | 25 +- .../containers/src/Kokkos_DynRankView.hpp | 27 - .../containers/src/Kokkos_DynamicView.hpp | 35 +- .../containers/src/Kokkos_OffsetView.hpp | 8 - .../containers/src/Kokkos_ScatterView.hpp | 163 +- .../containers/src/Kokkos_StaticCrsGraph.hpp | 469 ----- lib/kokkos/containers/src/Kokkos_Vector.hpp | 327 --- .../impl/Kokkos_StaticCrsGraph_factory.hpp | 176 -- .../containers/unit_tests/CMakeLists.txt | 6 - .../unit_tests/TestCreateMirror.cpp | 2 +- .../unit_tests/TestDynRankViewTypedefs.cpp | 2 +- .../containers/unit_tests/TestDynViewAPI.hpp | 8 +- .../containers/unit_tests/TestOffsetView.hpp | 10 - .../containers/unit_tests/TestScatterView.hpp | 11 +- .../unit_tests/TestStaticCrsGraph.hpp | 263 --- .../containers/unit_tests/TestVector.hpp | 245 --- .../unit_tests/TestWithoutInitializing.hpp | 30 - lib/kokkos/core/CMakeLists.txt | 10 +- lib/kokkos/core/perf_test/CMakeLists.txt | 72 +- lib/kokkos/core/perf_test/PerfTestDriver.hpp | 388 ---- lib/kokkos/core/perf_test/PerfTestMDRange.hpp | 585 ------ .../perf_test/PerfTestMDRange_Stencil.cpp | 498 +++++ .../core/perf_test/PerfTestMDRange_Stream.cpp | 97 + .../core/perf_test/PerfTestMDRange_Stream.hpp | 242 +++ .../perf_test/PerfTest_CustomReduction.cpp | 87 +- lib/kokkos/core/perf_test/PerfTest_Stream.cpp | 10 +- lib/kokkos/core/src/CMakeLists.txt | 16 - lib/kokkos/core/src/Cuda/Kokkos_Cuda.hpp | 90 +- lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp | 42 +- lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp | 20 - .../core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp | 40 +- ...Kokkos_Cuda_Half_MathematicalFunctions.hpp | 16 +- .../core/src/Cuda/Kokkos_Cuda_Instance.cpp | 194 +- .../core/src/Cuda/Kokkos_Cuda_Instance.hpp | 96 +- .../src/Cuda/Kokkos_Cuda_KernelLaunch.hpp | 5 +- .../src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp | 57 +- .../src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp | 128 +- .../src/Cuda/Kokkos_Cuda_Parallel_Range.hpp | 4 +- .../src/Cuda/Kokkos_Cuda_Parallel_Team.hpp | 131 +- lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp | 2 +- .../core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp | 12 - lib/kokkos/core/src/HIP/Kokkos_HIP.cpp | 71 +- lib/kokkos/core/src/HIP/Kokkos_HIP.hpp | 41 +- .../core/src/HIP/Kokkos_HIP_Graph_Impl.hpp | 73 +- .../src/HIP/Kokkos_HIP_Half_Conversion.hpp | 115 ++ .../src/HIP/Kokkos_HIP_Half_Impl_Type.hpp | 15 +- .../Kokkos_HIP_Half_MathematicalFunctions.hpp | 156 ++ .../core/src/HIP/Kokkos_HIP_Instance.cpp | 53 +- .../core/src/HIP/Kokkos_HIP_Instance.hpp | 35 +- .../core/src/HIP/Kokkos_HIP_IsXnack.hpp | 5 +- .../core/src/HIP/Kokkos_HIP_KernelLaunch.hpp | 4 +- .../core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp | 51 +- .../HIP/Kokkos_HIP_ParallelFor_MDRange.hpp | 145 +- .../src/HIP/Kokkos_HIP_ParallelFor_Range.hpp | 37 +- .../src/HIP/Kokkos_HIP_ParallelFor_Team.hpp | 48 +- .../HIP/Kokkos_HIP_ParallelReduce_Team.hpp | 55 +- .../src/HIP/Kokkos_HIP_ParallelScan_Range.hpp | 12 +- lib/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp | 15 - lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp | 2 +- .../src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp | 40 +- .../core/src/HIP/Kokkos_HIP_UniqueToken.hpp | 12 - lib/kokkos/core/src/HPX/Kokkos_HPX.cpp | 47 +- lib/kokkos/core/src/HPX/Kokkos_HPX.hpp | 268 ++- .../core/src/KokkosExp_MDRangePolicy.hpp | 264 ++- lib/kokkos/core/src/Kokkos_Abort.hpp | 11 +- lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp | 7 +- lib/kokkos/core/src/Kokkos_Array.hpp | 195 +- lib/kokkos/core/src/Kokkos_Assert.hpp | 2 - .../core/src/Kokkos_Atomics_Desul_Wrapper.hpp | 30 +- lib/kokkos/core/src/Kokkos_Complex.hpp | 194 +- lib/kokkos/core/src/Kokkos_Concepts.hpp | 45 +- lib/kokkos/core/src/Kokkos_CopyViews.hpp | 42 +- lib/kokkos/core/src/Kokkos_Core.cppm | 60 +- lib/kokkos/core/src/Kokkos_Core_Impl.cppm | 2 +- lib/kokkos/core/src/Kokkos_Core_fwd.hpp | 11 +- lib/kokkos/core/src/Kokkos_Crs.hpp | 7 +- lib/kokkos/core/src/Kokkos_ExecPolicy.hpp | 151 +- lib/kokkos/core/src/Kokkos_Graph.hpp | 32 +- lib/kokkos/core/src/Kokkos_GraphNode.hpp | 226 +- lib/kokkos/core/src/Kokkos_Graph_fwd.hpp | 4 +- lib/kokkos/core/src/Kokkos_HostSpace.hpp | 29 +- lib/kokkos/core/src/Kokkos_Layout.hpp | 26 - lib/kokkos/core/src/Kokkos_Macros.hpp | 136 +- .../core/src/Kokkos_MathematicalFunctions.hpp | 447 +++- lib/kokkos/core/src/Kokkos_MemoryPool.hpp | 5 - lib/kokkos/core/src/Kokkos_Pair.hpp | 114 - lib/kokkos/core/src/Kokkos_Parallel.hpp | 40 +- .../core/src/Kokkos_Parallel_Reduce.hpp | 134 +- .../core/src/Kokkos_ReductionIdentity.hpp | 2 +- lib/kokkos/core/src/Kokkos_ScratchSpace.hpp | 9 - lib/kokkos/core/src/Kokkos_Timer.hpp | 1 + lib/kokkos/core/src/Kokkos_Tuners.hpp | 12 +- lib/kokkos/core/src/Kokkos_UniqueToken.hpp | 5 + lib/kokkos/core/src/Kokkos_View.hpp | 27 +- .../core/src/OpenACC/Kokkos_OpenACC.cpp | 33 +- .../core/src/OpenACC/Kokkos_OpenACC.hpp | 17 +- .../src/OpenACC/Kokkos_OpenACC_Instance.cpp | 36 +- .../src/OpenACC/Kokkos_OpenACC_Instance.hpp | 22 +- .../Kokkos_OpenACC_ParallelFor_MDRange.hpp | 4 + lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.cpp | 141 +- lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp | 36 +- .../src/OpenMP/Kokkos_OpenMP_Instance.cpp | 184 +- .../src/OpenMP/Kokkos_OpenMP_Instance.hpp | 32 +- .../src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp | 23 + .../OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp | 24 + .../core/src/OpenMP/Kokkos_OpenMP_Team.hpp | 9 +- .../src/OpenMPTarget/Kokkos_OpenMPTarget.hpp | 138 -- .../OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp | 110 - .../OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp | 133 -- .../Kokkos_OpenMPTarget_Abort.hpp | 22 - .../Kokkos_OpenMPTarget_DeepCopy.hpp | 88 - .../Kokkos_OpenMPTarget_Error.hpp | 32 - .../Kokkos_OpenMPTarget_FunctorAdapter.hpp | 35 - .../Kokkos_OpenMPTarget_Instance.cpp | 269 --- .../Kokkos_OpenMPTarget_Instance.hpp | 61 - .../Kokkos_OpenMPTarget_MDRangePolicy.hpp | 25 - .../Kokkos_OpenMPTarget_Macros.hpp | 27 - .../Kokkos_OpenMPTarget_Parallel.hpp | 741 ------- ...okkos_OpenMPTarget_ParallelFor_MDRange.hpp | 325 --- .../Kokkos_OpenMPTarget_ParallelFor_Range.hpp | 52 - .../Kokkos_OpenMPTarget_ParallelFor_Team.hpp | 158 -- ...os_OpenMPTarget_ParallelReduce_MDRange.hpp | 606 ------ ...kkos_OpenMPTarget_ParallelReduce_Range.hpp | 106 - ...okkos_OpenMPTarget_ParallelReduce_Team.hpp | 407 ---- ...Kokkos_OpenMPTarget_ParallelScan_Range.hpp | 243 --- .../Kokkos_OpenMPTarget_ParallelScan_Team.hpp | 125 -- .../Kokkos_OpenMPTarget_Parallel_Common.hpp | 609 ------ .../Kokkos_OpenMPTarget_Reducer.hpp | 554 ----- .../Kokkos_OpenMPTarget_UniqueToken.hpp | 94 - lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp | 54 +- lib/kokkos/core/src/SYCL/Kokkos_SYCL.hpp | 31 +- .../core/src/SYCL/Kokkos_SYCL_Abort.hpp | 5 - .../core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp | 37 +- .../src/SYCL/Kokkos_SYCL_Half_Impl_Type.hpp | 5 - ...Kokkos_SYCL_Half_MathematicalFunctions.hpp | 18 + .../core/src/SYCL/Kokkos_SYCL_Instance.cpp | 145 +- .../core/src/SYCL/Kokkos_SYCL_Instance.hpp | 73 +- .../src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp | 56 +- .../SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp | 164 +- .../src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp | 41 +- .../Kokkos_SYCL_ParallelReduce_MDRange.hpp | 10 +- .../SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp | 12 +- .../SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp | 50 +- .../SYCL/Kokkos_SYCL_ParallelScan_Range.hpp | 156 +- .../core/src/SYCL/Kokkos_SYCL_Space.cpp | 3 +- lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp | 4 +- .../core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp | 33 +- .../core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp | 12 - .../SYCL/Kokkos_SYCL_WorkgroupReduction.hpp | 4 +- lib/kokkos/core/src/Serial/Kokkos_Serial.cpp | 65 +- lib/kokkos/core/src/Serial/Kokkos_Serial.hpp | 66 +- .../Serial/Kokkos_Serial_Parallel_Team.hpp | 58 +- .../core/src/Threads/Kokkos_Threads.hpp | 25 +- .../src/Threads/Kokkos_Threads_Instance.cpp | 25 +- .../src/Threads/Kokkos_Threads_Instance.hpp | 14 +- .../Kokkos_Threads_ParallelFor_Team.hpp | 25 +- .../Kokkos_Threads_ParallelReduce_Team.hpp | 23 + .../core/src/Threads/Kokkos_Threads_Team.hpp | 61 +- lib/kokkos/core/src/View/Kokkos_BasicView.hpp | 3 +- lib/kokkos/core/src/View/Kokkos_ViewAlloc.hpp | 13 +- lib/kokkos/core/src/View/Kokkos_ViewCtor.hpp | 5 + .../core/src/View/Kokkos_ViewDataAnalysis.hpp | 12 +- .../core/src/View/Kokkos_ViewLegacy.hpp | 27 +- .../core/src/View/Kokkos_ViewMapping.hpp | 146 +- .../core/src/View/Kokkos_ViewTracker.hpp | 11 + .../src/View/MDSpan/Kokkos_MDSpan_Header.hpp | 20 - .../core/src/decl/Kokkos_Declare_HIP.hpp | 1 + .../src/decl/Kokkos_Declare_OPENMPTARGET.hpp | 21 - .../core/src/fwd/Kokkos_Fwd_OPENMPTARGET.hpp | 15 - .../src/impl/KokkosExp_Host_IterateTile.hpp | 1212 +---------- .../src/impl/KokkosExp_IterateTileGPU.hpp | 1333 +++--------- .../core/src/impl/Kokkos_CPUDiscovery.cpp | 2 + .../impl/Kokkos_CStyleMemoryManagement.hpp | 5 - .../core/src/{ => impl}/Kokkos_CheckUsage.hpp | 41 +- lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp | 7 - .../core/src/impl/Kokkos_Combined_Reducer.hpp | 59 +- lib/kokkos/core/src/impl/Kokkos_Core.cpp | 10 +- .../src/impl/Kokkos_Default_Graph_Impl.hpp | 21 +- .../core/src/impl/Kokkos_DeviceHandle.hpp | 48 + lib/kokkos/core/src/impl/Kokkos_EBO.hpp | 32 - .../core/src/impl/Kokkos_ExecPolicy.cpp | 6 +- .../core/src/impl/Kokkos_FunctorAnalysis.hpp | 1 + .../src/impl/Kokkos_GraphNodeCtorProps.hpp | 147 ++ .../core/src/impl/Kokkos_GraphNodeImpl.hpp | 59 +- .../impl/Kokkos_Half_FloatingPointWrapper.hpp | 33 +- .../Kokkos_Half_MathematicalFunctions.hpp | 179 +- .../src/impl/Kokkos_Half_NumericTraits.hpp | 30 +- .../core/src/impl/Kokkos_HostBarrier.hpp | 1 + lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp | 10 +- .../core/src/impl/Kokkos_HostThreadTeam.hpp | 7 - .../impl/Kokkos_InitializationSettings.hpp | 26 - .../impl/Kokkos_NvidiaGpuArchitectures.hpp | 6 +- .../src/impl/Kokkos_Profiling_C_Interface.h | 6 +- .../src/impl/Kokkos_Profiling_Interface.hpp | 4 +- .../src/impl/Kokkos_QuadPrecisionMath.hpp | 24 +- .../core/src/impl/Kokkos_SharedAlloc.hpp | 26 +- .../src/impl/Kokkos_StringManipulation.hpp | 2 + .../core/src/impl/Kokkos_TeamMDPolicy.hpp | 6 +- .../core/src/impl/Kokkos_Tools_Generic.hpp | 52 +- lib/kokkos/core/src/impl/Kokkos_Utilities.hpp | 11 +- lib/kokkos/core/src/impl/Kokkos_hwloc.cpp | 2 + .../core/src/setup/Kokkos_Setup_SYCL.hpp | 22 - lib/kokkos/core/unit_test/CMakeLists.txt | 215 +- lib/kokkos/core/unit_test/TestAbort.hpp | 10 +- lib/kokkos/core/unit_test/TestArray.cpp | 23 + lib/kokkos/core/unit_test/TestArrayOps.hpp | 288 --- .../core/unit_test/TestAtomicOperations.hpp | 80 +- .../TestAtomicOperations_complexfloat.hpp | 4 - .../unit_test/TestAtomicOperations_double.hpp | 4 - .../unit_test/TestAtomicOperations_float.hpp | 4 - .../unit_test/TestAtomicOperations_int.hpp | 4 - .../unit_test/TestAtomicOperations_int16.hpp | 4 - .../unit_test/TestAtomicOperations_int8.hpp | 4 - .../TestAtomicOperations_longint.hpp | 4 - .../TestAtomicOperations_longlongint.hpp | 4 - .../unit_test/TestAtomicOperations_shared.hpp | 4 - .../TestAtomicOperations_unsignedint.hpp | 4 - .../TestAtomicOperations_unsignedlongint.hpp | 4 - ...stAtomicOperations_unsignedlonglongint.hpp | 4 - lib/kokkos/core/unit_test/TestAtomicViews.hpp | 3 - lib/kokkos/core/unit_test/TestAtomics.hpp | 137 +- .../core/unit_test/TestBitManipulation.cpp | 2 + .../core/unit_test/TestBlockSizeDeduction.hpp | 6 +- .../core/unit_test/TestCompilerMacros.cpp | 4 - lib/kokkos/core/unit_test/TestComplex.hpp | 108 +- lib/kokkos/core/unit_test/TestConcepts.hpp | 43 +- .../core/unit_test/TestCreateMirror.cpp | 2 +- .../TestCustomScalarParallelScan.hpp | 95 + .../core/unit_test/TestDetectionIdiom.cpp | 1 + ...onEnvironmentNonInitializedOrFinalized.cpp | 85 +- .../core/unit_test/TestExecutionSpace.hpp | 80 +- lib/kokkos/core/unit_test/TestGraph.hpp | 417 ++-- .../core/unit_test/TestGraphAtomicLocks.hpp | 22 +- .../core/unit_test/TestGraphNodeCtorProps.hpp | 185 ++ .../TestHostSharedPtrAccessOnDevice.hpp | 8 +- .../core/unit_test/TestIrregularLayout.hpp | 7 +- .../core/unit_test/TestLocalDeepCopy.hpp | 1197 +++++------ lib/kokkos/core/unit_test/TestMDRange.hpp | 9 - .../TestMDRangePolicyConstructors.hpp | 186 +- .../core/unit_test/TestMDRangeReduce.hpp | 5 - lib/kokkos/core/unit_test/TestMDRange_a.hpp | 3 - lib/kokkos/core/unit_test/TestMDRange_b.hpp | 3 - lib/kokkos/core/unit_test/TestMDRange_c.hpp | 5 - lib/kokkos/core/unit_test/TestMDRange_d.hpp | 5 - lib/kokkos/core/unit_test/TestMDRange_e.hpp | 3 - lib/kokkos/core/unit_test/TestMDRange_f.hpp | 3 - .../unit_test/TestMathematicalConstants.hpp | 8 +- .../unit_test/TestMathematicalFunctions.hpp | 1495 +++++++++++++- .../TestMathematicalSpecialFunctions.hpp | 4 - .../unit_test/TestNonTrivialScalarTypes.hpp | 36 - .../core/unit_test/TestNumericTraits.hpp | 28 +- .../unit_test/TestParallelScanRangePolicy.hpp | 54 +- .../core/unit_test/TestQuadPrecisionMath.hpp | 15 +- lib/kokkos/core/unit_test/TestRange.hpp | 12 +- .../unit_test/TestRangePolicyConstructors.hpp | 22 +- .../core/unit_test/TestRangePolicyRequire.hpp | 2 - lib/kokkos/core/unit_test/TestReduce.hpp | 170 +- .../unit_test/TestReduceCombinatorical.hpp | 23 +- lib/kokkos/core/unit_test/TestReducers.hpp | 86 +- lib/kokkos/core/unit_test/TestReducers_a.hpp | 8 +- lib/kokkos/core/unit_test/TestReducers_b.hpp | 4 - lib/kokkos/core/unit_test/TestReducers_c.hpp | 6 - lib/kokkos/core/unit_test/TestReducers_d.hpp | 11 +- lib/kokkos/core/unit_test/TestSharedAlloc.hpp | 3 - .../TestSpaceAwareAccessorAccessViolation.hpp | 6 - lib/kokkos/core/unit_test/TestSubView_c15.hpp | 2 +- lib/kokkos/core/unit_test/TestSubView_c16.hpp | 55 + lib/kokkos/core/unit_test/TestTeam.hpp | 172 +- lib/kokkos/core/unit_test/TestTeamBasic.hpp | 159 +- .../unit_test/TestTeamCombinedReducers.hpp | 23 +- lib/kokkos/core/unit_test/TestTeamMDRange.hpp | 418 +--- .../unit_test/TestTeamPolicyConstructors.hpp | 85 +- .../core/unit_test/TestTeamReductionScan.hpp | 31 - lib/kokkos/core/unit_test/TestTeamScan.hpp | 16 +- lib/kokkos/core/unit_test/TestTeamScratch.hpp | 5 - .../core/unit_test/TestTeamTeamSize.hpp | 26 +- lib/kokkos/core/unit_test/TestTeamVector.hpp | 25 +- .../core/unit_test/TestTeamVectorRange.hpp | 51 +- lib/kokkos/core/unit_test/TestTypeList.cpp | 38 + lib/kokkos/core/unit_test/TestUniqueToken.hpp | 9 +- lib/kokkos/core/unit_test/TestViewAPI.hpp | 22 +- lib/kokkos/core/unit_test/TestViewAPI_e.hpp | 2 - lib/kokkos/core/unit_test/TestViewCopy_c.hpp | 5 - .../core/unit_test/TestViewCtorDimMatch.hpp | 4 - .../TestViewLayoutStrideAssignment.hpp | 8 - .../core/unit_test/TestViewMapping_b.hpp | 19 +- .../TestViewMemoryAccessViolation.hpp | 6 - lib/kokkos/core/unit_test/TestViewMove.hpp | 24 + lib/kokkos/core/unit_test/TestViewOfClass.hpp | 5 + lib/kokkos/core/unit_test/TestViewOfViews.hpp | 3 + .../unit_test/TestViewOutOfBoundsAccess.hpp | 8 +- .../core/unit_test/TestViewTypedefs.cpp | 2 +- .../unit_test/TestWithoutInitializing.hpp | 37 +- .../unit_test/UnitTest_DeviceAndThreads.cpp | 4 - .../TestOpenMPTarget_Category.hpp | 15 - .../unit_test/cuda/TestCuda_InterOp_Graph.cpp | 10 +- .../cuda/TestCuda_InterOp_GraphMultiGPU.cpp | 8 +- .../cuda/TestCuda_InterOp_StreamsMultiGPU.cpp | 2 +- .../cuda/TestCuda_ReducerViewSizeLimit.cpp | 2 +- .../core/unit_test/cuda/TestCuda_Spaces.cpp | 6 - .../headers_self_contained/CMakeLists.txt | 5 - .../unit_test/hip/TestHIP_InterOp_Graph.cpp | 6 +- .../hip/TestHIP_InterOp_StreamsMultiGPU.cpp | 2 +- .../hip/TestHIP_SharedResourceLock.cpp | 1 + .../core/unit_test/hpx/TestHPX_InParallel.cpp | 183 -- ...estHPX_IndependentInstancesRefCounting.cpp | 2 + .../core/unit_test/incremental/README.md | 2 +- .../incremental/Test01_execspace.hpp | 11 - .../incremental/Test10_HierarchicalBasics.hpp | 13 +- .../incremental/Test12a_ThreadScratch.hpp | 9 - .../incremental/Test12b_TeamScratch.hpp | 9 - .../incremental/Test14_MDRangeReduce.hpp | 14 - .../unit_test/sycl/TestSYCL_InterOp_Graph.cpp | 6 +- .../unit_test/sycl/TestSYCL_InterOp_Init.cpp | 25 +- .../sycl/TestSYCL_InterOp_Streams.cpp | 88 +- .../core/unit_test/tools/TestCInterface.c | 5 + .../unit_test/tools/TestEventCorrectness.hpp | 47 - .../tools/TestWithoutInitializing.cpp | 13 - .../core/unit_test/view/TestBasicView.hpp | 48 +- .../view/TestBasicViewMDSpanConversion.cpp | 19 +- .../view/TestConversionFromPointer.cpp | 6 + .../unit_test/view/TestMemoryTraitTypes.cpp | 34 + .../unit_test/view/TestViewCtorDataHandle.hpp | 58 + .../view/TestViewEqualityOperator.hpp | 118 ++ lib/kokkos/docs/CODE_OF_CONDUCT.md | 3 + lib/kokkos/docs/CONTRIBUTING.md | 26 +- .../build_cmake_installed/CMakeLists.txt | 6 - .../example/build_cmake_installed/foo.f | 4 +- .../CMakeLists.txt | 4 +- .../05_NVIDIA_UVM/CMakeLists.txt | 7 - .../05_NVIDIA_UVM/uvm_example.cpp | 108 - .../tutorial/Advanced_Views/CMakeLists.txt | 4 - .../04_team_scan/team_scan.cpp | 11 +- .../example/virtual_functions/classes.hpp | 5 +- lib/kokkos/example/virtual_functions/main.cpp | 2 + lib/kokkos/simd/CMakeLists.txt | 1 + lib/kokkos/simd/perf_tests/.clang-tidy | 4 + lib/kokkos/simd/perf_tests/BenchmarkMain.cpp | 34 + lib/kokkos/simd/perf_tests/CMakeLists.txt | 7 + lib/kokkos/simd/perf_tests/include/Common.hpp | 135 ++ .../perf_tests/include/PerfTest_Device.hpp | 280 +++ .../simd/perf_tests/include/PerfTest_Host.hpp | 285 +++ .../perf_tests/include/PerfTest_Operators.hpp | 338 +++ lib/kokkos/simd/src/CMakeLists.txt | 10 + lib/kokkos/simd/src/Kokkos_SIMD.cppm | 7 + lib/kokkos/simd/src/Kokkos_SIMD.hpp | 164 +- lib/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp | 1143 ++++++++--- lib/kokkos/simd/src/Kokkos_SIMD_AVX512.hpp | 1339 +++++++++--- lib/kokkos/simd/src/Kokkos_SIMD_Common.hpp | 66 +- .../simd/src/Kokkos_SIMD_Common_Math.hpp | 126 +- lib/kokkos/simd/src/Kokkos_SIMD_Impl.cppm | 1 + lib/kokkos/simd/src/Kokkos_SIMD_NEON.hpp | 1826 ++++++++++++++--- lib/kokkos/simd/src/Kokkos_SIMD_SVE.hpp | 936 ++++++--- lib/kokkos/simd/src/Kokkos_SIMD_Scalar.hpp | 145 +- .../simd/src/impl/Kokkos_SIMD_Impl_Macros.hpp | 95 + lib/kokkos/simd/unit_tests/TestSIMD.cpp | 3 + .../unit_tests/include/SIMDTesting_Ops.hpp | 290 +-- .../include/SIMDTesting_Utilities.hpp | 66 +- .../include/TestSIMD_BitwiseOps.hpp | 577 ++++++ .../include/TestSIMD_ComparisonOps.hpp | 369 ++++ .../unit_tests/include/TestSIMD_Condition.hpp | 4 +- .../include/TestSIMD_Construction.hpp | 25 +- .../include/TestSIMD_Conversions.hpp | 4 +- .../include/TestSIMD_GeneratorCtors.hpp | 43 +- .../unit_tests/include/TestSIMD_LoadStore.hpp | 13 +- .../unit_tests/include/TestSIMD_MaskOps.hpp | 15 +- .../unit_tests/include/TestSIMD_MathOps.hpp | 121 +- .../include/TestSIMD_MemoryPermute.hpp | 355 ++++ .../include/TestSIMD_Reductions.hpp | 49 +- .../unit_tests/include/TestSIMD_ShiftOps.hpp | 71 +- lib/kokkos/tpls/desul-hash.txt | 2 +- .../desul/atomics/Compare_Exchange_GCC.hpp | 7 +- .../include/desul/atomics/Fetch_Op_GCC.hpp | 25 +- .../desul/atomics/Fetch_Op_Generic.hpp | 146 +- .../desul/include/desul/atomics/Generic.hpp | 310 +-- .../atomics/Lock_Based_Fetch_Op_CUDA.hpp | 6 +- .../desul/atomics/Lock_Based_Fetch_Op_HIP.hpp | 6 +- .../atomics/Lock_Based_Fetch_Op_Host.hpp | 5 +- .../atomics/Lock_Based_Fetch_Op_OpenACC.hpp | 5 +- .../atomics/Lock_Based_Fetch_Op_SYCL.hpp | 6 +- .../atomics/Operator_Function_Objects.hpp | 34 +- 463 files changed, 16935 insertions(+), 20045 deletions(-) create mode 100644 lib/kokkos/benchmarks/.clang-tidy delete mode 100644 lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp delete mode 100644 lib/kokkos/containers/src/Kokkos_Vector.hpp delete mode 100644 lib/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp delete mode 100644 lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp delete mode 100644 lib/kokkos/containers/unit_tests/TestVector.hpp delete mode 100644 lib/kokkos/core/perf_test/PerfTestDriver.hpp delete mode 100644 lib/kokkos/core/perf_test/PerfTestMDRange.hpp create mode 100644 lib/kokkos/core/perf_test/PerfTestMDRange_Stencil.cpp create mode 100644 lib/kokkos/core/perf_test/PerfTestMDRange_Stream.cpp create mode 100644 lib/kokkos/core/perf_test/PerfTestMDRange_Stream.hpp create mode 100644 lib/kokkos/core/src/HIP/Kokkos_HIP_Half_MathematicalFunctions.hpp delete mode 100644 lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp delete mode 100644 lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp delete mode 100644 lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp delete mode 100644 lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Abort.hpp delete mode 100644 lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_DeepCopy.hpp delete mode 100644 lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Error.hpp delete mode 100644 lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_FunctorAdapter.hpp delete mode 100644 lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp delete mode 100644 lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp delete mode 100644 lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_MDRangePolicy.hpp delete mode 100644 lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp delete mode 100644 lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp delete mode 100644 lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp delete mode 100644 lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp delete mode 100644 lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp delete mode 100644 lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp delete mode 100644 lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp delete mode 100644 lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp delete mode 100644 lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp delete mode 100644 lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Team.hpp delete mode 100644 lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp delete mode 100644 lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp delete mode 100644 lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_UniqueToken.hpp delete mode 100644 lib/kokkos/core/src/decl/Kokkos_Declare_OPENMPTARGET.hpp delete mode 100644 lib/kokkos/core/src/fwd/Kokkos_Fwd_OPENMPTARGET.hpp rename lib/kokkos/core/src/{ => impl}/Kokkos_CheckUsage.hpp (75%) create mode 100644 lib/kokkos/core/src/impl/Kokkos_DeviceHandle.hpp create mode 100644 lib/kokkos/core/src/impl/Kokkos_GraphNodeCtorProps.hpp create mode 100644 lib/kokkos/core/unit_test/TestCustomScalarParallelScan.hpp create mode 100644 lib/kokkos/core/unit_test/TestGraphNodeCtorProps.hpp create mode 100644 lib/kokkos/core/unit_test/TestSubView_c16.hpp delete mode 100644 lib/kokkos/core/unit_test/category_files/TestOpenMPTarget_Category.hpp delete mode 100644 lib/kokkos/core/unit_test/hpx/TestHPX_InParallel.cpp create mode 100644 lib/kokkos/core/unit_test/view/TestMemoryTraitTypes.cpp create mode 100644 lib/kokkos/core/unit_test/view/TestViewCtorDataHandle.hpp create mode 100644 lib/kokkos/core/unit_test/view/TestViewEqualityOperator.hpp create mode 100644 lib/kokkos/docs/CODE_OF_CONDUCT.md delete mode 100644 lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/CMakeLists.txt delete mode 100644 lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/uvm_example.cpp create mode 100644 lib/kokkos/simd/perf_tests/.clang-tidy create mode 100644 lib/kokkos/simd/perf_tests/BenchmarkMain.cpp create mode 100644 lib/kokkos/simd/perf_tests/CMakeLists.txt create mode 100644 lib/kokkos/simd/perf_tests/include/Common.hpp create mode 100644 lib/kokkos/simd/perf_tests/include/PerfTest_Device.hpp create mode 100644 lib/kokkos/simd/perf_tests/include/PerfTest_Host.hpp create mode 100644 lib/kokkos/simd/perf_tests/include/PerfTest_Operators.hpp create mode 100644 lib/kokkos/simd/src/impl/Kokkos_SIMD_Impl_Macros.hpp create mode 100644 lib/kokkos/simd/unit_tests/include/TestSIMD_BitwiseOps.hpp create mode 100644 lib/kokkos/simd/unit_tests/include/TestSIMD_ComparisonOps.hpp create mode 100644 lib/kokkos/simd/unit_tests/include/TestSIMD_MemoryPermute.hpp diff --git a/lib/kokkos/CHANGELOG.md b/lib/kokkos/CHANGELOG.md index 3dc2ba12759..049732daf9d 100644 --- a/lib/kokkos/CHANGELOG.md +++ b/lib/kokkos/CHANGELOG.md @@ -1,4 +1,60 @@ # CHANGELOG +## 5.1.0 + +[Full Changelog](https://github.com/kokkos/kokkos/compare/5.0.2...5.1.0) + +### Features: +* Export Kokkos type traits as C++20 concepts [\#8494](https://github.com/kokkos/kokkos/pull/8494) + +### Backend and Architecture Enhancements: + +#### CUDA: +* Added `Kokkos_ARCH_BLACKWELL103` configure option for NVIDIA B300 GPUs [\#8791](https://github.com/kokkos/kokkos/pull/8791) +* Fix compiling with Clang+Cuda+OpenMP with Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE=ON [\#8810](https://github.com/kokkos/kokkos/pull/8810) +* `nvcc_wrapper`: Add support for `-Ofc` and `--fdevice-time-trace` flags [\#8865](https://github.com/kokkos/kokkos/pull/8865) + +#### HIP: +* Search the CMake variable `ROCM_PATH` for dependencies [\#8669](https://github.com/kokkos/kokkos/pull/8669) +* Added support for brain floating-point (`bhalf_t`) [\#8705](https://github.com/kokkos/kokkos/pull/8705) +* Implemented true reduced-precision mathematical functions (instead of falling back to `float`) [\#8705](https://github.com/kokkos/kokkos/pull/8705) +* Add support for AMD MI355 and MI350 (`AMD_GFX950`) [\#8839](https://github.com/kokkos/kokkos/pull/8839) +* Fix race conditions in HIP `parallel_scan` when running on MI300A [\#8648](https://github.com/kokkos/kokkos/pull/8648) + +### General Enhancements +* Enable ScatterView to contribute into a View that is an rvalue [\#8594](https://github.com/kokkos/kokkos/pull/8594) +* Add bitwise operators to simd vectors and simd masks [\#8565](https://github.com/kokkos/kokkos/pull/8565) +* Use Array::size_type for subscript operators [\#8692](https://github.com/kokkos/kokkos/pull/8692) +* Add missing numeric trait `denorm_min` for `Kokkos::Experimental::half_t` and `Kokkos::Experimental::bhalf_t` [\#8769](https://github.com/kokkos/kokkos/pull/8769) +* Use StaticBatchSize in ViewFill [\#8795](https://github.com/kokkos/kokkos/pull/8795) +* Enforce failure when exceeding team_size_max and scratch_size_max checks [\#7445](https://github.com/kokkos/kokkos/pull/7445) +* Enable MPI detection with PALS [\#8895](https://github.com/kokkos/kokkos/pull/8895) +* Add simd memory permute functions [\#8775](https://github.com/kokkos/kokkos/pull/8775) +* Performance improvements using `MDRangePolicy` with `CUDA`, `HIP` and `SYCL` [\#8638](https://github.com/kokkos/kokkos/pull/8638), [\#8731](https://github.com/kokkos/kokkos/pull/8731) +* Add `Kokkos::norm`for `Kokkos::complex`- similar to `std::norm` [\#8627](https://github.com/kokkos/kokkos/pull/8927) +* Use neon and sve SIMD instructions if `nvcc` supports them [\#8667](https://github.com/kokkos/kokkos/pull/8667) +* Expand math support: complete the implementation of all remaining math functions and increase half-type support [\#8595](https://github.com/kokkos/kokkos/pull/8789) [\#8858](https://github.com/kokkos/kokkos/pull/8858) [\#8873](https://github.com/kokkos/kokkos/pull/8873) [\#8712](https://github.com/kokkos/kokkos/pull/8712) [\#8827](https://github.com/kokkos/kokkos/pull/8827) [\#8819](https://github.com/kokkos/kokkos/pull/8819) [\#8719](https://github.com/kokkos/kokkos/pull/8719) [\#8863](https://github.com/kokkos/kokkos/pull/8863) [\#8862](https://github.com/kokkos/kokkos/pull/8862) [\#8778](https://github.com/kokkos/kokkos/pull/8778) [\#8891](https://github.com/kokkos/kokkos/pull/8891) +* Improve performance of `deep_copy` from scalar in view fill using StaticBatchSize [\#8795](https://github.com/kokkos/kokkos/pull/8795) [\#8829](https://github.com/kokkos/kokkos/pull/8829) + +### Build System Changes +* Warn about multiple device architectures enabled by `find_package(HIP)` [\#8938](https://github.com/kokkos/kokkos/pull/8938) + +### Incompatibilities (i.e. breaking changes) +* Execution spaces can only be constructed after `Kokkos::initialize()` has been called and must be destructed before `Kokkos::finalize()` [\#8546](https://github.com/kokkos/kokkos/pull/8546) [\#8677](https://github.com/kokkos/kokkos/pull/8677) +* ScatterValue isn't move constructible/assignable anymore [\#8761](https://github.com/kokkos/kokkos/pull/8761) +* Enforce TeamPolicy constructor preconditions (includes vector length must be a power of two) [\#8904](https://github.com/kokkos/kokkos/pull/8904) [\#8907](https://github.com/kokkos/kokkos/pull/8907) +* OpenMP: Warn on exec space instance created within omp region [\#8919](https://github.com/kokkos/kokkos/pull/8919) +* Remove the deprecated OpenMPTarget backend [\#8701](https://github.com/kokkos/kokkos/pull/8701) [\#8717](https://github.com/kokkos/kokkos/pull/8717) [\#8749](https://github.com/kokkos/kokkos/pull/8749) [\#8767](https://github.com/kokkos/kokkos/pull/8767) + +### Bug Fixes +* Fix reduction_identity for BAnd [\#8715](https://github.com/kokkos/kokkos/pull/8715) +* Restrict lock free host atomics to the actual sizes that are lock free [\#8809](https://github.com/kokkos/kokkos/pull/8809) +* Use intrinsics when calling min and max on simd vectors of integral types [\#8899](https://github.com/kokkos/kokkos/pull/8899) +* Adds missing `constexpr` specifiers on `conj()`, and for the `real()` and `imag()` non-member functions taking complex numbers [\#8928](https://github.com/kokkos/kokkos/pull/8928) +* Ensure that execution space instances fence on finalize [\#8626](https://github.com/kokkos/kokkos/pull/8626) +* Update `team_fan_{in|out}` member functions of `ThreadsExecTeamMember` not to call host-only fuctions on the device [\#8730](https://github.com/kokkos/kokkos/pull/8730) +* Make overloads of `isnormal` compliant with std [\#8857](https://github.com/kokkos/kokkos/pull/8857) +* Fix compiler macros identify GCC and LLVM Clang on OSX [\#8592](https://github.com/kokkos/kokkos/pull/8592) [\#8952](https://github.com/kokkos/kokkos/pull/8952) + ## 5.0.2 [Full Changelog](https://github.com/kokkos/kokkos/compare/5.0.1...5.0.2) @@ -112,6 +168,15 @@ * Work around a performance regression related to index computation in the mdspan-based View [\#8476](https://github.com/kokkos/kokkos/pull/8476) * Fix a failure at configure time when SVE is enabled and the tests are disabled [\#8661](https://github.com/kokkos/kokkos/pull/8661) +## 4.7.02 + +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.7.01...4.7.02) + +### Bug Fixes +* Link kokkoscore directly with CMAKE_DL_LIBS [\#8456](https://github.com/kokkos/kokkos/pull/8456) +* mdspan fixes for cuda >= 12.9 [\#8562](https://github.com/kokkos/kokkos/pull/8562), [\#8615](https://github.com/kokkos/kokkos/pull/8615) +* Replace cudaMemAdvise_v2 with cudaMemAdvise when CUDART_VERSION >= 13000 [\#8726](https://github.com/kokkos/kokkos/pull/8726) + ## 4.7.01 [Full Changelog](https://github.com/kokkos/kokkos/compare/4.7.00...4.7.01) diff --git a/lib/kokkos/CMakeLists.txt b/lib/kokkos/CMakeLists.txt index 365e8ba4962..57ac49a2a90 100644 --- a/lib/kokkos/CMakeLists.txt +++ b/lib/kokkos/CMakeLists.txt @@ -140,8 +140,8 @@ elseif(NOT CMAKE_SIZEOF_VOID_P EQUAL 8) endif() set(Kokkos_VERSION_MAJOR 5) -set(Kokkos_VERSION_MINOR 0) -set(Kokkos_VERSION_PATCH 2) +set(Kokkos_VERSION_MINOR 1) +set(Kokkos_VERSION_PATCH 99) set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}") message(STATUS "Kokkos version: ${Kokkos_VERSION}") math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}") @@ -191,6 +191,10 @@ if(Kokkos_ENABLE_TESTS) find_package(GTest QUIET) endif() +if(Kokkos_ENABLE_BENCHMARKS) + find_package(benchmark QUIET 1.8.3) +endif() + # Include a set of Kokkos-specific wrapper functions that # will either call raw CMake or TriBITS # These are functions like KOKKOS_INCLUDE_DIRECTORIES diff --git a/lib/kokkos/COPYRIGHT.md b/lib/kokkos/COPYRIGHT.md index 63184ffcb7c..c73756dbb64 100644 --- a/lib/kokkos/COPYRIGHT.md +++ b/lib/kokkos/COPYRIGHT.md @@ -203,4 +203,3 @@ Date Range: 2022-09-20 -- 2019-06-24 - Scott Kruger; OTHER; scott.e.kruger@gmail.com - Christoph Junghans; OTHER; junghans@votca.org - Daniel Holladay; OTHER; dholladay00@lanl.gov - diff --git a/lib/kokkos/LICENSE b/lib/kokkos/LICENSE index 4d9d69d7c44..7200d2f2adb 100644 --- a/lib/kokkos/LICENSE +++ b/lib/kokkos/LICENSE @@ -221,7 +221,7 @@ NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - Questions? Contact: + Questions? Contact: Christian R. Trott (crtrott@sandia.gov) and Damien T. Lebrun-Grandie (lebrungrandt@ornl.gov) diff --git a/lib/kokkos/README.md b/lib/kokkos/README.md index a17b17834f1..78a15bb3d59 100644 --- a/lib/kokkos/README.md +++ b/lib/kokkos/README.md @@ -24,7 +24,7 @@ To start learning about Kokkos: - [Programming guide](https://kokkos.org/kokkos-core-wiki/programmingguide.html): contains in "narrative" form a technical description of the programming model, machine model, and the main building blocks like the Views and parallel dispatch. -- [API reference](https://kokkos.org/kokkos-core-wiki/): organized by category, i.e., [core](https://kokkos.org/kokkos-core-wiki/API/core-index.html), [algorithms](https://kokkos.org/kokkos-core-wiki/API/algorithms-index.html) and [containers](https://kokkos.org/kokkos-core-wiki/API/containers-index.html) or, if you prefer, in [alphabetical order](https://kokkos.org/kokkos-core-wiki/API/alphabetical.html). +- [API reference](https://kokkos.org/kokkos-core-wiki/): organized by category, i.e., [core](https://kokkos.org/kokkos-core-wiki/API/core-index.html), [algorithms](https://kokkos.org/kokkos-core-wiki/API/algorithms-index.html), [containers](https://kokkos.org/kokkos-core-wiki/API/containers-index.html), and [simd](https://kokkos.org/kokkos-core-wiki/API/simd-index.html). - [Use cases and Examples](https://kokkos.org/kokkos-core-wiki/tutorials-and-examples/use-cases-and-examples.html): a serie of examples ranging from how to use Kokkos with MPI to Fortran interoperability. diff --git a/lib/kokkos/algorithms/CMakeLists.txt b/lib/kokkos/algorithms/CMakeLists.txt index e257e4ccce0..26d9b1386c1 100644 --- a/lib/kokkos/algorithms/CMakeLists.txt +++ b/lib/kokkos/algorithms/CMakeLists.txt @@ -2,10 +2,8 @@ if(NOT Kokkos_INSTALL_TESTING) add_subdirectory(src) endif() # FIXME_OPENACC: temporarily disabled due to unimplemented features -if(NOT ((KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) OR KOKKOS_ENABLE_OPENACC)) +if(NOT KOKKOS_ENABLE_OPENACC) kokkos_add_test_directories(unit_tests) endif() -if(Kokkos_ENABLE_BENCHMARKS) - add_subdirectory(perf_test) -endif() +kokkos_add_benchmark_directories(perf_test) diff --git a/lib/kokkos/algorithms/perf_test/CMakeLists.txt b/lib/kokkos/algorithms/perf_test/CMakeLists.txt index a9dc04a7552..0b56debf3d0 100644 --- a/lib/kokkos/algorithms/perf_test/CMakeLists.txt +++ b/lib/kokkos/algorithms/perf_test/CMakeLists.txt @@ -1,64 +1,2 @@ -# FIXME: The following logic should be moved from here and also from `core/perf_test/CMakeLists.txt` to -# the root `CMakeLists.txt` in the form of a macro -# Find or download google/benchmark library -find_package(benchmark QUIET 1.8.3) -if(benchmark_FOUND) - message(STATUS "Using google benchmark found in ${benchmark_DIR}") -else() - message(STATUS "No installed google benchmark found, fetching from GitHub") - include(FetchContent) - set(BENCHMARK_ENABLE_TESTING OFF) - - list(APPEND CMAKE_MESSAGE_INDENT "[benchmark] ") - FetchContent_Declare( - googlebenchmark - DOWNLOAD_EXTRACT_TIMESTAMP FALSE - URL https://github.com/google/benchmark/archive/refs/tags/v1.8.3.tar.gz - URL_HASH MD5=7b93dd03670665684f1b2e9b70ad17fe - ) - FetchContent_MakeAvailable(googlebenchmark) - list(POP_BACK CMAKE_MESSAGE_INDENT) - - # Suppress clang-tidy diagnostics on code that we do not have control over - if(CMAKE_CXX_CLANG_TIDY) - set_target_properties(benchmark PROPERTIES CXX_CLANG_TIDY "") - endif() - - # FIXME: Check whether the following target_compile_options are needed. - # If so, clarify why. - target_compile_options(benchmark PRIVATE -w) - target_compile_options(benchmark_main PRIVATE -w) -endif() - -# FIXME: This function should be moved from here and also from `core/perf_test/CMakeLists.txt` to -# the root `CMakeLists.txt` -# FIXME: Could NAME be a one_value_keyword specified in cmake_parse_arguments? -function(KOKKOS_ADD_BENCHMARK NAME) - cmake_parse_arguments(BENCHMARK "" "" "SOURCES" ${ARGN}) - if(DEFINED BENCHMARK_UNPARSED_ARGUMENTS) - message(WARNING "Unexpected arguments when adding a benchmark: " ${BENCHMARK_UNPARSED_ARGUMENTS}) - endif() - - set(BENCHMARK_NAME Kokkos_${NAME}) - # FIXME: BenchmarkMain.cpp and Benchmark_Context.cpp should be moved to a common location from which - # they can be used by all performance tests. - list(APPEND BENCHMARK_SOURCES ../../core/perf_test/BenchmarkMain.cpp ../../core/perf_test/Benchmark_Context.cpp) - - add_executable(${BENCHMARK_NAME} ${BENCHMARK_SOURCES}) - target_link_libraries(${BENCHMARK_NAME} PRIVATE benchmark::benchmark Kokkos::kokkos impl_git_version) - target_include_directories(${BENCHMARK_NAME} SYSTEM PRIVATE ${benchmark_SOURCE_DIR}/include) - - # FIXME: This alone will not work. It might need an architecture and standard which need to be defined on target level. - # It will potentially go away with #7582. - foreach(SOURCE_FILE ${BENCHMARK_SOURCES}) - set_source_files_properties(${SOURCE_FILE} PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE}) - endforeach() - - string(TIMESTAMP BENCHMARK_TIME "%Y-%m-%d_T%H-%M-%S" UTC) - set(BENCHMARK_ARGS --benchmark_counters_tabular=true --benchmark_out=${BENCHMARK_NAME}_${BENCHMARK_TIME}.json) - - add_test(NAME ${BENCHMARK_NAME} COMMAND ${BENCHMARK_NAME} ${BENCHMARK_ARGS}) -endfunction() - kokkos_add_benchmark(PerformanceTest_InclusiveScan SOURCES test_inclusive_scan.cpp) kokkos_add_benchmark(PerformanceTest_Random SOURCES test_random.cpp) diff --git a/lib/kokkos/algorithms/src/Kokkos_Random.hpp b/lib/kokkos/algorithms/src/Kokkos_Random.hpp index b1b978e68d3..db108081b15 100644 --- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp +++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp @@ -85,7 +85,7 @@ namespace Kokkos { Pool(const execution_space& exec, uint64_t seed); //Initializing constructor - //Initialize Pool with seed as a starting seed with a pool_size of num_states using the + //Initialize Pool with seed as a starting seed with a pool_size of num_states using the //specified execution space instance Pool(const execution_space& exec, uint64_t seed, uint64_t num_states); @@ -611,11 +611,6 @@ struct Random_XorShift1024_UseCArrayState : std::false_type {}; template <> struct Random_XorShift1024_UseCArrayState : std::false_type {}; #endif -#ifdef KOKKOS_ENABLE_OPENMPTARGET -template <> -struct Random_XorShift1024_UseCArrayState - : std::false_type {}; -#endif #ifdef KOKKOS_ENABLE_OPENACC template <> struct Random_XorShift1024_UseCArrayState @@ -716,28 +711,6 @@ struct Random_UniqueIndex> { }; #endif -#ifdef KOKKOS_ENABLE_OPENMPTARGET -template -struct Random_UniqueIndex< - Kokkos::Device> { - using locks_view_type = - View>; - KOKKOS_FUNCTION - static int get_state_idx(const locks_view_type& locks) { - const int team_size = omp_get_num_threads(); - int i = omp_get_team_num() * team_size + omp_get_thread_num(); - const int lock_size = locks.extent_int(0); - - i %= lock_size; - while (Kokkos::atomic_compare_exchange(&locks(i, 0), 0, 1)) { - i = (i + 1) % lock_size; - } - return i; - } -}; -#endif - #ifdef KOKKOS_ENABLE_OPENACC template struct Random_UniqueIndex< @@ -934,12 +907,6 @@ class Random_XorShift64_Pool { #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 KOKKOS_DEFAULTED_FUNCTION Random_XorShift64_Pool() = default; - - KOKKOS_DEFAULTED_FUNCTION Random_XorShift64_Pool( - Random_XorShift64_Pool const&) = default; - - KOKKOS_DEFAULTED_FUNCTION Random_XorShift64_Pool& operator=( - Random_XorShift64_Pool const&) = default; #else Random_XorShift64_Pool() = default; #endif @@ -1214,12 +1181,6 @@ class Random_XorShift1024_Pool { #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 KOKKOS_DEFAULTED_FUNCTION Random_XorShift1024_Pool() = default; - - KOKKOS_DEFAULTED_FUNCTION Random_XorShift1024_Pool( - Random_XorShift1024_Pool const&) = default; - - KOKKOS_DEFAULTED_FUNCTION Random_XorShift1024_Pool& operator=( - Random_XorShift1024_Pool const&) = default; #else Random_XorShift1024_Pool() = default; #endif diff --git a/lib/kokkos/algorithms/src/sorting/Kokkos_BinOpsPublicAPI.hpp b/lib/kokkos/algorithms/src/sorting/Kokkos_BinOpsPublicAPI.hpp index 8bd4a11e581..ddc4e4bfdf4 100644 --- a/lib/kokkos/algorithms/src/sorting/Kokkos_BinOpsPublicAPI.hpp +++ b/lib/kokkos/algorithms/src/sorting/Kokkos_BinOpsPublicAPI.hpp @@ -15,11 +15,7 @@ struct BinOp1D { double mul_ = {}; double min_ = {}; -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - KOKKOS_DEPRECATED BinOp1D() = default; -#else BinOp1D() = delete; -#endif // Construct BinOp with number of bins, minimum value and maximum value BinOp1D(int max_bins, typename KeyViewType::const_value_type min, @@ -63,11 +59,7 @@ struct BinOp3D { double mul_[3] = {}; double min_[3] = {}; -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - KOKKOS_DEPRECATED BinOp3D() = default; -#else BinOp3D() = delete; -#endif BinOp3D(int max_bins[], typename KeyViewType::const_value_type min[], typename KeyViewType::const_value_type max[]) { diff --git a/lib/kokkos/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp b/lib/kokkos/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp index 707c48fb74e..0c0747fbce0 100644 --- a/lib/kokkos/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp +++ b/lib/kokkos/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp @@ -136,11 +136,7 @@ class BinSort { bool sort_within_bins; public: -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - KOKKOS_DEPRECATED BinSort() = default; -#else BinSort() = delete; -#endif //---------------------------------------- // Constructor: takes the keys, the binning_operator and optionally whether to diff --git a/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp b/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp index 4d8091377fe..29ac4cd63f7 100644 --- a/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp +++ b/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp @@ -26,38 +26,16 @@ import kokkos.core; #pragma GCC diagnostic ignored "-Wshadow" #pragma GCC diagnostic ignored "-Wsuggest-override" -#if defined(KOKKOS_COMPILER_CLANG) -// Some versions of Clang fail to compile Thrust, failing with errors like -// this: -// /thrust/system/cuda/detail/core/agent_launcher.h:557:11: -// error: use of undeclared identifier 'va_printf' -// The exact combination of versions for Clang and Thrust (or CUDA) for this -// failure was not investigated, however even very recent version combination -// (Clang 10.0.0 and Cuda 10.0) demonstrated failure. -// -// Defining _CubLog here locally allows us to avoid that code path, however -// disabling some debugging diagnostics -#pragma push_macro("_CubLog") -#ifdef _CubLog -#undef _CubLog -#endif -// NOLINTNEXTLINE(bugprone-reserved-identifier) -#define _CubLog -#include -#include -#pragma pop_macro("_CubLog") -#else #include #include -#endif #pragma GCC diagnostic pop -#endif +#elif defined(KOKKOS_ENABLE_ROCTHRUST) -#if defined(KOKKOS_ENABLE_ROCTHRUST) #include #include + #endif #ifdef KOKKOS_ENABLE_ONEDPL diff --git a/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp b/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp index ffbdf307ee3..71a6a75e360 100644 --- a/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp +++ b/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp @@ -31,38 +31,16 @@ import kokkos.core; #pragma GCC diagnostic ignored "-Wshadow" #pragma GCC diagnostic ignored "-Wsuggest-override" -#if defined(KOKKOS_COMPILER_CLANG) -// Some versions of Clang fail to compile Thrust, failing with errors like -// this: -// /thrust/system/cuda/detail/core/agent_launcher.h:557:11: -// error: use of undeclared identifier 'va_printf' -// The exact combination of versions for Clang and Thrust (or CUDA) for this -// failure was not investigated, however even very recent version combination -// (Clang 10.0.0 and Cuda 10.0) demonstrated failure. -// -// Defining _CubLog here locally allows us to avoid that code path, however -// disabling some debugging diagnostics -#pragma push_macro("_CubLog") -#ifdef _CubLog -#undef _CubLog -#endif -// NOLINTNEXTLINE(bugprone-reserved-identifier) -#define _CubLog -#include -#include -#pragma pop_macro("_CubLog") -#else #include #include -#endif #pragma GCC diagnostic pop -#endif +#elif defined(KOKKOS_ENABLE_ROCTHRUST) -#if defined(KOKKOS_ENABLE_ROCTHRUST) #include #include + #endif #if defined(KOKKOS_ENABLE_ONEDPL) diff --git a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_ExclusiveScan.hpp b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_ExclusiveScan.hpp index 4b562ee1365..b942c9ac2b7 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_ExclusiveScan.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_ExclusiveScan.hpp @@ -28,9 +28,11 @@ OutputIteratorType exclusive_scan(const ExecutionSpace& ex, ValueType init_value) { static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); - return Impl::exclusive_scan_default_op_exespace_impl( + using binary_op = Impl::StdExclusiveScanDefaultJoinFunctor; + + return Impl::exclusive_scan_exespace_impl( "Kokkos::exclusive_scan_default_functors_iterator_api", ex, first, last, - first_dest, std::move(init_value)); + first_dest, std::move(init_value), binary_op()); } template , "ValueType must be move constructible."); - return Impl::exclusive_scan_default_op_exespace_impl( - label, ex, first, last, first_dest, std::move(init_value)); + using binary_op = Impl::StdExclusiveScanDefaultJoinFunctor; + + return Impl::exclusive_scan_exespace_impl(label, ex, first, last, first_dest, + std::move(init_value), binary_op()); } template < @@ -63,11 +67,13 @@ auto exclusive_scan(const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); + using binary_op = Impl::StdExclusiveScanDefaultJoinFunctor; + namespace KE = ::Kokkos::Experimental; - return Impl::exclusive_scan_default_op_exespace_impl( + return Impl::exclusive_scan_exespace_impl( "Kokkos::exclusive_scan_default_functors_view_api", ex, KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), - std::move(init_value)); + std::move(init_value), binary_op()); } template < @@ -82,10 +88,12 @@ auto exclusive_scan(const std::string& label, const ExecutionSpace& ex, Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); + using binary_op = Impl::StdExclusiveScanDefaultJoinFunctor; + namespace KE = ::Kokkos::Experimental; - return Impl::exclusive_scan_default_op_exespace_impl( + return Impl::exclusive_scan_exespace_impl( label, ex, KE::cbegin(view_from), KE::cend(view_from), - KE::begin(view_dest), std::move(init_value)); + KE::begin(view_dest), std::move(init_value), binary_op()); } // overload set 2 @@ -101,10 +109,9 @@ OutputIteratorType exclusive_scan(const ExecutionSpace& ex, InputIteratorType last, OutputIteratorType first_dest, ValueType init_value, BinaryOpType bop) { - Impl::static_assert_is_not_openmptarget(ex); static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); - return Impl::exclusive_scan_custom_op_exespace_impl( + return Impl::exclusive_scan_exespace_impl( "Kokkos::exclusive_scan_custom_functors_iterator_api", ex, first, last, first_dest, std::move(init_value), bop); } @@ -122,11 +129,10 @@ OutputIteratorType exclusive_scan(const std::string& label, InputIteratorType last, OutputIteratorType first_dest, ValueType init_value, BinaryOpType bop) { - Impl::static_assert_is_not_openmptarget(ex); static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); - return Impl::exclusive_scan_custom_op_exespace_impl( - label, ex, first, last, first_dest, std::move(init_value), bop); + return Impl::exclusive_scan_exespace_impl(label, ex, first, last, first_dest, + std::move(init_value), bop); } template < @@ -138,13 +144,12 @@ auto exclusive_scan(const ExecutionSpace& ex, const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, ValueType init_value, BinaryOpType bop) { - Impl::static_assert_is_not_openmptarget(ex); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; - return Impl::exclusive_scan_custom_op_exespace_impl( + return Impl::exclusive_scan_exespace_impl( "Kokkos::exclusive_scan_custom_functors_view_api", ex, KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), std::move(init_value), bop); @@ -159,13 +164,12 @@ auto exclusive_scan(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, ValueType init_value, BinaryOpType bop) { - Impl::static_assert_is_not_openmptarget(ex); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; - return Impl::exclusive_scan_custom_op_exespace_impl( + return Impl::exclusive_scan_exespace_impl( label, ex, KE::cbegin(view_from), KE::cend(view_from), KE::begin(view_dest), std::move(init_value), bop); } @@ -223,7 +227,6 @@ KOKKOS_FUNCTION OutputIteratorType exclusive_scan(const TeamHandleType& teamHandle, InputIteratorType first, InputIteratorType last, OutputIteratorType first_dest, ValueType init_value, BinaryOpType bop) { - Impl::static_assert_is_not_openmptarget(teamHandle); static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::exclusive_scan_custom_op_team_impl( @@ -239,7 +242,6 @@ KOKKOS_FUNCTION auto exclusive_scan( const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, ValueType init_value, BinaryOpType bop) { - Impl::static_assert_is_not_openmptarget(teamHandle); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); static_assert(std::is_move_constructible_v, diff --git a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_IsSorted.hpp b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_IsSorted.hpp index 7a17cea9675..9e7fb5ea5bb 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_IsSorted.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_IsSorted.hpp @@ -59,7 +59,6 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool is_sorted(const ExecutionSpace& ex, IteratorType first, IteratorType last, ComparatorType comp) { - Impl::static_assert_is_not_openmptarget(ex); return Impl::is_sorted_exespace_impl("Kokkos::is_sorted_iterator_api_default", ex, first, last, std::move(comp)); } @@ -69,7 +68,6 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> bool is_sorted(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last, ComparatorType comp) { - Impl::static_assert_is_not_openmptarget(ex); return Impl::is_sorted_exespace_impl(label, ex, first, last, std::move(comp)); } @@ -81,7 +79,6 @@ bool is_sorted(const ExecutionSpace& ex, const ::Kokkos::View& view, ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); - Impl::static_assert_is_not_openmptarget(ex); namespace KE = ::Kokkos::Experimental; return Impl::is_sorted_exespace_impl("Kokkos::is_sorted_view_api_default", ex, @@ -97,7 +94,6 @@ bool is_sorted(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view, ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); - Impl::static_assert_is_not_openmptarget(ex); namespace KE = ::Kokkos::Experimental; return Impl::is_sorted_exespace_impl(label, ex, KE::cbegin(view), @@ -134,7 +130,6 @@ template & view, ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); - Impl::static_assert_is_not_openmptarget(teamHandle); namespace KE = ::Kokkos::Experimental; return Impl::is_sorted_team_impl(teamHandle, KE::cbegin(view), KE::cend(view), diff --git a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_IsSortedUntil.hpp b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_IsSortedUntil.hpp index d110da6620d..3e17e6f9dc8 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_IsSortedUntil.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_IsSortedUntil.hpp @@ -60,7 +60,6 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> IteratorType is_sorted_until(const ExecutionSpace& ex, IteratorType first, IteratorType last, ComparatorType comp) { - Impl::static_assert_is_not_openmptarget(ex); return Impl::is_sorted_until_exespace_impl( "Kokkos::is_sorted_until_iterator_api_default", ex, first, last, std::move(comp)); @@ -72,8 +71,6 @@ template < IteratorType is_sorted_until(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last, ComparatorType comp) { - Impl::static_assert_is_not_openmptarget(ex); - return Impl::is_sorted_until_exespace_impl(label, ex, first, last, std::move(comp)); } @@ -86,7 +83,6 @@ auto is_sorted_until(const ExecutionSpace& ex, const ::Kokkos::View& view, ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); - Impl::static_assert_is_not_openmptarget(ex); namespace KE = ::Kokkos::Experimental; return Impl::is_sorted_until_exespace_impl( @@ -102,7 +98,6 @@ auto is_sorted_until(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& view, ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); - Impl::static_assert_is_not_openmptarget(ex); namespace KE = ::Kokkos::Experimental; return Impl::is_sorted_until_exespace_impl(label, ex, KE::begin(view), @@ -139,7 +134,6 @@ KOKKOS_FUNCTION IteratorType is_sorted_until(const TeamHandleType& teamHandle, IteratorType first, IteratorType last, ComparatorType comp) { - Impl::static_assert_is_not_openmptarget(teamHandle); return Impl::is_sorted_until_team_impl(teamHandle, first, last, std::move(comp)); } @@ -151,7 +145,6 @@ KOKKOS_FUNCTION auto is_sorted_until( const TeamHandleType& teamHandle, const ::Kokkos::View& view, ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); - Impl::static_assert_is_not_openmptarget(teamHandle); namespace KE = ::Kokkos::Experimental; return Impl::is_sorted_until_team_impl(teamHandle, KE::begin(view), diff --git a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp index e1934dd8357..5e1bb130157 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_IterSwap.hpp @@ -50,16 +50,6 @@ void iter_swap(IteratorType1 a, IteratorType2 b) { Impl::iter_swap_impl(a, b); } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -template -KOKKOS_DEPRECATED_WITH_COMMENT("Use Kokkos::kokkos_swap instead!") -KOKKOS_FUNCTION - void swap(T& a, T& b) noexcept(::Kokkos::kokkos_swap(std::declval(), - std::declval())) { - ::Kokkos::kokkos_swap(a, b); -} -#endif - } // namespace Experimental } // namespace Kokkos diff --git a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_MaxElement.hpp b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_MaxElement.hpp index 2e128cd1718..1b79e66ea61 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_MaxElement.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_MaxElement.hpp @@ -36,8 +36,6 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto max_element(const ExecutionSpace& ex, IteratorType first, IteratorType last, ComparatorType comp) { - Impl::static_assert_is_not_openmptarget(ex); - return Impl::min_or_max_element_exespace_impl( "Kokkos::max_element_iterator_api_default", ex, first, last, std::move(comp)); @@ -48,8 +46,6 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto max_element(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last, ComparatorType comp) { - Impl::static_assert_is_not_openmptarget(ex); - return Impl::min_or_max_element_exespace_impl( label, ex, first, last, std::move(comp)); } @@ -84,7 +80,6 @@ auto max_element(const ExecutionSpace& ex, const ::Kokkos::View& v, ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); - Impl::static_assert_is_not_openmptarget(ex); return Impl::min_or_max_element_exespace_impl( "Kokkos::max_element_view_api_default", ex, begin(v), end(v), @@ -99,7 +94,6 @@ auto max_element(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& v, ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); - Impl::static_assert_is_not_openmptarget(ex); return Impl::min_or_max_element_exespace_impl( label, ex, begin(v), end(v), std::move(comp)); @@ -135,7 +129,6 @@ template ( teamHandle, first, last, std::move(comp)); } @@ -147,7 +140,6 @@ KOKKOS_FUNCTION auto max_element( const TeamHandleType& teamHandle, const ::Kokkos::View& v, ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); - Impl::static_assert_is_not_openmptarget(teamHandle); return Impl::min_or_max_element_team_impl( teamHandle, begin(v), end(v), std::move(comp)); } diff --git a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_MinElement.hpp b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_MinElement.hpp index 99376507976..7f7e7573c8b 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_MinElement.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_MinElement.hpp @@ -36,8 +36,6 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto min_element(const ExecutionSpace& ex, IteratorType first, IteratorType last, ComparatorType comp) { - Impl::static_assert_is_not_openmptarget(ex); - return Impl::min_or_max_element_exespace_impl( "Kokkos::min_element_iterator_api_default", ex, first, last, std::move(comp)); @@ -48,8 +46,6 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto min_element(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last, ComparatorType comp) { - Impl::static_assert_is_not_openmptarget(ex); - return Impl::min_or_max_element_exespace_impl( label, ex, first, last, std::move(comp)); } @@ -73,7 +69,6 @@ auto min_element(const ExecutionSpace& ex, const ::Kokkos::View& v, ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); - Impl::static_assert_is_not_openmptarget(ex); return Impl::min_or_max_element_exespace_impl( "Kokkos::min_element_view_api_default", ex, begin(v), end(v), @@ -99,7 +94,6 @@ auto min_element(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& v, ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); - Impl::static_assert_is_not_openmptarget(ex); return Impl::min_or_max_element_exespace_impl( label, ex, begin(v), end(v), std::move(comp)); @@ -135,7 +129,6 @@ template ( teamHandle, first, last, std::move(comp)); } @@ -146,7 +139,6 @@ template & v, ComparatorType comp) { - Impl::static_assert_is_not_openmptarget(teamHandle); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); return Impl::min_or_max_element_team_impl( teamHandle, begin(v), end(v), std::move(comp)); diff --git a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_MinMaxElement.hpp b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_MinMaxElement.hpp index d5cc34c77d7..29a961df6b9 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_MinMaxElement.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_MinMaxElement.hpp @@ -36,8 +36,6 @@ template < std::enable_if_t<::Kokkos::is_execution_space_v, int> = 0> auto minmax_element(const ExecutionSpace& ex, IteratorType first, IteratorType last, ComparatorType comp) { - Impl::static_assert_is_not_openmptarget(ex); - return Impl::minmax_element_exespace_impl( "Kokkos::minmax_element_iterator_api_default", ex, first, last, std::move(comp)); @@ -49,8 +47,6 @@ template < auto minmax_element(const std::string& label, const ExecutionSpace& ex, IteratorType first, IteratorType last, ComparatorType comp) { - Impl::static_assert_is_not_openmptarget(ex); - return Impl::minmax_element_exespace_impl( label, ex, first, last, std::move(comp)); } @@ -85,7 +81,6 @@ auto minmax_element(const ExecutionSpace& ex, const ::Kokkos::View& v, ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); - Impl::static_assert_is_not_openmptarget(ex); return Impl::minmax_element_exespace_impl( "Kokkos::minmax_element_view_api_default", ex, begin(v), end(v), @@ -100,7 +95,6 @@ auto minmax_element(const std::string& label, const ExecutionSpace& ex, const ::Kokkos::View& v, ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); - Impl::static_assert_is_not_openmptarget(ex); return Impl::minmax_element_exespace_impl( label, ex, begin(v), end(v), std::move(comp)); @@ -125,8 +119,6 @@ template ( teamHandle, first, last, std::move(comp)); } @@ -149,7 +141,6 @@ KOKKOS_FUNCTION auto minmax_element( const TeamHandleType& teamHandle, const ::Kokkos::View& v, ComparatorType comp) { Impl::static_assert_is_admissible_to_kokkos_std_algorithms(v); - Impl::static_assert_is_not_openmptarget(teamHandle); return Impl::minmax_element_team_impl( teamHandle, begin(v), end(v), std::move(comp)); diff --git a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_TransformExclusiveScan.hpp b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_TransformExclusiveScan.hpp index 2c41f932657..d4510323857 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_TransformExclusiveScan.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_TransformExclusiveScan.hpp @@ -24,7 +24,6 @@ OutputIteratorType transform_exclusive_scan( const ExecutionSpace& ex, InputIteratorType first, InputIteratorType last, OutputIteratorType first_dest, ValueType init_value, BinaryOpType binary_op, UnaryOpType unary_op) { - Impl::static_assert_is_not_openmptarget(ex); static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::transform_exclusive_scan_exespace_impl( @@ -43,7 +42,6 @@ OutputIteratorType transform_exclusive_scan( const std::string& label, const ExecutionSpace& ex, InputIteratorType first, InputIteratorType last, OutputIteratorType first_dest, ValueType init_value, BinaryOpType binary_op, UnaryOpType unary_op) { - Impl::static_assert_is_not_openmptarget(ex); static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::transform_exclusive_scan_exespace_impl( @@ -61,7 +59,6 @@ auto transform_exclusive_scan( const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, ValueType init_value, BinaryOpType binary_op, UnaryOpType unary_op) { - Impl::static_assert_is_not_openmptarget(ex); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); static_assert(std::is_move_constructible_v, @@ -83,7 +80,6 @@ auto transform_exclusive_scan( const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, ValueType init_value, BinaryOpType binary_op, UnaryOpType unary_op) { - Impl::static_assert_is_not_openmptarget(ex); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); static_assert(std::is_move_constructible_v, @@ -110,7 +106,6 @@ KOKKOS_FUNCTION OutputIteratorType transform_exclusive_scan( const TeamHandleType& teamHandle, InputIteratorType first, InputIteratorType last, OutputIteratorType first_dest, ValueType init_value, BinaryOpType binary_op, UnaryOpType unary_op) { - Impl::static_assert_is_not_openmptarget(teamHandle); static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::transform_exclusive_scan_team_impl( @@ -127,7 +122,6 @@ KOKKOS_FUNCTION auto transform_exclusive_scan( const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, ValueType init_value, BinaryOpType binary_op, UnaryOpType unary_op) { - Impl::static_assert_is_not_openmptarget(teamHandle); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); static_assert(std::is_move_constructible_v, diff --git a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_TransformInclusiveScan.hpp b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_TransformInclusiveScan.hpp index 040aa091ee2..f54e5a2a4fb 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_TransformInclusiveScan.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_TransformInclusiveScan.hpp @@ -28,8 +28,6 @@ OutputIteratorType transform_inclusive_scan(const ExecutionSpace& ex, OutputIteratorType first_dest, BinaryOpType binary_op, UnaryOpType unary_op) { - Impl::static_assert_is_not_openmptarget(ex); - return Impl::transform_inclusive_scan_exespace_impl( "Kokkos::transform_inclusive_scan_custom_functors_iterator_api", ex, first, last, first_dest, binary_op, unary_op); @@ -46,8 +44,6 @@ OutputIteratorType transform_inclusive_scan( const std::string& label, const ExecutionSpace& ex, InputIteratorType first, InputIteratorType last, OutputIteratorType first_dest, BinaryOpType binary_op, UnaryOpType unary_op) { - Impl::static_assert_is_not_openmptarget(ex); - return Impl::transform_inclusive_scan_exespace_impl( label, ex, first, last, first_dest, binary_op, unary_op); } @@ -62,7 +58,6 @@ auto transform_inclusive_scan( const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, BinaryOpType binary_op, UnaryOpType unary_op) { - Impl::static_assert_is_not_openmptarget(ex); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); namespace KE = ::Kokkos::Experimental; @@ -82,7 +77,6 @@ auto transform_inclusive_scan( const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, BinaryOpType binary_op, UnaryOpType unary_op) { - Impl::static_assert_is_not_openmptarget(ex); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); namespace KE = ::Kokkos::Experimental; @@ -104,7 +98,6 @@ OutputIteratorType transform_inclusive_scan( const ExecutionSpace& ex, InputIteratorType first, InputIteratorType last, OutputIteratorType first_dest, BinaryOpType binary_op, UnaryOpType unary_op, ValueType init_value) { - Impl::static_assert_is_not_openmptarget(ex); static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); @@ -125,7 +118,6 @@ OutputIteratorType transform_inclusive_scan( const std::string& label, const ExecutionSpace& ex, InputIteratorType first, InputIteratorType last, OutputIteratorType first_dest, BinaryOpType binary_op, UnaryOpType unary_op, ValueType init_value) { - Impl::static_assert_is_not_openmptarget(ex); static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); @@ -144,7 +136,6 @@ auto transform_inclusive_scan( const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, BinaryOpType binary_op, UnaryOpType unary_op, ValueType init_value) { - Impl::static_assert_is_not_openmptarget(ex); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); static_assert(std::is_move_constructible_v, @@ -167,7 +158,6 @@ auto transform_inclusive_scan( const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, BinaryOpType binary_op, UnaryOpType unary_op, ValueType init_value) { - Impl::static_assert_is_not_openmptarget(ex); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); static_assert(std::is_move_constructible_v, @@ -197,8 +187,6 @@ KOKKOS_FUNCTION OutputIteratorType transform_inclusive_scan( const TeamHandleType& teamHandle, InputIteratorType first, InputIteratorType last, OutputIteratorType first_dest, BinaryOpType binary_op, UnaryOpType unary_op) { - Impl::static_assert_is_not_openmptarget(teamHandle); - return Impl::transform_inclusive_scan_team_impl( teamHandle, first, last, first_dest, binary_op, unary_op); } @@ -212,7 +200,6 @@ KOKKOS_FUNCTION auto transform_inclusive_scan( const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, BinaryOpType binary_op, UnaryOpType unary_op) { - Impl::static_assert_is_not_openmptarget(teamHandle); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); namespace KE = ::Kokkos::Experimental; @@ -233,7 +220,6 @@ KOKKOS_FUNCTION OutputIteratorType transform_inclusive_scan( const TeamHandleType& teamHandle, InputIteratorType first, InputIteratorType last, OutputIteratorType first_dest, BinaryOpType binary_op, UnaryOpType unary_op, ValueType init_value) { - Impl::static_assert_is_not_openmptarget(teamHandle); static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); @@ -251,7 +237,6 @@ KOKKOS_FUNCTION auto transform_inclusive_scan( const ::Kokkos::View& view_from, const ::Kokkos::View& view_dest, BinaryOpType binary_op, UnaryOpType unary_op, ValueType init_value) { - Impl::static_assert_is_not_openmptarget(teamHandle); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_from); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view_dest); static_assert(std::is_move_constructible_v, diff --git a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp index 6949488a655..519d72d0ad6 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp @@ -190,28 +190,6 @@ static_assert_iterators_have_matching_difference_type(IteratorType1 it1, static_assert_iterators_have_matching_difference_type(it2, it3); } -// -// not_openmptarget -// -template -struct not_openmptarget { -#ifndef KOKKOS_ENABLE_OPENMPTARGET - static constexpr bool value = true; -#else - static constexpr bool value = - !std::is_same, - ::Kokkos::Experimental::OpenMPTarget>::value; -#endif -}; - -template -KOKKOS_INLINE_FUNCTION constexpr void static_assert_is_not_openmptarget( - const ExecutionSpaceOrTeamHandleType& /*ex_or_th*/) { - static_assert(not_openmptarget::value, - "Currently, Kokkos standard algorithms do not support custom " - "comparators in OpenMPTarget"); -} - // // valid range // diff --git a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp index 91436f06d4b..91e762cb03f 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_CopyIf.hpp @@ -141,11 +141,6 @@ KOKKOS_FUNCTION OutputIterator copy_if_team_impl( // no barrier needed because of the scan accumulating into count return d_first + count; } - -#if defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ - !defined(KOKKOS_COMPILER_MSVC) - __builtin_unreachable(); -#endif } } // namespace Impl diff --git a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ExclusiveScan.hpp b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ExclusiveScan.hpp index 15732962f95..3f1ded2a138 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ExclusiveScan.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ExclusiveScan.hpp @@ -23,67 +23,17 @@ namespace Kokkos { namespace Experimental { namespace Impl { -// -// exespace impl -// -template -OutputIteratorType exclusive_scan_default_op_exespace_impl( - const std::string& label, const ExecutionSpace& ex, - InputIteratorType first_from, InputIteratorType last_from, - OutputIteratorType first_dest, ValueType init_value) { - // checks - Impl::static_assert_random_access_and_accessible(ex, first_from, first_dest); - Impl::static_assert_iterators_have_matching_difference_type(first_from, - first_dest); - Impl::expect_valid_range(first_from, last_from); - - // does it make sense to do this static_assert too? - // using input_iterator_value_type = typename InputIteratorType::value_type; - // static_assert - // (std::is_convertible, - // ValueType>::value, - // "exclusive_scan: InputIteratorType::value_type not convertible to - // ValueType"); - - // we are unnecessarily duplicating code, but this is on purpose - // so that we can use the default_op for OpenMPTarget. - // Originally, I had this implemented as: - // ''' - // using bop_type = StdExclusiveScanDefaultJoinFunctor; - // call exclusive_scan_custom_op_impl(..., bop_type()); - // ''' - // which avoids duplicating the functors, but for OpenMPTarget - // I cannot use a custom binary op. - // This is the same problem that occurs for reductions. - - // aliases - using index_type = typename InputIteratorType::difference_type; - using func_type = std::conditional_t< - ::Kokkos::is_detected::value, - ExclusiveScanDefaultFunctorForKnownNeutralElement< - ExecutionSpace, index_type, ValueType, InputIteratorType, - OutputIteratorType>, - ExclusiveScanDefaultFunctorWithValueWrapper>; - - // run - const auto num_elements = - Kokkos::Experimental::distance(first_from, last_from); - ::Kokkos::parallel_scan( - label, RangePolicy(ex, 0, num_elements), - func_type(std::move(init_value), first_from, first_dest)); - - ex.fence("Kokkos::exclusive_scan_default_op: fence after operation"); - - return first_dest + num_elements; -} +template +struct StdExclusiveScanDefaultJoinFunctor { + KOKKOS_FUNCTION + constexpr ValueType operator()(const ValueType& a, const ValueType& b) const { + return a + b; + } +}; template -OutputIteratorType exclusive_scan_custom_op_exespace_impl( +OutputIteratorType exclusive_scan_exespace_impl( const std::string& label, const ExecutionSpace& ex, InputIteratorType first_from, InputIteratorType last_from, OutputIteratorType first_dest, ValueType init_value, BinaryOpType bop) { @@ -107,7 +57,7 @@ OutputIteratorType exclusive_scan_custom_op_exespace_impl( RangePolicy(ex, 0, num_elements), func_type(std::move(init_value), first_from, first_dest, bop, unary_op_type())); - ex.fence("Kokkos::exclusive_scan_custom_op: fence after operation"); + ex.fence("Kokkos::exclusive_scan: fence after operation"); // return return first_dest + num_elements; diff --git a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_InclusiveScan.hpp b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_InclusiveScan.hpp index dfa5e892662..4d747ac6015 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_InclusiveScan.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_InclusiveScan.hpp @@ -30,38 +30,18 @@ import kokkos.core; #pragma GCC diagnostic ignored "-Wshadow" #pragma GCC diagnostic ignored "-Wsuggest-override" -#if defined(KOKKOS_COMPILER_CLANG) -// Some versions of Clang fail to compile Thrust, failing with errors like -// this: -// /thrust/system/cuda/detail/core/agent_launcher.h:557:11: -// error: use of undeclared identifier 'va_printf' -// The exact combination of versions for Clang and Thrust (or CUDA) for this -// failure was not investigated, however even very recent version combination -// (Clang 10.0.0 and Cuda 10.0) demonstrated failure. -// -// Defining _CubLog here locally allows us to avoid that code path, however -// disabling some debugging diagnostics -#pragma push_macro("_CubLog") -#ifdef _CubLog -#undef _CubLog -#endif -// NOLINTNEXTLINE(bugprone-reserved-identifier) -#define _CubLog -#include -#include -#pragma pop_macro("_CubLog") +#if CUDA_VERSION >= 13010 +#include #else #include -#include #endif +#include -#pragma GCC diagnostic pop - -#endif +#elif defined(KOKKOS_ENABLE_ROCTHRUST) -#if defined(KOKKOS_ENABLE_ROCTHRUST) #include #include + #endif namespace Kokkos { @@ -158,7 +138,11 @@ OutputIteratorType inclusive_scan_default_op_exespace_impl( Kokkos::Profiling::popRegion(); +#if CUDA_VERSION >= 13010 + const auto num_elements = cuda::std::distance(first_from, last_from); +#else const auto num_elements = thrust::distance(first_from, last_from); +#endif return first_dest + num_elements; } @@ -244,7 +228,11 @@ OutputIteratorType inclusive_scan_custom_binary_op_exespace_impl( Kokkos::Profiling::popRegion(); +#if CUDA_VERSION >= 13010 + const auto num_elements = cuda::std::distance(first_from, last_from); +#else const auto num_elements = thrust::distance(first_from, last_from); +#endif return first_dest + num_elements; } diff --git a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reduce.hpp b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reduce.hpp index 0eb7a35dcee..7da583841af 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reduce.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reduce.hpp @@ -82,7 +82,6 @@ ValueType reduce_custom_functors_exespace_impl( IteratorType last, ValueType init_reduction_value, JoinerType joiner) { // checks Impl::static_assert_random_access_and_accessible(ex, first); - Impl::static_assert_is_not_openmptarget(ex); Impl::expect_valid_range(first, last); if (first == last) { @@ -113,7 +112,6 @@ ValueType reduce_default_functors_exespace_impl( IteratorType last, ValueType init_reduction_value) { // checks Impl::static_assert_random_access_and_accessible(ex, first); - Impl::static_assert_is_not_openmptarget(ex); Impl::expect_valid_range(first, last); using value_type = std::remove_cvref_t; @@ -157,7 +155,6 @@ KOKKOS_FUNCTION ValueType reduce_custom_functors_team_impl( ValueType init_reduction_value, JoinerType joiner) { // checks Impl::static_assert_random_access_and_accessible(teamHandle, first); - Impl::static_assert_is_not_openmptarget(teamHandle); Impl::expect_valid_range(first, last); if (first == last) { @@ -188,7 +185,6 @@ KOKKOS_FUNCTION ValueType reduce_default_functors_team_impl( ValueType init_reduction_value) { // checks Impl::static_assert_random_access_and_accessible(teamHandle, first); - Impl::static_assert_is_not_openmptarget(teamHandle); Impl::expect_valid_range(first, last); using value_type = std::remove_cvref_t; diff --git a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_TransformReduce.hpp b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_TransformReduce.hpp index 52b575fdd1f..9c512863929 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_TransformReduce.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_TransformReduce.hpp @@ -114,7 +114,6 @@ ValueType transform_reduce_custom_functors_exespace_impl( UnaryTransformerType transformer) { // checks Impl::static_assert_random_access_and_accessible(ex, first); - Impl::static_assert_is_not_openmptarget(ex); Impl::expect_valid_range(first, last); if (first == last) { @@ -153,7 +152,6 @@ ValueType transform_reduce_custom_functors_exespace_impl( JoinerType joiner, BinaryTransformerType transformer) { // checks Impl::static_assert_random_access_and_accessible(ex, first1, first2); - Impl::static_assert_is_not_openmptarget(ex); Impl::static_assert_iterators_have_matching_difference_type(first1, first2); Impl::expect_valid_range(first1, last1); @@ -192,7 +190,6 @@ ValueType transform_reduce_default_functors_exespace_impl( IteratorType1 last1, IteratorType2 first2, ValueType init_reduction_value) { // checks Impl::static_assert_random_access_and_accessible(ex, first1, first2); - Impl::static_assert_is_not_openmptarget(ex); Impl::static_assert_iterators_have_matching_difference_type(first1, first2); Impl::expect_valid_range(first1, last1); @@ -218,7 +215,6 @@ KOKKOS_FUNCTION ValueType transform_reduce_custom_functors_team_impl( UnaryTransformerType transformer) { // checks Impl::static_assert_random_access_and_accessible(teamHandle, first); - Impl::static_assert_is_not_openmptarget(teamHandle); Impl::expect_valid_range(first, last); if (first == last) { @@ -256,7 +252,6 @@ KOKKOS_FUNCTION ValueType transform_reduce_custom_functors_team_impl( BinaryTransformerType transformer) { // checks Impl::static_assert_random_access_and_accessible(teamHandle, first1, first2); - Impl::static_assert_is_not_openmptarget(teamHandle); Impl::static_assert_iterators_have_matching_difference_type(first1, first2); Impl::expect_valid_range(first1, last1); @@ -296,7 +291,6 @@ KOKKOS_FUNCTION ValueType transform_reduce_default_functors_team_impl( IteratorType2 first2, ValueType init_reduction_value) { // checks Impl::static_assert_random_access_and_accessible(teamHandle, first1, first2); - Impl::static_assert_is_not_openmptarget(teamHandle); Impl::static_assert_iterators_have_matching_difference_type(first1, first2); Impl::expect_valid_range(first1, last1); diff --git a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_UniqueCopy.hpp b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_UniqueCopy.hpp index e2aef5e1551..7d8d38e2ad9 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_UniqueCopy.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_UniqueCopy.hpp @@ -166,11 +166,6 @@ KOKKOS_FUNCTION OutputIterator unique_copy_team_impl( return Impl::copy_team_impl(teamHandle, first + scan_size, last, d_first + count); } - -#if defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ - !defined(KOKKOS_COMPILER_MSVC) - __builtin_unreachable(); -#endif } } diff --git a/lib/kokkos/algorithms/unit_tests/CMakeLists.txt b/lib/kokkos/algorithms/unit_tests/CMakeLists.txt index cca6899594d..d9288bdbd26 100644 --- a/lib/kokkos/algorithms/unit_tests/CMakeLists.txt +++ b/lib/kokkos/algorithms/unit_tests/CMakeLists.txt @@ -6,7 +6,7 @@ kokkos_include_directories(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) set(ALGORITHM UnitTestMain.cpp) -foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) +foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL) string(TOUPPER ${Tag} DEVICE) string(TOLOWER ${Tag} dir) @@ -150,11 +150,6 @@ set(STDALGO_TEAM_SOURCES_P) foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamExclusiveScan StdAlgorithmsTeamTransformExclusiveScan) list(APPEND STDALGO_TEAM_SOURCES_P Test${Name}.cpp) endforeach() -if(KOKKOS_ENABLE_OPENMPTARGET) # FIXME_OPENMPTARGET - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_P TestStdAlgorithmsTeamExclusiveScan.cpp - TestStdAlgorithmsTeamTransformExclusiveScan.cpp - ) -endif() # ------------------------------------------ # std team M @@ -206,12 +201,6 @@ foreach( list(APPEND STDALGO_TEAM_SOURCES_H Test${Name}.cpp) endforeach() -if(KOKKOS_ENABLE_OPENMPTARGET) # FIXME_OPENMPTARGET - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_H TestStdAlgorithmsTeamCopyIf.cpp TestStdAlgorithmsTeamUniqueCopy.cpp - TestStdAlgorithmsTeamRemoveCopy.cpp TestStdAlgorithmsTeamRemoveCopyIf.cpp - ) -endif() - # ------------------------------------------ # std team G # ------------------------------------------ @@ -303,61 +292,10 @@ foreach( list(APPEND STDALGO_TEAM_SOURCES_A Test${Name}.cpp) endforeach() -# FIXME_OPENMPTARGET - remove sort test as it leads to ICE with clang/16 and above at compile time. -if(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION - VERSION_GREATER_EQUAL 16.0.0 -) - list(REMOVE_ITEM ALGO_SORT_SOURCES TestSort.cpp) -endif() - -# FIXME_OPENMPTARGET remove tests for OpenMPTarget because in these cases -# the impl needs to use either Kokkos or tailored reducers -# which results in runtime memory errors. -if(KOKKOS_ENABLE_OPENMPTARGET) - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_L TestStdAlgorithmsTeamIsPartitioned.cpp - TestStdAlgorithmsTeamPartitionPoint.cpp TestStdAlgorithmsTeamPartitionCopy.cpp - ) -endif() - -# FIXME_OPENMPTARGET need to remove tests for OpenMPTarget because -# in these cases the impl needs to use either Kokkos or -# tailored reducers which results in runtime memory errors. -if(KOKKOS_ENABLE_OPENMPTARGET) - list( - REMOVE_ITEM - STDALGO_TEAM_SOURCES_C - TestStdAlgorithmsTeamFind.cpp - TestStdAlgorithmsTeamFindIf.cpp - TestStdAlgorithmsTeamFindIfNot.cpp - TestStdAlgorithmsTeamAllOf.cpp - TestStdAlgorithmsTeamAnyOf.cpp - TestStdAlgorithmsTeamNoneOf.cpp - TestStdAlgorithmsTeamSearchN.cpp - ) -endif() - kokkos_add_executable_and_test(UnitTest_Sort SOURCES UnitTestMain.cpp TestStdAlgorithmsCommon.cpp ${ALGO_SORT_SOURCES}) kokkos_add_executable_and_test(UnitTest_Random SOURCES UnitTestMain.cpp ${ALGO_RANDOM_SOURCES}) -# FIXME_OPENMPTARGET remove tests for OpenMPTarget -# causing failures for various reasons -if(KOKKOS_ENABLE_OPENMPTARGET) - # the following use either Kokkos or tailored reducers - # which results in runtime memory errors. - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_B TestStdAlgorithmsTeamFindEnd.cpp TestStdAlgorithmsTeamFindFirstOf.cpp - TestStdAlgorithmsTeamSearch.cpp - ) - - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_A TestStdAlgorithmsTeamAdjacentFind.cpp - TestStdAlgorithmsTeamLexicographicalCompare.cpp TestStdAlgorithmsTeamMismatch.cpp - ) - - # this causes an illegal memory access if team_members_have_matching_result - # is called - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_M TestStdAlgorithmsTeamTransformBinaryOp.cpp) -endif() - foreach(ID A;B;C;D;E) kokkos_add_executable_and_test(AlgorithmsUnitTest_StdSet_${ID} SOURCES UnitTestMain.cpp ${STDALGO_SOURCES_${ID}}) endforeach() diff --git a/lib/kokkos/algorithms/unit_tests/TestBinSortA.hpp b/lib/kokkos/algorithms/unit_tests/TestBinSortA.hpp index eda035916e7..7ae61535de0 100644 --- a/lib/kokkos/algorithms/unit_tests/TestBinSortA.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestBinSortA.hpp @@ -217,10 +217,6 @@ void test_sort_integer_overflow() { } // namespace BinSortSetA TEST(TEST_CATEGORY, BinSortGenericTests) { - // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler -#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) - GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; -#endif using ExecutionSpace = TEST_EXECSPACE; using key_type = unsigned; constexpr int N = 171; diff --git a/lib/kokkos/algorithms/unit_tests/TestBinSortB.hpp b/lib/kokkos/algorithms/unit_tests/TestBinSortB.hpp index cf2f11080db..5fe41473f56 100644 --- a/lib/kokkos/algorithms/unit_tests/TestBinSortB.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestBinSortB.hpp @@ -178,10 +178,6 @@ void run_for_rank2() { } // namespace BinSortSetB TEST(TEST_CATEGORY, BinSortUnsignedKeyLayoutStrideValues) { - // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler -#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) - GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; -#endif using ExeSpace = TEST_EXECSPACE; using key_type = unsigned; BinSortSetB::run_for_rank1(); diff --git a/lib/kokkos/algorithms/unit_tests/TestNestedSort.hpp b/lib/kokkos/algorithms/unit_tests/TestNestedSort.hpp index 05892be4da4..734e5f93112 100644 --- a/lib/kokkos/algorithms/unit_tests/TestNestedSort.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestNestedSort.hpp @@ -379,11 +379,6 @@ void test_nested_sort_by_key(unsigned int N, KeyType minKey, KeyType maxKey, } // namespace NestedSortImpl TEST(TEST_CATEGORY, NestedSort) { - // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler -#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) - GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; -#endif - using ExecutionSpace = TEST_EXECSPACE; NestedSortImpl::test_nested_sort(171, 0U, UINT_MAX); NestedSortImpl::test_nested_sort(42, -1e6f, 1e6f); @@ -392,11 +387,6 @@ TEST(TEST_CATEGORY, NestedSort) { } TEST(TEST_CATEGORY, NestedSortByKey) { - // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler -#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) - GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; -#endif - using ExecutionSpace = TEST_EXECSPACE; // Second/third template arguments are key and value respectively. diff --git a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp index 2fade46acee..0057a389c40 100644 --- a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp @@ -71,16 +71,6 @@ struct RandomProperties { } }; -// FIXME_OPENMPTARGET: Need this for OpenMPTarget because contra to the standard -// llvm requires the binary operator defined not just the += -KOKKOS_INLINE_FUNCTION -RandomProperties operator+(const RandomProperties& org, - const RandomProperties& add) { - RandomProperties val = org; - val += add; - return val; -} - template struct test_random_functor { using rnd_type = typename GeneratorPool::generator_type; @@ -605,11 +595,6 @@ void test_async_initialization(Args... args) { } // namespace AlgoRandomImpl TEST(TEST_CATEGORY, Random_XorShift64) { - // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler -#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) - GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; -#endif - using ExecutionSpace = TEST_EXECSPACE; #if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_CUDA) || \ @@ -630,10 +615,6 @@ TEST(TEST_CATEGORY, Random_XorShift64) { TEST(TEST_CATEGORY, Random_XorShift1024_0) { using ExecutionSpace = TEST_EXECSPACE; - // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler -#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) - GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; -#endif #if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_CUDA) || \ defined(KOKKOS_ENABLE_HIP) @@ -653,13 +634,6 @@ TEST(TEST_CATEGORY, Random_XorShift1024_0) { TEST(TEST_CATEGORY, Multi_streams) { using ExecutionSpace = TEST_EXECSPACE; -#ifdef KOKKOS_ENABLE_OPENMPTARGET - if constexpr (std::is_same_v) { - GTEST_SKIP() << "Libomptarget error"; // FIXME_OPENMPTARGET - } -#endif - #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) if constexpr (std::is_same_v) { GTEST_SKIP() << "Failing on NVIDIA GPUs"; // FIXME_SYCL diff --git a/lib/kokkos/algorithms/unit_tests/TestSort.hpp b/lib/kokkos/algorithms/unit_tests/TestSort.hpp index 14bf0b5cd4b..4d39fa7fccb 100644 --- a/lib/kokkos/algorithms/unit_tests/TestSort.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestSort.hpp @@ -207,29 +207,18 @@ void test_sort_integer_overflow() { } // namespace SortImpl TEST(TEST_CATEGORY, SortUnsignedValueType) { - // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler -#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) - GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; -#endif using ExecutionSpace = TEST_EXECSPACE; using key_type = unsigned; constexpr int N = 171; SortImpl::test_1D_sort_impl(N * N * N); -#ifndef KOKKOS_ENABLE_OPENMPTARGET - // FIXME_OPENMPTARGET: OpenMPTarget doesn't support DynamicView yet. SortImpl::test_dynamic_view_sort_impl(N * N); -#endif SortImpl::test_issue_4978_impl(); } TEST(TEST_CATEGORY, SortEmptyView) { - // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler -#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) - GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; -#endif using ExecutionSpace = TEST_EXECSPACE; // does not matter if we use int or something else diff --git a/lib/kokkos/algorithms/unit_tests/TestSortCustomComp.hpp b/lib/kokkos/algorithms/unit_tests/TestSortCustomComp.hpp index 0c7380632c4..16cc1c03125 100644 --- a/lib/kokkos/algorithms/unit_tests/TestSortCustomComp.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestSortCustomComp.hpp @@ -143,7 +143,7 @@ TEST(TEST_CATEGORY, SortWithCustomComparator) { } // namespace SortWithComp } // namespace anonym - + #undef KOKKOS_IMPL_ONEDPL_VERSION #undef KOKKOS_IMPL_ONEDPL_VERSION_GREATER_EQUAL diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp index 8df849acc9f..63c64513af2 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp @@ -193,10 +193,6 @@ auto create_deep_copyable_compatible_view_with_same_extent(ViewType view) { // this is needed for intel to avoid // error #1011: missing return statement at end of non-void function -#if defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ - !defined(KOKKOS_COMPILER_MSVC) - __builtin_unreachable(); -#endif } template @@ -440,9 +436,6 @@ struct CustomValueType { KOKKOS_INLINE_FUNCTION CustomValueType(value_type val) : value(val) {} - KOKKOS_INLINE_FUNCTION - CustomValueType(const CustomValueType& other) { this->value = other.value; } - KOKKOS_INLINE_FUNCTION explicit operator value_type() const { return value; } @@ -458,12 +451,6 @@ struct CustomValueType { return *this; } - KOKKOS_INLINE_FUNCTION - CustomValueType& operator=(const CustomValueType& other) { - this->value = other.value; - return *this; - } - KOKKOS_INLINE_FUNCTION CustomValueType operator+(const CustomValueType& other) const { CustomValueType result; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCompileOnly.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCompileOnly.cpp index c15aae7bc1c..bf3b4b6b28c 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCompileOnly.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCompileOnly.cpp @@ -387,18 +387,14 @@ struct TestStruct { TEST_ALGO_MACRO_B1E1(is_sorted_until); TEST_ALGO_MACRO_V1(is_sorted_until); -#ifndef KOKKOS_ENABLE_OPENMPTARGET TEST_ALGO_MACRO_B1E1_VARIAD(is_sorted_until, TrivialComparator()); TEST_ALGO_MACRO_V1_VARIAD(is_sorted_until, TrivialComparator()); -#endif TEST_ALGO_MACRO_B1E1(is_sorted); TEST_ALGO_MACRO_V1(is_sorted); -#ifndef KOKKOS_ENABLE_OPENMPTARGET TEST_ALGO_MACRO_B1E1_VARIAD(is_sorted, TrivialComparator()); TEST_ALGO_MACRO_V1_VARIAD(is_sorted, TrivialComparator()); -#endif } void minmax_ops() { @@ -409,14 +405,12 @@ struct TestStruct { TEST_ALGO_MACRO_B1E1(minmax_element); TEST_ALGO_MACRO_V1(minmax_element); -#ifndef KOKKOS_ENABLE_OPENMPTARGET TEST_ALGO_MACRO_B1E1_VARIAD(min_element, TrivialComparator()); TEST_ALGO_MACRO_V1_VARIAD(min_element, TrivialComparator()); TEST_ALGO_MACRO_B1E1_VARIAD(max_element, TrivialComparator()); TEST_ALGO_MACRO_V1_VARIAD(max_element, TrivialComparator()); TEST_ALGO_MACRO_B1E1_VARIAD(minmax_element, TrivialComparator()); TEST_ALGO_MACRO_V1_VARIAD(minmax_element, TrivialComparator()); -#endif } void partitionig_ops() { @@ -439,7 +433,6 @@ struct TestStruct { TEST_ALGO_MACRO_B1E1B2_VARIAD(exclusive_scan, T{}); TEST_ALGO_MACRO_V1V2_VARIAD(exclusive_scan, T{}); -#ifndef KOKKOS_ENABLE_OPENMPTARGET TEST_ALGO_MACRO_B1E1B2_VARIAD(exclusive_scan, T{}, TrivialBinaryFunctor()); TEST_ALGO_MACRO_V1V2_VARIAD(exclusive_scan, T{}, TrivialBinaryFunctor()); @@ -450,11 +443,9 @@ struct TestStruct { TEST_ALGO_MACRO_V1V2_VARIAD(transform_exclusive_scan, T{}, TrivialBinaryFunctor(), TrivialUnaryFunctor()); -#endif TEST_ALGO_MACRO_B1E1B2(inclusive_scan); TEST_ALGO_MACRO_V1V2(inclusive_scan); -#ifndef KOKKOS_ENABLE_OPENMPTARGET TEST_ALGO_MACRO_B1E1B2_VARIAD(inclusive_scan, TrivialBinaryFunctor()); TEST_ALGO_MACRO_V1V2_VARIAD(inclusive_scan, TrivialBinaryFunctor()); TEST_ALGO_MACRO_B1E1B2_VARIAD(inclusive_scan, TrivialBinaryFunctor(), @@ -473,9 +464,7 @@ struct TestStruct { TEST_ALGO_MACRO_V1V2_VARIAD(transform_inclusive_scan, TrivialBinaryFunctor(), TrivialUnaryFunctor(), T{}); -#endif -#ifndef KOKKOS_ENABLE_OPENMPTARGET TEST_ALGO_MACRO_B1E1(reduce); TEST_ALGO_MACRO_V1(reduce); TEST_ALGO_MACRO_B1E1_VARIAD(reduce, T{}); @@ -498,7 +487,6 @@ struct TestStruct { TEST_ALGO_MACRO_V1_VARIAD(transform_reduce, T{}, TrivialReduceJoinFunctor(), TrivialTransformReduceUnaryTransformer()); -#endif } }; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp index f339f1e789a..eae7b19086e 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp @@ -292,7 +292,6 @@ void run_exclusive_scan_all_scenarios() { run_single_scenario_inplace(it, ValueType{0}); run_single_scenario_inplace(it, ValueType{-2}); -#if !defined KOKKOS_ENABLE_OPENMPTARGET // custom multiply op is only run for small views otherwise it overflows if (it.first == "small-a" || it.first == "small-b") { using custom_bop_t = MultiplyFunctor; @@ -317,7 +316,6 @@ void run_exclusive_scan_all_scenarios() { custom_bop_t()); run_single_scenario_inplace(it, ValueType{-2}, custom_bop_t()); -#endif } } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp index df3dcef39ed..e982e09eaae 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp @@ -285,7 +285,6 @@ void run_inclusive_scan_all_scenarios() { run_single_scenario(it); run_single_scenario_inplace(it); -#if !defined KOKKOS_ENABLE_OPENMPTARGET // the sum custom op is always run using sum_binary_op = SumFunctor; sum_binary_op sbop; @@ -312,7 +311,6 @@ void run_inclusive_scan_all_scenarios() { run_single_scenario_inplace(it, mbop, ValueType{0}); run_single_scenario_inplace(it, mbop, ValueType{-2}); } -#endif } } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp index 13fa517b15e..9bf30e8073c 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp @@ -138,7 +138,6 @@ void run_single_scenario(const InfoType& scenario_info) { [=](bool v) { return v == gold; }); EXPECT_TRUE(allA) << name << ", " << view_tag_to_string(Tag{}); -#if !defined KOKKOS_ENABLE_OPENMPTARGET CustomLessThanComparator comp; std::vector resultsB(4); resultsB[0] = @@ -150,7 +149,6 @@ void run_single_scenario(const InfoType& scenario_info) { const auto allB = std::all_of(resultsB.cbegin(), resultsB.cend(), [=](bool v) { return v == gold; }); EXPECT_TRUE(allB) << name << ", " << view_tag_to_string(Tag{}); -#endif Kokkos::fence(); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp index 51d48ca2600..d46d54b23d1 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp @@ -140,7 +140,6 @@ void run_single_scenario(const InfoType& scenario_info) { ASSERT_EQ(r3, gold) << name << ", " << view_tag_to_string(Tag{}); ASSERT_EQ(r4, gold) << name << ", " << view_tag_to_string(Tag{}); -#if !defined KOKKOS_ENABLE_OPENMPTARGET CustomLessThanComparator comp; [[maybe_unused]] auto r5 = KE::is_sorted_until(exespace(), KE::cbegin(view), KE::cend(view), comp); @@ -149,7 +148,6 @@ void run_single_scenario(const InfoType& scenario_info) { [[maybe_unused]] auto r7 = KE::is_sorted_until(exespace(), view, comp); [[maybe_unused]] auto r8 = KE::is_sorted_until("label", exespace(), view, comp); -#endif ASSERT_EQ(r1, gold) << name << ", " << view_tag_to_string(Tag{}); ASSERT_EQ(r2, gold) << name << ", " << view_tag_to_string(Tag{}); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsLexicographicalCompare.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsLexicographicalCompare.cpp index f387e096e16..e5c56c6cf71 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsLexicographicalCompare.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsLexicographicalCompare.cpp @@ -127,12 +127,9 @@ void run_all_scenarios() { } TEST(std_algorithms_lexicographical_compare_test, test) { -// FIXME: should this disable only custom comparator tests? -#if !defined KOKKOS_ENABLE_OPENMPTARGET run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); -#endif } } // namespace LexicographicalCompare diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsMinMaxElementOps.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsMinMaxElementOps.cpp index 14433e9d97e..0af6662eed9 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsMinMaxElementOps.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsMinMaxElementOps.cpp @@ -297,7 +297,6 @@ void std_algorithms_min_max_element_test::test_minmax_element_non_trivial_data( } } -#if !defined KOKKOS_ENABLE_OPENMPTARGET template void std_algorithms_min_max_element_test:: test_max_element_non_trivial_data_custom_comp(ViewType view) { @@ -378,7 +377,6 @@ void std_algorithms_min_max_element_test:: } } } -#endif // trivial case TEST_F(std_algorithms_min_max_element_test, min_element_empty_range) { @@ -406,7 +404,6 @@ TEST_F(std_algorithms_min_max_element_test, max_element_non_trivial_data) { test_max_element_non_trivial_data(m_strided_view); } -#if !defined KOKKOS_ENABLE_OPENMPTARGET // non-trivial data, custom comp TEST_F(std_algorithms_min_max_element_test, min_element_non_trivial_data_custom_comp) { @@ -421,9 +418,7 @@ TEST_F(std_algorithms_min_max_element_test, test_max_element_non_trivial_data_custom_comp(m_dynamic_view); test_max_element_non_trivial_data_custom_comp(m_strided_view); } -#endif -#if defined(KOKKOS_ENABLE_OPENMPTARGET) TEST_F(std_algorithms_min_max_element_test, minmax_element_empty_range) { test_minmax_element_empty_range(m_static_view); test_minmax_element_empty_range(m_dynamic_view); @@ -435,17 +430,13 @@ TEST_F(std_algorithms_min_max_element_test, minmax_element_non_trivial_data) { test_minmax_element_non_trivial_data(m_dynamic_view); test_minmax_element_non_trivial_data(m_strided_view); } -#endif -#if !defined KOKKOS_ENABLE_OPENMPTARGET -// OpenMPTarget does not yet support custom comparator TEST_F(std_algorithms_min_max_element_test, minmax_element_non_trivial_data_custom_comp) { test_minmax_element_non_trivial_data_custom_comp(m_static_view); test_minmax_element_non_trivial_data_custom_comp(m_dynamic_view); test_minmax_element_non_trivial_data_custom_comp(m_strided_view); } -#endif } // namespace stdalgos } // namespace Test diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp index f667902dfc2..3cb5827dc36 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp @@ -16,6 +16,9 @@ struct MyMovableType { int m_value = 11; MyMovableType() = default; + + MyMovableType(const MyMovableType&) = delete; + MyMovableType(MyMovableType&& other) { if (this != &other) { m_value = other.m_value; @@ -23,6 +26,8 @@ struct MyMovableType { } } + MyMovableType& operator=(const MyMovableType&) = delete; + MyMovableType& operator=(MyMovableType&& other) { if (this != &other) { m_value = other.m_value; @@ -30,6 +35,8 @@ struct MyMovableType { } return *this; } + + ~MyMovableType() = default; }; TEST(std_algorithms_mod_ops_test, move) { diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsNumerics.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsNumerics.cpp index 494f9d5d4b0..9c7b343d76f 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsNumerics.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsNumerics.cpp @@ -117,8 +117,6 @@ struct std_algorithms_numerics_test : public ::testing::Test { } }; -#if !defined KOKKOS_ENABLE_OPENMPTARGET - // ------------------------------------------------------------------- // test default case of transform_reduce // @@ -583,7 +581,5 @@ TEST_F(std_algorithms_numerics_test, joiner_type()); } -#endif // not defined KOKKOS_ENABLE_OPENMPTARGET - } // namespace stdalgos } // namespace Test diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp index 18212af6de3..8ae53df86b2 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp @@ -72,8 +72,6 @@ struct TestFunctorA { break; } -#ifndef KOKKOS_ENABLE_OPENMPTARGET - case 2: { auto it = KE::exclusive_scan( member, KE::cbegin(rowViewSrc), KE::cend(rowViewSrc), @@ -94,7 +92,6 @@ struct TestFunctorA { break; } -#endif } // store result of checking if all members have their local @@ -193,7 +190,6 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { break; } -#ifndef KOKKOS_ENABLE_OPENMPTARGET case 2: case 3: { auto it = std::exclusive_scan(KE::cbegin(rowFrom), KE::cend(rowFrom), @@ -203,7 +199,6 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { break; } -#endif default: Kokkos::abort("unreachable"); } } @@ -221,11 +216,7 @@ template void run_all_scenarios() { for (int numTeams : teamSizesToTest) { for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 8153}) { -#ifndef KOKKOS_ENABLE_OPENMPTARGET for (int apiId : {0, 1, 2, 3}) { -#else - for (int apiId : {0, 1}) { -#endif test_A(numTeams, numCols, apiId); } } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp index d1a199443e4..36b76b83fd4 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamInclusiveScan.cpp @@ -258,10 +258,6 @@ void run_all_scenarios() { } TEST(std_algorithms_inclusive_scan_team_test, test) { -// FIXME_OPENMPTARGET -#if defined(KOKKOS_ENABLE_OPENMPTARGET) - GTEST_SKIP() << "the test is known to fail with OpenMPTarget"; -#endif run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp index fe19b5605ee..b7a1e415929 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp @@ -38,9 +38,7 @@ struct TestFunctorA { result = KE::is_sorted(member, myRowView); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_returnsView(myRowIndex) = result; }); - } -#ifndef KOKKOS_ENABLE_OPENMPTARGET - else if (m_apiPick == 2) { + } else if (m_apiPick == 2) { using value_type = typename ViewType::value_type; result = KE::is_sorted(member, KE::cbegin(myRowView), KE::cend(myRowView), CustomLessThanComparator{}); @@ -53,7 +51,6 @@ struct TestFunctorA { Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_returnsView(myRowIndex) = result; }); } -#endif // store result of checking if all members have their local // values matching the one stored in m_distancesView @@ -166,11 +163,7 @@ template void run_all_scenarios(bool makeDataSortedOnPurpose) { for (int numTeams : teamSizesToTest) { for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 5153}) { -#ifndef KOKKOS_ENABLE_OPENMPTARGET for (int apiId : {0, 1, 2, 3}) { -#else - for (int apiId : {0, 1}) { -#endif test_A(numTeams, numCols, apiId, makeDataSortedOnPurpose); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp index 8956ef440dd..cbecaa57cee 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp @@ -60,9 +60,7 @@ struct TestFunctorA { Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; }); - } -#ifndef KOKKOS_ENABLE_OPENMPTARGET - else if (m_apiPick == 2) { + } else if (m_apiPick == 2) { using value_type = typename ViewType::value_type; auto it = KE::is_sorted_until(member, KE::cbegin(myRowView), KE::cend(myRowView), @@ -82,7 +80,6 @@ struct TestFunctorA { m_distancesView(myRowIndex) = resultDist; }); } -#endif // store result of checking if all members have their local // values matching the one stored in m_distancesView @@ -214,11 +211,7 @@ template void run_all_scenarios(const std::string& name, const std::vector& cols) { for (int numTeams : teamSizesToTest) { for (const auto& numCols : cols) { -#ifndef KOKKOS_ENABLE_OPENMPTARGET for (int apiId : {0, 1, 2, 3}) { -#else - for (int apiId : {0, 1}) { -#endif test_A(numTeams, numCols, apiId, name); } } @@ -243,10 +236,6 @@ TEST(std_algorithms_is_sorted_until_team_test, test_trivialB) { } TEST(std_algorithms_is_sorted_until_team_test, test_nontrivialA) { -#ifdef KOKKOS_ENABLE_OPENMPTARGET // FIXME_OPENMPTARGET Failing with clang 17 - GTEST_SKIP() << "Known to fail with OpenMPTarget and clang 17"; -#endif - const std::string name = "nontrivialUntilLast"; const std::vector cols = {13, 101, 1444, 5153}; run_all_scenarios(name, cols); @@ -255,10 +244,6 @@ TEST(std_algorithms_is_sorted_until_team_test, test_nontrivialA) { } TEST(std_algorithms_is_sorted_until_team_test, test_nontrivialB) { -#ifdef KOKKOS_ENABLE_OPENMPTARGET // FIXME_OPENMPTARGET Failing with clang 17 - GTEST_SKIP() << "Known to fail with OpenMPTarget and clang 17"; -#endif - const std::string name = "nontrivialRandom"; const std::vector cols = {13, 101, 1444, 5153}; run_all_scenarios(name, cols); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp index c93b57fb10a..c2cf5b70635 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp @@ -45,9 +45,7 @@ struct TestFunctorA { Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; }); - } -#ifndef KOKKOS_ENABLE_OPENMPTARGET - else if (m_apiPick == 2) { + } else if (m_apiPick == 2) { using value_type = typename ViewType::value_type; auto it = KE::max_element(member, KE::cbegin(myRowView), KE::cend(myRowView), @@ -67,7 +65,6 @@ struct TestFunctorA { m_distancesView(myRowIndex) = resultDist; }); } -#endif // store result of checking if all members have their local // values matching the one stored in m_distancesView @@ -147,8 +144,6 @@ template void run_all_scenarios() { for (int numTeams : teamSizesToTest) { for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 5113}) { - // for OpenMPTarget we need to avod api accepting a custom - // comparator because it is not supported for (int apiId : {0, 1, 2, 3}) { test_A(numTeams, numCols, apiId); } @@ -157,11 +152,9 @@ void run_all_scenarios() { } TEST(std_algorithms_max_element_team_test, test) { -#ifndef KOKKOS_ENABLE_OPENMPTARGET run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); -#endif } } // namespace TeamMaxElement diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp index d56b0c5f96b..6a9b9256d9f 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp @@ -45,9 +45,7 @@ struct TestFunctorA { Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; }); - } -#ifndef KOKKOS_ENABLE_OPENMPTARGET - else if (m_apiPick == 2) { + } else if (m_apiPick == 2) { using value_type = typename ViewType::value_type; auto it = KE::min_element(member, KE::cbegin(myRowView), KE::cend(myRowView), @@ -67,7 +65,6 @@ struct TestFunctorA { m_distancesView(myRowIndex) = resultDist; }); } -#endif // store result of checking if all members have their local // values matching the one stored in m_distancesView @@ -146,8 +143,6 @@ template void run_all_scenarios() { for (int numTeams : teamSizesToTest) { for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 5113}) { - // for OpenMPTarget we need to avod api accepting a custom - // comparator because it is not supported for (int apiId : {0, 1, 2, 3}) { test_A(numTeams, numCols, apiId); } @@ -156,11 +151,9 @@ void run_all_scenarios() { } TEST(std_algorithms_min_element_team_test, test) { -#ifndef KOKKOS_ENABLE_OPENMPTARGET run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); -#endif } } // namespace TeamMinElement diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp index f7133f1ed24..c747fc758cf 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp @@ -52,9 +52,7 @@ struct TestFunctorA { m_distancesView(myRowIndex, 0) = resultDist1; m_distancesView(myRowIndex, 1) = resultDist2; }); - } -#ifndef KOKKOS_ENABLE_OPENMPTARGET - else if (m_apiPick == 2) { + } else if (m_apiPick == 2) { using value_type = typename ViewType::value_type; auto itPair = KE::minmax_element(member, KE::cbegin(myRowView), KE::cend(myRowView), @@ -80,7 +78,6 @@ struct TestFunctorA { m_distancesView(myRowIndex, 1) = resultDist2; }); } -#endif // store result of checking if all members have their local // values matching the one stored in m_distancesView @@ -165,8 +162,6 @@ template void run_all_scenarios() { for (int numTeams : teamSizesToTest) { for (const auto& numCols : {0, 1, 2, 13, 101, 1444, 5113}) { - // for OpenMPTarget we need to avod api accepting a custom - // comparator because it is not supported for (int apiId : {0, 1, 2, 3}) { test_A(numTeams, numCols, apiId); } @@ -175,11 +170,9 @@ void run_all_scenarios() { } TEST(std_algorithms_minmax_element_team_test, test) { -#ifndef KOKKOS_ENABLE_OPENMPTARGET run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); -#endif } } // namespace TeamMinMaxElement diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReduce.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReduce.cpp index 553aefbae7b..e2b1da60fc9 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReduce.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReduce.cpp @@ -3,8 +3,6 @@ #include -#ifndef KOKKOS_ENABLE_OPENMPTARGET - namespace Test { namespace stdalgos { namespace TeamReduce { @@ -243,5 +241,3 @@ TEST(std_algorithms_reduce_team_test, test) { } // namespace TeamReduce } // namespace stdalgos } // namespace Test - -#endif diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp index b7c0b7da24e..cae1a3e5aa7 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformExclusiveScan.cpp @@ -3,8 +3,6 @@ #include -#ifndef KOKKOS_ENABLE_OPENMPTARGET - namespace Test { namespace stdalgos { namespace TeamTransformExclusiveScan { @@ -219,5 +217,3 @@ TEST(std_algorithms_transform_exclusive_scan_team_test, test) { } // namespace TeamTransformExclusiveScan } // namespace stdalgos } // namespace Test - -#endif diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp index efbe0af81e5..0792e0dd737 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp @@ -3,8 +3,6 @@ #include -#ifndef KOKKOS_ENABLE_OPENMPTARGET - namespace Test { namespace stdalgos { namespace TeamTransformInclusiveScan { @@ -256,5 +254,3 @@ TEST(std_algorithms_transform_inclusive_scan_team_test, test) { } // namespace TeamTransformInclusiveScan } // namespace stdalgos } // namespace Test - -#endif diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformReduce.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformReduce.cpp index 359bdea3bde..7829d280607 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformReduce.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformReduce.cpp @@ -3,8 +3,6 @@ #include -#ifndef KOKKOS_ENABLE_OPENMPTARGET - namespace Test { namespace stdalgos { namespace TeamTransformReduce { @@ -295,5 +293,3 @@ TEST(std_algorithms_transform_reduce_team_test, test) { } // namespace TeamTransformReduce } // namespace stdalgos } // namespace Test - -#endif diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp index be3dab55fe1..a9913491764 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp @@ -299,14 +299,12 @@ void run_all_scenarios() { } } -#if !defined KOKKOS_ENABLE_OPENMPTARGET TEST(std_algorithms_numeric_ops_test, transform_exclusive_scan) { run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); run_all_scenarios(); } -#endif template struct MultiplyFunctor { diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp index 1e4925e4259..b47af380253 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp @@ -298,7 +298,6 @@ void run_all_scenarios() { } } -#if !defined KOKKOS_ENABLE_OPENMPTARGET // FIXME_OPENMPTARGET TEST(std_algorithms_numeric_ops_test, transform_inclusive_scan) { run_all_scenarios(); run_all_scenarios(); @@ -373,7 +372,6 @@ TEST(std_algorithms_numeric_ops_test, transform_inclusive_scan_functor) { test_lambda(functor); } } -#endif } // namespace TransformIncScan } // namespace stdalgos diff --git a/lib/kokkos/benchmarks/.clang-tidy b/lib/kokkos/benchmarks/.clang-tidy new file mode 100644 index 00000000000..4175cb3a4e2 --- /dev/null +++ b/lib/kokkos/benchmarks/.clang-tidy @@ -0,0 +1,4 @@ +Checks: > + -bugprone-exception-escape + +InheritParentConfig: true diff --git a/lib/kokkos/benchmarks/CMakeLists.txt b/lib/kokkos/benchmarks/CMakeLists.txt index 968c8ae3bf5..4b49d55a217 100644 --- a/lib/kokkos/benchmarks/CMakeLists.txt +++ b/lib/kokkos/benchmarks/CMakeLists.txt @@ -1,12 +1,8 @@ -#FIXME_OPENMPTARGET - compiling in debug mode causes ICE. kokkos_add_benchmark_directories(atomic) kokkos_add_benchmark_directories(gather) kokkos_add_benchmark_directories(gups) kokkos_add_benchmark_directories(launch_latency) kokkos_add_benchmark_directories(stream) kokkos_add_benchmark_directories(view_copy_constructor) -#FIXME_OPENMPTARGET - These two benchmarks cause ICE. Commenting them for now but a deeper analysis on the cause and a possible fix will follow. -if(NOT Kokkos_ENABLE_OPENMPTARGET) - kokkos_add_benchmark_directories(policy_performance) - kokkos_add_benchmark_directories(bytes_and_flops) -endif() +kokkos_add_benchmark_directories(policy_performance) +kokkos_add_benchmark_directories(bytes_and_flops) diff --git a/lib/kokkos/benchmarks/benchmark_suite/scripts/build_code.bash b/lib/kokkos/benchmarks/benchmark_suite/scripts/build_code.bash index 0b885293e27..1cc5810e20e 100755 --- a/lib/kokkos/benchmarks/benchmark_suite/scripts/build_code.bash +++ b/lib/kokkos/benchmarks/benchmark_suite/scripts/build_code.bash @@ -80,5 +80,3 @@ cd build/miniFE make KOKKOS_ARCH=${KOKKOS_ARCH} KOKKOS_DEVICES=${KOKKOS_DEVICES} CXX=${CXX} KOKKOS_PATH=${KOKKOS_PATH} \ CXXFLAGS=${OPT_FLAG} -f ${MINIFE_PATH}/src/Makefile -j 16 cd ../../ - - diff --git a/lib/kokkos/benchmarks/benchmark_suite/scripts/checkout_repos.bash b/lib/kokkos/benchmarks/benchmark_suite/scripts/checkout_repos.bash index 9b52a36d89a..35ed7d8099b 100755 --- a/lib/kokkos/benchmarks/benchmark_suite/scripts/checkout_repos.bash +++ b/lib/kokkos/benchmarks/benchmark_suite/scripts/checkout_repos.bash @@ -32,6 +32,3 @@ fi cd miniFE git pull cd .. - - - diff --git a/lib/kokkos/benchmarks/benchmark_suite/scripts/run_benchmark.bash b/lib/kokkos/benchmarks/benchmark_suite/scripts/run_benchmark.bash index 6afa05f5fcf..2abb9b9cfa4 100755 --- a/lib/kokkos/benchmarks/benchmark_suite/scripts/run_benchmark.bash +++ b/lib/kokkos/benchmarks/benchmark_suite/scripts/run_benchmark.bash @@ -11,4 +11,4 @@ ${SCRIPT_PATH}/checkout_repos.bash ${SCRIPT_PATH}/build_code.bash --arch=${KOKKOS_ARCH} --device-list=${KOKKOS_DEVICES} --compiler=${COMPILER} ${SCRIPT_PATH}/run_tests.bash -fi \ No newline at end of file +fi diff --git a/lib/kokkos/benchmarks/policy_performance/script_sample_usage.sh b/lib/kokkos/benchmarks/policy_performance/script_sample_usage.sh index 1c2db56648c..dccdfdf62eb 100755 --- a/lib/kokkos/benchmarks/policy_performance/script_sample_usage.sh +++ b/lib/kokkos/benchmarks/policy_performance/script_sample_usage.sh @@ -1,9 +1,9 @@ #!/bin/bash -# Sample script for benchmarking policy performance +# Sample script for benchmarking policy performance # Suggested environment variables to export prior to executing script: -# KNL: +# KNL: # OMP_NUM_THREADS=256 KMP_AFFINITY=compact # Power: # OMP_NUM_THREADS=64 OMP_PROC_BIND=true @@ -27,7 +27,7 @@ # Tier 7: 'outer' parallel_for with TeamPolicy (nested parallelism) 1XY # Tier 8: 'outer' parallel_reduce with TeamPolicy (nested parallelism) 2XY -# Results grouped by: +# Results grouped by: # 0) SCHEDULE 1) CODE (test) 2) TEAMRANGE 3) TEAMSIZE 4) THREADRANGE EXECUTABLE=policy_performance diff --git a/lib/kokkos/bin/nvcc_wrapper b/lib/kokkos/bin/nvcc_wrapper index 32ae30f94c8..ffb6fd8293e 100755 --- a/lib/kokkos/bin/nvcc_wrapper +++ b/lib/kokkos/bin/nvcc_wrapper @@ -150,7 +150,15 @@ do *.cpp|*.cxx|*.cc|*.C|*.c++|*.cu) cpp_files="$cpp_files $1" ;; - # Ensure we only have one optimization flag because NVCC doesn't allow multiple + #fast-compile level for device code, tradeoff between compilation speed and runtime performance + -Ofc=*|--Ofast-compile=*) + cuda_args="$cuda_args $1" + ;; + -Ofc|--Ofast-compile) + cuda_args="$cuda_args $1 $2" + shift + ;; + # Ensure we only have one optimization flag because NVCC doesn't allow multiple -O*) if [ -n "$optimization_flag" ]; then if [ "$1" = "$optimization_flag" ]; then @@ -236,10 +244,10 @@ do cuda_args="$cuda_args $1" ;; #Handle known nvcc args that have an argument - -maxrregcount=*|--maxrregcount=*|-time=*|-Xptxas=*) + -maxrregcount=*|--maxrregcount=*|-time=*|-Xptxas=*|--fdevice-time-trace=*|-fdevice-time-trace=*) cuda_args="$cuda_args $1" ;; - -maxrregcount|--default-stream|-Xnvlink|--ftz|--prec-div|--prec-sqrt|--fmad|-cudart|--cudart|-include|-time|-Xptxas) + -maxrregcount|--default-stream|-Xnvlink|--ftz|--prec-div|--prec-sqrt|--fmad|-cudart|--cudart|-include|-time|-Xptxas|--fdevice-time-trace|-fdevice-time-trace) cuda_args="$cuda_args $1 $2" shift ;; @@ -315,11 +323,11 @@ do ;; # End of Werror handling #Handle unsupported standard flags - --std=c++1y|-std=c++1y|--std=gnu++1y|-std=gnu++1y|--std=c++1z|-std=c++1z|--std=gnu++1z|-std=gnu++1z|--std=c++2a|-std=c++2a) - fallback_std_flag="-std=c++17" + --std=c++1y|-std=c++1y|--std=gnu++1y|-std=gnu++1y|--std=c++1z|-std=c++1z|--std=gnu++1z|-std=gnu++1z|--std=c++2a|-std=c++2a|--std=c++2b|-std=c++2b|--std=c++2c|-std=c++2c) + fallback_std_flag="-std=c++20" # this is hopefully just occurring in a downstream project during CMake feature tests # we really have no choice here but to accept the flag and change to an accepted C++ standard - echo "nvcc_wrapper does not accept standard flags $1 since partial standard flags and standards after C++17 are not supported. nvcc_wrapper will use $fallback_std_flag instead. It is undefined behavior to use this flag. This should only be occurring during CMake configuration." + echo "nvcc_wrapper does not accept standard flags $1 since partial standard flags and standards after C++20 are not supported. nvcc_wrapper will use $fallback_std_flag instead. It is undefined behavior to use this flag. This should only be occurring during CMake configuration." if [ -n "$std_flag" ]; then warn_std_flag shared_args=${shared_args/ $std_flag/} @@ -337,25 +345,7 @@ do std_flag=$corrected_std_flag shared_args="$shared_args $std_flag" ;; - --std=c++20|-std=c++20) - if [ -n "$std_flag" ]; then - warn_std_flag - shared_args=${shared_args/ $std_flag/} - fi - # NVCC only has C++20 from version 12 on - cuda_main_version=$([[ $(${nvcc_compiler} --version) =~ V([0-9]+) ]] && echo ${BASH_REMATCH[1]}) - if [ ${cuda_main_version} -lt 12 ]; then - fallback_std_flag="-std=c++17" - # this is hopefully just occurring in a downstream project during CMake feature tests - # we really have no choice here but to accept the flag and change to an accepted C++ standard - echo "nvcc_wrapper does not accept standard flags $1 since partial standard flags and standards after C++17 are not supported. nvcc_wrapper will use $fallback_std_flag instead. It is undefined behavior to use this flag. This should only be occurring during CMake configuration." - std_flag=$fallback_std_flag - else - std_flag=$1 - fi - shared_args="$shared_args $std_flag" - ;; - --std=c++11|-std=c++11|--std=c++14|-std=c++14|--std=c++17|-std=c++17) + --std=c++11|-std=c++11|--std=c++14|-std=c++14|--std=c++17|-std=c++17|--std=c++20|-std=c++20) if [ -n "$std_flag" ]; then warn_std_flag shared_args=${shared_args/ $std_flag/} @@ -365,7 +355,7 @@ do ;; #convert PGI standard flags to something nvcc can handle - --c++11|--c++14|--c++17) + --c++11|--c++14|--c++17|--c++20) if [ -n "$std_flag" ]; then warn_std_flag shared_args=${shared_args/ $std_flag/} diff --git a/lib/kokkos/cmake/KokkosConfigCommon.cmake.in b/lib/kokkos/cmake/KokkosConfigCommon.cmake.in index 527b0c579c6..a384ecc6188 100644 --- a/lib/kokkos/cmake/KokkosConfigCommon.cmake.in +++ b/lib/kokkos/cmake/KokkosConfigCommon.cmake.in @@ -31,6 +31,21 @@ endif() if(Kokkos_ENABLE_HIP) set(Kokkos_HIP_ARCHITECTURES @KOKKOS_HIP_ARCHITECTURES@) + # check if the GPU_TARGETS are the same as the arch in Kokkos + foreach(arch IN LISTS GPU_TARGETS) + if(NOT (arch STREQUAL @KOKKOS_HIP_ARCHITECTURES@)) + if(@KOKKOS_ENABLE_DEPRECATED_CODE_5@) + set(MESSAGE_TYPE WARNING) + else() + set(MESSAGE_TYPE FATAL_ERROR) + endif() + message( + ${MESSAGE_TYPE} + "AMD GPU architectures given via GPU_TARGETS=\"${GPU_TARGETS}\" are not compatible with the architecture enabled in Kokkos which is @KOKKOS_HIP_ARCHITECTURES@. Kokkos allows only one device architecture to be active. To resolve this, configure with -DGPU_TARGETS=\"@KOKKOS_HIP_ARCHITECTURES@\" to prevent it from being set implicitly by find_package calls." + ) + break() + endif() + endforeach() endif() if(NOT Kokkos_FIND_QUIETLY) diff --git a/lib/kokkos/cmake/KokkosCore_config.h.in b/lib/kokkos/cmake/KokkosCore_config.h.in index 852292f5bbe..49c9f85977a 100644 --- a/lib/kokkos/cmake/KokkosCore_config.h.in +++ b/lib/kokkos/cmake/KokkosCore_config.h.in @@ -22,7 +22,6 @@ #cmakedefine KOKKOS_ENABLE_SERIAL #cmakedefine KOKKOS_ENABLE_OPENMP #cmakedefine KOKKOS_ENABLE_OPENACC -#cmakedefine KOKKOS_ENABLE_OPENMPTARGET #cmakedefine KOKKOS_ENABLE_THREADS #cmakedefine KOKKOS_ENABLE_CUDA #cmakedefine KOKKOS_ENABLE_HIP @@ -35,8 +34,6 @@ #cmakedefine KOKKOS_ENABLE_CXX26 #cmakedefine KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE -#cmakedefine KOKKOS_ENABLE_CUDA_UVM // deprecated -#cmakedefine KOKKOS_ENABLE_CUDA_LAMBDA // deprecated #cmakedefine KOKKOS_ENABLE_CUDA_CONSTEXPR #cmakedefine KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC #cmakedefine KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY @@ -66,6 +63,7 @@ #cmakedefine KOKKOS_ENABLE_IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND #cmakedefine KOKKOS_ENABLE_ATOMICS_BYPASS #cmakedefine KOKKOS_ENABLE_IMPL_CHECK_POSSIBLY_BREAKING_LAYOUTS +#cmakedefine KOKKOS_ENABLE_BENCHMARKS_HEAVY /* TPL Settings */ #cmakedefine KOKKOS_ENABLE_HWLOC @@ -133,7 +131,9 @@ #cmakedefine KOKKOS_ARCH_HOPPER90 #cmakedefine KOKKOS_ARCH_BLACKWELL #cmakedefine KOKKOS_ARCH_BLACKWELL100 +#cmakedefine KOKKOS_ARCH_BLACKWELL103 #cmakedefine KOKKOS_ARCH_BLACKWELL120 +#cmakedefine KOKKOS_ARCH_BLACKWELL121 #cmakedefine KOKKOS_ARCH_AMD_ZEN #cmakedefine KOKKOS_ARCH_AMD_ZEN2 #cmakedefine KOKKOS_ARCH_AMD_ZEN3 @@ -145,6 +145,7 @@ #cmakedefine KOKKOS_ARCH_AMD_GFX940 #cmakedefine KOKKOS_ARCH_AMD_GFX942 #cmakedefine KOKKOS_ARCH_AMD_GFX942_APU +#cmakedefine KOKKOS_ARCH_AMD_GFX950 #cmakedefine KOKKOS_ARCH_AMD_GFX1030 #cmakedefine KOKKOS_ARCH_AMD_GFX1100 #cmakedefine KOKKOS_ARCH_AMD_GFX1103 diff --git a/lib/kokkos/cmake/Modules/FindTPLROCM.cmake b/lib/kokkos/cmake/Modules/FindTPLROCM.cmake index 3a69358bebb..c273ebe0b35 100644 --- a/lib/kokkos/cmake/Modules/FindTPLROCM.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLROCM.cmake @@ -1,16 +1,4 @@ -include(FindPackageHandleStandardArgs) +find_package(hip REQUIRED PATHS ${ROCM_PATH} $ENV{ROCM_PATH}) -find_library(AMD_HIP_LIBRARY amdhip64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib) -find_library(HSA_RUNTIME_LIBRARY hsa-runtime64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib) - -find_package_handle_standard_args(TPLROCM DEFAULT_MSG AMD_HIP_LIBRARY HSA_RUNTIME_LIBRARY) - -kokkos_create_imported_tpl( - ROCM - INTERFACE - LINK_LIBRARIES - ${HSA_RUNTIME_LIBRARY} - ${AMD_HIP_LIBRARY} - COMPILE_DEFINITIONS - __HIP_ROCclr__ -) +kokkos_create_imported_tpl(ROCM INTERFACE LINK_LIBRARIES hip::device) +kokkos_export_cmake_tpl(hip REQUIRED) diff --git a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc index daa5228b264..8f05ab5b38a 100644 --- a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc +++ b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc @@ -33,7 +33,9 @@ int main() { case 89: std::cout << "Set -DKokkos_ARCH_ADA89=ON ." << std::endl; break; case 90: std::cout << "Set -DKokkos_ARCH_HOPPER90=ON ." << std::endl; break; case 100: std::cout << "Set -DKokkos_ARCH_BLACKWELL100=ON ." << std::endl; break; + case 103: std::cout << "Set -DKokkos_ARCH_BLACKWELL103=ON ." << std::endl; break; case 120: std::cout << "Set -DKokkos_ARCH_BLACKWELL120=ON ." << std::endl; break; + case 121: std::cout << "Set -DKokkos_ARCH_BLACKWELL121=ON ." << std::endl; break; default: std::cout << "Compute capability " << compute_capability << " is not supported" << std::endl; diff --git a/lib/kokkos/cmake/kokkos_arch.cmake b/lib/kokkos/cmake/kokkos_arch.cmake index afca80b6030..22f9c545da2 100644 --- a/lib/kokkos/cmake/kokkos_arch.cmake +++ b/lib/kokkos/cmake/kokkos_arch.cmake @@ -27,6 +27,7 @@ kokkos_check_deprecated_options( set(KOKKOS_ARCH_LIST) include(CheckCXXCompilerFlag) +include(CheckSourceCompiles) kokkos_deprecated_list(ARCH ARCH) @@ -75,11 +76,7 @@ declare_and_check_host_arch(RISCV_SG2042 "SG2042 (RISC-V) CPUs") declare_and_check_host_arch(RISCV_RVA22V "RVA22V (RISC-V) CPUs") declare_and_check_host_arch(RISCV_U74MC "U74MC (RISC-V) CPUs") -if(Kokkos_ENABLE_CUDA - OR Kokkos_ENABLE_OPENMPTARGET - OR Kokkos_ENABLE_OPENACC - OR Kokkos_ENABLE_SYCL -) +if(Kokkos_ENABLE_CUDA OR Kokkos_ENABLE_OPENACC OR Kokkos_ENABLE_SYCL) set(KOKKOS_SHOW_CUDA_ARCHS ON) endif() @@ -97,20 +94,18 @@ kokkos_arch_option(AMPERE87 GPU "NVIDIA Ampere generation CC 8.7" "KOKKOS_SHOW_C kokkos_arch_option(ADA89 GPU "NVIDIA Ada generation CC 8.9" "KOKKOS_SHOW_CUDA_ARCHS") kokkos_arch_option(HOPPER90 GPU "NVIDIA Hopper generation CC 9.0" "KOKKOS_SHOW_CUDA_ARCHS") kokkos_arch_option(BLACKWELL100 GPU "NVIDIA Blackwell generation CC 10.0" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(BLACKWELL103 GPU "NVIDIA Blackwell generation CC 10.3" "KOKKOS_SHOW_CUDA_ARCHS") kokkos_arch_option(BLACKWELL120 GPU "NVIDIA Blackwell generation CC 12.0" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(BLACKWELL121 GPU "NVIDIA Blackwell generation CC 12.1" "KOKKOS_SHOW_CUDA_ARCHS") -if(Kokkos_ENABLE_HIP - OR Kokkos_ENABLE_OPENMPTARGET - OR Kokkos_ENABLE_OPENACC - OR Kokkos_ENABLE_SYCL -) +if(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENACC OR Kokkos_ENABLE_SYCL) set(KOKKOS_SHOW_HIP_ARCHS ON) endif() # AMD archs ordered in decreasing priority of autodetection -list(APPEND SUPPORTED_AMD_GPUS MI300 MI300A MI300) -list(APPEND SUPPORTED_AMD_ARCHS AMD_GFX942 AMD_GFX942_APU AMD_GFX940) -list(APPEND CORRESPONDING_AMD_FLAGS gfx942 gfx942 gfx940) +list(APPEND SUPPORTED_AMD_GPUS MI300 MI300A MI300 MI350) +list(APPEND SUPPORTED_AMD_ARCHS AMD_GFX942 AMD_GFX942_APU AMD_GFX940 AMD_GFX950) +list(APPEND CORRESPONDING_AMD_FLAGS gfx942 gfx942 gfx940 gfx950) list(APPEND SUPPORTED_AMD_GPUS MI200 MI200 MI100 MI100) list(APPEND SUPPORTED_AMD_ARCHS VEGA90A AMD_GFX90A VEGA908 AMD_GFX908) list(APPEND CORRESPONDING_AMD_FLAGS gfx90a gfx90a gfx908 gfx908) @@ -245,9 +240,6 @@ if(KOKKOS_ENABLE_HIP) global_append(KOKKOS_AMDGPU_OPTIONS -xhip) set(AMDGPU_ARCH_FLAG "--offload-arch") if(NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) - if(NOT CMAKE_CXX_STANDARD) - message(FATAL_ERROR "Kokkos requires CMAKE_CXX_STANDARD to set to 20 or higher") - endif() if(DEFINED ENV{ROCM_PATH}) global_append(KOKKOS_AMDGPU_OPTIONS --rocm-path=$ENV{ROCM_PATH}) endif() @@ -273,6 +265,79 @@ if(KOKKOS_ARCH_NATIVE) compiler_specific_flags(COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID DEFAULT ${KOKKOS_NATIVE_FLAGS}) endif() +#------------------------------- KOKKOS NEON and SVE detection --------------------------- +function(kokkos_use_neon_if_compiler_allows_it) + cmake_parse_arguments(ARG "" "" "COMPILER_FLAGS" ${ARGN}) + if(ARG_UNPARSED_ARGUMENTS) + message(FATAL_ERROR "'kokkos_use_neon_if_compiler_allows_it' has unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") + endif() + + if(ARG_COMPILER_FLAGS) + set(CMAKE_REQUIRED_FLAGS ${ARG_COMPILER_FLAGS}) + endif() + + unset(KOKKOS_COMPILER_HAS_ARM_NEON CACHE) + #FIXME_Kokkos_launch_compiler + get_property(kokkos_global_rule_compile GLOBAL PROPERTY RULE_LAUNCH_COMPILE) + if("${kokkos_global_rule_compile}" MATCHES "kokkos_launch_compiler") + message(WARNING "The use of 'kokkos_launch_compiler' prevents reliable NEON detection. Disabling NEON.\n" + "You can force the use of NEON by using the Kokkos_ARCH_* flag specific to your target " + "processor instead of Kokkos_ARCH_NATIVE." + ) + else() + check_source_compiles( + ${KOKKOS_COMPILE_LANGUAGE} + " + #include + int main() { + float32x2_t a; + a = vadd_f32(a, a); + } + " + KOKKOS_COMPILER_HAS_ARM_NEON + ) + endif() +endfunction() + +function(kokkos_use_sve_if_compiler_allows_it) + cmake_parse_arguments(ARG "" "" "COMPILER_FLAGS" ${ARGN}) + if(ARG_UNPARSED_ARGUMENTS) + message(FATAL_ERROR "'kokkos_use_sve_if_compiler_allows_it' has unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") + endif() + + if(ARG_COMPILER_FLAGS) + set(CMAKE_REQUIRED_FLAGS ${ARG_COMPILER_FLAGS}) + endif() + + unset(KOKKOS_COMPILER_HAS_ARM_SVE CACHE) + #FIXME_Kokkos_launch_compiler + get_property(kokkos_global_rule_compile GLOBAL PROPERTY RULE_LAUNCH_COMPILE) + if("${kokkos_global_rule_compile}" MATCHES "kokkos_launch_compiler") + message(WARNING "The use of 'kokkos_launch_compiler' prevents reliable SVE detection. Disabling SVE.\n" + "You can force the use of SVE by using the Kokkos_ARCH_* flag specific to your target " + "processor instead of Kokkos_ARCH_NATIVE." + ) + else() + check_source_compiles( + ${KOKKOS_COMPILE_LANGUAGE} + " + #include + #include + int main() { + svuint64_t z; + uint64x2_t res; + svbool_t pg0 = svpfirst(svptrue_b64(), svpfalse()); + svbool_t pg1 = svpnext_b64(pg0, pg0); + res[0] = svlastb(pg0, z); + res[1] = svlastb(pg1, z); + return 0; + } + " + KOKKOS_COMPILER_HAS_ARM_SVE + ) + endif() +endfunction() + if(KOKKOS_ARCH_ARMV80) set(KOKKOS_ARCH_ARM_NEON ON) compiler_specific_flags( @@ -760,10 +825,8 @@ if(KOKKOS_ARCH_NATIVE) check_cxx_symbol_exists(__AVX512F__ "" KOKKOS_COMPILER_HAS_AVX512) unset(KOKKOS_COMPILER_HAS_AVX2 CACHE) check_cxx_symbol_exists(__AVX2__ "" KOKKOS_COMPILER_HAS_AVX2) - unset(KOKKOS_COMPILER_HAS_ARM_SVE CACHE) - check_cxx_symbol_exists(__ARM_FEATURE_SVE "" KOKKOS_COMPILER_HAS_ARM_SVE) - unset(KOKKOS_COMPILER_HAS_ARM_NEON CACHE) - check_cxx_symbol_exists(__ARM_NEON "" KOKKOS_COMPILER_HAS_ARM_NEON) + kokkos_use_sve_if_compiler_allows_it(COMPILER_FLAGS "${KOKKOS_NATIVE_FLAGS}") + kokkos_use_neon_if_compiler_allows_it(COMPILER_FLAGS "${KOKKOS_NATIVE_FLAGS}") unset(KOKKOS_COMPILER_HAS_AVX CACHE) check_cxx_symbol_exists(__AVX__ "" KOKKOS_COMPILER_HAS_AVX) @@ -813,11 +876,6 @@ if(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL NVHPC) set(KOKKOS_ARCH_AVX512XEON OFF) endif() -# FIXME_NVCC nvcc doesn't seem to support Arm Neon. -if(KOKKOS_ARCH_ARM_NEON AND KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - unset(KOKKOS_ARCH_ARM_NEON) -endif() - if(NOT KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) compiler_specific_flags(Clang -fgpu-rdc --offload-new-driver NVIDIA --relocatable-device-code=true) @@ -834,6 +892,13 @@ if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" OR "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL compiler_specific_defs(Clang _CRT_SECURE_NO_WARNINGS) endif() +# MSVC needs another flag to allow using __VA_OPT__ +if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + compiler_specific_options(COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID MSVC -Xcompiler=/Zc:preprocessor) +else() + compiler_specific_options(COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID MSVC /Zc:preprocessor) +endif() + #Right now we cannot get the compiler ID when cross-compiling, so just check #that HIP is enabled if(KOKKOS_ENABLE_HIP) @@ -868,12 +933,17 @@ endif() # implementation. Otherwise, the feature is not supported when building shared # libraries. Thus, we don't even check for support if shared libraries are # requested and SYCL_EXT_ONEAPI_DEVICE_GLOBAL is not defined. -# As of oneAPI 2025.0.0, this feature only works well for Intel GPUs. -# For simplicity only test for JIT and PVC +# As of oneAPI 2025.0.0, the codeplay documentation indicates support +# for device_global on Nvidia and AMD GPUs. However, testing suggested +# that the feature only works well as of oneAPI 2025.1.1. +# Otherwise, for simplicity we only test for JIT and PVC. if(KOKKOS_ENABLE_SYCL) string(REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") include(CheckCXXSymbolExists) - if(Kokkos_ARCH_INTEL_PVC OR Kokkos_ARCH_INTEL_GEN) + if(Kokkos_ARCH_INTEL_PVC OR Kokkos_ARCH_INTEL_GEN + OR (KOKKOS_ENABLE_UNSUPPORTED_ARCHS AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM + AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 2025.1.1) + ) check_cxx_symbol_exists( SYCL_EXT_ONEAPI_DEVICE_GLOBAL "sycl/sycl.hpp" KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL ) @@ -923,14 +993,10 @@ function(CHECK_CUDA_ARCH ARCH FLAG) ) endif() set(CUDA_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) - if(NOT KOKKOS_ENABLE_CUDA - AND NOT KOKKOS_ENABLE_OPENMPTARGET - AND NOT KOKKOS_ENABLE_SYCL - AND NOT KOKKOS_ENABLE_OPENACC - ) + if(NOT KOKKOS_ENABLE_CUDA AND NOT KOKKOS_ENABLE_SYCL AND NOT KOKKOS_ENABLE_OPENACC) message( WARNING - "Given CUDA arch ${ARCH}, but Kokkos_ENABLE_CUDA, Kokkos_ENABLE_SYCL, Kokkos_ENABLE_OPENACC, and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored." + "Given CUDA arch ${ARCH}, but Kokkos_ENABLE_CUDA, Kokkos_ENABLE_SYCL and Kokkos_ENABLE_OPENACC are OFF. Option will be ignored." ) unset(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) else() @@ -972,7 +1038,9 @@ check_cuda_arch(AMPERE87 sm_87) check_cuda_arch(ADA89 sm_89) check_cuda_arch(HOPPER90 sm_90) check_cuda_arch(BLACKWELL100 sm_100) +check_cuda_arch(BLACKWELL103 sm_103) check_cuda_arch(BLACKWELL120 sm_120) +check_cuda_arch(BLACKWELL121 sm_121) set(AMDGPU_ARCH_ALREADY_SPECIFIED "") function(CHECK_AMDGPU_ARCH ARCH FLAG) @@ -984,14 +1052,10 @@ function(CHECK_AMDGPU_ARCH ARCH FLAG) ) endif() set(AMDGPU_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) - if(NOT KOKKOS_ENABLE_HIP - AND NOT KOKKOS_ENABLE_OPENMPTARGET - AND NOT KOKKOS_ENABLE_OPENACC - AND NOT KOKKOS_ENABLE_SYCL - ) + if(NOT KOKKOS_ENABLE_HIP AND NOT KOKKOS_ENABLE_OPENACC AND NOT KOKKOS_ENABLE_SYCL) message( WARNING - "Given AMD GPU architecture ${ARCH}, but Kokkos_ENABLE_HIP, Kokkos_ENABLE_SYCL, Kokkos_ENABLE_OPENACC, and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored." + "Given AMD GPU architecture ${ARCH}, but Kokkos_ENABLE_HIP, Kokkos_ENABLE_SYCL and Kokkos_ENABLE_OPENACC are OFF. Option will be ignored." ) unset(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) else() @@ -1069,22 +1133,6 @@ if(KOKKOS_ENABLE_OPENMP) compiler_specific_link_options(CrayClang -fopenmp) endif() -if(KOKKOS_ENABLE_OPENMPTARGET) - set(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) - if(CLANG_CUDA_ARCH) - string(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${CLANG_CUDA_ARCH}) - compiler_specific_flags( - Clang -Xopenmp-target -march=${CLANG_CUDA_ARCH} -fopenmp-targets=nvptx64 NVHPC -gpu=${NVHPC_CUDA_ARCH} - ) - endif() - set(CLANG_AMDGPU_ARCH ${KOKKOS_AMDGPU_ARCH_FLAG}) - if(CLANG_AMDGPU_ARCH) - compiler_specific_flags( - Clang -Xopenmp-target=amdgcn-amd-amdhsa -march=${CLANG_AMDGPU_ARCH} -fopenmp-targets=amdgcn-amd-amdhsa - ) - endif() -endif() - if(KOKKOS_ENABLE_OPENACC) if(KOKKOS_CUDA_ARCH_FLAG) if(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) @@ -1277,7 +1325,11 @@ if(KOKKOS_ARCH_HOPPER90) set(KOKKOS_ARCH_HOPPER ON) endif() -if(KOKKOS_ARCH_BLACKWELL100 OR KOKKOS_ARCH_BLACKWELL120) +if(KOKKOS_ARCH_BLACKWELL100 + OR KOKKOS_ARCH_BLACKWELL103 + OR KOKKOS_ARCH_BLACKWELL120 + OR KOKKOS_ARCH_BLACKWELL121 +) set(KOKKOS_ARCH_BLACKWELL ON) endif() @@ -1396,11 +1448,29 @@ foreach(ARCH IN LISTS SUPPORTED_AMD_ARCHS) endif() endforeach() +#FIXME_HIP right now we only check if the arch autodetected by hip is the same as the one enabled in Kokkos. If not we warn/error +if(Kokkos_ENABLE_HIP) + foreach(arch IN LISTS GPU_TARGETS) + if(NOT (arch STREQUAL KOKKOS_HIP_ARCHITECTURES)) + if(KOKKOS_ENABLE_DEPRECATED_CODE_5) + set(MESSAGE_TYPE WARNING) + else() + set(MESSAGE_TYPE FATAL_ERROR) + endif() + message( + ${MESSAGE_TYPE} + "AMD GPU architectures given via GPU_TARGETS=\"${GPU_TARGETS}\" are not compatible with the architecture enabled in Kokkos which is ${KOKKOS_HIP_ARCHITECTURES}. Kokkos allows only one device architecture to be active. To resolve this, configure with -DGPU_TARGETS=\"${KOKKOS_HIP_ARCHITECTURES}\" to prevent it from being set implicitly by find_package calls." + ) + break() + endif() + endforeach() +endif() + #CMake verbose is kind of pointless #Let's just always print things message(STATUS "Built-in Execution Spaces:") -foreach(_BACKEND Cuda OpenMPTarget HIP SYCL OpenACC) +foreach(_BACKEND Cuda HIP SYCL OpenACC) string(TOUPPER ${_BACKEND} UC_BACKEND) if(KOKKOS_ENABLE_${UC_BACKEND}) if(_DEVICE_PARALLEL) @@ -1413,15 +1483,6 @@ foreach(_BACKEND Cuda OpenMPTarget HIP SYCL OpenACC) ) endif() if(${_BACKEND} STREQUAL "Cuda") - if(KOKKOS_ENABLE_CUDA_UVM) - message( - DEPRECATION - "Setting Kokkos_ENABLE_CUDA_UVM is deprecated - use the portable Kokkos::SharedSpace as an explicit memory space in your code instead" - ) - if(NOT KOKKOS_ENABLE_DEPRECATED_CODE_4) - message(FATAL_ERROR "Kokkos_ENABLE_DEPRECATED_CODE_4 must be set to use Kokkos_ENABLE_CUDA_UVM") - endif() - endif() set(_DEVICE_PARALLEL "Kokkos::${_BACKEND}") elseif(${_BACKEND} STREQUAL "HIP" OR ${_BACKEND} STREQUAL "SYCL") set(_DEVICE_PARALLEL "Kokkos::${_BACKEND}") diff --git a/lib/kokkos/cmake/kokkos_compiler_id.cmake b/lib/kokkos/cmake/kokkos_compiler_id.cmake index dfd218fa28d..d3cd935a992 100644 --- a/lib/kokkos/cmake/kokkos_compiler_id.cmake +++ b/lib/kokkos/cmake/kokkos_compiler_id.cmake @@ -156,7 +156,6 @@ unset(Kokkos_LANGUAGES) # Enforce the minimum compilers supported by Kokkos. set(KOKKOS_CLANG_CPU_MINIMUM 14.0.0) set(KOKKOS_CLANG_CUDA_MINIMUM 15.0.0) -set(KOKKOS_CLANG_OPENMPTARGET_MINIMUM 15.0.0) set(KOKKOS_GCC_MINIMUM 10.4.0) set(KOKKOS_INTEL_LLVM_CPU_MINIMUM 2022.0.0) set(KOKKOS_INTEL_LLVM_SYCL_MINIMUM 2024.2.1) @@ -170,7 +169,6 @@ set(KOKKOS_MESSAGE_TEXT ) set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CPU) ${KOKKOS_CLANG_CPU_MINIMUM}") set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CUDA) ${KOKKOS_CLANG_CUDA_MINIMUM}") -set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(OpenMPTarget) ${KOKKOS_CLANG_OPENMPTARGET_MINIMUM}") set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n GCC ${KOKKOS_GCC_MINIMUM}") set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Intel not supported") set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(CPU) ${KOKKOS_INTEL_LLVM_CPU_MINIMUM}") @@ -234,10 +232,6 @@ elseif(KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") endif() elseif(KOKKOS_CXX_COMPILER_ID STREQUAL XL OR KOKKOS_CXX_COMPILER_ID STREQUAL XLClang) message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") -elseif(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND Kokkos_ENABLE_OPENMPTARGET) - if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS KOKKOS_CLANG_OPENMPTARGET_MINIMUM) - message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - endif() endif() if(NOT DEFINED KOKKOS_CXX_HOST_COMPILER_ID) diff --git a/lib/kokkos/cmake/kokkos_enable_devices.cmake b/lib/kokkos/cmake/kokkos_enable_devices.cmake index b1da1826771..9569af529fe 100644 --- a/lib/kokkos/cmake/kokkos_enable_devices.cmake +++ b/lib/kokkos/cmake/kokkos_enable_devices.cmake @@ -69,23 +69,6 @@ if(KOKKOS_ENABLE_OPENACC) -Wno-unknown-cuda-version -Wno-pass-failed ) - compiler_specific_defs(Clang KOKKOS_WORKAROUND_OPENMPTARGET_CLANG) -endif() - -kokkos_device_option(OPENMPTARGET OFF DEVICE "Whether to build the OpenMP target backend") -if(KOKKOS_ENABLE_OPENMPTARGET) - set(ClangOpenMPFlag -fopenmp=libomp) - - compiler_specific_flags( - Clang - ${ClangOpenMPFlag} - -Wno-openmp-mapping - -Wno-unknown-cuda-version - -Wno-pass-failed - DEFAULT - -fopenmp - ) - compiler_specific_defs(Clang KOKKOS_WORKAROUND_OPENMPTARGET_CLANG) endif() if(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_CUDA) diff --git a/lib/kokkos/cmake/kokkos_enable_options.cmake b/lib/kokkos/cmake/kokkos_enable_options.cmake index 516b97723f5..5717cad1d0f 100644 --- a/lib/kokkos/cmake/kokkos_enable_options.cmake +++ b/lib/kokkos/cmake/kokkos_enable_options.cmake @@ -23,20 +23,10 @@ kokkos_cfg_depends(OPTIONS COMPILER_ID) kokkos_deprecated_list(OPTIONS ENABLE) kokkos_enable_option(CUDA_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for CUDA") -kokkos_enable_option(CUDA_UVM OFF "Whether to use unified memory (UM) for CUDA by default") -kokkos_enable_option(CUDA_LDG_INTRINSIC OFF "Whether to use CUDA LDG intrinsics") -# In contrast to other CUDA-dependent, options CUDA_LAMBDA is ON by default. -# That is problematic when CUDA is not enabled because this not only yields a -# bogus warning, but also exports the Kokkos_ENABLE_CUDA_LAMBDA variable and -# sets it to ON. -kokkos_enable_option( - CUDA_LAMBDA ${KOKKOS_ENABLE_CUDA} "Whether to allow lambda expressions on the device with NVCC **DEPRECATED**" -) # As of 09/2024, cudaMallocAsync causes issues with ICP and older version of UCX # as MPI communication layer. kokkos_enable_option(IMPL_CUDA_MALLOC_ASYNC OFF "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)") -kokkos_enable_option(IMPL_NVHPC_AS_DEVICE_COMPILER OFF "Whether to allow nvc++ as Cuda device compiler") kokkos_enable_option(IMPL_CUDA_UNIFIED_MEMORY OFF "Whether to leverage unified memory architectures for CUDA") kokkos_enable_option(DEPRECATED_CODE_4 OFF "Whether code deprecated in major release 4 is available") @@ -59,6 +49,9 @@ kokkos_enable_option(IMPL_SYCL_OUT_OF_ORDER_QUEUES OFF "Whether to make Kokkos u kokkos_enable_option(TESTS OFF "Whether to build the unit tests") kokkos_enable_option(BENCHMARKS OFF "Whether to build the benchmarks") kokkos_enable_option(EXAMPLES OFF "Whether to build the examples") +if(Kokkos_ENABLE_BENCHMARKS) + kokkos_enable_option(BENCHMARKS_HEAVY OFF "Whether to build and run the long benchmarks") +endif() string(TOUPPER "${CMAKE_BUILD_TYPE}" UPPERCASE_CMAKE_BUILD_TYPE) if(UPPERCASE_CMAKE_BUILD_TYPE STREQUAL "DEBUG") set(DEBUG_DEFAULT ON) @@ -223,11 +216,8 @@ check_device_specific_options( DEVICE CUDA OPTIONS - CUDA_UVM CUDA_RELOCATABLE_DEVICE_CODE - CUDA_LAMBDA CUDA_CONSTEXPR - CUDA_LDG_INTRINSIC IMPL_CUDA_MALLOC_ASYNC IMPL_CUDA_UNIFIED_MEMORY ) @@ -279,26 +269,6 @@ if((KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE OR KOKKOS_ENABLE_HIP_RELOCATABLE_ message(FATAL_ERROR "Relocatable device code requires static libraries.") endif() -if(Kokkos_ENABLE_CUDA_LDG_INTRINSIC) - if(KOKKOS_ENABLE_DEPRECATED_CODE_4) - message(DEPRECATION "Setting Kokkos_ENABLE_CUDA_LDG_INTRINSIC is deprecated. LDG intrinsics are always enabled.") - else() - message(FATAL_ERROR "Kokkos_ENABLE_CUDA_LDG_INTRINSIC has been removed. LDG intrinsics are always enabled.") - endif() -endif() -if(Kokkos_ENABLE_CUDA AND NOT Kokkos_ENABLE_CUDA_LAMBDA) - if(KOKKOS_ENABLE_DEPRECATED_CODE_4) - message( - DEPRECATION - "Setting Kokkos_ENABLE_CUDA_LAMBDA is deprecated. Lambda expressions in device code are always enabled. Forcing -DKokkos_ENABLE_CUDA_LAMBDA=ON" - ) - set(Kokkos_ENABLE_CUDA_LAMBDA ON CACHE BOOL "Kokkos turned Cuda lambda support ON!" FORCE) - set(KOKKOS_ENABLE_CUDA_LAMBDA ON) - else() - message(FATAL_ERROR "Kokkos_ENABLE_CUDA_LAMBDA has been removed. Lambda expressions in device code always enabled.") - endif() -endif() - if(DEFINED Kokkos_ENABLE_IMPL_DESUL_ATOMICS) message(WARNING "Kokkos_ENABLE_IMPL_DESUL_ATOMICS option has been removed. Desul atomics cannot be disabled.") endif() diff --git a/lib/kokkos/cmake/kokkos_functions.cmake b/lib/kokkos/cmake/kokkos_functions.cmake index ffd20832ab9..039965668c7 100644 --- a/lib/kokkos/cmake/kokkos_functions.cmake +++ b/lib/kokkos/cmake/kokkos_functions.cmake @@ -990,7 +990,7 @@ function(kokkos_check_flags) check_compiler_flag(${INP_LANGUAGE} "${QUOTED_FLAGS}" KOKKOS_COMPILE_OPTIONS_CHECK) if(NOT KOKKOS_COMPILE_OPTIONS_CHECK) message( - FATAL_ERROR + WARNING "The compiler for ${KOKKOS_COMPILE_LANGUAGE} can not consume flag(s) ${QUOTED_FLAGS} in combination with the CMAKE_${KOKKOS_COMPILE_LANGUAGE}_FLAGS=${CMAKE_${KOKKOS_COMPILE_LANGUAGE}_FLAGS}. Please check the given configuration." ) endif() @@ -1005,7 +1005,7 @@ function(kokkos_check_flags) check_linker_flag(${INP_LANGUAGE} "${QUOTED_FLAGS}" KOKKOS_LINK_OPTIONS_CHECK) if(NOT KOKKOS_LINK_OPTIONS_CHECK) message( - FATAL_ERROR + WARNING "The linker for ${KOKKOS_COMPILE_LANGUAGE} can not consume flag(s) ${QUOTED_FLAGS}. Please check the given configuration." ) endif() diff --git a/lib/kokkos/cmake/kokkos_pick_cxx_std.cmake b/lib/kokkos/cmake/kokkos_pick_cxx_std.cmake index a5a07e4f81d..973597f8089 100644 --- a/lib/kokkos/cmake/kokkos_pick_cxx_std.cmake +++ b/lib/kokkos/cmake/kokkos_pick_cxx_std.cmake @@ -1,23 +1,7 @@ -# From CMake 3.10 documentation - -#This can run at any time -kokkos_option( - CXX_STANDARD - "" - STRING - "[[DEPRECATED - USE CMAKE_CXX_STANDARD INSTEAD]] The C++ standard for Kokkos to use: 20, 23, and 26. If empty, this will default to CMAKE_CXX_STANDARD. If both CMAKE_CXX_STANDARD and Kokkos_CXX_STANDARD are empty, this will default to 20" -) - # Set CXX standard flags set(KOKKOS_ENABLE_CXX20 OFF) set(KOKKOS_ENABLE_CXX23 OFF) set(KOKKOS_ENABLE_CXX26 OFF) -if(KOKKOS_CXX_STANDARD) - message( - FATAL_ERROR - "Setting the variable Kokkos_CXX_STANDARD in configuration is deprecated - set CMAKE_CXX_STANDARD directly instead" - ) -endif() if(NOT CMAKE_CXX_STANDARD) set(KOKKOS_CXX_STANDARD "20") diff --git a/lib/kokkos/cmake/kokkos_tpls.cmake b/lib/kokkos/cmake/kokkos_tpls.cmake index e811a5b4c4e..e304867ff26 100644 --- a/lib/kokkos/cmake/kokkos_tpls.cmake +++ b/lib/kokkos/cmake/kokkos_tpls.cmake @@ -21,18 +21,8 @@ endfunction() kokkos_tpl_option(HWLOC Off TRIBITS HWLOC) kokkos_tpl_option(CUDA ${Kokkos_ENABLE_CUDA} TRIBITS CUDA) -if(KOKKOS_ENABLE_HIP AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) - set(ROCM_DEFAULT ON) -else() - set(ROCM_DEFAULT OFF) -endif() -if(KOKKOS_ENABLE_HIP) - set(ROCTHRUST_DEFAULT ON) -else() - set(ROCTHRUST_DEFAULT OFF) -endif() -kokkos_tpl_option(ROCM ${ROCM_DEFAULT}) -kokkos_tpl_option(ROCTHRUST ${ROCTHRUST_DEFAULT}) +kokkos_tpl_option(ROCM ${Kokkos_ENABLE_HIP}) +kokkos_tpl_option(ROCTHRUST ${Kokkos_ENABLE_HIP}) if(KOKKOS_ENABLE_SYCL) set(ONEDPL_DEFAULT ON) @@ -95,7 +85,11 @@ if(Kokkos_ENABLE_OPENMP) global_append(KOKKOS_AMDGPU_OPTIONS ${OpenMP_CXX_FLAGS}) endif() if(Kokkos_ENABLE_CUDA AND KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) - global_append(KOKKOS_CUDA_OPTIONS -Xcompiler ${OpenMP_CXX_FLAGS}) + if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + global_append(KOKKOS_CUDA_OPTIONS -Xcompiler ${OpenMP_CXX_FLAGS}) + else() + global_append(KOKKOS_CUDA_OPTIONS ${OpenMP_CXX_FLAGS}) + endif() endif() endif() diff --git a/lib/kokkos/cmake/kokkos_tribits.cmake b/lib/kokkos/cmake/kokkos_tribits.cmake index 9291bbfa193..63e3ba00b57 100644 --- a/lib/kokkos/cmake/kokkos_tribits.cmake +++ b/lib/kokkos/cmake/kokkos_tribits.cmake @@ -6,12 +6,26 @@ include(GNUInstallDirs) message(STATUS "The project name is: ${PROJECT_NAME}") -if(GTest_FOUND) - set(KOKKOS_GTEST_LIB GTest::gtest) - message(STATUS "Using gtest found in ${GTest_DIR}") -else() # fallback to internal gtest - set(KOKKOS_GTEST_LIB kokkos_gtest) - message(STATUS "Using internal gtest for testing") +if(Kokkos_ENABLE_TESTS OR Kokkos_INSTALL_TESTING) + find_package(GTest QUIET) + if(GTest_FOUND) + message(STATUS "Found external GoogleTest: ${GTest_DIR} (version \"${GTest_VERSION}\")") + else() + message(STATUS "Using bundled GoogleTest version") + include(FetchContent) + list(APPEND CMAKE_MESSAGE_INDENT "[googletest] ") + FetchContent_Declare(googletest SOURCE_DIR ${Kokkos_SOURCE_DIR}/tpls/gtest) + FetchContent_MakeAvailable(googletest) + list(POP_BACK CMAKE_MESSAGE_INDENT) + + # Suppress clang-tidy diagnostics on code that we do not have control over + if(CMAKE_CXX_CLANG_TIDY) + set_target_properties(gtest PROPERTIES CXX_CLANG_TIDY "") + endif() + + # Suppress compiler warnings. TODO use SYSTEM within the FetchContent_Declare call when CMake 3.25 is required + set_target_properties(gtest PROPERTIES COMPILE_OPTIONS -w) + endif() endif() function(VERIFY_EMPTY CONTEXT) @@ -168,10 +182,17 @@ macro(KOKKOS_ADD_TEST_EXECUTABLE ROOT_NAME) cmake_parse_arguments(PARSE "" "" "SOURCES" ${ARGN}) # Don't do anything if the user disabled the test if(NOT ${PACKAGE_NAME}_${ROOT_NAME}_DISABLE) - kokkos_add_executable( - ${ROOT_NAME} SOURCES ${PARSE_SOURCES} ${PARSE_UNPARSED_ARGUMENTS} TESTONLYLIBS ${KOKKOS_GTEST_LIB} - ) + kokkos_add_executable(${ROOT_NAME} SOURCES ${PARSE_SOURCES} ${PARSE_UNPARSED_ARGUMENTS} TESTONLYLIBS GTest::gtest) set(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME}) + + # Suppress compiler warnings when not using an external gtest version. + # TODO use SYSTEM within the FetchContent_Declare call when CMake 3.25 is required + get_target_property(GTEST_INCLUDES GTest::gtest INCLUDE_DIRECTORIES) + if(GTEST_INCLUDES) + foreach(dir ${GTEST_INCLUDES}) + target_include_directories(${EXE_NAME} SYSTEM PRIVATE "${dir}") + endforeach() + endif() endif() endmacro() diff --git a/lib/kokkos/containers/performance_tests/CMakeLists.txt b/lib/kokkos/containers/performance_tests/CMakeLists.txt index 8d4d605b087..9fd426a4229 100644 --- a/lib/kokkos/containers/performance_tests/CMakeLists.txt +++ b/lib/kokkos/containers/performance_tests/CMakeLists.txt @@ -7,8 +7,6 @@ foreach(Tag Threads;OpenMP;Cuda;HPX;HIP) string(TOLOWER ${Tag} dir) if(Kokkos_ENABLE_${DEVICE}) - message(STATUS "Sources Test${Tag}.cpp") - set(SOURCES TestMain.cpp Test${Tag}.cpp) kokkos_add_executable_and_test(ContainersPerformanceTest_${Tag} SOURCES ${SOURCES}) diff --git a/lib/kokkos/containers/performance_tests/TestCuda.cpp b/lib/kokkos/containers/performance_tests/TestCuda.cpp index fa7e13b102a..ce5c1d669db 100644 --- a/lib/kokkos/containers/performance_tests/TestCuda.cpp +++ b/lib/kokkos/containers/performance_tests/TestCuda.cpp @@ -27,13 +27,13 @@ import kokkos.unordered_impl; namespace Performance { -TEST(TEST_CATEGORY, dynrankview_perf) { +TEST(cuda, dynrankview_perf) { std::cout << "Cuda" << std::endl; std::cout << " DynRankView vs View: Initialization Only " << std::endl; test_dynrankview_op_perf(40960); } -TEST(TEST_CATEGORY, global_2_local) { +TEST(cuda, global_2_local) { std::cout << "Cuda" << std::endl; std::cout << "size, create, generate, fill, find" << std::endl; for (unsigned i = Performance::begin_id_size; i <= Performance::end_id_size; @@ -41,11 +41,11 @@ TEST(TEST_CATEGORY, global_2_local) { test_global_to_local_ids(i); } -TEST(TEST_CATEGORY, unordered_map_performance_near) { +TEST(cuda, unordered_map_performance_near) { Perf::run_performance_tests("cuda-near"); } -TEST(TEST_CATEGORY, unordered_map_performance_far) { +TEST(cuda, unordered_map_performance_far) { Perf::run_performance_tests("cuda-far"); } diff --git a/lib/kokkos/containers/performance_tests/TestHPX.cpp b/lib/kokkos/containers/performance_tests/TestHPX.cpp index b1bdcc22da2..d0976f8811c 100644 --- a/lib/kokkos/containers/performance_tests/TestHPX.cpp +++ b/lib/kokkos/containers/performance_tests/TestHPX.cpp @@ -27,13 +27,13 @@ import kokkos.unordered_map; namespace Performance { -TEST(TEST_CATEGORY, dynrankview_perf) { +TEST(hpx, dynrankview_perf) { std::cout << "HPX" << std::endl; std::cout << " DynRankView vs View: Initialization Only " << std::endl; test_dynrankview_op_perf(8192); } -TEST(TEST_CATEGORY, global_2_local) { +TEST(hpx, global_2_local) { std::cout << "HPX" << std::endl; std::cout << "size, create, generate, fill, find" << std::endl; for (unsigned i = Performance::begin_id_size; i <= Performance::end_id_size; @@ -41,7 +41,7 @@ TEST(TEST_CATEGORY, global_2_local) { test_global_to_local_ids(i); } -TEST(TEST_CATEGORY, unordered_map_performance_near) { +TEST(hpx, unordered_map_performance_near) { unsigned num_hpx = 4; std::ostringstream base_file_name; base_file_name << "hpx-" << num_hpx << "-near"; @@ -49,7 +49,7 @@ TEST(TEST_CATEGORY, unordered_map_performance_near) { base_file_name.str()); } -TEST(TEST_CATEGORY, unordered_map_performance_far) { +TEST(hpx, unordered_map_performance_far) { unsigned num_hpx = 4; std::ostringstream base_file_name; base_file_name << "hpx-" << num_hpx << "-far"; @@ -57,7 +57,7 @@ TEST(TEST_CATEGORY, unordered_map_performance_far) { base_file_name.str()); } -TEST(TEST_CATEGORY, scatter_view) { +TEST(hpx, scatter_view) { std::cout << "ScatterView data-duplicated test:\n"; Perf::test_scatter_view(8192); } -TEST(TEST_CATEGORY, global_2_local) { +TEST(openmp, global_2_local) { std::cout << "OpenMP" << std::endl; std::cout << "size, create, generate, fill, find" << std::endl; for (unsigned i = Performance::begin_id_size; i <= Performance::end_id_size; @@ -41,7 +41,7 @@ TEST(TEST_CATEGORY, global_2_local) { test_global_to_local_ids(i); } -TEST(TEST_CATEGORY, unordered_map_performance_near) { +TEST(openmp, unordered_map_performance_near) { unsigned num_openmp = 4; if (Kokkos::hwloc::available()) { num_openmp = Kokkos::hwloc::get_available_numa_count() * @@ -53,7 +53,7 @@ TEST(TEST_CATEGORY, unordered_map_performance_near) { Perf::run_performance_tests(base_file_name.str()); } -TEST(TEST_CATEGORY, unordered_map_performance_far) { +TEST(openmp, unordered_map_performance_far) { unsigned num_openmp = 4; if (Kokkos::hwloc::available()) { num_openmp = Kokkos::hwloc::get_available_numa_count() * @@ -65,7 +65,7 @@ TEST(TEST_CATEGORY, unordered_map_performance_far) { Perf::run_performance_tests(base_file_name.str()); } -TEST(TEST_CATEGORY, scatter_view) { +TEST(openmp, scatter_view) { std::cout << "ScatterView data-duplicated test:\n"; Perf::test_scatter_view&) = default; - - KOKKOS_DEFAULTED_FUNCTION - Bitset& operator=(const Bitset&) = default; - - KOKKOS_DEFAULTED_FUNCTION - Bitset(Bitset&&) = default; - - KOKKOS_DEFAULTED_FUNCTION - Bitset& operator=(Bitset&&) = default; - - KOKKOS_DEFAULTED_FUNCTION - ~Bitset() = default; - /// number of bits in the set /// can be call from the host or the device KOKKOS_FORCEINLINE_FUNCTION @@ -329,10 +314,6 @@ class ConstBitset { ConstBitset(Bitset const& rhs) : m_size(rhs.m_size), m_blocks(rhs.m_blocks) {} - KOKKOS_FUNCTION - ConstBitset(ConstBitset const& rhs) - : m_size(rhs.m_size), m_blocks(rhs.m_blocks) {} - KOKKOS_FUNCTION ConstBitset& operator=(Bitset const& rhs) { this->m_size = rhs.m_size; @@ -341,14 +322,6 @@ class ConstBitset { return *this; } - KOKKOS_FUNCTION - ConstBitset& operator=(ConstBitset const& rhs) { - this->m_size = rhs.m_size; - this->m_blocks = rhs.m_blocks; - - return *this; - } - KOKKOS_FORCEINLINE_FUNCTION unsigned size() const { return m_size; } diff --git a/lib/kokkos/containers/src/Kokkos_DualView.hpp b/lib/kokkos/containers/src/Kokkos_DualView.hpp index fe0726df0f8..f4f3c40607c 100644 --- a/lib/kokkos/containers/src/Kokkos_DualView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DualView.hpp @@ -67,15 +67,17 @@ namespace Impl { #ifdef KOKKOS_ENABLE_CUDA -inline const Kokkos::Cuda& get_cuda_space(const Kokkos::Cuda& in) { return in; } +inline cudaStream_t get_cuda_stream(const Kokkos::Cuda& in) { + return in.cuda_stream(); +} -inline const Kokkos::Cuda& get_cuda_space() { - return *Kokkos::Impl::cuda_get_deep_copy_space(); +inline cudaStream_t get_cuda_stream() { + return Kokkos::Impl::cuda_get_deep_copy_stream(); } template -inline const Kokkos::Cuda& get_cuda_space(const NonCudaExecSpace&) { - return get_cuda_space(); +inline cudaStream_t get_cuda_stream(const NonCudaExecSpace&) { + return get_cuda_stream(); } #endif // KOKKOS_ENABLE_CUDA @@ -210,12 +212,7 @@ class DualView : public ViewTraits { std::is_same_v; //@} -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - public: -#else private: -#endif - // Moved this specifically after modified_flags to resolve an alignment issue // on MSVC/NVCC //! \name The two View instances. @@ -552,7 +549,7 @@ class DualView : public ViewTraits { Kokkos::CudaUVMSpace>::value) { if (d_view.data() == h_view.data()) Kokkos::Impl::cuda_prefetch_pointer( - Impl::get_cuda_space(args...), d_view.data(), + Impl::get_cuda_stream(args...), d_view.data(), sizeof(typename t_dev::value_type) * d_view.span(), true); } #endif @@ -569,7 +566,7 @@ class DualView : public ViewTraits { Kokkos::CudaUVMSpace>::value) { if (d_view.data() == h_view.data()) Kokkos::Impl::cuda_prefetch_pointer( - Impl::get_cuda_space(args...), d_view.data(), + Impl::get_cuda_stream(args...), d_view.data(), sizeof(typename t_dev::value_type) * d_view.span(), false); } #endif @@ -652,7 +649,7 @@ class DualView : public ViewTraits { Kokkos::CudaUVMSpace>::value) { if (d_view.data() == h_view.data()) Kokkos::Impl::cuda_prefetch_pointer( - Impl::get_cuda_space(args...), d_view.data(), + Impl::get_cuda_stream(args...), d_view.data(), sizeof(typename t_dev::value_type) * d_view.span(), false); } #endif @@ -695,7 +692,7 @@ class DualView : public ViewTraits { Kokkos::CudaUVMSpace>::value) { if (d_view.data() == h_view.data()) Kokkos::Impl::cuda_prefetch_pointer( - Impl::get_cuda_space(args...), d_view.data(), + Impl::get_cuda_stream(args...), d_view.data(), sizeof(typename t_dev::value_type) * d_view.span(), true); } #endif diff --git a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp index 069fc59df5a..58df0b70864 100644 --- a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp @@ -668,10 +668,6 @@ class DynRankView : private View { { return view_type::operator()(i0, 0, 0, 0, 0, 0, 0); } -#if defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ - !defined(KOKKOS_COMPILER_MSVC) - __builtin_unreachable(); -#endif } KOKKOS_FUNCTION reference_type operator()(index_type i0, @@ -699,10 +695,6 @@ class DynRankView : private View { { return view_type::operator()(i0, i1, 0, 0, 0, 0, 0); } -#if defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ - !defined(KOKKOS_COMPILER_MSVC) - __builtin_unreachable(); -#endif } KOKKOS_FUNCTION reference_type operator()(index_type i0, index_type i1, @@ -734,10 +726,6 @@ class DynRankView : private View { { return view_type::operator()(i0, i1, i2, 0, 0, 0, 0); } -#if defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ - !defined(KOKKOS_COMPILER_MSVC) - __builtin_unreachable(); -#endif } KOKKOS_FUNCTION reference_type operator()(index_type i0, index_type i1, @@ -789,9 +777,6 @@ class DynRankView : private View { //---------------------------------------- // Standard constructor, destructor, and assignment operators... - KOKKOS_DEFAULTED_FUNCTION - ~DynRankView() = default; - KOKKOS_DEFAULTED_FUNCTION DynRankView() = default; //---------------------------------------- @@ -1606,10 +1591,6 @@ inline auto create_mirror(const DynRankView& src, return dst_type(create_mirror(arg_prop, src.DownCast()), src.rank()); } -#if defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ - !defined(KOKKOS_COMPILER_MSVC) - __builtin_unreachable(); -#endif } } // namespace Impl @@ -1695,10 +1676,6 @@ inline auto create_mirror_view( return Kokkos::Impl::choose_create_mirror(src, arg_prop); } } -#if defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ - !defined(KOKKOS_COMPILER_MSVC) - __builtin_unreachable(); -#endif } } // namespace Impl @@ -1785,10 +1762,6 @@ auto create_mirror_view_and_copy( deep_copy(mirror, src); return mirror; } -#if defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ - !defined(KOKKOS_COMPILER_MSVC) - __builtin_unreachable(); -#endif } template diff --git a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp index 1f007fd7540..7b7e0393af4 100644 --- a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp @@ -35,15 +35,11 @@ struct ChunkedArrayManager { using pointer_type = ValueType*; using track_type = Kokkos::Impl::SharedAllocationTracker; - ChunkedArrayManager() = default; - ChunkedArrayManager(ChunkedArrayManager const&) = default; - ChunkedArrayManager(ChunkedArrayManager&&) = default; - ChunkedArrayManager& operator=(ChunkedArrayManager&&) = default; - ChunkedArrayManager& operator=(const ChunkedArrayManager&) = default; - template friend struct ChunkedArrayManager; + ChunkedArrayManager() = default; + template inline ChunkedArrayManager(const ChunkedArrayManager& rhs) : m_valid(rhs.m_valid), @@ -124,11 +120,7 @@ struct ChunkedArrayManager { /// allocation template struct Destroy { - Destroy() = default; - Destroy(Destroy&&) = default; - Destroy(const Destroy&) = default; - Destroy& operator=(Destroy&&) = default; - Destroy& operator=(const Destroy&) = default; + Destroy() = default; Destroy(std::string label, value_type** arg_chunk, const unsigned arg_chunk_max, const unsigned arg_chunk_size, @@ -475,12 +467,7 @@ class DynamicView : public Kokkos::ViewTraits { //---------------------------------------------------------------------- - ~DynamicView() = default; - DynamicView() = default; - DynamicView(DynamicView&&) = default; - DynamicView(const DynamicView&) = default; - DynamicView& operator=(DynamicView&&) = default; - DynamicView& operator=(const DynamicView&) = default; + DynamicView() = default; template DynamicView(const DynamicView& rhs) @@ -515,7 +502,7 @@ class DynamicView : public Kokkos::ViewTraits { m_chunk_max((max_extent + m_chunk_mask) >> m_chunk_shift) // max num pointers-to-chunks in array , - m_chunk_size(2 << (m_chunk_shift - 1)) { + m_chunk_size(1 << m_chunk_shift) { m_chunks = device_accessor(m_chunk_max, m_chunk_size); const std::string& label = @@ -634,10 +621,6 @@ inline auto create_mirror(const Kokkos::Experimental::DynamicView& src, return ret; } -#if defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ - !defined(KOKKOS_COMPILER_MSVC) - __builtin_unreachable(); -#endif } } // namespace Impl @@ -732,10 +715,6 @@ inline auto create_mirror_view( return Kokkos::Impl::choose_create_mirror(src, arg_prop); } } -#if defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ - !defined(KOKKOS_COMPILER_MSVC) - __builtin_unreachable(); -#endif } } // namespace Impl @@ -1001,10 +980,6 @@ auto create_mirror_view_and_copy( deep_copy(mirror, src); return mirror; } -#if defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ - !defined(KOKKOS_COMPILER_MSVC) - __builtin_unreachable(); -#endif } template & src, return typename Kokkos::Experimental::OffsetView::host_mirror_type( Kokkos::create_mirror(arg_prop, src.view()), src.begins()); } -#if defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ - !defined(KOKKOS_COMPILER_MSVC) - __builtin_unreachable(); -#endif } } // namespace Impl @@ -1415,10 +1411,6 @@ inline auto create_mirror_view( return Kokkos::Impl::choose_create_mirror(src, arg_prop); } } -#if defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ - !defined(KOKKOS_COMPILER_MSVC) - __builtin_unreachable(); -#endif } } // namespace Impl diff --git a/lib/kokkos/containers/src/Kokkos_ScatterView.hpp b/lib/kokkos/containers/src/Kokkos_ScatterView.hpp index 5a7b76df370..4b5599dd3f2 100644 --- a/lib/kokkos/containers/src/Kokkos_ScatterView.hpp +++ b/lib/kokkos/containers/src/Kokkos_ScatterView.hpp @@ -92,23 +92,6 @@ struct DefaultContribution -struct DefaultDuplication { - using type = Kokkos::Experimental::ScatterNonDuplicated; -}; -template <> -struct DefaultContribution { - using type = Kokkos::Experimental::ScatterAtomic; -}; -template <> -struct DefaultContribution { - using type = Kokkos::Experimental::ScatterNonAtomic; -}; -#endif - #ifdef KOKKOS_ENABLE_HPX template <> struct DefaultDuplication { @@ -195,20 +178,14 @@ struct DefaultContribution struct ScatterValue; /* ScatterValue is - the object returned by the access operator() of ScatterAccess. This class - inherits from the Sum<> reducer and it wraps join(dest, src) with convenient - operator+=, etc. Note the addition of update(ValueType const& rhs) and - reset() so that all reducers can have common functions See ReduceDuplicates - and ResetDuplicates ) */ + the object returned by the access operator() of ScatterAccess. */ template struct ScatterValue { @@ -217,8 +194,9 @@ struct ScatterValue is the - object returned by the access operator() of ScatterAccess. This class inherits - from the Sum<> reducer, and similar to that returned by an Atomic View, it - wraps Kokkos::atomic_add with convenient operator+=, etc. This version also has - the update(rhs) and reset() functions. */ + object returned by the access operator() of ScatterAccess. */ template struct ScatterValue { @@ -250,29 +225,26 @@ struct ScatterValuejoin(value, rhs); + Kokkos::atomic_add(&value, rhs); } - KOKKOS_FORCEINLINE_FUNCTION void operator++() { this->join(value, 1); } - KOKKOS_FORCEINLINE_FUNCTION void operator++(int) { this->join(value, 1); } - KOKKOS_FORCEINLINE_FUNCTION void operator-=(ValueType const& rhs) { - this->join(value, ValueType(-rhs)); + KOKKOS_FORCEINLINE_FUNCTION void operator++() { Kokkos::atomic_inc(&value); } + KOKKOS_FORCEINLINE_FUNCTION void operator++(int) { + Kokkos::atomic_inc(&value); } - KOKKOS_FORCEINLINE_FUNCTION void operator--() { - this->join(value, ValueType(-1)); + KOKKOS_FORCEINLINE_FUNCTION void operator-=(ValueType const& rhs) { + Kokkos::atomic_sub(&value, rhs); } + KOKKOS_FORCEINLINE_FUNCTION void operator--() { Kokkos::atomic_dec(&value); } KOKKOS_FORCEINLINE_FUNCTION void operator--(int) { - this->join(value, ValueType(-1)); - } - - KOKKOS_INLINE_FUNCTION - void join(ValueType& dest, const ValueType& src) const { - Kokkos::atomic_add(&dest, src); + Kokkos::atomic_dec(&value); } KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) { - this->join(value, rhs); + Kokkos::atomic_add(&value, rhs); } KOKKOS_FORCEINLINE_FUNCTION void reset() { @@ -281,11 +253,7 @@ struct ScatterValue is - the object returned by the access operator() of ScatterAccess. This class - inherits from the Prod<> reducer, and it wraps join(dest, src) with - convenient operator*=, etc. Note the addition of update(ValueType const& rhs) - and reset() so that all reducers can have common functions See - ReduceDuplicates and ResetDuplicates ) */ + the object returned by the access operator() of ScatterAccess. */ template struct ScatterValue { @@ -294,8 +262,11 @@ struct ScatterValue is the - object returned by the access operator() of ScatterAccess. This class - inherits from the Prod<> reducer, and similar to that returned by an Atomic - View, it wraps and atomic_prod with convenient operator*=, etc. atomic_prod - uses the atomic_compare_exchange. This version also has the update(rhs) - and reset() functions. */ + object returned by the access operator() of ScatterAccess. */ template struct ScatterValue { @@ -325,8 +292,10 @@ struct ScatterValue::prod(); @@ -349,21 +313,18 @@ struct ScatterValue is - the object returned by the access operator() of ScatterAccess. This class - inherits from the Min<> reducer and it wraps join(dest, src) with convenient - update(rhs). Note the addition of update(ValueType const& rhs) and reset() - are so that all reducers can have a common update function See - ReduceDuplicates and ResetDuplicates ) */ + the object returned by the access operator() of ScatterAccess. */ template struct ScatterValue { ValueType& value; KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) : value(value_in) {} - KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ScatterValue&& other) - : value(other.value) {} - public: + KOKKOS_FUNCTION ScatterValue(const ScatterValue&) = delete; + KOKKOS_FUNCTION ScatterValue& operator=(const ScatterValue&) = delete; + KOKKOS_DEFAULTED_FUNCTION ~ScatterValue() = default; + KOKKOS_FORCEINLINE_FUNCTION void update(ValueType const& rhs) { value = rhs < value ? rhs : value; } @@ -373,11 +334,7 @@ struct ScatterValue is the - object returned by the access operator() of ScatterAccess. This class - inherits from the Min<> reducer, and similar to that returned by an Atomic - View, it wraps atomic_min with join(), etc. atomic_min uses the - atomic_compare_exchange. This version also has the update(rhs) and reset() - functions. */ + object returned by the access operator() of ScatterAccess. */ template struct ScatterValue { @@ -386,16 +343,12 @@ struct ScatterValuejoin(value, rhs); + Kokkos::atomic_min(&value, rhs); } KOKKOS_FORCEINLINE_FUNCTION void reset() { value = reduction_identity::min(); @@ -403,11 +356,7 @@ struct ScatterValue is - the object returned by the access operator() of ScatterAccess. This class - inherits from the Max<> reducer and it wraps join(dest, src) with convenient - update(rhs). Note the addition of update(ValueType const& rhs) and reset() - are so that all reducers can have a common update function See - ReduceDuplicates and ResetDuplicates ) */ + the object returned by the access operator() of ScatterAccess. */ template struct ScatterValue { @@ -416,8 +365,11 @@ struct ScatterValue value ? rhs : value; } @@ -427,11 +379,7 @@ struct ScatterValue is the - object returned by the access operator() of ScatterAccess. This class - inherits from the Max<> reducer, and similar to that returned by an Atomic - View, it wraps atomic_max with join(), etc. atomic_max uses the - atomic_compare_exchange. This version also has the update(rhs) and reset() - functions. */ + object returned by the access operator() of ScatterAccess. */ template struct ScatterValue { @@ -440,16 +388,12 @@ struct ScatterValuejoin(value, rhs); + Kokkos::atomic_max(&value, rhs); } KOKKOS_FORCEINLINE_FUNCTION void reset() { value = reduction_identity::max(); @@ -914,8 +858,6 @@ class ScatterAccess KOKKOS_FORCEINLINE_FUNCTION value_type operator()(Args... args) const { @@ -1561,7 +1503,8 @@ namespace Experimental { template void contribute( - typename ES::execution_space const& exec_space, View& dest, + typename ES::execution_space const& exec_space, + View const& dest, Kokkos::Experimental::ScatterView const& src) { src.contribute_into(exec_space, dest); } @@ -1569,7 +1512,7 @@ void contribute( template void contribute( - View& dest, + View const& dest, Kokkos::Experimental::ScatterView const& src) { using execution_space = typename ES::execution_space; contribute(execution_space{}, dest, src); diff --git a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp deleted file mode 100644 index a3b6b871782..00000000000 --- a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp +++ /dev/null @@ -1,469 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project - -#ifndef KOKKOS_STATICCRSGRAPH_HPP -#define KOKKOS_STATICCRSGRAPH_HPP -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#define KOKKOS_IMPL_PUBLIC_INCLUDE -#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_STATICCRSGRAPH -#endif - -#include - -#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) -#if defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) && \ - !defined(KOKKOS_IMPL_DO_NOT_WARN_INCLUDE_STATIC_CRS_GRAPH) -namespace { -[[deprecated("Deprecated header is included")]] int -emit_warning_kokkos_static_crs_graph_deprecated() { - return 0; -} -static auto do_not_include = emit_warning_kokkos_static_crs_graph_deprecated(); -} // namespace -#endif -#else -#error "Deprecated header is included" -#endif - -#include -#include - -#include -#include -#include - -namespace Kokkos { - -namespace Impl { -template -struct StaticCrsGraphBalancerFunctor { - using int_type = typename RowOffsetsType::non_const_value_type; - RowOffsetsType row_offsets; - RowBlockOffsetsType row_block_offsets; - - int_type cost_per_row, num_blocks; - - StaticCrsGraphBalancerFunctor(RowOffsetsType row_offsets_, - RowBlockOffsetsType row_block_offsets_, - int_type cost_per_row_, int_type num_blocks_) - : row_offsets(row_offsets_), - row_block_offsets(row_block_offsets_), - cost_per_row(cost_per_row_), - num_blocks(num_blocks_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const int_type& iRow) const { - const int_type num_rows = row_offsets.extent(0) - 1; - const int_type num_entries = row_offsets(num_rows); - const int_type total_cost = num_entries + num_rows * cost_per_row; - - const double cost_per_workset = 1.0 * total_cost / num_blocks; - - const int_type row_cost = - row_offsets(iRow + 1) - row_offsets(iRow) + cost_per_row; - - int_type count = row_offsets(iRow + 1) + cost_per_row * iRow; - - if (iRow == num_rows - 1) row_block_offsets(num_blocks) = num_rows; - - if (true) { - int_type current_block = - (count - row_cost - cost_per_row) / cost_per_workset; - int_type end_block = count / cost_per_workset; - - // Handle some corner cases for the last two blocks. - if (current_block >= num_blocks - 2) { - if ((current_block == num_blocks - 2) && - (count >= (current_block + 1) * cost_per_workset)) { - int_type row = iRow; - int_type cc = count - row_cost - cost_per_row; - int_type block = cc / cost_per_workset; - while ((block > 0) && (block == current_block)) { - cc = row_offsets(row) + row * cost_per_row; - block = cc / cost_per_workset; - row--; - } - if ((count - cc - row_cost - cost_per_row) < - num_entries - row_offsets(iRow + 1)) { - row_block_offsets(current_block + 1) = iRow + 1; - } else { - row_block_offsets(current_block + 1) = iRow; - } - } - } else { - if ((count >= (current_block + 1) * cost_per_workset) || - (iRow + 2 == int_type(row_offsets.extent(0)))) { - if (end_block > current_block + 1) { - int_type num_block = end_block - current_block; - row_block_offsets(current_block + 1) = iRow; - for (int_type block = current_block + 2; block <= end_block; - block++) - if ((block < current_block + 2 + (num_block - 1) / 2)) - row_block_offsets(block) = iRow; - else - row_block_offsets(block) = iRow + 1; - } else { - row_block_offsets(current_block + 1) = iRow + 1; - } - } - } - } - } -}; -} // namespace Impl - -/// \class GraphRowViewConst -/// \brief View of a row of a sparse graph. -/// \tparam GraphType Sparse graph type, such as (but not limited to) -/// StaticCrsGraph. -/// -/// This class provides a generic view of a row of a sparse graph. -/// We intended this class to view a row of a StaticCrsGraph, but -/// GraphType need not necessarily be CrsMatrix. -/// -/// The row view is suited for computational kernels like sparse -/// matrix-vector multiply, as well as for modifying entries in the -/// sparse matrix. The view is always const as it does not allow graph -/// modification. -/// -/// Here is an example loop over the entries in the row: -/// \code -/// using ordinal_type = typename GraphRowViewConst::ordinal_type; -/// -/// GraphRowView G_i = ...; -/// const ordinal_type numEntries = G_i.length; -/// for (ordinal_type k = 0; k < numEntries; ++k) { -/// ordinal_type j = G_i.colidx (k); -/// // ... do something with A_ij and j ... -/// } -/// \endcode -/// -/// GraphType must provide the \c data_type -/// aliases. In addition, it must make sense to use GraphRowViewConst to -/// view a row of GraphType. In particular, column -/// indices of a row must be accessible using the entries -/// resp. colidx arrays given to the constructor of this -/// class, with a constant stride between successive entries. -/// The stride is one for the compressed sparse row storage format (as -/// is used by CrsMatrix), but may be greater than one for other -/// sparse matrix storage formats (e.g., ELLPACK or jagged diagonal). -template -struct GraphRowViewConst { - //! The type of the column indices in the row. - using ordinal_type = const typename GraphType::data_type; - - private: - //! Array of (local) column indices in the row. - ordinal_type* colidx_; - /// \brief Stride between successive entries in the row. - /// - /// For compressed sparse row (CSR) storage, this is always one. - /// This might be greater than one for storage formats like ELLPACK - /// or Jagged Diagonal. Nevertheless, the stride can never be - /// greater than the number of rows or columns in the matrix. Thus, - /// \c ordinal_type is the correct type. - const ordinal_type stride_; - - public: - /// \brief Constructor - /// - /// \param values [in] Array of the row's values. - /// \param colidx [in] Array of the row's column indices. - /// \param stride [in] (Constant) stride between matrix entries in - /// each of the above arrays. - /// \param count [in] Number of entries in the row. - KOKKOS_INLINE_FUNCTION - GraphRowViewConst(ordinal_type* const colidx_in, const ordinal_type& stride, - const ordinal_type& count) - : colidx_(colidx_in), stride_(stride), length(count) {} - - /// \brief Constructor with offset into \c colidx array - /// - /// \param colidx [in] Array of the row's column indices. - /// \param stride [in] (Constant) stride between matrix entries in - /// each of the above arrays. - /// \param count [in] Number of entries in the row. - /// \param idx [in] Start offset into \c colidx array - /// - /// \tparam OffsetType The type of \c idx (see above). Must be a - /// built-in integer type. This may differ from ordinal_type. - /// For example, the matrix may have dimensions that fit in int, - /// but a number of entries that does not fit in int. - template - KOKKOS_INLINE_FUNCTION GraphRowViewConst( - const typename GraphType::entries_type& colidx_in, - const ordinal_type& stride, const ordinal_type& count, - const OffsetType& idx, - const std::enable_if_t, int>& = 0) - : colidx_(&colidx_in(idx)), stride_(stride), length(count) {} - - /// \brief Number of entries in the row. - /// - /// This is a public const field rather than a public const method, - /// in order to avoid possible overhead of a method call if the - /// compiler is unable to inline that method call. - /// - /// We assume that rows contain no duplicate entries (i.e., entries - /// with the same column index). Thus, a row may have up to - /// A.numCols() entries. This means that the correct type of - /// 'length' is ordinal_type. - const ordinal_type length; - - /// \brief (Const) reference to the column index of entry i in this - /// row of the sparse matrix. - /// - /// "Entry i" is not necessarily the entry with column index i, nor - /// does i necessarily correspond to the (local) row index. - KOKKOS_INLINE_FUNCTION - ordinal_type& colidx(const ordinal_type& i) const { - return colidx_[i * stride_]; - } - - /// \brief An alias for colidx - KOKKOS_INLINE_FUNCTION - ordinal_type& operator()(const ordinal_type& i) const { return colidx(i); } -}; - -/// \class StaticCrsGraph -/// \brief Compressed row storage array. -/// -/// \tparam DataType The type of stored entries. If a StaticCrsGraph is -/// used as the graph of a sparse matrix, then this is usually an -/// integer type, the type of the column indices in the sparse -/// matrix. -/// -/// \tparam Arg1Type The second template parameter, corresponding -/// either to the Device type (if there are no more template -/// parameters) or to the Layout type (if there is at least one more -/// template parameter). -/// -/// \tparam Arg2Type The third template parameter, which if provided -/// corresponds to the Device type. -/// -/// \tparam Arg3Type The third template parameter, which if provided -/// corresponds to the MemoryTraits. -/// -/// \tparam SizeType The type of row offsets. Usually the default -/// parameter suffices. However, setting a nondefault value is -/// necessary in some cases, for example, if you want to have a -/// sparse matrices with dimensions (and therefore column indices) -/// that fit in \c int, but want to store more than INT_MAX -/// entries in the sparse matrix. -/// -/// A row has a range of entries: -///
    -///
  • row_map[i0] <= entry < row_map[i0+1]
  • -///
  • 0 <= i1 < row_map[i0+1] - row_map[i0]
  • -///
  • entries( entry , i2 , i3 , ... );
  • -///
  • entries( row_map[i0] + i1 , i2 , i3 , ... );
  • -///
-template ::size_type> -class StaticCrsGraph { - private: - using traits = ViewTraits; - - public: - using data_type = DataType; - using array_layout = typename traits::array_layout; - using execution_space = typename traits::execution_space; - using device_type = typename traits::device_type; - using memory_traits = typename traits::memory_traits; - using size_type = SizeType; - - using staticcrsgraph_type = - StaticCrsGraph; - - using host_mirror_type = StaticCrsGraph; - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - using HostMirror KOKKOS_DEPRECATED_WITH_COMMENT( - "Use host_mirror_type instead.") = host_mirror_type; -#endif - - using row_map_type = - View; - using entries_type = - View; - using row_block_type = - View; - - entries_type entries; - row_map_type row_map; - row_block_type row_block_offsets; - - KOKKOS_DEFAULTED_FUNCTION - StaticCrsGraph() = default; - - template - KOKKOS_INLINE_FUNCTION StaticCrsGraph(const EntriesType& entries_, - const RowMapType& row_map_) - : entries(entries_), row_map(row_map_) {} - - template - KOKKOS_INLINE_FUNCTION StaticCrsGraph(const StaticCrsGraph& other) - : entries(other.entries), - row_map(other.row_map), - row_block_offsets(other.row_block_offsets) {} - - /** \brief Return number of rows in the graph - */ - KOKKOS_INLINE_FUNCTION - size_type numRows() const { - return (row_map.extent(0) != 0) - ? row_map.extent(0) - static_cast(1) - : static_cast(0); - } - - KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const { - return (row_map.is_allocated() && entries.is_allocated()); - } - - /// \brief Return a const view of row i of the graph. - /// - /// If row i does not belong to the graph, return an empty view. - /// - /// The returned object \c view implements the following interface: - ///
    - ///
  • \c view.length is the number of entries in the row
  • - ///
  • \c view.colidx(k) returns a const reference to the - /// column index of the k-th entry in the row
  • - ///
- /// k is not a column index; it just counts from 0 to - /// view.length - 1. - /// - /// Users should not rely on the return type of this method. They - /// should instead assign to 'auto'. That allows compile-time - /// polymorphism for different kinds of sparse matrix formats (e.g., - /// ELLPACK or Jagged Diagonal) that we may wish to support in the - /// future. - KOKKOS_INLINE_FUNCTION - GraphRowViewConst rowConst(const data_type i) const { - const size_type start = row_map(i); - // count is guaranteed to fit in ordinal_type, as long as no row - // has duplicate entries. - const data_type count = static_cast(row_map(i + 1) - start); - - if (count == 0) { - return GraphRowViewConst(nullptr, 1, 0); - } else { - return GraphRowViewConst(entries, 1, count, start); - } - } - - /** \brief Create a row partitioning into a given number of blocks - * balancing non-zeros + a fixed cost per row. - */ - void create_block_partitioning(size_type num_blocks, - size_type fix_cost_per_row = 4) { - View block_offsets( - "StatisCrsGraph::load_balance_offsets", num_blocks + 1); - - Impl::StaticCrsGraphBalancerFunctor< - row_map_type, View > - partitioner(row_map, block_offsets, fix_cost_per_row, num_blocks); - - Kokkos::parallel_for("Kokkos::StaticCrsGraph::create_block_partitioning", - Kokkos::RangePolicy(0, numRows()), - partitioner); - typename device_type::execution_space().fence( - "Kokkos::StaticCrsGraph::create_block_partitioning:: fence after " - "partition"); - - row_block_offsets = block_offsets; - } -}; - -//---------------------------------------------------------------------------- - -template -typename StaticCrsGraphType::staticcrsgraph_type create_staticcrsgraph( - const std::string& label, const std::vector& input); - -template -typename StaticCrsGraphType::staticcrsgraph_type create_staticcrsgraph( - const std::string& label, - const std::vector >& input); - -//---------------------------------------------------------------------------- - -template -typename StaticCrsGraph::host_mirror_type -create_mirror_view(const StaticCrsGraph& input); - -template -typename StaticCrsGraph::host_mirror_type -create_mirror(const StaticCrsGraph& input); - -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template -struct StaticCrsGraphMaximumEntry { - using execution_space = typename GraphType::execution_space; - using value_type = typename GraphType::data_type; - - const typename GraphType::entries_type entries; - - StaticCrsGraphMaximumEntry(const GraphType& graph) : entries(graph.entries) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const unsigned i, value_type& update) const { - if (update < entries(i)) update = entries(i); - } - - KOKKOS_INLINE_FUNCTION - void init(value_type& update) const { update = 0; } - - KOKKOS_INLINE_FUNCTION - void join(value_type& update, const value_type& input) const { - if (update < input) update = input; - } -}; - -} // namespace Impl - -template -DataType maximum_entry(const StaticCrsGraph& graph) { - using GraphType = - StaticCrsGraph; - using FunctorType = Impl::StaticCrsGraphMaximumEntry; - - DataType result = 0; - Kokkos::parallel_reduce("Kokkos::maximum_entry", graph.entries.extent(0), - FunctorType(graph), result); - return result; -} - -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_STATICCRSGRAPH -#undef KOKKOS_IMPL_PUBLIC_INCLUDE -#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_STATICCRSGRAPH -#endif -#endif /* #ifndef KOKKOS_CRSARRAY_HPP */ diff --git a/lib/kokkos/containers/src/Kokkos_Vector.hpp b/lib/kokkos/containers/src/Kokkos_Vector.hpp deleted file mode 100644 index 63eca15c3db..00000000000 --- a/lib/kokkos/containers/src/Kokkos_Vector.hpp +++ /dev/null @@ -1,327 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project - -#ifndef KOKKOS_VECTOR_HPP -#define KOKKOS_VECTOR_HPP -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#define KOKKOS_IMPL_PUBLIC_INCLUDE -#define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_VECTOR -#endif - -#include - -#if defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) -#if defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) -namespace { -[[deprecated("Deprecated header is included")]] int -emit_warning_kokkos_vector_deprecated() { - return 0; -} -static auto do_not_include = emit_warning_kokkos_vector_deprecated(); -} // namespace -#endif -#else -#error "Deprecated header is included" -#endif - -#include -#include - -/* Drop in replacement for std::vector based on Kokkos::DualView - * Most functions only work on the host (it will not compile if called from - * device kernel) - * - */ -namespace Kokkos { - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -template -class KOKKOS_DEPRECATED vector - : public DualView { - public: - using value_type = Scalar; - using pointer = Scalar*; - using const_pointer = const Scalar*; - using reference = Scalar&; - using const_reference = const Scalar&; - using iterator = Scalar*; - using const_iterator = const Scalar*; - using size_type = size_t; - - private: - size_t _size; - float _extra_storage; - using DV = DualView; - - public: -#ifdef KOKKOS_ENABLE_CUDA_UVM - KOKKOS_INLINE_FUNCTION reference operator()(int i) const { - return DV::view_host()(i); - }; - KOKKOS_INLINE_FUNCTION reference operator[](int i) const { - return DV::view_host()(i); - }; -#else - inline reference operator()(int i) const { return DV::view_host()(i); } - inline reference operator[](int i) const { return DV::view_host()(i); } -#endif - - /* Member functions which behave like std::vector functions */ - - vector() : DV() { - _size = 0; - _extra_storage = 1.1; - } - - vector(int n, Scalar val = Scalar()) - : DualView("Vector", size_t(n * (1.1))) { - _size = n; - _extra_storage = 1.1; - DV::modified_flags(0) = 1; - - assign(n, val); - } - - void resize(size_t n) { - if (n >= span()) DV::resize(size_t(n * _extra_storage)); - _size = n; - } - - void resize(size_t n, const Scalar& val) { assign(n, val); } - - void assign(size_t n, const Scalar& val) { - /* Resize if necessary (behavior of std:vector) */ - - if (n > span()) DV::resize(size_t(n * _extra_storage)); - _size = n; - - /* Assign value either on host or on device */ - - if (DV::template need_sync()) { - set_functor_host f(DV::view_host(), val); - parallel_for("Kokkos::vector::assign", n, f); - typename DV::t_host::execution_space().fence( - "Kokkos::vector::assign: fence after assigning values"); - DV::template modify(); - } else { - set_functor f(DV::view_device(), val); - parallel_for("Kokkos::vector::assign", n, f); - typename DV::t_dev::execution_space().fence( - "Kokkos::vector::assign: fence after assigning values"); - DV::template modify(); - } - } - - void reserve(size_t n) { DV::resize(size_t(n * _extra_storage)); } - - void push_back(Scalar val) { - if (_size == span()) { - size_t new_size = _size * _extra_storage; - if (new_size == _size) new_size++; - DV::resize(new_size); - } - - DV::sync_host(); - DV::view_host()(_size) = val; - _size++; - DV::modify_host(); - } - - void pop_back() { _size--; } - - void clear() { _size = 0; } - - iterator insert(iterator it, const value_type& val) { - return insert(it, 1, val); - } - - iterator insert(iterator it, size_type count, const value_type& val) { - if ((size() == 0) && (it == begin())) { - resize(count, val); - DV::sync_host(); - return begin(); - } - DV::sync_host(); - DV::modify_host(); - if (std::less<>()(it, begin()) || std::less<>()(end(), it)) - Kokkos::abort("Kokkos::vector::insert : invalid insert iterator"); - if (count == 0) return it; - ptrdiff_t start = std::distance(begin(), it); - auto org_size = size(); - resize(size() + count); - - std::copy_backward(begin() + start, begin() + org_size, - begin() + org_size + count); - std::fill_n(begin() + start, count, val); - - return begin() + start; - } - - private: - template - struct impl_is_input_iterator : /* TODO replace this */ std::bool_constant< - !std::is_convertible_v> {}; - - public: - // TODO: can use detection idiom to generate better error message here later - template - std::enable_if_t::value, iterator> - insert(iterator it, InputIterator b, InputIterator e) { - ptrdiff_t count = std::distance(b, e); - - DV::sync_host(); - DV::modify_host(); - if (std::less<>()(it, begin()) || std::less<>()(end(), it)) - Kokkos::abort("Kokkos::vector::insert : invalid insert iterator"); - - ptrdiff_t start = std::distance(begin(), it); - auto org_size = size(); - - // Note: resize(...) invalidates it; use begin() + start instead - resize(size() + count); - - std::copy_backward(begin() + start, begin() + org_size, - begin() + org_size + count); - std::copy(b, e, begin() + start); - - return begin() + start; - } - - KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const { - return DV::is_allocated(); - } - - size_type size() const { return _size; } - size_type max_size() const { return 2000000000; } - size_type span() const { return DV::span(); } - bool empty() const { return _size == 0; } - - pointer data() const { return DV::view_host().data(); } - - iterator begin() const { return DV::view_host().data(); } - - const_iterator cbegin() const { return DV::view_host().data(); } - - iterator end() const { - return _size > 0 ? DV::view_host().data() + _size : DV::view_host().data(); - } - - const_iterator cend() const { - return _size > 0 ? DV::view_host().data() + _size : DV::view_host().data(); - } - - reference front() { return DV::view_host()(0); } - - reference back() { return DV::view_host()(_size - 1); } - - const_reference front() const { return DV::view_host()(0); } - - const_reference back() const { return DV::view_host()(_size - 1); } - - /* std::algorithms which work originally with iterators, here they are - * implemented as member functions */ - - size_t lower_bound(const size_t& start, const size_t& theEnd, - const Scalar& comp_val) const { - int lower = start; // FIXME (mfh 24 Apr 2014) narrowing conversion - int upper = - _size > theEnd - ? theEnd - : _size - 1; // FIXME (mfh 24 Apr 2014) narrowing conversion - if (upper <= lower) { - return theEnd; - } - - Scalar lower_val = DV::view_host()(lower); - Scalar upper_val = DV::view_host()(upper); - size_t idx = (upper + lower) / 2; - Scalar val = DV::view_host()(idx); - if (val > upper_val) return upper; - if (val < lower_val) return start; - - while (upper > lower) { - if (comp_val > val) { - lower = ++idx; - } else { - upper = idx; - } - idx = (upper + lower) / 2; - val = DV::view_host()(idx); - } - return idx; - } - - bool is_sorted() { - for (int i = 0; i < _size - 1; i++) { - if (DV::view_host()(i) > DV::view_host()(i + 1)) return false; - } - return true; - } - - iterator find(Scalar val) const { - if (_size == 0) return end(); - - int upper, lower, current; - current = _size / 2; - upper = _size - 1; - lower = 0; - - if ((val < DV::view_host()(0)) || (val > DV::view_host()(_size - 1))) - return end(); - - while (upper > lower) { - if (val > DV::view_host()(current)) - lower = current + 1; - else - upper = current; - current = (upper + lower) / 2; - } - - if (val == DV::view_host()(current)) - return &DV::view_host()(current); - else - return end(); - } - - /* Additional functions for data management */ - - void device_to_host() { deep_copy(DV::view_host(), DV::view_device()); } - void host_to_device() const { deep_copy(DV::view_device(), DV::view_host()); } - - void on_host() { DV::template modify(); } - void on_device() { DV::template modify(); } - - void set_overallocation(float extra) { _extra_storage = 1.0 + extra; } - - public: - struct set_functor { - using execution_space = typename DV::t_dev::execution_space; - typename DV::t_dev _data; - Scalar _val; - - set_functor(typename DV::t_dev data, Scalar val) : _data(data), _val(val) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const int& i) const { _data(i) = _val; } - }; - - struct set_functor_host { - using execution_space = typename DV::t_host::execution_space; - typename DV::t_host _data; - Scalar _val; - - set_functor_host(typename DV::t_host data, Scalar val) - : _data(data), _val(val) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const int& i) const { _data(i) = _val; } - }; -}; -#endif - -} // namespace Kokkos -#ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_VECTOR -#undef KOKKOS_IMPL_PUBLIC_INCLUDE -#undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_VECTOR -#endif -#endif diff --git a/lib/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp b/lib/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp deleted file mode 100644 index 557bea04f7f..00000000000 --- a/lib/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp +++ /dev/null @@ -1,176 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project - -#ifndef KOKKOS_IMPL_STATICCRSGRAPH_FACTORY_HPP -#define KOKKOS_IMPL_STATICCRSGRAPH_FACTORY_HPP - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- -#include -#ifdef KOKKOS_ENABLE_EXPERIMENTAL_CXX20_MODULES -import kokkos.core; -#else -#include -#endif -#include - -namespace Kokkos { - -template -inline typename StaticCrsGraph::host_mirror_type -create_mirror_view(const StaticCrsGraph& view, - std::enable_if_t::is_hostspace>* = 0) { - return view; -} - -template -inline typename StaticCrsGraph::host_mirror_type -create_mirror(const StaticCrsGraph& view) { - // Force copy: - // using alloc = Impl::ViewAssignment; // unused - using staticcrsgraph_type = - StaticCrsGraph; - - typename staticcrsgraph_type::host_mirror_type tmp; - typename staticcrsgraph_type::row_map_type::host_mirror_type tmp_row_map = - create_mirror(view.row_map); - typename staticcrsgraph_type::row_block_type::host_mirror_type - tmp_row_block_offsets = create_mirror(view.row_block_offsets); - - // Allocation to match: - tmp.row_map = tmp_row_map; // Assignment of 'const' from 'non-const' - tmp.entries = create_mirror(view.entries); - tmp.row_block_offsets = - tmp_row_block_offsets; // Assignment of 'const' from 'non-const' - - // Deep copy: - deep_copy(tmp_row_map, view.row_map); - deep_copy(tmp.entries, view.entries); - deep_copy(tmp_row_block_offsets, view.row_block_offsets); - - return tmp; -} - -template -inline typename StaticCrsGraph::host_mirror_type -create_mirror_view(const StaticCrsGraph& view, - std::enable_if_t::is_hostspace>* = 0) { - return create_mirror(view); -} -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -template -inline typename StaticCrsGraphType::staticcrsgraph_type create_staticcrsgraph( - const std::string& label, const std::vector& input) { - using output_type = StaticCrsGraphType; - using entries_type = typename output_type::entries_type; - using work_type = View< - typename output_type::size_type[], typename output_type::array_layout, - typename output_type::device_type, typename output_type::memory_traits>; - - output_type output; - - // Create the row map: - - const size_t length = input.size(); - - { - work_type row_work("tmp", length + 1); - - typename work_type::host_mirror_type row_work_host = - create_mirror_view(row_work); - - size_t sum = 0; - row_work_host[0] = 0; - for (size_t i = 0; i < length; ++i) { - row_work_host[i + 1] = sum += input[i]; - } - - deep_copy(row_work, row_work_host); - - output.entries = entries_type(label, sum); - output.row_map = row_work; - } - - return output; -} - -//---------------------------------------------------------------------------- - -template -inline typename StaticCrsGraphType::staticcrsgraph_type create_staticcrsgraph( - const std::string& label, - const std::vector >& input) { - using output_type = StaticCrsGraphType; - using entries_type = typename output_type::entries_type; - - static_assert(entries_type::rank == 1, "Graph entries view must be rank one"); - - using work_type = View< - typename output_type::size_type[], typename output_type::array_layout, - typename output_type::device_type, typename output_type::memory_traits>; - - output_type output; - - // Create the row map: - - const size_t length = input.size(); - - { - work_type row_work("tmp", length + 1); - - typename work_type::host_mirror_type row_work_host = - create_mirror_view(row_work); - - size_t sum = 0; - row_work_host[0] = 0; - for (size_t i = 0; i < length; ++i) { - row_work_host[i + 1] = sum += input[i].size(); - } - - deep_copy(row_work, row_work_host); - - output.entries = entries_type(label, sum); - output.row_map = row_work; - } - - // Fill in the entries: - { - typename entries_type::host_mirror_type host_entries = - create_mirror_view(output.entries); - - size_t sum = 0; - for (size_t i = 0; i < length; ++i) { - for (size_t j = 0; j < input[i].size(); ++j, ++sum) { - host_entries(sum) = input[i][j]; - } - } - - deep_copy(output.entries, host_entries); - } - - return output; -} - -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#endif /* #ifndef KOKKOS_IMPL_CRSARRAY_FACTORY_HPP */ diff --git a/lib/kokkos/containers/unit_tests/CMakeLists.txt b/lib/kokkos/containers/unit_tests/CMakeLists.txt index ad64811b34d..8f95dc91fb4 100644 --- a/lib/kokkos/containers/unit_tests/CMakeLists.txt +++ b/lib/kokkos/containers/unit_tests/CMakeLists.txt @@ -11,7 +11,6 @@ foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL) set(UnitTestSources UnitTestMain.cpp) set(dir ${CMAKE_CURRENT_BINARY_DIR}/${dir}) file(MAKE_DIRECTORY ${dir}) - set(DeprecatedTests Vector StaticCrsGraph) foreach( Name Bitset @@ -27,15 +26,10 @@ foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL) ErrorReporter OffsetView ScatterView - StaticCrsGraph WithoutInitializing UnorderedMap - Vector ViewCtorPropEmbeddedDim ) - if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4 AND Name IN_LIST DeprecatedTests) - continue() # skip tests for deprecated features if deprecated code 4 is not enabled - endif() # Write to a temporary intermediate file and call configure_file to avoid # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. set(file ${dir}/Test${Tag}_${Name}.cpp) diff --git a/lib/kokkos/containers/unit_tests/TestCreateMirror.cpp b/lib/kokkos/containers/unit_tests/TestCreateMirror.cpp index e40fca4a50d..884f94ec31c 100644 --- a/lib/kokkos/containers/unit_tests/TestCreateMirror.cpp +++ b/lib/kokkos/containers/unit_tests/TestCreateMirror.cpp @@ -35,7 +35,7 @@ void test_create_mirror_properties(const View& view) { using DeviceMemorySpace = typename DefaultExecutionSpace::memory_space; // clang-format off - + // create_mirror // FIXME DynamicView: host_mirror_type is the same type if constexpr (!is_dynamic_view::value) { diff --git a/lib/kokkos/containers/unit_tests/TestDynRankViewTypedefs.cpp b/lib/kokkos/containers/unit_tests/TestDynRankViewTypedefs.cpp index f24c14ceb35..2bc2a0c5872 100644 --- a/lib/kokkos/containers/unit_tests/TestDynRankViewTypedefs.cpp +++ b/lib/kokkos/containers/unit_tests/TestDynRankViewTypedefs.cpp @@ -207,7 +207,7 @@ constexpr bool test_view_typedefs(ViewParams) { constexpr bool is_host_exec = std::is_same_v; -#if defined(KOKKOS_ENABLE_CUDA_UVM) || defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY) || defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) +#if defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY) || defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) constexpr bool has_unified_mem_space = true; #else constexpr bool has_unified_mem_space = false; diff --git a/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp b/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp index 7adbd021ac4..2a170553cf3 100644 --- a/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp +++ b/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp @@ -695,7 +695,11 @@ class TestDynViewAPI { } static void run_operator_test_rank67() { + // FIXME_CLANG 22 The test triggers an internal compiler error in clang 22. +#if !defined(KOKKOS_COMPILER_CLANG) || (KOKKOS_COMPILER_CLANG < 2200) || \ + (KOKKOS_COMPILER_CLANG > 2210) TestViewOperator_LeftAndRight::testit(2, 3, 4, 2, 3, 4, 2); +#endif TestViewOperator_LeftAndRight::testit(2, 3, 4, 2, 3, 4); } @@ -1686,8 +1690,8 @@ class TestDynViewAPI { ASSERT_EQ(ds5.extent(4), ds5plus.extent(4)); ASSERT_EQ(ds5.extent(5), ds5plus.extent(5)); -#if (!defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_UVM)) && \ - !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_SYCL) +#if !defined(KOKKOS_ENABLE_CUDA) && !defined(KOKKOS_ENABLE_HIP) && \ + !defined(KOKKOS_ENABLE_SYCL) ASSERT_EQ(&ds5(1, 1, 1, 1, 0) - &ds5plus(1, 1, 1, 1, 0), 0); ASSERT_EQ(&ds5(1, 1, 1, 1, 0, 0) - &ds5plus(1, 1, 1, 1, 0, 0), 0); // passing argument to rank beyond the view's rank is allowed diff --git a/lib/kokkos/containers/unit_tests/TestOffsetView.hpp b/lib/kokkos/containers/unit_tests/TestOffsetView.hpp index 9fe74ba33b5..02e3f9f1203 100644 --- a/lib/kokkos/containers/unit_tests/TestOffsetView.hpp +++ b/lib/kokkos/containers/unit_tests/TestOffsetView.hpp @@ -48,16 +48,6 @@ void test_offsetview_construction() { ASSERT_EQ("firstOV", ov.label()); -#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS - KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() -#endif -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - ASSERT_EQ(2u, ov.Rank); -#endif -#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS - KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() -#endif - ASSERT_EQ(2u, ov.rank()); ASSERT_EQ(ov.begin(0), -1); diff --git a/lib/kokkos/containers/unit_tests/TestScatterView.hpp b/lib/kokkos/containers/unit_tests/TestScatterView.hpp index d0fa56d4422..be965ecdafc 100644 --- a/lib/kokkos/containers/unit_tests/TestScatterView.hpp +++ b/lib/kokkos/containers/unit_tests/TestScatterView.hpp @@ -79,6 +79,8 @@ struct test_scatter_view_impl_cls - -#include - -#define KOKKOS_IMPL_DO_NOT_WARN_INCLUDE_STATIC_CRS_GRAPH -#include -#undef KOKKOS_IMPL_DO_NOT_WARN_INCLUDE_STATIC_CRS_GRAPH -#include -#ifdef KOKKOS_ENABLE_EXPERIMENTAL_CXX20_MODULES -import kokkos.core; -#else -#include -#endif - -/*--------------------------------------------------------------------------*/ -namespace Test { -namespace TestStaticCrsGraph { - -template -void run_test_graph() { - using dView = Kokkos::StaticCrsGraph; - using hView = typename dView::host_mirror_type; - - const unsigned LENGTH = 1000; - - std::vector > graph(LENGTH); - - for (size_t i = 0; i < LENGTH; ++i) { - graph[i].reserve(8); - for (size_t j = 0; j < 8; ++j) { - graph[i].push_back(i + j * 3); - } - } - - { - dView d1; - ASSERT_FALSE(d1.is_allocated()); - - d1 = Kokkos::create_staticcrsgraph("d1", graph); - - dView d2(d1); - dView d3(d1.entries, d1.row_map); - - ASSERT_TRUE(d1.is_allocated()); - ASSERT_TRUE(d2.is_allocated()); - ASSERT_TRUE(d3.is_allocated()); - } - - dView dx; - hView hx; - - dx = Kokkos::create_staticcrsgraph("dx", graph); - hx = Kokkos::create_mirror(dx); - - ASSERT_EQ(hx.row_map.extent(0) - 1, LENGTH); - - for (size_t i = 0; i < LENGTH; ++i) { - const size_t begin = hx.row_map[i]; - const size_t n = hx.row_map[i + 1] - begin; - ASSERT_EQ(n, graph[i].size()); - for (size_t j = 0; j < n; ++j) { - ASSERT_EQ((int)hx.entries(j + begin), graph[i][j]); - } - } - - // Test row view access - for (size_t i = 0; i < LENGTH; ++i) { - auto rowView = hx.rowConst(i); - ASSERT_EQ(rowView.length, graph[i].size()); - for (size_t j = 0; j < rowView.length; ++j) { - ASSERT_EQ(rowView.colidx(j), (size_t)graph[i][j]); - ASSERT_EQ(rowView(j), (size_t)graph[i][j]); - } - } -} - -template -void run_test_graph2() { - using dView = Kokkos::StaticCrsGraph; - using hView = typename dView::host_mirror_type; - - const unsigned LENGTH = 10; - - std::vector sizes(LENGTH); - - size_t total_length = 0; - - for (size_t i = 0; i < LENGTH; ++i) { - total_length += (sizes[i] = 6 + i % 4); - } - - dView dx = Kokkos::create_staticcrsgraph("test", sizes); - hView hx = Kokkos::create_mirror(dx); - hView mx = Kokkos::create_mirror(dx); - - ASSERT_EQ((size_t)dx.row_map.extent(0), (size_t)LENGTH + 1); - ASSERT_EQ((size_t)hx.row_map.extent(0), (size_t)LENGTH + 1); - ASSERT_EQ((size_t)mx.row_map.extent(0), (size_t)LENGTH + 1); - - ASSERT_EQ((size_t)dx.entries.extent(0), (size_t)total_length); - ASSERT_EQ((size_t)hx.entries.extent(0), (size_t)total_length); - ASSERT_EQ((size_t)mx.entries.extent(0), (size_t)total_length); - - ASSERT_EQ((size_t)dx.entries.extent(1), (size_t)3); - ASSERT_EQ((size_t)hx.entries.extent(1), (size_t)3); - ASSERT_EQ((size_t)mx.entries.extent(1), (size_t)3); - - for (size_t i = 0; i < LENGTH; ++i) { - const size_t entry_begin = hx.row_map[i]; - const size_t entry_end = hx.row_map[i + 1]; - for (size_t j = entry_begin; j < entry_end; ++j) { - hx.entries(j, 0) = j + 1; - hx.entries(j, 1) = j + 2; - hx.entries(j, 2) = j + 3; - } - } - - Kokkos::deep_copy(dx.entries, hx.entries); - Kokkos::deep_copy(mx.entries, dx.entries); - - ASSERT_EQ(mx.row_map.extent(0), (size_t)LENGTH + 1); - - for (size_t i = 0; i < LENGTH; ++i) { - const size_t entry_begin = mx.row_map[i]; - const size_t entry_end = mx.row_map[i + 1]; - ASSERT_EQ((entry_end - entry_begin), sizes[i]); - for (size_t j = entry_begin; j < entry_end; ++j) { - ASSERT_EQ((size_t)mx.entries(j, 0), (j + 1)); - ASSERT_EQ((size_t)mx.entries(j, 1), (j + 2)); - ASSERT_EQ((size_t)mx.entries(j, 2), (j + 3)); - } - } -} - -template -void run_test_graph3(size_t B, size_t N) { - srand(10310); - - using dView = Kokkos::StaticCrsGraph; - using hView = typename dView::host_mirror_type; - - const unsigned LENGTH = 2000; - - std::vector sizes(LENGTH); - - for (size_t i = 0; i < LENGTH; ++i) { - sizes[i] = rand() % 1000; - } - - sizes[1] = N; - sizes[1998] = N; - - int C = 0; - dView dx = Kokkos::create_staticcrsgraph("test", sizes); - dx.create_block_partitioning(B, C); - hView hx = Kokkos::create_mirror(dx); - - for (size_t i = 0; i < B; i++) { - size_t ne = 0; - for (auto j = hx.row_block_offsets(i); j < hx.row_block_offsets(i + 1); j++) - ne += hx.row_map(j + 1) - hx.row_map(j) + C; - - ASSERT_FALSE( - (ne > 2 * ((hx.row_map(hx.numRows()) + C * hx.numRows()) / B)) && - (hx.row_block_offsets(i + 1) > hx.row_block_offsets(i) + 1)); - } -} - -template -void run_test_graph4() { - using ordinal_type = unsigned int; - using layout_type = Kokkos::LayoutRight; - using space_type = Space; - using memory_traits_type = Kokkos::MemoryUnmanaged; - using dView = Kokkos::StaticCrsGraph; - using hView = typename dView::host_mirror_type; - - dView dx; - - // StaticCrsGraph with Unmanaged trait will contain row_map and entries - // members with the Unmanaged memory trait. Use of such a StaticCrsGraph - // requires an allocaton of memory for the unmanaged views to wrap. - // - // In this test, a graph (via raw arrays) resides on the host. - // The pointers are wrapped by unmanaged Views. - // To make use of this on the device, managed device Views are created - // (allocation required), and data from the unmanaged host views is deep - // copied to the device Views Unmanaged views of the appropriate type wrap the - // device data and are assigned to their corresponding unmanaged view members - // of the unmanaged StaticCrsGraph - - // Data types for raw pointers storing StaticCrsGraph info - using ptr_row_map_type = typename dView::size_type; - using ptr_entries_type = typename dView::data_type; - - const ordinal_type numRows = 8; - const ordinal_type nnz = 24; - ptr_row_map_type ptrRaw[] = {0, 4, 8, 10, 12, 14, 16, 20, 24}; - ptr_entries_type indRaw[] = {0, 1, 4, 5, 0, 1, 4, 5, 2, 3, 2, 3, - 4, 5, 4, 5, 2, 3, 6, 7, 2, 3, 6, 7}; - - // Wrap pointers in unmanaged host views - using local_row_map_type = typename hView::row_map_type; - using local_entries_type = typename hView::entries_type; - local_row_map_type unman_row_map(&(ptrRaw[0]), numRows + 1); - local_entries_type unman_entries(&(indRaw[0]), nnz); - - hView hx; - hx = hView(unman_entries, unman_row_map); - - // Create the device Views for copying the host arrays into - // An allocation is needed on the device for the unmanaged StaticCrsGraph to - // wrap the pointer - using d_row_map_view_type = - typename Kokkos::View; - using d_entries_view_type = - typename Kokkos::View; - - d_row_map_view_type tmp_row_map("tmp_row_map", numRows + 1); - d_entries_view_type tmp_entries("tmp_entries", nnz); - - Kokkos::deep_copy(tmp_row_map, unman_row_map); - Kokkos::deep_copy(tmp_entries, unman_entries); - - // Wrap the pointer in unmanaged View and assign to the corresponding - // StaticCrsGraph member - dx.row_map = typename dView::row_map_type(tmp_row_map.data(), numRows + 1); - dx.entries = typename dView::entries_type(tmp_entries.data(), nnz); - - ASSERT_TRUE((std::is_same_v)); - ASSERT_TRUE((std::is_same_v)); - ASSERT_TRUE((std::is_same_v)); - ASSERT_TRUE((std::is_same_v)); -} - -} /* namespace TestStaticCrsGraph */ - -TEST(TEST_CATEGORY, staticcrsgraph) { - TestStaticCrsGraph::run_test_graph(); - TestStaticCrsGraph::run_test_graph2(); - TestStaticCrsGraph::run_test_graph3(1, 0); - TestStaticCrsGraph::run_test_graph3(1, 1000); - TestStaticCrsGraph::run_test_graph3(1, 10000); - TestStaticCrsGraph::run_test_graph3(1, 100000); - TestStaticCrsGraph::run_test_graph3(3, 0); - TestStaticCrsGraph::run_test_graph3(3, 1000); - TestStaticCrsGraph::run_test_graph3(3, 10000); - TestStaticCrsGraph::run_test_graph3(3, 100000); - TestStaticCrsGraph::run_test_graph3(75, 0); - TestStaticCrsGraph::run_test_graph3(75, 1000); - TestStaticCrsGraph::run_test_graph3(75, 10000); - TestStaticCrsGraph::run_test_graph3(75, 100000); - TestStaticCrsGraph::run_test_graph4(); -} -} // namespace Test diff --git a/lib/kokkos/containers/unit_tests/TestVector.hpp b/lib/kokkos/containers/unit_tests/TestVector.hpp deleted file mode 100644 index d5a420e99a1..00000000000 --- a/lib/kokkos/containers/unit_tests/TestVector.hpp +++ /dev/null @@ -1,245 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project - -#ifndef KOKKOS_TEST_VECTOR_HPP -#define KOKKOS_TEST_VECTOR_HPP - -#include -#include -#include -#include -#include -KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() -#include - -namespace Test { - -namespace Impl { - -template -struct test_vector_insert { - using scalar_type = Scalar; - using execution_space = Device; - - template - void run_test(Vector& a) { - auto n = a.size(); - - auto it = a.begin(); - if (n > 0) { - ASSERT_EQ(a.data(), &a[0]); - } - it += 15; - ASSERT_EQ(*it, scalar_type(1)); - - auto it_return = a.insert(it, scalar_type(3)); - ASSERT_EQ(a.size(), n + 1); - ASSERT_EQ(std::distance(it_return, a.begin() + 15), 0); - - it = a.begin(); - it += 17; - it_return = a.insert(it, n + 5, scalar_type(5)); - - ASSERT_EQ(a.size(), n + 1 + n + 5); - ASSERT_EQ(std::distance(it_return, a.begin() + 17), 0); - - Vector b; - - b.insert(b.begin(), 7, 9); - ASSERT_EQ(b.size(), 7u); - ASSERT_EQ(b[0], scalar_type(9)); - - it = a.begin(); - it += 27 + n; - it_return = a.insert(it, b.begin(), b.end()); - - ASSERT_EQ(a.size(), n + 1 + n + 5 + 7); - ASSERT_EQ(std::distance(it_return, a.begin() + 27 + n), 0); - - // Testing insert at end via all three function interfaces - a.insert(a.end(), 11); - a.insert(a.end(), 2, 12); - a.insert(a.end(), b.begin(), b.end()); - } - - template - void check_test(Vector& a, int n) { - for (int i = 0; i < (int)a.size(); i++) { - if (i == 15) - ASSERT_EQ(a[i], scalar_type(3)); - else if (i > 16 && i < 16 + 6 + n) - ASSERT_EQ(a[i], scalar_type(5)); - else if (i > 26 + n && i < 34 + n) - ASSERT_EQ(a[i], scalar_type(9)); - else if (i == (int)a.size() - 10) - ASSERT_EQ(a[i], scalar_type(11)); - else if ((i == (int)a.size() - 9) || (i == (int)a.size() - 8)) - ASSERT_EQ(a[i], scalar_type(12)); - else if (i > (int)a.size() - 8) - ASSERT_EQ(a[i], scalar_type(9)); - else - ASSERT_EQ(a[i], scalar_type(1)); - } - } - - test_vector_insert(unsigned int size) { - { - std::vector a(size, scalar_type(1)); - run_test(a); - check_test(a, size); - } - { - Kokkos::vector a(size, scalar_type(1)); - a.sync_device(); - run_test(a); - a.sync_host(); - check_test(a, size); - } - { - Kokkos::vector a(size, scalar_type(1)); - a.sync_host(); - run_test(a); - check_test(a, size); - } - { test_vector_insert_into_empty(size); } - } - - void test_vector_insert_into_empty(const size_t size) { - using Vector = Kokkos::vector; - { - Vector a; - Vector b(size); - a.insert(a.begin(), b.begin(), b.end()); - ASSERT_EQ(a.size(), size); - } - - { - Vector c; - c.insert(c.begin(), size, Scalar{}); - ASSERT_EQ(c.size(), size); - } - } -}; - -template -struct test_vector_allocate { - using self_type = test_vector_allocate; - - using scalar_type = Scalar; - using execution_space = Device; - - bool result = false; - - template - Scalar run_me(unsigned int n) { - { - Vector v1; - if (v1.is_allocated() == true) return false; - - v1 = Vector(n, 1); - Vector v2(v1); - Vector v3(n, 1); - - if (v1.is_allocated() == false) return false; - if (v2.is_allocated() == false) return false; - if (v3.is_allocated() == false) return false; - } - return true; - } - - test_vector_allocate(unsigned int size) { - result = run_me >(size); - } -}; - -template -struct test_vector_combinations { - using self_type = test_vector_combinations; - - using scalar_type = Scalar; - using execution_space = Device; - - Scalar reference; - Scalar result; - - template - Scalar run_me(unsigned int n) { - Vector a(n, 1); - - a.push_back(2); - a.resize(n + 4); - a[n + 1] = 3; - a[n + 2] = 4; - a[n + 3] = 5; - - Scalar temp1 = a[2]; - Scalar temp2 = a[n]; - Scalar temp3 = a[n + 1]; - - a.assign(n + 2, -1); - - a[2] = temp1; - a[n] = temp2; - a[n + 1] = temp3; - - Scalar test1 = 0; - for (unsigned int i = 0; i < a.size(); i++) test1 += a[i]; - - a.assign(n + 1, -2); - Scalar test2 = 0; - for (unsigned int i = 0; i < a.size(); i++) test2 += a[i]; - - a.reserve(n + 10); - - Scalar test3 = 0; - for (unsigned int i = 0; i < a.size(); i++) test3 += a[i]; - - return (test1 * test2 + test3) * test2 + test1 * test3; - } - - test_vector_combinations(unsigned int size) { - reference = run_me >(size); - result = run_me >(size); - } -}; - -} // namespace Impl - -template -void test_vector_combinations(unsigned int size) { - Impl::test_vector_combinations test(size); - ASSERT_EQ(test.reference, test.result); -} - -template -void test_vector_allocate(unsigned int size) { - Impl::test_vector_allocate test(size); - ASSERT_TRUE(test.result); -} - -TEST(TEST_CATEGORY, vector_combination) { - test_vector_allocate(10); - test_vector_combinations(10); - test_vector_combinations(3057); -} - -TEST(TEST_CATEGORY, vector_insert) { - Impl::test_vector_insert(3057); -} - -// The particular scenario below triggered a bug where empty modified_flags -// would cause resize in push_back to be executed on the device overwriting the -// values that were stored on the host previously. -TEST(TEST_CATEGORY, vector_push_back_default_exec) { - Kokkos::vector V; - V.clear(); - V.push_back(4); - ASSERT_EQ(V[0], 4); - V.push_back(3); - ASSERT_EQ(V[1], 3); - ASSERT_EQ(V[0], 4); -} - -} // namespace Test - -#endif // KOKKOS_TEST_UNORDERED_MAP_HPP diff --git a/lib/kokkos/containers/unit_tests/TestWithoutInitializing.hpp b/lib/kokkos/containers/unit_tests/TestWithoutInitializing.hpp index 77d1074174a..b07ff31e9f1 100644 --- a/lib/kokkos/containers/unit_tests/TestWithoutInitializing.hpp +++ b/lib/kokkos/containers/unit_tests/TestWithoutInitializing.hpp @@ -21,19 +21,6 @@ import kokkos.scatter_view; #include <../../core/unit_test/tools/include/ToolTestingUtilities.hpp> -/// Some tests are skipped for @c CudaUVM memory space. -/// @todo To be revised according to the future of @c KOKKOS_ENABLE_CUDA_UVM. -///@{ -#ifdef KOKKOS_ENABLE_CUDA -#define GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE \ - if constexpr (std::is_same_v) \ - GTEST_SKIP() << "skipping since CudaUVMSpace requires additional fences"; -#else -#define GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE -#endif -///@} - /// Some tests are skipped for unified memory space #if defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY) #define GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE \ @@ -149,8 +136,6 @@ TEST(TEST_CATEGORY, resize_exec_space_dualview) { } TEST(TEST_CATEGORY, realloc_exec_space_dualview) { - GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE - using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableFences()); using view_type = Kokkos::DualView; @@ -241,8 +226,6 @@ TEST(TEST_CATEGORY, resize_exec_space_dynrankview) { } TEST(TEST_CATEGORY, realloc_exec_space_dynrankview) { - GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE - // FIXME_THREADS The Threads backend fences every parallel_for #ifdef KOKKOS_ENABLE_THREADS if (std::is_same_v) @@ -380,8 +363,6 @@ TEST(TEST_CATEGORY, resize_exec_space_scatterview) { } TEST(TEST_CATEGORY, realloc_exec_space_scatterview) { - GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE - // FIXME_THREADS The Threads backend fences every parallel_for #ifdef KOKKOS_ENABLE_THREADS if (std::is_same_v) @@ -490,8 +471,6 @@ TEST(TEST_CATEGORY, create_mirror_no_init_dynrankview_viewctor) { } TEST(TEST_CATEGORY, create_mirror_view_and_copy_dynrankview) { - GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE - using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels(), Config::EnableFences()); @@ -594,8 +573,6 @@ TEST(TEST_CATEGORY, create_mirror_no_init_offsetview_view_ctor) { } TEST(TEST_CATEGORY, create_mirror_view_and_copy_offsetview) { - GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE - using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels(), Config::EnableFences()); @@ -629,8 +606,6 @@ TEST(TEST_CATEGORY, create_mirror_view_and_copy_offsetview) { ASSERT_TRUE(success); } -// FIXME OPENMPTARGET -#ifndef KOKKOS_ENABLE_OPENMPTARGET TEST(TEST_CATEGORY, create_mirror_no_init_dynamicview) { using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels()); @@ -666,7 +641,6 @@ TEST(TEST_CATEGORY, create_mirror_no_init_dynamicview) { } TEST(TEST_CATEGORY, create_mirror_view_and_copy_dynamicview) { - GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE using namespace Kokkos::Test::Tools; @@ -707,10 +681,7 @@ TEST(TEST_CATEGORY, create_mirror_view_and_copy_dynamicview) { }); ASSERT_TRUE(success); } -#endif -// FIXME OPENMPTARGET -#ifndef KOKKOS_ENABLE_OPENMPTARGET TEST(TEST_CATEGORY, create_mirror_no_init_dynamicview_view_ctor) { using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels()); @@ -756,4 +727,3 @@ TEST(TEST_CATEGORY, create_mirror_no_init_dynamicview_view_ctor) { }); ASSERT_TRUE(success); } -#endif diff --git a/lib/kokkos/core/CMakeLists.txt b/lib/kokkos/core/CMakeLists.txt index 21f05f62724..e6682f22e2b 100644 --- a/lib/kokkos/core/CMakeLists.txt +++ b/lib/kokkos/core/CMakeLists.txt @@ -2,13 +2,5 @@ if(NOT Kokkos_INSTALL_TESTING) add_subdirectory(src) endif() -function(KOKKOS_ADD_BENCHMARK_DIRECTORY DIR_NAME) - if(NOT Kokkos_ENABLE_BENCHMARKS) - return() - endif() - - add_subdirectory(${DIR_NAME}) -endfunction() - kokkos_add_test_directories(unit_test) -kokkos_add_benchmark_directory(perf_test) +kokkos_add_benchmark_directories(perf_test) diff --git a/lib/kokkos/core/perf_test/CMakeLists.txt b/lib/kokkos/core/perf_test/CMakeLists.txt index d8f72825e78..94597c2457c 100644 --- a/lib/kokkos/core/perf_test/CMakeLists.txt +++ b/lib/kokkos/core/perf_test/CMakeLists.txt @@ -1,34 +1,11 @@ -# FIXME_OPENMPTARGET - the NVIDIA HPC compiler nvc++ in the OpenMPTarget backend does not pass the perf_tests. -# FIXME_OPENACC - temporarily disabled due to unimplemented features -if((KOKKOS_ENABLE_OPENMPTARGET OR KOKKOS_ENABLE_OPENACC) AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - return() -endif() -if(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - return() -endif() - -# all PerformanceTest_* executables are part of regular tests -# TODO: finish converting these into benchmarks (in progress) -if(KOKKOS_ENABLE_TESTS) - if(KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_HIP OR KOKKOS_ENABLE_SYCL) - kokkos_add_executable(PerformanceTest_SharedSpace SOURCES test_sharedSpace.cpp) - endif() - - kokkos_include_directories(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) -endif() - -if(NOT Kokkos_ENABLE_BENCHMARKS) - return() -endif() - -# Find or download google/benchmark library -find_package(benchmark QUIET 1.8.3) +# Download google/benchmark if it was not found by find_package if(benchmark_FOUND) message(STATUS "Using google benchmark found in ${benchmark_DIR}") else() message(STATUS "No installed google benchmark found, fetching from GitHub") include(FetchContent) set(BENCHMARK_ENABLE_TESTING OFF) + set(BENCHMARK_ENABLE_INSTALL OFF) list(APPEND CMAKE_MESSAGE_INDENT "[benchmark] ") FetchContent_Declare( @@ -49,17 +26,26 @@ else() target_compile_options(benchmark_main PRIVATE -w) endif() -function(KOKKOS_ADD_BENCHMARK NAME) - cmake_parse_arguments(BENCHMARK "" "" "SOURCES" ${ARGN}) +# This being a macro allows it to be used in other perf test directories without having to +# use absolute paths for the source files we pass as argument. +macro(KOKKOS_ADD_BENCHMARK NAME) + cmake_parse_arguments(BENCHMARK "" "MAIN_FILE" "SOURCES" ${ARGN}) if(DEFINED BENCHMARK_UNPARSED_ARGUMENTS) message(WARNING "Unexpected arguments when adding a benchmark: " ${BENCHMARK_UNPARSED_ARGUMENTS}) endif() set(BENCHMARK_NAME Kokkos_${NAME}) - list(APPEND BENCHMARK_SOURCES BenchmarkMain.cpp Benchmark_Context.cpp) + set(CORE_PERFTEST_DIR ${PROJECT_SOURCE_DIR}/core/perf_test) + if(BENCHMARK_MAIN_FILE) + set(BENCHMARK_MAIN ${BENCHMARK_MAIN_FILE}) + else() + set(BENCHMARK_MAIN ${CORE_PERFTEST_DIR}/BenchmarkMain.cpp) + endif() + list(APPEND BENCHMARK_SOURCES ${BENCHMARK_MAIN} ${CORE_PERFTEST_DIR}/Benchmark_Context.cpp) add_executable(${BENCHMARK_NAME} ${BENCHMARK_SOURCES}) target_link_libraries(${BENCHMARK_NAME} PRIVATE benchmark::benchmark Kokkos::kokkos impl_git_version) + target_include_directories(${BENCHMARK_NAME} PRIVATE ${CORE_PERFTEST_DIR}) target_include_directories(${BENCHMARK_NAME} SYSTEM PRIVATE ${benchmark_SOURCE_DIR}/include) foreach(SOURCE_FILE ${BENCHMARK_SOURCES}) @@ -70,7 +56,25 @@ function(KOKKOS_ADD_BENCHMARK NAME) set(BENCHMARK_ARGS --benchmark_counters_tabular=true --benchmark_out=${BENCHMARK_NAME}_${BENCHMARK_TIME}.json) add_test(NAME ${BENCHMARK_NAME} COMMAND ${BENCHMARK_NAME} ${BENCHMARK_ARGS}) -endfunction() +endmacro() + +# FIXME_OPENACC - temporarily disabled due to unimplemented features +if(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + return() +endif() +if(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + return() +endif() + +# all PerformanceTest_* executables are part of regular tests +# TODO: finish converting these into benchmarks (in progress) +if(KOKKOS_ENABLE_TESTS) + if(KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_HIP OR KOKKOS_ENABLE_SYCL) + kokkos_add_executable(PerformanceTest_SharedSpace SOURCES test_sharedSpace.cpp) + endif() + + kokkos_include_directories(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) +endif() set(BENCHMARK_SOURCES PerfTestGramSchmidt.cpp @@ -80,6 +84,7 @@ set(BENCHMARK_SOURCES PerfTestHexGrad.cpp PerfTest_MallocFree.cpp PerfTest_Stream.cpp + PerfTestMDRange_Stencil.cpp PerfTest_ViewAllocate.cpp PerfTest_ViewCopy_a123.cpp PerfTest_ViewCopy_b123.cpp @@ -116,13 +121,6 @@ set(BENCHMARK_SOURCES PerfTest_ViewResize_Raw.cpp ) -if(Kokkos_ENABLE_OPENMPTARGET) - # FIXME OPENMPTARGET requires TeamPolicy Reductions and Custom Reduction - list(REMOVE_ITEM BENCHMARK_SOURCES PerfTestGramSchmidt.cpp PerfTest_CustomReduction.cpp - PerfTest_ExecSpacePartitioning.cpp - ) -endif() - kokkos_add_benchmark(PerformanceTest_Benchmark SOURCES ${BENCHMARK_SOURCES}) kokkos_add_benchmark(Benchmark_Atomic_MinMax SOURCES test_atomic_minmax_simple.cpp) @@ -132,6 +130,8 @@ kokkos_add_benchmark( PerfTest_ViewFirstTouch_DeepCopy.cpp ) +kokkos_add_benchmark(PerformanceTest_MDRangePolicy_Stream SOURCES PerfTestMDRange_Stream.cpp) + # FIXME_NVHPC if(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) kokkos_add_benchmark(PerformanceTest_Mempool SOURCES test_mempool.cpp) diff --git a/lib/kokkos/core/perf_test/PerfTestDriver.hpp b/lib/kokkos/core/perf_test/PerfTestDriver.hpp deleted file mode 100644 index 025b8f61823..00000000000 --- a/lib/kokkos/core/perf_test/PerfTestDriver.hpp +++ /dev/null @@ -1,388 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project - -#include -#include - -// mfh 06 Jun 2013: This macro doesn't work like one might thing it -// should. It doesn't take the template parameter DeviceType and -// print its actual type name; it just literally prints out -// "DeviceType". I've worked around this below without using the -// macro, so I'm commenting out the macro to avoid compiler complaints -// about an unused macro. - -// #define KOKKOS_IMPL_MACRO_TO_STRING( X ) #X -// #define KOKKOS_MACRO_TO_STRING( X ) KOKKOS_IMPL_MACRO_TO_STRING( X ) - -//------------------------------------------------------------------------ - -namespace Test { - -enum { NUMBER_OF_TRIALS = 5 }; - -template -void run_test_mdrange(int exp_beg, int exp_end, const char deviceTypeName[], - int range_offset = 0, int tile_offset = 0) -// exp_beg = 6 => 2^6 = 64 is starting range length -{ -#define MDRANGE_PERFORMANCE_OUTPUT_VERBOSE 0 - - std::string label_mdrange; - label_mdrange.append("\"MDRange< double , "); - label_mdrange.append(deviceTypeName); - label_mdrange.append(" >\""); - - std::string label_range_col2; - label_range_col2.append("\"RangeColTwo< double , "); - label_range_col2.append(deviceTypeName); - label_range_col2.append(" >\""); - - std::string label_range_col_all; - label_range_col_all.append("\"RangeColAll< double , "); - label_range_col_all.append(deviceTypeName); - label_range_col_all.append(" >\""); - - if (std::is_same::value) { - std::cout - << "--------------------------------------------------------------\n" - << "Performance tests for MDRange Layout Right" - << "\n--------------------------------------------------------------" - << std::endl; - } else { - std::cout - << "--------------------------------------------------------------\n" - << "Performance tests for MDRange Layout Left" - << "\n--------------------------------------------------------------" - << std::endl; - } - - for (int i = exp_beg; i < exp_end; ++i) { - const int range_length = (1 << i) + range_offset; - - std::cout - << "\n--------------------------------------------------------------\n" - << "--------------------------------------------------------------\n" - << "MDRange Test: range bounds: " << range_length << " , " - << range_length << " , " << range_length - << "\n--------------------------------------------------------------\n" - << "--------------------------------------------------------------\n"; - // << std::endl; - - int t0_min = 0, t1_min = 0, t2_min = 0; - double seconds_min = 0.0; - - // Test 1: The MDRange in full - { - int t0 = 1, t1 = 1, t2 = 1; - int counter = 1; -#if !defined(KOKKOS_ENABLE_CUDA) - int min_bnd = 8; - int tfast = range_length; -#else - int min_bnd = 2; - int tfast = 32; -#endif - while (tfast >= min_bnd) { - int tmid = min_bnd; - while (tmid < tfast) { - t0 = min_bnd; - t1 = tmid; - t2 = tfast; - int t2_rev = min_bnd; - int t1_rev = tmid; - int t0_rev = tfast; - -#if defined(KOKKOS_ENABLE_CUDA) - // Note: Product of tile sizes must be < 1024 for Cuda - if (t0 * t1 * t2 >= 1024) { - printf(" Exceeded Cuda tile limits; onto next range set\n\n"); - break; - } -#endif - - // Run 1 with tiles LayoutRight style - double seconds_1 = 0; - { - seconds_1 = - MultiDimRangePerf3D::test_multi_index(range_length, - range_length, - range_length, - t0, t1, t2); - } - -#if MDRANGE_PERFORMANCE_OUTPUT_VERBOSE - std::cout << label_mdrange << " , " << t0 << " , " << t1 << " , " - << t2 << " , " << seconds_1 << std::endl; -#endif - - if (counter == 1) { - seconds_min = seconds_1; - t0_min = t0; - t1_min = t1; - t2_min = t2; - } else { - if (seconds_1 < seconds_min) { - seconds_min = seconds_1; - t0_min = t0; - t1_min = t1; - t2_min = t2; - } - } - - // Run 2 with tiles LayoutLeft style - reverse order of tile dims - double seconds_1rev = 0; - { - seconds_1rev = - MultiDimRangePerf3D::test_multi_index(range_length, - range_length, - range_length, - t0_rev, - t1_rev, - t2_rev); - } - -#if MDRANGE_PERFORMANCE_OUTPUT_VERBOSE - std::cout << label_mdrange << " , " << t0_rev << " , " << t1_rev - << " , " << t2_rev << " , " << seconds_1rev << std::endl; -#endif - - if (seconds_1rev < seconds_min) { - seconds_min = seconds_1rev; - t0_min = t0_rev; - t1_min = t1_rev; - t2_min = t2_rev; - } - - ++counter; - tmid <<= 1; - } // end inner while - tfast >>= 1; - } // end outer while - - std::cout - << "\n" - << "--------------------------------------------------------------\n" - << label_mdrange << "\n Min values " - << "\n Range length per dim (3D): " << range_length - << "\n TileDims: " << t0_min << " , " << t1_min << " , " << t2_min - << "\n Min time: " << seconds_min - << "\n---------------------------------------------------------------" - << std::endl; - } // end scope - -#if !defined(KOKKOS_ENABLE_CUDA) - double seconds_min_c = 0.0; - int t0c_min = 0, t1c_min = 0, t2c_min = 0; - int counter = 1; - { - int min_bnd = 8; - // Test 1_c: MDRange with 0 for 'inner' tile dim; this case will utilize - // the full span in that direction, should be similar to Collapse<2> - if (std::is_same::value) { - for (unsigned int T0 = min_bnd; - T0 < static_cast(range_length); T0 <<= 1) { - for (unsigned int T1 = min_bnd; - T1 < static_cast(range_length); T1 <<= 1) { - double seconds_c = 0; - { - seconds_c = MultiDimRangePerf3D:: - test_multi_index(range_length, range_length, range_length, T0, - T1, 0); - } - -#if MDRANGE_PERFORMANCE_OUTPUT_VERBOSE - std::cout << " MDRange LR with '0' tile - collapse-like \n" - << label_mdrange << " , " << T0 << " , " << T1 << " , " - << range_length << " , " << seconds_c << std::endl; -#endif - - t2c_min = range_length; - if (counter == 1) { - seconds_min_c = seconds_c; - t0c_min = T0; - t1c_min = T1; - } else { - if (seconds_c < seconds_min_c) { - seconds_min_c = seconds_c; - t0c_min = T0; - t1c_min = T1; - } - } - ++counter; - } - } - } else { - for (unsigned int T1 = min_bnd; - T1 <= static_cast(range_length); T1 <<= 1) { - for (unsigned int T2 = min_bnd; - T2 <= static_cast(range_length); T2 <<= 1) { - double seconds_c = 0; - { - seconds_c = MultiDimRangePerf3D:: - test_multi_index(range_length, range_length, range_length, 0, - T1, T2); - } - -#if MDRANGE_PERFORMANCE_OUTPUT_VERBOSE - std::cout << " MDRange LL with '0' tile - collapse-like \n" - << label_mdrange << " , " << range_length << " < " << T1 - << " , " << T2 << " , " << seconds_c << std::endl; -#endif - - t0c_min = range_length; - if (counter == 1) { - seconds_min_c = seconds_c; - t1c_min = T1; - t2c_min = T2; - } else { - if (seconds_c < seconds_min_c) { - seconds_min_c = seconds_c; - t1c_min = T1; - t2c_min = T2; - } - } - ++counter; - } - } - } - - std::cout - // << - // "--------------------------------------------------------------\n" - << label_mdrange << " Collapse<2> style: " - << "\n Min values " - << "\n Range length per dim (3D): " << range_length - << "\n TileDims: " << t0c_min << " , " << t1c_min << " , " << t2c_min - << "\n Min time: " << seconds_min_c - << "\n---------------------------------------------------------------" - << std::endl; - } // end scope test 2 -#endif - - // Test 2: RangePolicy Collapse2 style - double seconds_2 = 0; - { - seconds_2 = RangePolicyCollapseTwo:: - test_index_collapse_two(range_length, range_length, range_length); - } - std::cout << label_range_col2 << " , " << range_length << " , " << seconds_2 - << std::endl; - - // Test 3: RangePolicy Collapse all style - not necessary, always slow - /* - double seconds_3 = 0; - { seconds_3 = RangePolicyCollapseAll< DeviceType , double , LayoutType - >::test_collapse_all(range_length,range_length,range_length) ; } std::cout - << label_range_col_all - << " , " << range_length - << " , " << seconds_3 - << "\n---------------------------------------------------------------" - << std::endl ; - */ - - // Compare fastest times... will never be collapse all so ignore it - // seconds_min = tiled MDRange - // seconds_min_c = collapse<2>-like MDRange (tiledim = span for fast dim) - - // only for non-Cuda, else tile too long seconds_2 = collapse<2>-style - // RangePolicy seconds_3 = collapse<3>-style RangePolicy - -#if !defined(KOKKOS_ENABLE_CUDA) - if (seconds_min < seconds_min_c) { - if (seconds_min < seconds_2) { - std::cout - << "--------------------------------------------------------------" - "\n" - << " Fastest run: MDRange tiled\n" - << " Time: " << seconds_min - << " Difference: " << seconds_2 - seconds_min << " Other times: \n" - << " MDrange collapse-like (tiledim = span on fast dim) type: " - << seconds_min_c << "\n" - << " Collapse2 Range Policy: " << seconds_2 << "\n" - << "\n-------------------------------------------------------------" - "-" - << "\n-------------------------------------------------------------" - "-" - //<< "\n\n" - << std::endl; - } else if (seconds_min > seconds_2) { - std::cout - << " Fastest run: Collapse2 RangePolicy\n" - << " Time: " << seconds_2 - << " Difference: " << seconds_min - seconds_2 << " Other times: \n" - << " MDrange Tiled: " << seconds_min << "\n" - << " MDrange collapse-like (tiledim = span on fast dim) type: " - << seconds_min_c << "\n" - << "\n-------------------------------------------------------------" - "-" - << "\n-------------------------------------------------------------" - "-" - //<< "\n\n" - << std::endl; - } - } else if (seconds_min > seconds_min_c) { - if (seconds_min_c < seconds_2) { - std::cout << "---------------------------------------------------------" - "-----\n" - << " Fastest run: MDRange collapse-like (tiledim = span on " - "fast dim) type\n" - << " Time: " << seconds_min_c - << " Difference: " << seconds_2 - seconds_min_c - << " Other times: \n" - << " MDrange Tiled: " << seconds_min << "\n" - << " Collapse2 Range Policy: " << seconds_2 << "\n" - << "\n-------------------------------------------------------" - "-------" - << "\n-------------------------------------------------------" - "-------" - //<< "\n\n" - << std::endl; - } else if (seconds_min_c > seconds_2) { - std::cout - << " Fastest run: Collapse2 RangePolicy\n" - << " Time: " << seconds_2 - << " Difference: " << seconds_min_c - seconds_2 - << " Other times: \n" - << " MDrange Tiled: " << seconds_min << "\n" - << " MDrange collapse-like (tiledim = span on fast dim) type: " - << seconds_min_c << "\n" - << "\n-------------------------------------------------------------" - "-" - << "\n-------------------------------------------------------------" - "-" - //<< "\n\n" - << std::endl; - } - } // end else if -#else - if (seconds_min < seconds_2) { - std::cout - << "--------------------------------------------------------------\n" - << " Fastest run: MDRange tiled\n" - << " Time: " << seconds_min - << " Difference: " << seconds_2 - seconds_min << " Other times: \n" - << " Collapse2 Range Policy: " << seconds_2 << "\n" - << "\n--------------------------------------------------------------" - << "\n--------------------------------------------------------------" - //<< "\n\n" - << std::endl; - } else if (seconds_min > seconds_2) { - std::cout - << " Fastest run: Collapse2 RangePolicy\n" - << " Time: " << seconds_2 - << " Difference: " << seconds_min - seconds_2 << " Other times: \n" - << " MDrange Tiled: " << seconds_min << "\n" - << "\n--------------------------------------------------------------" - << "\n--------------------------------------------------------------" - //<< "\n\n" - << std::endl; - } -#endif - - } // end for - -#undef MDRANGE_PERFORMANCE_OUTPUT_VERBOSE -} - -} // namespace Test diff --git a/lib/kokkos/core/perf_test/PerfTestMDRange.hpp b/lib/kokkos/core/perf_test/PerfTestMDRange.hpp deleted file mode 100644 index b1d3722877e..00000000000 --- a/lib/kokkos/core/perf_test/PerfTestMDRange.hpp +++ /dev/null @@ -1,585 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project - -namespace Test { -template -struct MultiDimRangePerf3D { - using execution_space = DeviceType; - using size_type = typename execution_space::size_type; - - using iterate_type = Kokkos::Iterate; - - using view_type = Kokkos::View; - using host_view_type = typename view_type::host_mirror_type; - - view_type A; - view_type B; - const long irange; - const long jrange; - const long krange; - - MultiDimRangePerf3D(const view_type &A_, const view_type &B_, - const long &irange_, const long &jrange_, - const long &krange_) - : A(A_), B(B_), irange(irange_), jrange(jrange_), krange(krange_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const long i, const long j, const long k) const { - A(i, j, k) = - 0.25 * (ScalarType)(B(i + 2, j, k) + B(i + 1, j, k) + B(i, j + 2, k) + - B(i, j + 1, k) + B(i, j, k + 2) + B(i, j, k + 1) + - B(i, j, k)); - } - - struct InitZeroTag {}; - // struct InitViewTag {}; - - struct Init { - Init(const view_type &input_, const long &irange_, const long &jrange_, - const long &krange_) - : input(input_), irange(irange_), jrange(jrange_), krange(krange_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const long i, const long j, const long k) const { - input(i, j, k) = 1.0; - } - - KOKKOS_INLINE_FUNCTION - void operator()(const InitZeroTag &, const long i, const long j, - const long k) const { - input(i, j, k) = 0; - } - - view_type input; - const long irange; - const long jrange; - const long krange; - }; - - static double test_multi_index(const unsigned int icount, - const unsigned int jcount, - const unsigned int kcount, - const unsigned int Ti = 1, - const unsigned int Tj = 1, - const unsigned int Tk = 1, - const long iter = 1) { - // This test performs multidim range over all dims - view_type Atest("Atest", icount, jcount, kcount); - view_type Btest("Btest", icount + 2, jcount + 2, kcount + 2); - using FunctorType = - MultiDimRangePerf3D; - - double dt_min = 0; - - // LayoutRight - if (std::is_same::value) { - Kokkos::MDRangePolicy< - Kokkos::Rank<3, iterate_type::Right, iterate_type::Right>, - execution_space> - policy_initA({{0, 0, 0}}, {{icount, jcount, kcount}}, {{Ti, Tj, Tk}}); - Kokkos::MDRangePolicy< - Kokkos::Rank<3, iterate_type::Right, iterate_type::Right>, - execution_space> - policy_initB({{0, 0, 0}}, {{icount + 2, jcount + 2, kcount + 2}}, - {{Ti, Tj, Tk}}); - - using MDRangeType = typename Kokkos::MDRangePolicy< - Kokkos::Rank<3, iterate_type::Right, iterate_type::Right>, - execution_space>; - using tile_type = typename MDRangeType::tile_type; - using point_type = typename MDRangeType::point_type; - - Kokkos::MDRangePolicy< - Kokkos::Rank<3, iterate_type::Right, iterate_type::Right>, - execution_space> - policy(point_type{{0, 0, 0}}, point_type{{icount, jcount, kcount}}, - tile_type{{Ti, Tj, Tk}}); - - Kokkos::parallel_for(policy_initA, Init(Atest, icount, jcount, kcount)); - execution_space().fence(); - Kokkos::parallel_for(policy_initB, - Init(Btest, icount + 2, jcount + 2, kcount + 2)); - execution_space().fence(); - - for (int i = 0; i < iter; ++i) { - Kokkos::Timer timer; - Kokkos::parallel_for(policy, - FunctorType(Atest, Btest, icount, jcount, kcount)); - execution_space().fence(); - const double dt = timer.seconds(); - if (0 == i) - dt_min = dt; - else - dt_min = dt < dt_min ? dt : dt_min; - - // Correctness check - only the first run - if (0 == i) { - long numErrors = 0; - host_view_type Ahost("Ahost", icount, jcount, kcount); - Kokkos::deep_copy(Ahost, Atest); - host_view_type Bhost("Bhost", icount + 2, jcount + 2, kcount + 2); - Kokkos::deep_copy(Bhost, Btest); - - // On KNL, this may vectorize - add print statement to prevent - // Also, compare against epsilon, as vectorization can change bitwise - // answer - for (long l = 0; l < static_cast(icount); ++l) { - for (long j = 0; j < static_cast(jcount); ++j) { - for (long k = 0; k < static_cast(kcount); ++k) { - ScalarType check = - 0.25 * - (ScalarType)(Bhost(l + 2, j, k) + Bhost(l + 1, j, k) + - Bhost(l, j + 2, k) + Bhost(l, j + 1, k) + - Bhost(l, j, k + 2) + Bhost(l, j, k + 1) + - Bhost(l, j, k)); - if (Ahost(l, j, k) - check != 0) { - ++numErrors; - std::cout << " Correctness error at index: " << l << "," << j - << "," << k << "\n" - << " multi Ahost = " << Ahost(l, j, k) - << " expected = " << check - << " multi Bhost(ijk) = " << Bhost(l, j, k) - << " multi Bhost(l+1jk) = " << Bhost(l + 1, j, k) - << " multi Bhost(l+2jk) = " << Bhost(l + 2, j, k) - << " multi Bhost(ij+1k) = " << Bhost(l, j + 1, k) - << " multi Bhost(ij+2k) = " << Bhost(l, j + 2, k) - << " multi Bhost(ijk+1) = " << Bhost(l, j, k + 1) - << " multi Bhost(ijk+2) = " << Bhost(l, j, k + 2) - << std::endl; - // exit(-1); - } - } - } - } - if (numErrors != 0) { - std::cout << "LR multi: errors " << numErrors << " range product " - << icount * jcount * kcount << " LL " << jcount * kcount - << " LR " << icount * jcount << std::endl; - } - // else { std::cout << " multi: No errors!" << std::endl; } - } - } // end for - - } - // LayoutLeft - else { - Kokkos::MDRangePolicy< - Kokkos::Rank<3, iterate_type::Left, iterate_type::Left>, - execution_space> - policy_initA({{0, 0, 0}}, {{icount, jcount, kcount}}, {{Ti, Tj, Tk}}); - Kokkos::MDRangePolicy< - Kokkos::Rank<3, iterate_type::Left, iterate_type::Left>, - execution_space> - policy_initB({{0, 0, 0}}, {{icount + 2, jcount + 2, kcount + 2}}, - {{Ti, Tj, Tk}}); - - // using MDRangeType = - // typename Kokkos::MDRangePolicy< - // Kokkos::Rank<3, iterate_type::Left, iterate_type::Left>, - // execution_space >; - // using tile_type = typename MDRangeType::tile_type; - // using point_type = typename MDRangeType::point_type; - // MDRangeType policy(point_type{{0,0,0}}, - // point_type{{icount,jcount,kcount}}, - // tile_type{{Ti,Tj,Tk}}); - Kokkos::MDRangePolicy< - Kokkos::Rank<3, iterate_type::Left, iterate_type::Left>, - execution_space> - policy({{0, 0, 0}}, {{icount, jcount, kcount}}, {{Ti, Tj, Tk}}); - - Kokkos::parallel_for(policy_initA, Init(Atest, icount, jcount, kcount)); - execution_space().fence(); - Kokkos::parallel_for(policy_initB, - Init(Btest, icount + 2, jcount + 2, kcount + 2)); - execution_space().fence(); - - for (int i = 0; i < iter; ++i) { - Kokkos::Timer timer; - Kokkos::parallel_for(policy, - FunctorType(Atest, Btest, icount, jcount, kcount)); - execution_space().fence(); - const double dt = timer.seconds(); - if (0 == i) - dt_min = dt; - else - dt_min = dt < dt_min ? dt : dt_min; - - // Correctness check - only the first run - if (0 == i) { - long numErrors = 0; - host_view_type Ahost("Ahost", icount, jcount, kcount); - Kokkos::deep_copy(Ahost, Atest); - host_view_type Bhost("Bhost", icount + 2, jcount + 2, kcount + 2); - Kokkos::deep_copy(Bhost, Btest); - - // On KNL, this may vectorize - add print statement to prevent - // Also, compare against epsilon, as vectorization can change bitwise - // answer - for (long l = 0; l < static_cast(icount); ++l) { - for (long j = 0; j < static_cast(jcount); ++j) { - for (long k = 0; k < static_cast(kcount); ++k) { - ScalarType check = - 0.25 * - (ScalarType)(Bhost(l + 2, j, k) + Bhost(l + 1, j, k) + - Bhost(l, j + 2, k) + Bhost(l, j + 1, k) + - Bhost(l, j, k + 2) + Bhost(l, j, k + 1) + - Bhost(l, j, k)); - if (Ahost(l, j, k) - check != 0) { - ++numErrors; - std::cout << " Correctness error at index: " << l << "," << j - << "," << k << "\n" - << " multi Ahost = " << Ahost(l, j, k) - << " expected = " << check - << " multi Bhost(ijk) = " << Bhost(l, j, k) - << " multi Bhost(l+1jk) = " << Bhost(l + 1, j, k) - << " multi Bhost(l+2jk) = " << Bhost(l + 2, j, k) - << " multi Bhost(ij+1k) = " << Bhost(l, j + 1, k) - << " multi Bhost(ij+2k) = " << Bhost(l, j + 2, k) - << " multi Bhost(ijk+1) = " << Bhost(l, j, k + 1) - << " multi Bhost(ijk+2) = " << Bhost(l, j, k + 2) - << std::endl; - // exit(-1); - } - } - } - } - if (numErrors != 0) { - std::cout << " LL multi run: errors " << numErrors - << " range product " << icount * jcount * kcount - << " LL " << jcount * kcount << " LR " - << icount * jcount << std::endl; - } - // else { std::cout << " multi: No errors!" << std::endl; } - } - } // end for - } - - return dt_min; - } -}; - -template -struct RangePolicyCollapseTwo { - // RangePolicy for 3D range, but will collapse only 2 dims => like Rank<2> for - // multi-dim; unroll 2 dims in one-dim - - using execution_space = DeviceType; - using size_type = typename execution_space::size_type; - using layout = TestLayout; - - using iterate_type = Kokkos::Iterate; - - using view_type = Kokkos::View; - using host_view_type = typename view_type::host_mirror_type; - - view_type A; - view_type B; - const long irange; - const long jrange; - const long krange; - - RangePolicyCollapseTwo(view_type &A_, const view_type &B_, - const long &irange_, const long &jrange_, - const long &krange_) - : A(A_), B(B_), irange(irange_), jrange(jrange_), krange(krange_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const long r) const { - if (std::is_same::value) { - // id(i,j,k) = k + j*Nk + i*Nk*Nj = k + Nk*(j + i*Nj) = k + Nk*r - // r = j + i*Nj - long i = int(r / jrange); - long j = int(r - i * jrange); - for (int k = 0; k < krange; ++k) { - A(i, j, k) = - 0.25 * (ScalarType)(B(i + 2, j, k) + B(i + 1, j, k) + - B(i, j + 2, k) + B(i, j + 1, k) + - B(i, j, k + 2) + B(i, j, k + 1) + B(i, j, k)); - } - } else if (std::is_same::value) { - // id(i,j,k) = i + j*Ni + k*Ni*Nj = i + Ni*(j + k*Nj) = i + Ni*r - // r = j + k*Nj - long k = int(r / jrange); - long j = int(r - k * jrange); - for (int i = 0; i < irange; ++i) { - A(i, j, k) = - 0.25 * (ScalarType)(B(i + 2, j, k) + B(i + 1, j, k) + - B(i, j + 2, k) + B(i, j + 1, k) + - B(i, j, k + 2) + B(i, j, k + 1) + B(i, j, k)); - } - } - } - - struct Init { - view_type input; - const long irange; - const long jrange; - const long krange; - - Init(const view_type &input_, const long &irange_, const long &jrange_, - const long &krange_) - : input(input_), irange(irange_), jrange(jrange_), krange(krange_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const long r) const { - if (std::is_same::value) { - long i = int(r / jrange); - long j = int(r - i * jrange); - for (int k = 0; k < krange; ++k) { - input(i, j, k) = 1; - } - } else if (std::is_same::value) { - long k = int(r / jrange); - long j = int(r - k * jrange); - for (int i = 0; i < irange; ++i) { - input(i, j, k) = 1; - } - } - } - }; - - static double test_index_collapse_two(const unsigned int icount, - const unsigned int jcount, - const unsigned int kcount, - const long iter = 1) { - // This test refers to collapsing two dims while using the RangePolicy - view_type Atest("Atest", icount, jcount, kcount); - view_type Btest("Btest", icount + 2, jcount + 2, kcount + 2); - using FunctorType = - RangePolicyCollapseTwo; - - long collapse_index_rangeA = 0; - long collapse_index_rangeB = 0; - if (std::is_same::value) { - collapse_index_rangeA = icount * jcount; - collapse_index_rangeB = (icount + 2) * (jcount + 2); - // std::cout << " LayoutRight " << std::endl; - } else if (std::is_same::value) { - collapse_index_rangeA = kcount * jcount; - collapse_index_rangeB = (kcount + 2) * (jcount + 2); - // std::cout << " LayoutLeft " << std::endl; - } else { - std::cout << " LayoutRight or LayoutLeft required - will pass 0 as " - "range instead " - << std::endl; - exit(-1); - } - - Kokkos::RangePolicy policy(0, (collapse_index_rangeA)); - Kokkos::RangePolicy policy_initB(0, - (collapse_index_rangeB)); - - double dt_min = 0; - - Kokkos::parallel_for(policy, Init(Atest, icount, jcount, kcount)); - execution_space().fence(); - Kokkos::parallel_for(policy_initB, - Init(Btest, icount + 2, jcount + 2, kcount + 2)); - execution_space().fence(); - - for (int i = 0; i < iter; ++i) { - Kokkos::Timer timer; - Kokkos::parallel_for(policy, - FunctorType(Atest, Btest, icount, jcount, kcount)); - execution_space().fence(); - const double dt = timer.seconds(); - if (0 == i) - dt_min = dt; - else - dt_min = dt < dt_min ? dt : dt_min; - - // Correctness check - first iteration only - if (0 == i) { - long numErrors = 0; - host_view_type Ahost("Ahost", icount, jcount, kcount); - Kokkos::deep_copy(Ahost, Atest); - host_view_type Bhost("Bhost", icount + 2, jcount + 2, kcount + 2); - Kokkos::deep_copy(Bhost, Btest); - - // On KNL, this may vectorize - add print statement to prevent - // Also, compare against epsilon, as vectorization can change bitwise - // answer - for (long l = 0; l < static_cast(icount); ++l) { - for (long j = 0; j < static_cast(jcount); ++j) { - for (long k = 0; k < static_cast(kcount); ++k) { - ScalarType check = - 0.25 * (ScalarType)(Bhost(l + 2, j, k) + Bhost(l + 1, j, k) + - Bhost(l, j + 2, k) + Bhost(l, j + 1, k) + - Bhost(l, j, k + 2) + Bhost(l, j, k + 1) + - Bhost(l, j, k)); - if (Ahost(l, j, k) - check != 0) { - ++numErrors; - std::cout << " Correctness error at index: " << l << "," << j - << "," << k << "\n" - << " flat Ahost = " << Ahost(l, j, k) - << " expected = " << check << std::endl; - // exit(-1); - } - } - } - } - if (numErrors != 0) { - std::cout << " RP collapse2: errors " << numErrors - << " range product " << icount * jcount * kcount << " LL " - << jcount * kcount << " LR " << icount * jcount - << std::endl; - } - // else { std::cout << " RP collapse2: Pass! " << std::endl; } - } - } - - return dt_min; - } -}; - -template -struct RangePolicyCollapseAll { - // RangePolicy for 3D range, but will collapse all dims - - using execution_space = DeviceType; - using size_type = typename execution_space::size_type; - using layout = TestLayout; - - using view_type = Kokkos::View; - using host_view_type = typename view_type::host_mirror_type; - - view_type A; - view_type B; - const long irange; - const long jrange; - const long krange; - - RangePolicyCollapseAll(view_type &A_, const view_type &B_, - const long &irange_, const long &jrange_, - const long &krange_) - : A(A_), B(B_), irange(irange_), jrange(jrange_), krange(krange_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const long r) const { - if (std::is_same::value) { - long i = int(r / (jrange * krange)); - long j = int((r - i * jrange * krange) / krange); - long k = int(r - i * jrange * krange - j * krange); - A(i, j, k) = - 0.25 * (ScalarType)(B(i + 2, j, k) + B(i + 1, j, k) + B(i, j + 2, k) + - B(i, j + 1, k) + B(i, j, k + 2) + B(i, j, k + 1) + - B(i, j, k)); - } else if (std::is_same::value) { - long k = int(r / (irange * jrange)); - long j = int((r - k * irange * jrange) / irange); - long i = int(r - k * irange * jrange - j * irange); - A(i, j, k) = - 0.25 * (ScalarType)(B(i + 2, j, k) + B(i + 1, j, k) + B(i, j + 2, k) + - B(i, j + 1, k) + B(i, j, k + 2) + B(i, j, k + 1) + - B(i, j, k)); - } - } - - struct Init { - view_type input; - const long irange; - const long jrange; - const long krange; - - Init(const view_type &input_, const long &irange_, const long &jrange_, - const long &krange_) - : input(input_), irange(irange_), jrange(jrange_), krange(krange_) {} - - KOKKOS_INLINE_FUNCTION - void operator()(const long r) const { - if (std::is_same::value) { - long i = int(r / (jrange * krange)); - long j = int((r - i * jrange * krange) / krange); - long k = int(r - i * jrange * krange - j * krange); - input(i, j, k) = 1; - } else if (std::is_same::value) { - long k = int(r / (irange * jrange)); - long j = int((r - k * irange * jrange) / irange); - long i = int(r - k * irange * jrange - j * irange); - input(i, j, k) = 1; - } - } - }; - - static double test_collapse_all(const unsigned int icount, - const unsigned int jcount, - const unsigned int kcount, - const long iter = 1) { - // This test refers to collapsing all dims using the RangePolicy - view_type Atest("Atest", icount, jcount, kcount); - view_type Btest("Btest", icount + 2, jcount + 2, kcount + 2); - using FunctorType = - RangePolicyCollapseAll; - - const long flat_index_range = icount * jcount * kcount; - Kokkos::RangePolicy policy(0, flat_index_range); - Kokkos::RangePolicy policy_initB( - 0, (icount + 2) * (jcount + 2) * (kcount + 2)); - - double dt_min = 0; - - Kokkos::parallel_for(policy, Init(Atest, icount, jcount, kcount)); - execution_space().fence(); - Kokkos::parallel_for(policy_initB, - Init(Btest, icount + 2, jcount + 2, kcount + 2)); - execution_space().fence(); - - for (int i = 0; i < iter; ++i) { - Kokkos::Timer timer; - Kokkos::parallel_for(policy, - FunctorType(Atest, Btest, icount, jcount, kcount)); - execution_space().fence(); - const double dt = timer.seconds(); - if (0 == i) - dt_min = dt; - else - dt_min = dt < dt_min ? dt : dt_min; - - // Correctness check - first iteration only - if (0 == i) { - long numErrors = 0; - host_view_type Ahost("Ahost", icount, jcount, kcount); - Kokkos::deep_copy(Ahost, Atest); - host_view_type Bhost("Bhost", icount + 2, jcount + 2, kcount + 2); - Kokkos::deep_copy(Bhost, Btest); - - // On KNL, this may vectorize - add print statement to prevent - // Also, compare against epsilon, as vectorization can change bitwise - // answer - for (long l = 0; l < static_cast(icount); ++l) { - for (long j = 0; j < static_cast(jcount); ++j) { - for (long k = 0; k < static_cast(kcount); ++k) { - ScalarType check = - 0.25 * (ScalarType)(Bhost(l + 2, j, k) + Bhost(l + 1, j, k) + - Bhost(l, j + 2, k) + Bhost(l, j + 1, k) + - Bhost(l, j, k + 2) + Bhost(l, j, k + 1) + - Bhost(l, j, k)); - if (Ahost(l, j, k) - check != 0) { - ++numErrors; - std::cout << " Callapse ALL Correctness error at index: " << l - << "," << j << "," << k << "\n" - << " flat Ahost = " << Ahost(l, j, k) - << " expected = " << check << std::endl; - // exit(-1); - } - } - } - } - if (numErrors != 0) { - std::cout << " RP collapse all: errors " << numErrors - << " range product " << icount * jcount * kcount << " LL " - << jcount * kcount << " LR " << icount * jcount - << std::endl; - } - // else { std::cout << " RP collapse all: Pass! " << std::endl; } - } - } - - return dt_min; - } -}; - -} // end namespace Test diff --git a/lib/kokkos/core/perf_test/PerfTestMDRange_Stencil.cpp b/lib/kokkos/core/perf_test/PerfTestMDRange_Stencil.cpp new file mode 100644 index 00000000000..5a5231a9b9e --- /dev/null +++ b/lib/kokkos/core/perf_test/PerfTestMDRange_Stencil.cpp @@ -0,0 +1,498 @@ +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project + +#include +#include +#include +#include + +#include + +#include "PerfTest_Category.hpp" +#include + +namespace Test { + +template +struct LayoutToIterationPattern {}; + +template <> +struct LayoutToIterationPattern { + static constexpr Kokkos::Iterate pattern = Kokkos::Iterate::Right; +}; + +template <> +struct LayoutToIterationPattern { + static constexpr Kokkos::Iterate pattern = Kokkos::Iterate::Left; +}; + +template +void check_computation(const ViewType& A, const ViewType& B) { + int numErrors = 0; + auto Ahost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), A); + auto Bhost = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), B); + + // On KNL, this may vectorize - add print statement to prevent + // Also, compare against epsilon, as vectorization can change bitwise + // answer + ScalarType epsilon = std::numeric_limits::epsilon() * 100; + if constexpr (ViewType::rank == 2) { + const int n0 = Ahost.extent_int(0) - 2, n1 = Ahost.extent_int(1) - 2; + for (int i0 = 1; i0 < n0 + 1; ++i0) { + for (int i1 = 1; i1 < n1 + 1; ++i1) { + ScalarType check = + 0.25 * + (ScalarType)(Bhost(i0 + 1, i1) + Bhost(i0 - 1, i1) + + Bhost(i0, i1 + 1) + Bhost(i0, i1 - 1) + Bhost(i0, i1)); + if (Kokkos::abs(Ahost(i0, i1) - check) > epsilon) { + ++numErrors; + std::cerr << "Correctness error at index: " << i0 << "," << i1 + << ", got " << Ahost(i0, i1) << ", expected " << check + << "\n"; + } + } + } + } else if constexpr (ViewType::rank == 3) { + const int n0 = Ahost.extent_int(0) - 2, n1 = Ahost.extent_int(1) - 2, + n2 = Ahost.extent_int(2) - 2; + for (int i0 = 1; i0 < n0 + 1; ++i0) { + for (int i1 = 1; i1 < n1 + 1; ++i1) { + for (int i2 = 1; i2 < n2 + 1; ++i2) { + ScalarType check = + 0.25 * + (ScalarType)(Bhost(i0 + 1, i1, i2) + Bhost(i0 - 1, i1, i2) + + Bhost(i0, i1 + 1, i2) + Bhost(i0, i1 - 1, i2) + + Bhost(i0, i1, i2 + 1) + Bhost(i0, i1, i2 - 1) + + Bhost(i0, i1, i2)); + if (Kokkos::abs(Ahost(i0, i1, i2) - check) > epsilon) { + ++numErrors; + std::cerr << "Correctness error at index: " << i0 << "," << i1 + << "," << i2 << ", got " << Ahost(i0, i1, i2) + << ", expected " << check << "\n"; + } + } + } + } + } else if constexpr (ViewType::rank == 4) { + const int n0 = Ahost.extent_int(0) - 2, n1 = Ahost.extent_int(1) - 2, + n2 = Ahost.extent_int(2) - 2, n3 = Ahost.extent_int(3) - 2; + for (int i0 = 1; i0 < n0 + 1; ++i0) { + for (int i1 = 1; i1 < n1 + 1; ++i1) { + for (int i2 = 1; i2 < n2 + 1; ++i2) { + for (int i3 = 1; i3 < n3 + 1; ++i3) { + ScalarType check = 0.25 * (ScalarType)(Bhost(i0 + 1, i1, i2, i3) + + Bhost(i0 - 1, i1, i2, i3) + + Bhost(i0, i1 + 1, i2, i3) + + Bhost(i0, i1 - 1, i2, i3) + + Bhost(i0, i1, i2 + 1, i3) + + Bhost(i0, i1, i2 - 1, i3) + + Bhost(i0, i1, i2, i3 + 1) + + Bhost(i0, i1, i2, i3 - 1) + + Bhost(i0, i1, i2, i3)); + if (Kokkos::abs(Ahost(i0, i1, i2, i3) - check) > epsilon) { + ++numErrors; + std::cerr << "Correctness error at index: " << i0 << "," << i1 + << "," << i2 << "," << i3 << ", got " + << Ahost(i0, i1, i2, i3) << ", expected " << check + << "\n"; + } + } + } + } + } + if (numErrors != 0) { + std::cerr << "Detected some errors for a run with dimensions " + << Ahost.extent(0); + for (std::size_t i = 1; i < Ahost.rank(); i++) { + std::cerr << "x" << Ahost.extent(i); + } + std::cerr << std::endl; + } + } +} + +template +void bench_mdrange(benchmark::State& state, std::index_sequence) { + using execution_space = typename FunctorType::execution_space; + using view_type = typename FunctorType::view_type; + + Kokkos::Array dims, tiles; + for (std::size_t i = 0; i < dims.size(); i++) { + dims[i] = state.range(0); + tiles[i] = state.range(1); + } + + const auto policy = FunctorType::get_policy(dims, tiles); + + bool using_default_tiling = false; + for (std::size_t i = 0; i < tiles.size(); i++) { + state.counters[std::string("tile_") + std::to_string(i)] = tiles[i]; + using_default_tiling |= tiles[i] != state.range(1); + } + state.counters["default_tiling"] = using_default_tiling; + + view_type Atest("Atest", (dims[Idx] + 2)...); + view_type Btest("Btest", (dims[Idx] + 2)...); + + Kokkos::deep_copy(Atest, 1.0); + execution_space().fence(); + Kokkos::deep_copy(Btest, 1.0); + execution_space().fence(); + + for (auto _ : state) { + Kokkos::Timer timer; + Kokkos::parallel_for(policy, FunctorType(Atest, Btest, dims)); + execution_space().fence(); + const double dt = timer.seconds(); + state.SetIterationTime(dt); + } + // Correctness check + check_computation(Atest, Btest); +} + +template +void bench_mdrange(benchmark::State& state) { + bench_mdrange( + state, std::make_index_sequence()); +} + +template +struct add_pointer_n { + using type = typename add_pointer_n::type; +}; + +template +struct add_pointer_n { + using type = T; +}; + +template +using add_pointer_n_t = typename add_pointer_n::type; + +template +struct MDRange { + using execution_space = DeviceType; + using scalar_type = ScalarType; + using size_type = typename execution_space::size_type; + using view_type = Kokkos::View, + TestLayout, DeviceType>; + + static constexpr int dimension = Dimension; + + view_type A; + view_type B; + const Kokkos::Array ranges; + + template + MDRange(const view_type& A_, const view_type& B_, + const Kokkos::Array& dims) + : A(A_), B(B_), ranges(dims) {} + + KOKKOS_INLINE_FUNCTION + void operator()(int i0, int i1) const + requires(dimension == 2) + { + i0++; + i1++; + A(i0, i1) = 0.25 * (ScalarType)(B(i0 + 1, i1) + B(i0 - 1, i1) + + B(i0, i1 + 1) + B(i0, i1 - 1) + B(i0, i1)); + } + + KOKKOS_INLINE_FUNCTION + void operator()(int i0, int i1, int i2) const + requires(dimension == 3) + { + i0++; + i1++; + i2++; + A(i0, i1, i2) = 0.25 * (ScalarType)(B(i0 + 1, i1, i2) + B(i0 - 1, i1, i2) + + B(i0, i1 + 1, i2) + B(i0, i1 - 1, i2) + + B(i0, i1, i2 + 1) + B(i0, i1, i2 - 1) + + B(i0, i1, i2)); + } + + KOKKOS_INLINE_FUNCTION + void operator()(int i0, int i1, int i2, int i3) const + requires(dimension == 4) + { + i0++; + i1++; + i2++; + i3++; + A(i0, i1, i2, i3) = + 0.25 * (ScalarType)(B(i0 + 1, i1, i2, i3) + B(i0 - 1, i1, i2, i3) + + B(i0, i1 + 1, i2, i3) + B(i0, i1 - 1, i2, i3) + + B(i0, i1, i2 + 1, i3) + B(i0, i1, i2 - 1, i3) + + B(i0, i1, i2, i3 + 1) + B(i0, i1, i2, i3 - 1) + + B(i0, i1, i2, i3)); + } + + static auto get_policy(const Kokkos::Array& end, + Kokkos::Array& tile) { + constexpr Kokkos::Iterate iteration_pattern = + LayoutToIterationPattern::pattern; + const Kokkos::MDRangePolicy< + Kokkos::Rank, + execution_space> + policy(Kokkos::Array{}, end, tile); + + for (int i = 0; i < dimension; i++) { + tile[i] = policy.m_tile[i]; + } + + return policy; + } +}; + +template +struct CollapseTwo { + // RangePolicy for ND range, but will collapse only 2 dims; unroll 2 dims in + // one-dim + + using execution_space = DeviceType; + using scalar_type = ScalarType; + using size_type = typename execution_space::size_type; + using view_type = Kokkos::View, + TestLayout, DeviceType>; + + static constexpr int dimension = Dimension; + + view_type A; + view_type B; + const Kokkos::Array ranges; + + CollapseTwo(view_type& A_, const view_type& B_, + const Kokkos::Array& dims) + : A(A_), B(B_), ranges(dims) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int r) const + requires(dimension == 3) + { + if constexpr (std::is_same_v) { + const int i0 = r % ranges[0] + 1, i1 = r / ranges[0] + 1; + for (int i2 = 1; i2 < ranges[2] + 1; ++i2) { + A(i0, i1, i2) = + 0.25 * + (ScalarType)(B(i0 + 1, i1, i2) + B(i0 - 1, i1, i2) + + B(i0, i1 + 1, i2) + B(i0, i1 - 1, i2) + + B(i0, i1, i2 + 1) + B(i0, i1, i2 - 1) + B(i0, i1, i2)); + } + } else { + const int i2 = r % ranges[2] + 1, i1 = r / ranges[2] + 1; + for (int i0 = 1; i0 < ranges[0] + 1; ++i0) { + A(i0, i1, i2) = + 0.25 * + (ScalarType)(B(i0 + 1, i1, i2) + B(i0 - 1, i1, i2) + + B(i0, i1 + 1, i2) + B(i0, i1 - 1, i2) + + B(i0, i1, i2 + 1) + B(i0, i1, i2 - 1) + B(i0, i1, i2)); + } + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int r) const + requires(dimension == 4) + { + if constexpr (std::is_same_v) { + const int i0 = r % ranges[0] + 1, i12 = r / ranges[0]; + const int i1 = i12 % ranges[1] + 1, i2 = i12 / ranges[1] + 1; + for (int i3 = 1; i3 < ranges[3] + 1; ++i3) { + A(i0, i1, i2, i3) = + 0.25 * (ScalarType)(B(i0 + 1, i1, i2, i3) + B(i0 - 1, i1, i2, i3) + + B(i0, i1 + 1, i2, i3) + B(i0, i1 - 1, i2, i3) + + B(i0, i1, i2 + 1, i3) + B(i0, i1, i2 - 1, i3) + + B(i0, i1, i2, i3 + 1) + B(i0, i1, i2, i3 - 1) + + B(i0, i1, i2, i3)); + } + } else { + const int i3 = r % ranges[3] + 1, i21 = r / ranges[3]; + const int i2 = i21 % ranges[2] + 1, i1 = i21 / ranges[2] + 1; + for (int i0 = 1; i0 < ranges[0] + 1; ++i0) { + A(i0, i1, i2, i3) = + 0.25 * (ScalarType)(B(i0 + 1, i1, i2, i3) + B(i0 - 1, i1, i2, i3) + + B(i0, i1 + 1, i2, i3) + B(i0, i1 - 1, i2, i3) + + B(i0, i1, i2 + 1, i3) + B(i0, i1, i2 - 1, i3) + + B(i0, i1, i2, i3 + 1) + B(i0, i1, i2, i3 - 1) + + B(i0, i1, i2, i3)); + } + } + } + + static auto get_policy(const Kokkos::Array& dims, + const Kokkos::Array&) { + int collapse_index_rangeA = 0; + if constexpr (std::is_same_v) { + collapse_index_rangeA = std::reduce(Kokkos::begin(dims), + Kokkos::begin(dims) + (dimension - 1), + 1, std::multiplies{}); + } else if constexpr (std::is_same_v) { + collapse_index_rangeA = + std::reduce(Kokkos::begin(dims) + 1, Kokkos::end(dims), 1, + std::multiplies{}); + } else { + static_assert(!(std::is_same_v || + std::is_same_v), + "LayoutRight or LayoutLeft required"); + } + + return Kokkos::RangePolicy(0, collapse_index_rangeA); + } +}; + +template +struct CollapseAll { + // RangePolicy for ND range, but will collapse all dims + + using execution_space = DeviceType; + using scalar_type = ScalarType; + using size_type = typename execution_space::size_type; + using view_type = Kokkos::View, + TestLayout, DeviceType>; + + static constexpr int dimension = Dimension; + + view_type A; + view_type B; + const Kokkos::Array ranges; + + template + CollapseAll(view_type& A_, const view_type& B_, + const Kokkos::Array& dims) + : A(A_), B(B_), ranges(dims) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const int r) const + requires(dimension == 2) + { + if constexpr (std::is_same_v) { + const int i0 = r % ranges[0] + 1, i1 = r / ranges[0] + 1; + A(i0, i1) = + 0.25 * (ScalarType)(B(i0 + 1, i1) + B(i0 - 1, i1) + B(i0, i1 + 1) + + B(i0, i1 - 1) + B(i0, i1)); + } else { + const int i1 = r % ranges[1] + 1, i0 = r / ranges[1] + 1; + A(i0, i1) = + 0.25 * (ScalarType)(B(i0 + 1, i1) + B(i0 - 1, i1) + B(i0, i1 + 1) + + B(i0, i1 - 1) + B(i0, i1)); + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int r) const + requires(dimension == 3) + { + if constexpr (std::is_same_v) { + const int i0 = r % ranges[0] + 1, i12 = r / ranges[0]; + const int i1 = i12 % ranges[1] + 1, i2 = i12 / ranges[1] + 1; + A(i0, i1, i2) = + 0.25 * + (ScalarType)(B(i0 + 1, i1, i2) + B(i0 - 1, i1, i2) + + B(i0, i1 + 1, i2) + B(i0, i1 - 1, i2) + + B(i0, i1, i2 + 1) + B(i0, i1, i2 - 1) + B(i0, i1, i2)); + } else { + const int i2 = r % ranges[2] + 1, i10 = r / ranges[2]; + const int i1 = i10 % ranges[1] + 1, i0 = i10 / ranges[1] + 1; + A(i0, i1, i2) = + 0.25 * + (ScalarType)(B(i0 + 1, i1, i2) + B(i0 - 1, i1, i2) + + B(i0, i1 + 1, i2) + B(i0, i1 - 1, i2) + + B(i0, i1, i2 + 1) + B(i0, i1, i2 - 1) + B(i0, i1, i2)); + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const int r) const + requires(dimension == 4) + { + if constexpr (std::is_same_v) { + const int i0 = r % ranges[0] + 1, i123 = r / ranges[0]; + const int i1 = i123 % ranges[1] + 1, i23 = i123 / ranges[1]; + const int i2 = i23 % ranges[2] + 1, i3 = i23 / ranges[2] + 1; + A(i0, i1, i2, i3) = + 0.25 * (ScalarType)(B(i0 + 1, i1, i2, i3) + B(i0 - 1, i1, i2, i3) + + B(i0, i1 + 1, i2, i3) + B(i0, i1 - 1, i2, i3) + + B(i0, i1, i2 + 1, i3) + B(i0, i1, i2 - 1, i3) + + B(i0, i1, i2, i3 + 1) + B(i0, i1, i2, i3 - 1) + + B(i0, i1, i2, i3)); + } else { + const int i3 = r % ranges[3] + 1, i210 = r / ranges[3]; + const int i2 = i210 % ranges[2] + 1, i10 = i210 / ranges[2]; + const int i1 = i10 % ranges[1] + 1, i0 = i10 / ranges[1] + 1; + A(i0, i1, i2, i3) = + 0.25 * (ScalarType)(B(i0 + 1, i1, i2, i3) + B(i0 - 1, i1, i2, i3) + + B(i0, i1 + 1, i2, i3) + B(i0, i1 - 1, i2, i3) + + B(i0, i1, i2 + 1, i3) + B(i0, i1, i2 - 1, i3) + + B(i0, i1, i2, i3 + 1) + B(i0, i1, i2, i3 - 1) + + B(i0, i1, i2, i3)); + } + } + + static auto get_policy(const Kokkos::Array& dims, + const Kokkos::Array&) { + const int flat_index_range = std::reduce( + Kokkos::begin(dims), Kokkos::end(dims), 1, std::multiplies{}); + return Kokkos::RangePolicy(0, flat_index_range); + } +}; + +#if !defined(KOKKOS_ENABLE_BENCHMARKS_HEAVY) +#define MDRANGE_STENCIL_BENCHMARK(functor, dim, layout, sizes, ...) \ + BENCHMARK(bench_mdrange>) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond) \ + ->Name("MDRangeStencil_" #dim "D_" #functor "_" #layout) \ + ->ArgNames({"size", "tile_size"}) \ + ->ArgsProduct({sizes, __VA_ARGS__}) \ + ->Iterations(1); + +MDRANGE_STENCIL_BENCHMARK(MDRange, 2, LayoutRight, {512}, {0}) +MDRANGE_STENCIL_BENCHMARK(MDRange, 3, LayoutLeft, {128}, {0}) +MDRANGE_STENCIL_BENCHMARK(MDRange, 4, LayoutRight, {32}, {0}) +#else +#define MDRANGE_STENCIL_BENCHMARK(functor, dim, layout, sizes, ...) \ + BENCHMARK(bench_mdrange>) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond) \ + ->Name("MDRangeStencil_" #dim "D_" #functor "_" #layout) \ + ->ArgNames({"size", "tile_size"}) \ + ->ArgsProduct({sizes, __VA_ARGS__}); + +#define SIZES_2D \ + { 512, 1024, 2048, 4096, 8192 } +#define SIZES_3D \ + { 128, 192, 256, 512 } +#define SIZES_4D \ + { 32, 64, 96 } +MDRANGE_STENCIL_BENCHMARK(MDRange, 2, LayoutRight, SIZES_2D, {0, 1}) +MDRANGE_STENCIL_BENCHMARK(MDRange, 3, LayoutRight, SIZES_3D, {0, 1}) +MDRANGE_STENCIL_BENCHMARK(MDRange, 4, LayoutRight, SIZES_4D, {0, 1}) +MDRANGE_STENCIL_BENCHMARK(MDRange, 2, LayoutLeft, SIZES_2D, {0, 1}) +MDRANGE_STENCIL_BENCHMARK(MDRange, 3, LayoutLeft, SIZES_3D, {0, 1}) +MDRANGE_STENCIL_BENCHMARK(MDRange, 4, LayoutLeft, SIZES_4D, {0, 1}) + +MDRANGE_STENCIL_BENCHMARK(CollapseTwo, 3, LayoutRight, SIZES_3D, {-1}) +MDRANGE_STENCIL_BENCHMARK(CollapseTwo, 3, LayoutLeft, SIZES_3D, {-1}) +MDRANGE_STENCIL_BENCHMARK(CollapseTwo, 4, LayoutRight, SIZES_4D, {-1}) +MDRANGE_STENCIL_BENCHMARK(CollapseTwo, 4, LayoutLeft, SIZES_4D, {-1}) + +MDRANGE_STENCIL_BENCHMARK(CollapseAll, 2, LayoutRight, SIZES_2D, {-1}) +MDRANGE_STENCIL_BENCHMARK(CollapseAll, 2, LayoutLeft, SIZES_2D, {-1}) +MDRANGE_STENCIL_BENCHMARK(CollapseAll, 3, LayoutRight, SIZES_3D, {-1}) +MDRANGE_STENCIL_BENCHMARK(CollapseAll, 3, LayoutLeft, SIZES_3D, {-1}) +MDRANGE_STENCIL_BENCHMARK(CollapseAll, 4, LayoutRight, SIZES_4D, {-1}) +MDRANGE_STENCIL_BENCHMARK(CollapseAll, 4, LayoutLeft, SIZES_4D, {-1}) +#undef SIZES_2D +#undef SIZES_3D +#undef SIZES_4D +#endif + +#undef MDRANGE_STENCIL_BENCHMARK + +} // end namespace Test diff --git a/lib/kokkos/core/perf_test/PerfTestMDRange_Stream.cpp b/lib/kokkos/core/perf_test/PerfTestMDRange_Stream.cpp new file mode 100644 index 00000000000..c53616b205e --- /dev/null +++ b/lib/kokkos/core/perf_test/PerfTestMDRange_Stream.cpp @@ -0,0 +1,97 @@ +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project + +#include "PerfTestMDRange_Stream.hpp" + +namespace Benchmark { + +template +void MDRangePolicy_Copy(benchmark::State& state) { + int N = static_cast(state.range(0)); + + MDStreamTest stream_bench(N); + stream_bench.test_copy(state); +} + +template +void MDRangePolicy_Set(benchmark::State& state) { + int N = static_cast(state.range(0)); + + MDStreamTest stream_bench(N); + stream_bench.test_set(state); +} + +template +void MDRangePolicy_Scale(benchmark::State& state) { + int N = static_cast(state.range(0)); + + MDStreamTest stream_bench(N); + stream_bench.test_scale(state); +} + +template +void MDRangePolicy_Add(benchmark::State& state) { + int N = static_cast(state.range(0)); + + MDStreamTest stream_bench(N); + stream_bench.test_add(state); +} + +template +void MDRangePolicy_Triad(benchmark::State& state) { + int N = static_cast(state.range(0)); + + MDStreamTest stream_bench(N); + stream_bench.test_triad(state); +} + +// Small size for CPU backends, the problem size is computed as N^6 +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || \ + defined(KOKKOS_ENABLE_SYCL) +#define MDRANGE_BENCHMARK_ARG_SIZE 22 +#else +#define MDRANGE_BENCHMARK_ARG_SIZE 16 +#endif + +// Macros to generate benchmarks +#define MDRANGE_BENCHMARK_ARGS(BENCH_FUNCTION, RANKS) \ + BENCHMARK_TEMPLATE(BENCH_FUNCTION, RANKS) \ + ->Arg(MDRANGE_BENCHMARK_ARG_SIZE) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond); + +#if defined(KOKKOS_ENABLE_BENCHMARKS_HEAVY) + +// Generate benchmarks for ranks 1 to 6 +#define MDRANGE_MAKE_BENCHMARK(BENCH_FUNCTION) \ + MDRANGE_BENCHMARK_ARGS(BENCH_FUNCTION, 1) \ + MDRANGE_BENCHMARK_ARGS(BENCH_FUNCTION, 2) \ + MDRANGE_BENCHMARK_ARGS(BENCH_FUNCTION, 3) \ + MDRANGE_BENCHMARK_ARGS(BENCH_FUNCTION, 4) \ + MDRANGE_BENCHMARK_ARGS(BENCH_FUNCTION, 5) \ + MDRANGE_BENCHMARK_ARGS(BENCH_FUNCTION, 6) + +MDRANGE_MAKE_BENCHMARK(MDRangePolicy_Set) +MDRANGE_MAKE_BENCHMARK(MDRangePolicy_Copy) +MDRANGE_MAKE_BENCHMARK(MDRangePolicy_Scale) +MDRANGE_MAKE_BENCHMARK(MDRangePolicy_Add) +MDRANGE_MAKE_BENCHMARK(MDRangePolicy_Triad) + +#else + +// Generate benchmarks for ranks 1, 3 and 6 +#define MDRANGE_MAKE_BENCHMARK(BENCH_FUNCTION) \ + MDRANGE_BENCHMARK_ARGS(BENCH_FUNCTION, 1) \ + MDRANGE_BENCHMARK_ARGS(BENCH_FUNCTION, 3) \ + MDRANGE_BENCHMARK_ARGS(BENCH_FUNCTION, 6) + +MDRANGE_MAKE_BENCHMARK(MDRangePolicy_Set) +MDRANGE_MAKE_BENCHMARK(MDRangePolicy_Triad) + +#endif // defined(KOKKOS_ENABLE_BENCHMARKS_HEAVY) + +#undef MDRANGE_BENCHMARK_ARG_SIZE +#undef MDRANGE_BENCHMARK_ARGS +#undef MDRANGE_MAKE_BENCHMARK + +} // namespace Benchmark diff --git a/lib/kokkos/core/perf_test/PerfTestMDRange_Stream.hpp b/lib/kokkos/core/perf_test/PerfTestMDRange_Stream.hpp new file mode 100644 index 00000000000..b31316175f1 --- /dev/null +++ b/lib/kokkos/core/perf_test/PerfTestMDRange_Stream.hpp @@ -0,0 +1,242 @@ +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project + +#include + +#include + +#include "Benchmark_Context.hpp" +#include "PerfTest_Category.hpp" + +namespace Benchmark { + +struct Tag_Set {}; +struct Tag_Copy {}; +struct Tag_Scale {}; +struct Tag_Add {}; +struct Tag_Triad {}; + +template +int tag_to_data_ratio() { + if constexpr (std::is_same_v) return 1; + if constexpr (std::is_same_v) return 2; + if constexpr (std::is_same_v) return 2; + if constexpr (std::is_same_v) return 3; + if constexpr (std::is_same_v) return 3; + return 0; +} + +template +struct ViewTypeRank { + using type = void; +}; + +template +struct ViewTypeRank<1, ScalarType, Layout, MemorySpace> { + using type = Kokkos::View; +}; + +template +struct ViewTypeRank<2, ScalarType, Layout, MemorySpace> { + using type = Kokkos::View; +}; + +template +struct ViewTypeRank<3, ScalarType, Layout, MemorySpace> { + using type = Kokkos::View; +}; + +template +struct ViewTypeRank<4, ScalarType, Layout, MemorySpace> { + using type = Kokkos::View; +}; + +template +struct ViewTypeRank<5, ScalarType, Layout, MemorySpace> { + using type = Kokkos::View; +}; + +template +struct ViewTypeRank<6, ScalarType, Layout, MemorySpace> { + using type = Kokkos::View; +}; + +// Select between MDRangePolicy and RangePolicy +template , typename Tag = void> +struct policy_selector { + using preferred_layout = typename ExecutionSpace::array_layout; + static const Kokkos::Iterate outer_iter = + Kokkos::Impl::layout_iterate_type_selector< + preferred_layout>::outer_iteration_pattern; + static const Kokkos::Iterate inner_iter = + Kokkos::Impl::layout_iterate_type_selector< + preferred_layout>::inner_iteration_pattern; + using type = Kokkos::MDRangePolicy, + IndexType, Tag>; +}; + +// Specialization for 1D RangePolicy +template +struct policy_selector<1, ExecutionSpace, IndexType, Tag> { + using type = Kokkos::RangePolicy; +}; + +// Choose between 1d and Nd bound types +template +struct bound_type_selector { + using type = typename PolicyType::index_type; +}; + +// Specialization for MDRangePolicy +template +struct bound_type_selector> { + using type = typename Kokkos::MDRangePolicy::point_type; +}; + +// Functor for stream test (set, copy, scale, add, triad). +// The problem size is N^6, meaning that each view will have the size of N^6 +// whatever the rank is. +template > +struct MDStreamTest { + using scalar_type = ScalarType; + using execution_space = ExecutionSpace; + using memory_space = typename ExecutionSpace::memory_space; + using preferred_layout = typename ExecutionSpace::array_layout; + using view_type = typename ViewTypeRank::type; + using policy_init_type = + typename policy_selector::type; + using bound_type = typename bound_type_selector::type; + + view_type m_view_A; + view_type m_view_B; + view_type m_view_C; + ScalarType m_scalar; + + bound_type m_lower_bounds; + bound_type m_upper_bounds; + + // Functor for initialization + struct Init { + view_type m_tensor; + scalar_type m_value; + + Init(const view_type &tensor, const scalar_type &value) + : m_tensor(tensor), m_value(value) {} + + template + KOKKOS_INLINE_FUNCTION void operator()(Indices... indices) const { + m_tensor(indices...) = m_value; + } + }; + + MDStreamTest(const int N) { + static_assert(Rank >= 1 && Rank <= 6, + "MDStreamTest: Only ranks 1 to 6 supported"); + + m_view_A = create_test_view("MDStreamTest::view_A", N); + m_view_B = create_test_view("MDStreamTest::view_B", N); + m_view_C = create_test_view("MDStreamTest::view_C", N); + m_scalar = static_cast(2.718281828); + + if constexpr (Rank == 1) { + m_lower_bounds = 0; + m_upper_bounds = m_view_A.extent(0); + } else { + for (int i = 0; i < Rank; ++i) { + m_lower_bounds[i] = 0; + m_upper_bounds[i] = m_view_A.extent(i); + } + } + + policy_init_type init_policy(m_lower_bounds, m_upper_bounds); + Kokkos::parallel_for(init_policy, + Init(m_view_A, static_cast(1.0))); + Kokkos::parallel_for(init_policy, + Init(m_view_B, static_cast(2.0))); + Kokkos::parallel_for(init_policy, + Init(m_view_C, static_cast(3.0))); + execution_space().fence(); + } + + // Tagged operator() + template + KOKKOS_INLINE_FUNCTION void operator()(Tag_Set, Args... args) const { + m_view_A(args...) = static_cast(m_scalar); + } + + template + KOKKOS_INLINE_FUNCTION void operator()(Tag_Copy, Args... args) const { + m_view_B(args...) = m_view_A(args...); + } + + template + KOKKOS_INLINE_FUNCTION void operator()(Tag_Scale, Args... args) const { + m_view_B(args...) = m_scalar * m_view_A(args...); + } + + template + KOKKOS_INLINE_FUNCTION void operator()(Tag_Add, Args... args) const { + m_view_C(args...) = m_view_A(args...) + m_view_B(args...); + } + + template + KOKKOS_INLINE_FUNCTION void operator()(Tag_Triad, Args... args) const { + m_view_C(args...) = m_view_A(args...) + m_scalar * m_view_B(args...); + } + + // Create test views of size N^6 + view_type create_test_view(const char *name, int dim) { + long N1 = dim; + long N2 = N1 * N1; + long N3 = N2 * N1; + long N6 = N3 * N3; + std::string view_name(name); + if constexpr (Rank == 1) { + return view_type(view_name, N6); + } else if constexpr (Rank == 2) { + return view_type(view_name, N3, N3); + } else if constexpr (Rank == 3) { + return view_type(view_name, N2, N2, N2); + } else if constexpr (Rank == 4) { + return view_type(view_name, N2, N1, N1, N2); + } else if constexpr (Rank == 5) { + return view_type(view_name, N1, N1, N2, N1, N1); + } else if constexpr (Rank == 6) { + return view_type(view_name, N1, N1, N1, N1, N1, N1); + } + } + + template + void run_test(benchmark::State &state) { + using policy_test_type = + typename policy_selector::type; + + policy_test_type compute_policy(m_lower_bounds, m_upper_bounds); + + const int data_ratio = tag_to_data_ratio(); + + for (auto _ : state) { + Kokkos::Timer timer; + Kokkos::parallel_for(compute_policy, *this); + execution_space().fence(); + KokkosBenchmark::report_results(state, m_view_A, data_ratio, + timer.seconds()); + } + } + + void test_set(benchmark::State &state) { run_test(state); } + + void test_copy(benchmark::State &state) { run_test(state); } + + void test_scale(benchmark::State &state) { run_test(state); } + + void test_add(benchmark::State &state) { run_test(state); } + + void test_triad(benchmark::State &state) { run_test(state); } +}; + +} // namespace Benchmark diff --git a/lib/kokkos/core/perf_test/PerfTest_CustomReduction.cpp b/lib/kokkos/core/perf_test/PerfTest_CustomReduction.cpp index 2d0f65d2c7b..c1da1470ad1 100644 --- a/lib/kokkos/core/perf_test/PerfTest_CustomReduction.cpp +++ b/lib/kokkos/core/perf_test/PerfTest_CustomReduction.cpp @@ -23,65 +23,44 @@ std::pair custom_reduction_test(int N, int R) { Scalar max; + auto reduction_lambda = KOKKOS_LAMBDA( + const Kokkos::TeamPolicy<>::member_type& team, Scalar& lmax) { + Scalar team_max = Scalar(0); + for (int rr = 0; rr < R; rr++) { + int i = team.league_rank(); + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team, 32), + [&](const int& j, Scalar& thread_max) { + Scalar t_max = Scalar(0); + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, 32), + [&](const int& k, Scalar& max_) { + const Scalar val = a((i * 32 + j) * 32 + k); + if (val > max_) max_ = val; + if ((k == 11) && (j == 17) && (i == 2)) max_ = 11.5; + }, + Kokkos::Max(t_max)); + if (t_max > thread_max) thread_max = t_max; + }, + Kokkos::Max(team_max)); + } + if (team_max > lmax) lmax = team_max; + }; + int team_size = 32; - if (team_size > Kokkos::DefaultExecutionSpace().concurrency()) - team_size = Kokkos::DefaultExecutionSpace().concurrency(); + Kokkos::Max reducer(max); + // FIXME Use reducer + int const max_team_size = Kokkos::TeamPolicy<>(1, 1).team_size_max( + reduction_lambda, Kokkos::ParallelReduceTag{}); + if (team_size > max_team_size) team_size = max_team_size; // Warm up - Kokkos::parallel_reduce( - Kokkos::TeamPolicy<>(N / 1024, team_size), - KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type& team, - Scalar& lmax) { - Scalar team_max = Scalar(0); - for (int rr = 0; rr < R; rr++) { - int i = team.league_rank(); - Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(team, 32), - [&](const int& j, Scalar& thread_max) { - Scalar t_max = Scalar(0); - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, 32), - [&](const int& k, Scalar& max_) { - const Scalar val = a((i * 32 + j) * 32 + k); - if (val > max_) max_ = val; - if ((k == 11) && (j == 17) && (i == 2)) max_ = 11.5; - }, - Kokkos::Max(t_max)); - if (t_max > thread_max) thread_max = t_max; - }, - Kokkos::Max(team_max)); - } - if (team_max > lmax) lmax = team_max; - }, - Kokkos::Max(max)); + Kokkos::parallel_reduce(Kokkos::TeamPolicy<>(N / 1024, team_size), + reduction_lambda, reducer); // Timing Kokkos::Timer timer; - Kokkos::parallel_reduce( - Kokkos::TeamPolicy<>(N / 1024, team_size), - KOKKOS_LAMBDA(const Kokkos::TeamPolicy<>::member_type& team, - Scalar& lmax) { - Scalar team_max = Scalar(0); - for (int rr = 0; rr < R; rr++) { - int i = team.league_rank(); - Kokkos::parallel_reduce( - Kokkos::TeamThreadRange(team, 32), - [&](const int& j, Scalar& thread_max) { - Scalar t_max = Scalar(0); - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, 32), - [&](const int& k, Scalar& max_) { - const Scalar val = a((i * 32 + j) * 32 + k); - if (val > max_) max_ = val; - if ((k == 11) && (j == 17) && (i == 2)) max_ = 11.5; - }, - Kokkos::Max(t_max)); - if (t_max > thread_max) thread_max = t_max; - }, - Kokkos::Max(team_max)); - } - if (team_max > lmax) lmax = team_max; - }, - Kokkos::Max(max)); + Kokkos::parallel_reduce(Kokkos::TeamPolicy<>(N / 1024, team_size), + reduction_lambda, reducer); return std::make_pair(timer.seconds(), max); } diff --git a/lib/kokkos/core/perf_test/PerfTest_Stream.cpp b/lib/kokkos/core/perf_test/PerfTest_Stream.cpp index 346d85ca70d..204e40e41ee 100644 --- a/lib/kokkos/core/perf_test/PerfTest_Stream.cpp +++ b/lib/kokkos/core/perf_test/PerfTest_Stream.cpp @@ -213,13 +213,17 @@ static void or_skip(benchmark::State& state) { } } -// As of May 2025, 10^8 doubles is larger than caches, but not so large as -// to be inconvenient. Also run 11^8 for a quick check of convergence. +// We choose the case of 9^8 doubles to test with (which allocates ~1 GB), as +// well as a quick test with 10^8 doubles (~2.4 GB) for checking convergence. As +// of January 2026, This value is higher than caches but reasonable enough +// to be safely allocated, even on low-memory devices. This needs to be +// (potentially) periodically revisited. + #define STREAM_ARGS(label) \ Name(label) \ ->ArgName("N") \ + ->Arg(9) \ ->Arg(10) \ - ->Arg(11) \ ->Unit(benchmark::kMillisecond) \ ->UseManualTime() diff --git a/lib/kokkos/core/src/CMakeLists.txt b/lib/kokkos/core/src/CMakeLists.txt index 90fd56de803..61383d86cc4 100644 --- a/lib/kokkos/core/src/CMakeLists.txt +++ b/lib/kokkos/core/src/CMakeLists.txt @@ -18,9 +18,6 @@ if(NOT desul_FOUND) set(DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION ON) endif() endif() - if(KOKKOS_ENABLE_OPENMPTARGET) - set(DESUL_ATOMICS_ENABLE_OPENMP ON) # not a typo Kokkos OpenMPTarget -> Desul OpenMP - endif() if(KOKKOS_ENABLE_OPENACC) # FIXME_OPENACC FIXME_CLACC - Below condition will be removed if Clacc can compile atomics. if(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) @@ -60,11 +57,6 @@ if(KOKKOS_ENABLE_OPENMP) append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/*.hpp) endif() -if(KOKKOS_ENABLE_OPENMPTARGET) - append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMPTarget/*.cpp) - append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMPTarget/*.hpp) -endif() - if(KOKKOS_ENABLE_OPENACC) append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenACC/*.cpp) append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenACC/*.hpp) @@ -193,14 +185,6 @@ if(NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) endif() kokkos_link_tpl(kokkoscore PUBLIC LIBQUADMATH) -# FIXME: We need a proper solution to figure out whether to enable -# libatomic -# Most compilers only require libatomic for 128-bit CAS -# I (CT) had removed 128bit CAS from desul to not need libatomic. -if(KOKKOS_ENABLE_OPENMPTARGET) - target_link_libraries(kokkoscore PUBLIC atomic) -endif() - if(desul_FOUND) target_link_libraries(kokkoscore PUBLIC desul_atomics) endif() diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda.hpp index df693189245..2fa44c175a1 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda.hpp @@ -80,13 +80,8 @@ class Cuda { //! Tag this class as a kokkos execution space using execution_space = Cuda; -#if defined(KOKKOS_ENABLE_CUDA_UVM) - //! This execution space's preferred memory space. - using memory_space = CudaUVMSpace; -#else //! This execution space's preferred memory space. using memory_space = CudaSpace; -#endif //! This execution space preferred device_type using device_type = Kokkos::Device; @@ -105,19 +100,6 @@ class Cuda { //! \name Functions that all Kokkos devices must implement. //@{ - /// \brief True if and only if this method is being called in a - /// thread-parallel function. - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION static int in_parallel() { -#if defined(__CUDA_ARCH__) - return true; -#else - return false; -#endif - } -#endif - /// \brief Wait until all dispatched functors complete. /// /// The parallel_for or parallel_reduce dispatch of a functor may @@ -130,11 +112,7 @@ class Cuda { "Kokkos::Cuda::fence(): Unnamed Instance Fence") const; /** \brief Return the maximum amount of concurrency. */ -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - static int concurrency(); -#else int concurrency() const; -#endif //! Print configuration information to the given output stream. void print_configuration(std::ostream& os, bool verbose = false) const; @@ -143,22 +121,20 @@ class Cuda { //-------------------------------------------------- //! \name Cuda space instances + KOKKOS_DEFAULTED_FUNCTION Cuda(const Cuda&) = default; + KOKKOS_FUNCTION Cuda(Cuda&& other) noexcept + : Cuda(static_cast(other)) {} + KOKKOS_DEFAULTED_FUNCTION Cuda& operator=(const Cuda&) = default; + KOKKOS_FUNCTION Cuda& operator=(Cuda&& other) noexcept { + return *this = static_cast(other); + } + ~Cuda(); Cuda(); explicit Cuda(cudaStream_t stream) : Cuda(stream, Impl::ManageStream::no) {} -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - template - KOKKOS_DEPRECATED_WITH_COMMENT( - "Cuda execution space should be constructed explicitly.") - Cuda(cudaStream_t stream) - : Cuda(stream) {} -#endif - Cuda(cudaStream_t stream, Impl::ManageStream manage_stream); - KOKKOS_DEPRECATED Cuda(cudaStream_t stream, bool manage_stream); - //-------------------------------------------------------------------------- //! Free any resources being consumed by the device. static void impl_finalize(); @@ -166,38 +142,6 @@ class Cuda { //! Initialize, telling the CUDA run-time library which device to use. static void impl_initialize(InitializationSettings const&); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - /// \brief Cuda device architecture of the selected device. - /// - /// This matches the __CUDA_ARCH__ specification. - KOKKOS_DEPRECATED static size_type device_arch() { - const cudaDeviceProp cudaProp = Cuda().cuda_device_prop(); - return cudaProp.major * 100 + cudaProp.minor; - } - - //! Query device count. - KOKKOS_DEPRECATED static size_type detect_device_count() { - int count; - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count)); - return count; - } - - /** \brief Detect the available devices and their architecture - * as defined by the __CUDA_ARCH__ specification. - */ - KOKKOS_DEPRECATED static std::vector detect_device_arch() { - int count; - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceCount(&count)); - std::vector out; - for (int i = 0; i < count; ++i) { - cudaDeviceProp prop; - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaGetDeviceProperties(&prop, i)); - out.push_back(prop.major * 100 + prop.minor); - } - return out; - } -#endif - cudaStream_t cuda_stream() const; int cuda_device() const; const cudaDeviceProp& cuda_device_prop() const; @@ -249,24 +193,6 @@ struct MemorySpaceAccess -struct MemorySpaceAccess { - enum : bool { assignable = false }; - enum : bool { accessible = true }; - enum : bool { deepcopy = false }; -}; - -#endif - } // namespace Impl } // namespace Kokkos diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp index 24f24ae6d11..bd2f104517f 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp @@ -34,25 +34,18 @@ cudaStream_t Kokkos::Impl::cuda_get_deep_copy_stream() { static cudaStream_t s = nullptr; if (s == nullptr) { KOKKOS_IMPL_CUDA_SAFE_CALL( - (CudaInternal::singleton().cuda_stream_create_wrapper(&s))); + (CudaInternal::default_instance->cuda_stream_create_wrapper(&s))); } return s; } -const std::unique_ptr &Kokkos::Impl::cuda_get_deep_copy_space( - bool initialize) { - static std::unique_ptr space = nullptr; - if (!space && initialize) - space = std::make_unique(Kokkos::Impl::cuda_get_deep_copy_stream()); - return space; -} - namespace Kokkos { namespace Impl { void DeepCopyCuda(void *dst, const void *src, size_t n) { - KOKKOS_IMPL_CUDA_SAFE_CALL((CudaInternal::singleton().cuda_memcpy_wrapper( - dst, src, n, cudaMemcpyDefault))); + KOKKOS_IMPL_CUDA_SAFE_CALL( + (CudaInternal::default_instance->cuda_memcpy_wrapper(dst, src, n, + cudaMemcpyDefault))); } void DeepCopyAsyncCuda(const Cuda &instance, void *dst, const void *src, @@ -81,12 +74,6 @@ void DeepCopyAsyncCuda(void *dst, const void *src, size_t n) { namespace Kokkos { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -bool CudaUVMSpace::available() { return true; } -#endif - -/*--------------------------------------------------------------------------*/ - #ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST // The purpose of the following variable is to allow a state-based choice // for pinning UVM allocations to the CPU. For now this is considered @@ -440,23 +427,28 @@ void CudaHostPinnedSpace::impl_deallocate( namespace Kokkos { namespace Impl { -void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes, +void cuda_prefetch_pointer(cudaStream_t stream, const void *ptr, size_t bytes, bool to_device) { if ((ptr == nullptr) || (bytes == 0)) return; cudaPointerAttributes attr; - KOKKOS_IMPL_CUDA_SAFE_CALL(( - space.impl_internal_space_instance()->cuda_pointer_get_attributes_wrapper( - &attr, ptr))); + KOKKOS_IMPL_CUDA_SAFE_CALL((cudaPointerGetAttributes(&attr, ptr))); // I measured this and it turns out prefetching towards the host slows // DualView syncs down. Probably because the latency is not too bad in the - // first place for the pull down. If we want to change that provde + // first place for the pull down. If we want to change that provide // cudaCpuDeviceId as the device if to_device is false bool is_managed = attr.type == cudaMemoryTypeManaged; + Cuda default_instance; if (to_device && is_managed && - space.cuda_device_prop().concurrentManagedAccess) { + default_instance.cuda_device_prop().concurrentManagedAccess) { + const int dstDevice = default_instance.cuda_device(); +#if CUDART_VERSION >= 13000 + cudaMemLocation loc = {cudaMemLocationTypeDevice, dstDevice}; KOKKOS_IMPL_CUDA_SAFE_CALL( - (space.impl_internal_space_instance()->cuda_mem_prefetch_async_wrapper( - ptr, bytes, space.cuda_device()))); + (cudaMemPrefetchAsync(ptr, bytes, loc, /*flags=*/0, stream))); +#else + KOKKOS_IMPL_CUDA_SAFE_CALL( + (cudaMemPrefetchAsync(ptr, bytes, dstDevice, stream))); +#endif } } diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp index 25237140c34..9172644c613 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp @@ -60,12 +60,6 @@ class CudaSpace { CudaSpace(int device_id, cudaStream_t stream); public: - CudaSpace(CudaSpace&& rhs) = default; - CudaSpace(const CudaSpace& rhs) = default; - CudaSpace& operator=(CudaSpace&& rhs) = default; - CudaSpace& operator=(const CudaSpace& rhs) = default; - ~CudaSpace() = default; - /**\brief Allocate untracked memory in the cuda space */ void* allocate(const Cuda& exec_space, const size_t arg_alloc_size) const; void* allocate(const Cuda& exec_space, const char* arg_label, @@ -146,11 +140,6 @@ class CudaUVMSpace { using device_type = Kokkos::Device; using size_type = unsigned int; -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - /** \brief If UVM capability is available */ - KOKKOS_DEPRECATED static bool available(); -#endif - /*--------------------------------*/ /*--------------------------------*/ @@ -161,12 +150,6 @@ class CudaUVMSpace { CudaUVMSpace(int device_id, cudaStream_t stream); public: - CudaUVMSpace(CudaUVMSpace&& rhs) = default; - CudaUVMSpace(const CudaUVMSpace& rhs) = default; - CudaUVMSpace& operator=(CudaUVMSpace&& rhs) = default; - CudaUVMSpace& operator=(const CudaUVMSpace& rhs) = default; - ~CudaUVMSpace() = default; - /**\brief Allocate untracked memory in the cuda space */ template void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { @@ -321,9 +304,6 @@ namespace Impl { cudaStream_t cuda_get_deep_copy_stream(); -const std::unique_ptr& cuda_get_deep_copy_space( - bool initialize = true); - static_assert(Kokkos::Impl::MemorySpaceAccess::assignable); static_assert(Kokkos::Impl::MemorySpaceAccess< diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp index fb533d830c1..eb03825e176 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp @@ -30,7 +30,9 @@ struct GraphImpl { using execution_space = Kokkos::Cuda; private: - execution_space m_execution_space; + using device_handle_t = Kokkos::Impl::DeviceHandle; + + device_handle_t m_device_handle; cudaGraph_t m_graph = nullptr; cudaGraphExec_t m_graph_exec = nullptr; @@ -46,10 +48,9 @@ struct GraphImpl { void instantiate() { KOKKOS_EXPECTS(!m_graph_exec); KOKKOS_IMPL_CUDA_SAFE_CALL( - (m_execution_space.impl_internal_space_instance() + (m_device_handle.m_exec.impl_internal_space_instance() ->cuda_graph_instantiate_wrapper(&m_graph_exec, m_graph))); KOKKOS_ENSURES(m_graph_exec); - // TODO @graphs print out errors } using root_node_impl_t = @@ -71,31 +72,30 @@ struct GraphImpl { // TODO @graphs we need to somehow indicate the need for a fence in the // destructor of the GraphImpl object (so that we don't have to // just always do it) - m_execution_space.fence("Kokkos::GraphImpl::~GraphImpl: Graph Destruction"); + m_device_handle.m_exec.fence( + "Kokkos::GraphImpl::~GraphImpl: Graph Destruction"); KOKKOS_EXPECTS(bool(m_graph)) if (bool(m_graph_exec)) { KOKKOS_IMPL_CUDA_SAFE_CALL( - (m_execution_space.impl_internal_space_instance() + (m_device_handle.m_exec.impl_internal_space_instance() ->cuda_graph_exec_destroy_wrapper(m_graph_exec))); } if (m_graph_owning) { KOKKOS_IMPL_CUDA_SAFE_CALL( - (m_execution_space.impl_internal_space_instance() + (m_device_handle.m_exec.impl_internal_space_instance() ->cuda_graph_destroy_wrapper(m_graph))); } } - explicit GraphImpl(Kokkos::Cuda arg_instance) - : m_execution_space(std::move(arg_instance)), m_graph_owning(true) { + explicit GraphImpl(const device_handle_t& device_handle) + : m_device_handle(device_handle), m_graph_owning(true) { KOKKOS_IMPL_CUDA_SAFE_CALL( - (m_execution_space.impl_internal_space_instance() + (m_device_handle.m_exec.impl_internal_space_instance() ->cuda_graph_create_wrapper(&m_graph, cuda_graph_flags_t{0}))); } - explicit GraphImpl(Kokkos::Cuda arg_instance, cudaGraph_t graph) - : m_execution_space(std::move(arg_instance)), - m_graph(graph), - m_graph_owning(false) { + explicit GraphImpl(const device_handle_t& device_handle, cudaGraph_t graph) + : m_device_handle(device_handle), m_graph(graph), m_graph_owning(false) { KOKKOS_EXPECTS(graph != nullptr); } @@ -103,7 +103,7 @@ struct GraphImpl { // All of the predecessors are just added as normal, so all we need to // do here is add an empty node KOKKOS_IMPL_CUDA_SAFE_CALL( - (m_execution_space.impl_internal_space_instance() + (m_device_handle.m_exec.impl_internal_space_instance() ->cuda_graph_add_empty_node_wrapper( &(arg_node_ptr->node_details_t::node), m_graph, /* dependencies = */ nullptr, @@ -182,7 +182,7 @@ struct GraphImpl { KOKKOS_EXPECTS(bool(cuda_node)) KOKKOS_IMPL_CUDA_SAFE_CALL( - (m_execution_space.impl_internal_space_instance() + (m_device_handle.m_exec.impl_internal_space_instance() ->cuda_graph_add_dependencies_wrapper(m_graph, &pred_cuda_node, &cuda_node, 1))); } @@ -196,17 +196,17 @@ struct GraphImpl { m_graph_exec))); } - execution_space const& get_execution_space() const noexcept { - return m_execution_space; + device_handle_t const& get_device_handle() const noexcept { + return m_device_handle; } auto create_root_node_ptr() { KOKKOS_EXPECTS(bool(m_graph)) KOKKOS_EXPECTS(!bool(m_graph_exec)) auto rv = std::make_shared( - get_execution_space(), _graph_node_is_root_ctor_tag{}); + m_device_handle, _graph_node_is_root_ctor_tag{}); KOKKOS_IMPL_CUDA_SAFE_CALL( - (m_execution_space.impl_internal_space_instance() + (m_device_handle.m_exec.impl_internal_space_instance() ->cuda_graph_add_empty_node_wrapper(&(rv->node_details_t::node), m_graph, /* dependencies = */ nullptr, @@ -223,7 +223,7 @@ struct GraphImpl { // each predecessor ref, so all we need to do here is create the (trivial) // aggregate node. return std::make_shared( - m_execution_space, _graph_node_kernel_ctor_tag{}, aggregate_impl_t{}); + m_device_handle, _graph_node_kernel_ctor_tag{}, aggregate_impl_t{}); } cudaGraph_t cuda_graph() { return m_graph; } diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_MathematicalFunctions.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_MathematicalFunctions.hpp index d3e3e480bf1..64f7dee1f1a 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_MathematicalFunctions.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half_MathematicalFunctions.hpp @@ -76,6 +76,7 @@ KOKKOS_CUDA_HALF_AND_BHALF_UNARY_FUNCTION_IMPL(abs, __habs) KOKKOS_CUDA_HALF_AND_BHALF_UNARY_FUNCTION_IMPL(fabs, __habs) // fmod // remainder +// remquo #if KOKKOS_IMPL_ARCH_NVIDIA_GPU >= 80 KOKKOS_CUDA_HALF_AND_BHALF_BINARY_FUNCTION_IMPL(fmax, __hmax) KOKKOS_CUDA_HALF_AND_BHALF_BINARY_FUNCTION_IMPL(fmin, __hmin) @@ -121,7 +122,17 @@ KOKKOS_CUDA_HALF_AND_BHALF_UNARY_FUNCTION_IMPL(ceil, hceil) KOKKOS_CUDA_HALF_AND_BHALF_UNARY_FUNCTION_IMPL(floor, hfloor) KOKKOS_CUDA_HALF_AND_BHALF_UNARY_FUNCTION_IMPL(trunc, htrunc) // round -KOKKOS_CUDA_HALF_AND_BHALF_UNARY_FUNCTION_IMPL(nearbyint, hrint) +KOKKOS_CUDA_HALF_AND_BHALF_UNARY_FUNCTION_IMPL(rint, hrint) +// NOTE Cuda does not provide these functions, but we can exclude domain errors, +// as the range of int is enough for any value half_t can take. +// Thus we just cast to the required return type here. +// We are still missing the bhalf_t versions +KOKKOS_INLINE_FUNCTION long impl_lrint(Kokkos::Experimental::half_t x) { + return static_cast(impl_rint(x)); +} +KOKKOS_INLINE_FUNCTION long long impl_llrint(Kokkos::Experimental::half_t x) { + return static_cast(impl_rint(x)); +} // logb // nextafter // copysign @@ -133,6 +144,9 @@ KOKKOS_CUDA_HALF_AND_BHALF_UNARY_PREDICATE_IMPL(isinf, __hisinf) #endif KOKKOS_CUDA_HALF_AND_BHALF_UNARY_PREDICATE_IMPL(isnan, __hisnan) // signbit +// Non-standard functions +KOKKOS_CUDA_HALF_AND_BHALF_UNARY_FUNCTION_IMPL(rsqrt, hrsqrt) +KOKKOS_CUDA_HALF_AND_BHALF_UNARY_FUNCTION_IMPL(rcp, hrcp) #undef KOKKOS_CUDA_HALF_AND_BHALF_UNARY_FUNCTION_IMPL #undef KOKKOS_CUDA_HALF_AND_BHALF_BINARY_FUNCTION_IMPL diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp index eeeb142c980..3d4c4673efe 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -25,6 +25,7 @@ import kokkos.core; #include #include #include +#include #include #include @@ -222,22 +223,30 @@ void CudaInternal::print_configuration(std::ostream &s) const { //---------------------------------------------------------------------------- CudaInternal::~CudaInternal() { - if (m_scratchSpace || m_scratchFlags || m_scratchUnified) { - std::cerr << "Kokkos::Cuda ERROR: Failed to call Kokkos::Cuda::finalize()" - << std::endl; + fence("Kokkos::CudaInternal: fence on destruction"); + + auto cuda_mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream); + if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) { + auto host_mem_space = + Kokkos::CudaHostPinnedSpace::impl_create(m_cudaDev, m_stream); + cuda_mem_space.deallocate(m_scratchFlags, + m_scratchFlagsCount * sizeScratchGrain); + cuda_mem_space.deallocate(m_scratchSpace, + m_scratchSpaceCount * sizeScratchGrain); + host_mem_space.deallocate(m_scratchUnified, + m_scratchUnifiedCount * sizeScratchGrain); + if (m_scratchFunctorSize > 0) { + cuda_mem_space.deallocate(m_scratchFunctor, m_scratchFunctorSize); + } } - m_scratchSpaceCount = 0; - m_scratchFlagsCount = 0; - m_scratchUnifiedCount = 0; - m_scratchSpace = nullptr; - m_scratchFlags = nullptr; - m_scratchUnified = nullptr; - m_stream = nullptr; for (int i = 0; i < m_n_team_scratch; ++i) { - m_team_scratch_current_size[i] = 0; - m_team_scratch_ptr[i] = nullptr; + if (m_team_scratch_current_size[i] > 0) + cuda_mem_space.deallocate(m_team_scratch_ptr[i], + m_team_scratch_current_size[i]); } + + KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_free_wrapper(m_scratch_locks))); } int CudaInternal::verify_is_initialized(const char *const label) const { @@ -249,10 +258,6 @@ int CudaInternal::verify_is_initialized(const char *const label) const { return 0 <= m_cudaDev; } uint32_t CudaInternal::impl_get_instance_id() const { return m_instance_id; } -CudaInternal &CudaInternal::singleton() { - static CudaInternal self; - return self; -} void CudaInternal::fence(const std::string &name) const { Impl::cuda_stream_synchronize(m_stream, this, name); } @@ -260,13 +265,7 @@ void CudaInternal::fence() const { fence("Kokkos::CudaInternal::fence(): Unnamed Instance Fence"); } -void CudaInternal::initialize(cudaStream_t stream) { - KOKKOS_EXPECTS(!is_initialized()); - - if (was_finalized) - Kokkos::abort("Calling Cuda::initialize after Cuda::finalize is illegal\n"); - was_initialized = true; - +CudaInternal::CudaInternal(cudaStream_t stream) : m_stream(stream) { // Check that the device associated with the stream matches cuda_device CUcontext context; KOKKOS_IMPL_CUDA_SAFE_CALL(cudaError_t(cuStreamGetCtx(stream, &context))); @@ -274,7 +273,6 @@ void CudaInternal::initialize(cudaStream_t stream) { KOKKOS_IMPL_CUDA_SAFE_CALL(cudaError_t(cuCtxGetDevice(&m_cudaDev))); KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_cudaDev)); - m_stream = stream; CudaInternal::cuda_devices.insert(m_cudaDev); // Allocate a staging buffer for constant mem in pinned host memory @@ -309,11 +307,6 @@ void CudaInternal::initialize(cudaStream_t stream) { (void)scratch_space(reduce_block_count * 16 * sizeof(size_type)); } - for (int i = 0; i < m_n_team_scratch; ++i) { - m_team_scratch_current_size[i] = 0; - m_team_scratch_ptr[i] = nullptr; - } - m_num_scratch_locks = concurrency(); void *scratch_locks_void_ptr = nullptr; KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_malloc_wrapper( @@ -454,52 +447,6 @@ void CudaInternal::release_team_scratch_space(int scratch_pool_id) { //---------------------------------------------------------------------------- -void CudaInternal::finalize() { - // skip if finalize() has already been called - if (was_finalized) return; - - was_finalized = true; - - auto cuda_mem_space = Kokkos::CudaSpace::impl_create(m_cudaDev, m_stream); - if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) { - auto host_mem_space = - Kokkos::CudaHostPinnedSpace::impl_create(m_cudaDev, m_stream); - cuda_mem_space.deallocate(m_scratchFlags, - m_scratchFlagsCount * sizeScratchGrain); - cuda_mem_space.deallocate(m_scratchSpace, - m_scratchSpaceCount * sizeScratchGrain); - host_mem_space.deallocate(m_scratchUnified, - m_scratchUnifiedCount * sizeScratchGrain); - if (m_scratchFunctorSize > 0) { - cuda_mem_space.deallocate(m_scratchFunctor, m_scratchFunctorSize); - } - } - - for (int i = 0; i < m_n_team_scratch; ++i) { - if (m_team_scratch_current_size[i] > 0) - cuda_mem_space.deallocate(m_team_scratch_ptr[i], - m_team_scratch_current_size[i]); - } - - m_scratchSpaceCount = 0; - m_scratchFlagsCount = 0; - m_scratchUnifiedCount = 0; - m_scratchSpace = nullptr; - m_scratchFlags = nullptr; - m_scratchUnified = nullptr; - for (int i = 0; i < m_n_team_scratch; ++i) { - m_team_scratch_current_size[i] = 0; - m_team_scratch_ptr[i] = nullptr; - } - - KOKKOS_IMPL_CUDA_SAFE_CALL((cuda_free_wrapper(m_scratch_locks))); - m_scratch_locks = nullptr; - m_num_scratch_locks = 0; - m_cudaDev = -1; -} - -//---------------------------------------------------------------------------- - Cuda::size_type *cuda_internal_scratch_space(const Cuda &instance, const std::size_t size) { return instance.impl_internal_space_instance()->scratch_space(size); @@ -522,13 +469,7 @@ Cuda::size_type *cuda_internal_scratch_unified(const Cuda &instance, namespace Kokkos { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -int Cuda::concurrency() { -#else -int Cuda::concurrency() const { -#endif - return Impl::CudaInternal::concurrency(); -} +int Cuda::concurrency() const { return Impl::CudaInternal::concurrency(); } void Cuda::impl_initialize(InitializationSettings const &settings) { const std::vector &visible_devices = Impl::get_visible_devices(); @@ -578,33 +519,6 @@ void Cuda::impl_initialize(InitializationSettings const &settings) { //---------------------------------- -#ifdef KOKKOS_ENABLE_CUDA_UVM - const char *env_force_device_alloc = - getenv("CUDA_MANAGED_FORCE_DEVICE_ALLOC"); - bool force_device_alloc; - if (env_force_device_alloc == nullptr) - force_device_alloc = false; - else - force_device_alloc = std::stoi(env_force_device_alloc) != 0; - - const char *env_visible_devices = getenv("CUDA_VISIBLE_DEVICES"); - bool visible_devices_one = true; - if (env_visible_devices == nullptr) visible_devices_one = false; - - if (Kokkos::show_warnings() && - (!visible_devices_one && !force_device_alloc)) { - std::cerr << R"warning( -Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default - without setting CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 or - setting CUDA_VISIBLE_DEVICES. - This could on multi GPU systems lead to severe performance" - penalties.)warning" - << std::endl; - } -#endif - - //---------------------------------- - #ifdef KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY // Check if unified memory is available int cuda_result; @@ -621,14 +535,20 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default //---------------------------------- - cudaStream_t singleton_stream; + cudaStream_t stream; KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device_id)); - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&singleton_stream)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&stream)); // Init the array for used for arbitrarily sized atomics desul::Impl::init_lock_arrays(); // FIXME - Impl::CudaInternal::singleton().initialize(singleton_stream); + // Create the default instance. + Impl::CudaInternal::default_instance = Impl::HostSharedPtr( + new Impl::CudaInternal(stream), [](Impl::CudaInternal *ptr) { + cudaStream_t s = ptr->m_stream; + delete ptr; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(s)); + }); } void Cuda::impl_finalize() { @@ -644,41 +564,32 @@ void Cuda::impl_finalize() { Kokkos::Impl::CudaInternal::constantMemReusablePerDevice[cuda_device])); } - auto &deep_copy_space = Impl::cuda_get_deep_copy_space(/*initialize*/ false); - if (deep_copy_space) - deep_copy_space->impl_internal_space_instance()->finalize(); KOKKOS_IMPL_CUDA_SAFE_CALL( cudaStreamDestroy(Impl::cuda_get_deep_copy_stream())); - Impl::CudaInternal::singleton().finalize(); - KOKKOS_IMPL_CUDA_SAFE_CALL( - cudaStreamDestroy(Impl::CudaInternal::singleton().m_stream)); + // Destroy the default instance. + Impl::CudaInternal::default_instance = nullptr; } -Cuda::Cuda() - : m_space_instance(&Impl::CudaInternal::singleton(), - [](Impl::CudaInternal *) {}) { - Impl::CudaInternal::singleton().verify_is_initialized( - "Cuda instance constructor"); -} +Cuda::~Cuda() { Impl::check_execution_space_destructor_precondition(name()); } -KOKKOS_DEPRECATED Cuda::Cuda(cudaStream_t stream, bool manage_stream) - : Cuda(stream, - manage_stream ? Impl::ManageStream::yes : Impl::ManageStream::no) {} +Cuda::Cuda() + : m_space_instance( + (Impl::check_execution_space_constructor_precondition(name()), + Impl::CudaInternal::default_instance)) {} Cuda::Cuda(cudaStream_t stream, Impl::ManageStream manage_stream) : m_space_instance( - new Impl::CudaInternal, [manage_stream](Impl::CudaInternal *ptr) { - ptr->finalize(); - if (static_cast(manage_stream)) { - KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamDestroy(ptr->m_stream)); - } - delete ptr; - }) { - Impl::CudaInternal::singleton().verify_is_initialized( - "Cuda instance constructor"); - m_space_instance->initialize(stream); -} + (Impl::check_execution_space_constructor_precondition(name()), + static_cast(manage_stream) + ? Impl::HostSharedPtr(new Impl::CudaInternal(stream), + [](Impl::CudaInternal *ptr) { + cudaStream_t s = ptr->m_stream; + delete ptr; + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaStreamDestroy(s)); + }) + : Impl::HostSharedPtr(new Impl::CudaInternal(stream)))) {} void Cuda::print_configuration(std::ostream &os, bool /*verbose*/) const { os << "Device Execution Space:\n"; @@ -690,12 +601,6 @@ void Cuda::print_configuration(std::ostream &os, bool /*verbose*/) const { os << "yes\n"; #else os << "no\n"; -#endif - os << " KOKKOS_ENABLE_CUDA_UVM: "; -#ifdef KOKKOS_ENABLE_CUDA_UVM - os << "yes\n"; -#else - os << "no\n"; #endif os << " KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC: "; #ifdef KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC @@ -739,6 +644,7 @@ int g_cuda_space_factory_initialized = int CudaInternal::m_cudaArch = -1; KOKKOS_IMPL_EXPORT cudaDeviceProp CudaInternal::m_deviceProp; +HostSharedPtr CudaInternal::default_instance; std::set CudaInternal::cuda_devices = {}; KOKKOS_IMPL_EXPORT std::map CudaInternal::constantMemHostStagingPerDevice = {}; diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp index 4e4b13093cc..b55612c4433 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp @@ -5,6 +5,7 @@ #define KOKKOS_CUDA_INSTANCE_HPP_ #include +#include #include #include #include @@ -74,9 +75,6 @@ namespace Kokkos { namespace Impl { class CudaInternal { - private: - CudaInternal(const CudaInternal&); - CudaInternal& operator=(const CudaInternal&); #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION static bool kokkos_impl_cuda_use_serial_execution_v; #endif @@ -90,31 +88,32 @@ class CudaInternal { static int m_cudaArch; static int concurrency(); + static HostSharedPtr default_instance; + KOKKOS_IMPL_EXPORT static cudaDeviceProp m_deviceProp; // Scratch Spaces for Reductions - mutable std::size_t m_scratchSpaceCount; - mutable std::size_t m_scratchFlagsCount; - mutable std::size_t m_scratchUnifiedCount; - mutable std::size_t m_scratchFunctorSize; - - mutable size_type* m_scratchSpace; - mutable size_type* m_scratchFlags; - mutable size_type* m_scratchUnified; - mutable size_type* m_scratchFunctor; - cudaStream_t m_stream; - uint32_t m_instance_id; + mutable std::size_t m_scratchSpaceCount = 0; + mutable std::size_t m_scratchFlagsCount = 0; + mutable std::size_t m_scratchUnifiedCount = 0; + mutable std::size_t m_scratchFunctorSize = 0; + + mutable size_type* m_scratchSpace = nullptr; + mutable size_type* m_scratchFlags = nullptr; + mutable size_type* m_scratchUnified = nullptr; + mutable size_type* m_scratchFunctor = nullptr; + cudaStream_t m_stream = nullptr; + uint32_t m_instance_id = + Kokkos::Tools::Experimental::Impl::idForInstance( + reinterpret_cast(this)); // Team Scratch Level 1 Space - int m_n_team_scratch = 10; - mutable int64_t m_team_scratch_current_size[10]; - mutable void* m_team_scratch_ptr[10]; - mutable std::atomic_int m_team_scratch_pool[10]; - int32_t* m_scratch_locks; - size_t m_num_scratch_locks; - - bool was_initialized = false; - bool was_finalized = false; + int m_n_team_scratch = 10; + mutable int64_t m_team_scratch_current_size[10] = {}; + mutable void* m_team_scratch_ptr[10] = {}; + mutable std::atomic_int m_team_scratch_pool[10] = {}; + int32_t* m_scratch_locks = nullptr; + size_t m_num_scratch_locks = 0; static std::set cuda_devices; KOKKOS_IMPL_EXPORT static std::map @@ -123,16 +122,12 @@ class CudaInternal { constantMemReusablePerDevice; KOKKOS_IMPL_EXPORT static std::map constantMemMutexPerDevice; - static CudaInternal& singleton(); - int verify_is_initialized(const char* const label) const; - int is_initialized() const { - return nullptr != m_scratchSpace && nullptr != m_scratchFlags; - } - - void initialize(cudaStream_t stream); - void finalize(); + CudaInternal(cudaStream_t stream); + ~CudaInternal(); + CudaInternal(const CudaInternal&) = delete; + CudaInternal& operator=(const CudaInternal&) = delete; void print_configuration(std::ostream&) const; @@ -144,28 +139,6 @@ class CudaInternal { void fence(const std::string&) const; void fence() const; - ~CudaInternal(); - - CudaInternal() - : m_scratchSpaceCount(0), - m_scratchFlagsCount(0), - m_scratchUnifiedCount(0), - m_scratchFunctorSize(0), - m_scratchSpace(nullptr), - m_scratchFlags(nullptr), - m_scratchUnified(nullptr), - m_scratchFunctor(nullptr), - m_stream(nullptr), - m_instance_id( - Kokkos::Tools::Experimental::Impl::idForInstance( - reinterpret_cast(this))) { - for (int i = 0; i < m_n_team_scratch; ++i) { - m_team_scratch_current_size[i] = 0; - m_team_scratch_ptr[i] = nullptr; - m_team_scratch_pool[i] = 0; - } - } - // Using CUDA API function/objects will be w.r.t. device 0 unless // cudaSetDevice(device_id) is called with the correct device_id. // The correct device_id is stored in the variable @@ -275,17 +248,6 @@ class CudaInternal { return cudaMallocHost(ptr, size); } - cudaError_t cuda_mem_prefetch_async_wrapper(const void* devPtr, size_t count, - int dstDevice) const { - set_cuda_device(); -#if CUDART_VERSION >= 13000 - cudaMemLocation loc = {cudaMemLocationTypeDevice, dstDevice}; - return cudaMemPrefetchAsync(devPtr, count, loc, 0, m_stream); -#else - return cudaMemPrefetchAsync(devPtr, count, dstDevice, m_stream); -#endif - } - cudaError_t cuda_memcpy_wrapper(void* dst, const void* src, size_t count, cudaMemcpyKind kind) const { set_cuda_device(); @@ -318,12 +280,6 @@ class CudaInternal { return cudaMemsetAsync(devPtr, value, count, m_stream); } - cudaError_t cuda_pointer_get_attributes_wrapper( - cudaPointerAttributes* attributes, const void* ptr) const { - set_cuda_device(); - return cudaPointerGetAttributes(attributes, ptr); - } - cudaError_t cuda_stream_create_wrapper(cudaStream_t* pStream) const { set_cuda_device(); return cudaStreamCreate(pStream); diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp index 1182ba92ca0..77006a24afa 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp @@ -529,7 +529,8 @@ struct CudaParallelLaunchKernelInvokercuda_memcpy_async_wrapper( driver_ptr, &driver, sizeof(DriverType), cudaMemcpyDefault))); - void const* args[] = {&driver_ptr}; + // NOLINTNEXTLINE(bugprone-multi-level-implicit-pointer-conversion) + void* args[] = {&driver_ptr}; cudaKernelNodeParams params = {}; @@ -538,7 +539,7 @@ struct CudaParallelLaunchKernelInvoker(base_t::get_kernel_func()); - params.kernelParams = const_cast(args); + params.kernelParams = args; params.extra = nullptr; KOKKOS_IMPL_CUDA_SAFE_CALL( diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp index 9d9346063f8..07b3e9594a9 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp @@ -22,16 +22,63 @@ struct default_inner_direction { namespace Impl { +template <> +struct TileSizeRecommended { + template + static auto get(Policy const&) { + constexpr auto InnerDirection = Policy::inner_direction; + constexpr int Rank = Policy::rank; + + using tile_type = typename Policy::tile_type; + + if constexpr (InnerDirection == Iterate::Left) { + if constexpr (Rank == 2) { + return tile_type{64, 4}; + } else if constexpr (Rank == 3) { + return tile_type{32, 2, 4}; + } else if constexpr (Rank == 4) { + return tile_type{16, 2, 2, 4}; + } else if constexpr (Rank == 5) { + return tile_type{16, 2, 4, 2, 1}; + } else if constexpr (Rank == 6) { + return tile_type{8, 4, 2, 2, 2, 1}; + } + tile_type tile_sizes{}; + for (int i = 0; i < Rank; ++i) { + tile_sizes[i] = 2; + } + tile_sizes[0] = 16; + return tile_sizes; + } else { + if constexpr (Rank == 2) { + return tile_type{4, 64}; + } else if constexpr (Rank == 3) { + return tile_type{4, 2, 32}; + } else if constexpr (Rank == 4) { + return tile_type{4, 2, 2, 16}; + } else if constexpr (Rank == 5) { + return tile_type{1, 2, 4, 2, 16}; + } else if constexpr (Rank == 6) { + return tile_type{1, 2, 2, 2, 4, 8}; + } + tile_type tile_sizes{}; + for (int i = 0; i < Rank; ++i) { + tile_sizes[i] = 2; + } + tile_sizes[Rank - 1] = 16; + return tile_sizes; + } + } +}; + // Settings for MDRangePolicy template <> inline TileSizeProperties get_tile_size_properties( const Kokkos::Cuda& space) { TileSizeProperties properties; - const auto& device_prop = space.cuda_device_prop(); - properties.max_threads = device_prop.maxThreadsPerMultiProcessor; - properties.default_largest_tile_size = 16; - properties.default_tile_size = 2; - properties.max_total_tile_size = 512; + const auto& device_prop = space.cuda_device_prop(); + properties.max_threads = device_prop.maxThreadsPerMultiProcessor; + properties.max_total_tile_size = 512; properties.max_threads_dimensions[0] = device_prop.maxThreadsDim[0]; properties.max_threads_dimensions[1] = device_prop.maxThreadsDim[1]; properties.max_threads_dimensions[2] = device_prop.maxThreadsDim[2]; diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp index 11eac4e17cd..43e799eff19 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp @@ -8,7 +8,6 @@ #if defined(KOKKOS_ENABLE_CUDA) #include -#include #include @@ -16,9 +15,6 @@ #include #include -#include -#include - #include #include @@ -60,40 +56,47 @@ class ParallelFor, Kokkos::Cuda> { using functor_type = FunctorType; private: - using RP = Policy; using array_index_type = typename Policy::array_index_type; using index_type = typename Policy::index_type; using LaunchBounds = typename Policy::launch_bounds; using MaxGridSize = Kokkos::Array; + using array_type = typename Policy::point_type; const FunctorType m_functor; - const Policy m_rp; + const Policy m_policy; const MaxGridSize m_max_grid_size; + array_type m_lower; + array_type m_upper; + array_type m_extent; // tile_size * num_tiles + public: template static int max_tile_size_product(const Policy& pol, const Functor&) { return max_tile_size_product_helper(pol, LaunchBounds{}); } - Policy const& get_policy() const { return m_rp; } + + Policy const& get_policy() const { return m_policy; } + inline __device__ void operator()() const { - Kokkos::Impl::DeviceIterateTile( - m_rp, m_functor, m_max_grid_size) + Kokkos::Impl::DeviceIterate(m_lower, m_upper, + m_extent, m_functor) .exec_range(); } inline void execute() const { - if (m_rp.m_num_tiles == 0) return; + if (m_policy.m_num_tiles == 0) return; // maximum number of threads in each dimension of the block as fetched by // the API [[maybe_unused]] const auto max_threads_dim = - m_rp.space().cuda_device_prop().maxThreadsDim; + m_policy.space().cuda_device_prop().maxThreadsDim; // maximum total number of threads per block as fetched by the API [[maybe_unused]] const auto max_threads_per_block = - m_rp.space().cuda_device_prop().maxThreadsPerBlock; + m_policy.space().cuda_device_prop().maxThreadsPerBlock; // make sure the block dimensions don't exceed the max number of threads // allowed @@ -119,95 +122,44 @@ class ParallelFor, Kokkos::Cuda> { grid.z <= static_cast(m_max_grid_size[2])); }; - dim3 grid(1, 1, 1); - dim3 block(1, 1, 1); - if constexpr (RP::rank == 2) { - // id0 to threadIdx.x; id1 to threadIdx.y - block = dim3(m_rp.m_tile[0], m_rp.m_tile[1], 1); - grid = - dim3(std::min( - (m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x, - m_max_grid_size[0]), - std::min( - (m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1) / block.y, - m_max_grid_size[1]), - 1); - } else if constexpr (RP::rank == 3) { - // id0 to threadIdx.x; id1 to threadIdx.y; id2 to threadIdx.z - block = dim3(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]); - grid = - dim3(std::min( - (m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x, - m_max_grid_size[0]), - std::min( - (m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1) / block.y, - m_max_grid_size[1]), - std::min( - (m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z, - m_max_grid_size[2])); - } else if constexpr (RP::rank == 4) { - // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to - // threadIdx.z - block = - dim3(m_rp.m_tile[0] * m_rp.m_tile[1], m_rp.m_tile[2], m_rp.m_tile[3]); - grid = - dim3(std::min( - m_rp.m_tile_end[0] * m_rp.m_tile_end[1], m_max_grid_size[0]), - std::min( - (m_rp.m_upper[2] - m_rp.m_lower[2] + block.y - 1) / block.y, - m_max_grid_size[1]), - std::min( - (m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z, - m_max_grid_size[2])); - } else if constexpr (RP::rank == 5) { - // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to - // threadIdx.z - block = dim3(m_rp.m_tile[0] * m_rp.m_tile[1], - m_rp.m_tile[2] * m_rp.m_tile[3], m_rp.m_tile[4]); - grid = - dim3(std::min( - m_rp.m_tile_end[0] * m_rp.m_tile_end[1], m_max_grid_size[0]), - std::min( - m_rp.m_tile_end[2] * m_rp.m_tile_end[3], m_max_grid_size[1]), - std::min( - (m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z, - m_max_grid_size[2])); - } else if constexpr (RP::rank == 6) { - // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4,id5 to - // threadIdx.z - block = - dim3(m_rp.m_tile[0] * m_rp.m_tile[1], m_rp.m_tile[2] * m_rp.m_tile[3], - m_rp.m_tile[4] * m_rp.m_tile[5]); - grid = dim3( - std::min(m_rp.m_tile_end[0] * m_rp.m_tile_end[1], - m_max_grid_size[0]), - std::min(m_rp.m_tile_end[2] * m_rp.m_tile_end[3], - m_max_grid_size[1]), - std::min(m_rp.m_tile_end[4] * m_rp.m_tile_end[5], - m_max_grid_size[2])); - } else { - Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with Cuda\n"); - } + const auto [grid, block] = + Kokkos::Impl::compute_device_launch_params(m_policy, m_max_grid_size); + // ensure we don't exceed the capability of the device check_grid_sizes(grid); check_block_sizes(block); // launch the kernel CudaParallelLaunch( - *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); + *this, grid, block, 0, m_policy.space().impl_internal_space_instance()); } // end execute // inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy) : m_functor(arg_functor), - m_rp(arg_policy), + m_policy(arg_policy), m_max_grid_size({ static_cast( - m_rp.space().cuda_device_prop().maxGridSize[0]), + m_policy.space().cuda_device_prop().maxGridSize[0]), static_cast( - m_rp.space().cuda_device_prop().maxGridSize[1]), + m_policy.space().cuda_device_prop().maxGridSize[1]), static_cast( - m_rp.space().cuda_device_prop().maxGridSize[2]), - }) {} + m_policy.space().cuda_device_prop().maxGridSize[2]), + }) { + // Initialize begins and ends based on layout + // Swap the fastest indexes to x dimension + for (array_index_type i = 0; i < Policy::rank; ++i) { + if constexpr (Policy::inner_direction == Iterate::Left) { + m_lower[i] = m_policy.m_lower[i]; + m_upper[i] = m_policy.m_upper[i]; + m_extent[i] = m_policy.m_tile[i] * m_policy.m_tile_end[i]; + } else { + m_lower[i] = m_policy.m_lower[Policy::rank - 1 - i]; + m_upper[i] = m_policy.m_upper[Policy::rank - 1 - i]; + m_extent[i] = m_policy.m_tile[Policy::rank - 1 - i] * + m_policy.m_tile_end[Policy::rank - 1 - i]; + } + } + } }; template diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp index 425de94f66d..8c18284c667 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp @@ -51,9 +51,7 @@ class ParallelFor, Kokkos::Cuda> { public: using functor_type = FunctorType; - ParallelFor() = delete; - ParallelFor(const ParallelFor&) = default; - ParallelFor& operator=(const ParallelFor&) = delete; + ParallelFor() = delete; Policy const& get_policy() const { return m_policy; } diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp index 9befb1702cd..44bcd5c0b73 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp @@ -108,9 +108,19 @@ class TeamPolicyInternal return internal_team_size_max(f); } - template - inline int team_size_max(const FunctorType& f, const ReducerType& /*r*/, - const ParallelReduceTag&) const { + template + inline int team_size_max(const FunctorType& f, const ReducerType& reducer, + const ParallelReduceTag& tag) const { + using functor_analysis_type = + Impl::FunctorAnalysis; + return team_size_max_internal( + f, typename functor_analysis_type::Reducer{reducer}, tag); + } + + template + inline int team_size_max_internal(const FunctorType& f, const ReducerType&, + const ParallelReduceTag&) const { using closure_type = Impl::ParallelReduce, TeamPolicy, Kokkos::Cuda>; @@ -147,9 +157,19 @@ class TeamPolicyInternal return internal_team_size_recommended(f); } - template - int team_size_recommended(const FunctorType& f, const ReducerType&, - const ParallelReduceTag&) const { + template + int team_size_recommended(const FunctorType& f, const ReducerType& reducer, + const ParallelReduceTag& tag) const { + using functor_analysis_type = + Impl::FunctorAnalysis; + return team_size_recommended_internal( + f, typename functor_analysis_type::Reducer{reducer}, tag); + } + + template + int team_size_recommended_internal(const FunctorType& f, const ReducerType&, + const ParallelReduceTag&) const { using closure_type = Impl::ParallelReduce, TeamPolicy, Kokkos::Cuda>; @@ -241,10 +261,13 @@ class TeamPolicyInternal // Make sure total block size is permissible if (m_team_size * m_vector_length > - int(Impl::CudaTraits::MaxHierarchicalParallelism)) { - Impl::throw_runtime_exception( - std::string("Kokkos::TeamPolicy< Cuda > the team size is too large. " - "Team size x vector length must be smaller than 1024.")); + static_cast(Impl::CudaTraits::MaxHierarchicalParallelism)) { + std::stringstream error; + error << "Kokkos::TeamPolicy: Requested too large team size. " + "Requested: " + << m_team_size << ", Maximum: " + << Impl::CudaTraits::MaxHierarchicalParallelism / m_vector_length; + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); } } @@ -510,6 +533,12 @@ class ParallelFor, *this, grid, block, shmem_size_total, m_policy.space() .impl_internal_space_instance()); // copy to device and execute + + if (m_scratch_pool_id >= 0) { + m_policy.space() + .impl_internal_space_instance() + ->release_team_scratch_space(m_scratch_pool_id); + } } ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) @@ -557,22 +586,30 @@ class ParallelFor, m_policy.space().cuda_device_prop().sharedMemPerBlock; const int shmem_size_total = m_shmem_begin + m_shmem_size; if (maxShmemPerBlock < shmem_size_total) { - printf("%i %i\n", maxShmemPerBlock, shmem_size_total); - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::ParallelFor< Cuda > insufficient shared memory")); + std::stringstream error; + error << "Kokkos::parallel_for: Requested too much scratch memory " + "on level 0. Requested: " + << m_shmem_size + << ", Maximum: " << maxShmemPerBlock - m_shmem_begin; + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); } - if (m_team_size > arg_policy.team_size_max(arg_functor, ParallelForTag())) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::ParallelFor< Cuda > requested too large team size.")); + if (m_scratch_size[1] > static_cast(m_policy.scratch_size_max(1))) { + std::stringstream error; + error << "Kokkos::parallel_for: Requested too much scratch memory " + "on level 1. Requested: " + << m_scratch_size[1] + << ", Maximum: " << m_policy.scratch_size_max(1); + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); } - } - ~ParallelFor() { - if (m_scratch_pool_id >= 0) { - m_policy.space() - .impl_internal_space_instance() - ->release_team_scratch_space(m_scratch_pool_id); + if (m_team_size > arg_policy.team_size_max(arg_functor, ParallelForTag())) { + std::stringstream error; + error << "Kokkos::parallel_for: Requested too large team size. " + "Requested: " + << m_team_size << ", Maximum: " + << arg_policy.team_size_max(arg_functor, ParallelForTag()); + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); } } }; @@ -855,6 +892,12 @@ class ParallelReduce= 0) { + m_policy.space() + .impl_internal_space_instance() + ->release_team_scratch_space(m_scratch_pool_id); + } } template @@ -883,7 +926,7 @@ class ParallelReduce requested too much " - "L0 scratch memory")); + std::stringstream error; + error + << "Kokkos::parallel_reduce: Requested too much scratch memory " + "on level 0. Requested: " + << m_shmem_size + << ", Maximum: " << maxShmemPerBlock - m_shmem_begin - m_team_begin; + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); } - if (int(m_team_size) > - arg_policy.team_size_max(m_functor_reducer.get_functor(), - m_functor_reducer.get_reducer(), - ParallelReduceTag())) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too " - "large team size.")); + if (m_scratch_size[1] > static_cast(m_policy.scratch_size_max(1))) { + std::stringstream error; + error + << "Kokkos::parallel_reduce: Requested too much scratch memory " + "on level 1. Requested: " + << m_scratch_size[1] << ", Maximum: " << m_policy.scratch_size_max(1); + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); } - } - ~ParallelReduce() { - if (m_scratch_pool_id >= 0) { - m_policy.space() - .impl_internal_space_instance() - ->release_team_scratch_space(m_scratch_pool_id); + if (m_team_size > + arg_policy.team_size_max_internal(m_functor_reducer.get_functor(), + m_functor_reducer.get_reducer(), + ParallelReduceTag())) { + std::stringstream error; + error << "Kokkos::parallel_reduce: Requested too large team size. " + "Requested: " + << m_team_size << ", Maximum: " + << arg_policy.team_size_max_internal( + m_functor_reducer.get_functor(), + m_functor_reducer.get_reducer(), ParallelReduceTag()); + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); } } }; diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp index fd86216105c..217a013d453 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp @@ -788,7 +788,7 @@ parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< * less than N) and a scan operation is performed. The last call to closure has * final == true. */ -// This is the same code as in HIP and largely the same as in OpenMPTarget +// This is the same code as in HIP. template KOKKOS_INLINE_FUNCTION void parallel_scan( const Impl::TeamThreadRangeBoundariesStruct& diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp index a7e13528574..6c3185c40ae 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp @@ -46,18 +46,6 @@ class UniqueToken { } public: - KOKKOS_DEFAULTED_FUNCTION - UniqueToken(const UniqueToken&) = default; - - KOKKOS_DEFAULTED_FUNCTION - UniqueToken(UniqueToken&&) = default; - - KOKKOS_DEFAULTED_FUNCTION - UniqueToken& operator=(const UniqueToken&) = default; - - KOKKOS_DEFAULTED_FUNCTION - UniqueToken& operator=(UniqueToken&&) = default; - /// \brief upper bound for acquired values, i.e. 0 <= value < size() KOKKOS_INLINE_FUNCTION size_type size() const noexcept { return m_locks.extent(0); } diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP.cpp index 0485599bc65..4aa143fc5e3 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP.cpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP.cpp @@ -15,6 +15,7 @@ import kokkos.core; #include #include +#include #include #include @@ -22,15 +23,21 @@ import kokkos.core; #include +namespace { + +struct { + void operator()(Kokkos::Impl::HIPInternal* ptr) const { + hipStream_t stream = ptr->m_stream; + delete ptr; + KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamDestroy(stream)); + } +} customDeleterManagesStream; + +} // namespace + namespace Kokkos { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -int HIP::concurrency() { -#else -int HIP::concurrency() const { -#endif - return Impl::HIPInternal::concurrency(); -} +int HIP::concurrency() const { return Impl::HIPInternal::concurrency(); } void HIP::impl_initialize(InitializationSettings const& settings) { const std::vector& visible_devices = Impl::get_visible_devices(); @@ -94,13 +101,11 @@ Kokkos::HIP::initialize WARNING: Could not determine that xnack is enabled. // Init the array for used for arbitrarily sized atomics desul::Impl::init_lock_arrays(); // FIXME - // Set singleton device id - Impl::HIPInternal::singleton().m_hipDev = hip_device_id; - - // Create the singleton stream and initialize singleton instance. - hipStream_t singleton_stream; - KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&singleton_stream)); - Impl::HIPInternal::singleton().initialize(singleton_stream); + // Create the default instance. + hipStream_t stream; + KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamCreate(&stream)); + Impl::HIPInternal::default_instance = Impl::HostSharedPtr( + new Impl::HIPInternal(stream), customDeleterManagesStream); } void HIP::impl_finalize() { @@ -118,36 +123,24 @@ void HIP::impl_finalize() { lock.finalize(); } - Impl::HIPInternal::singleton().finalize(); - - KOKKOS_IMPL_HIP_SAFE_CALL( - hipStreamDestroy(Impl::HIPInternal::singleton().m_stream)); + // Destroy the default instance. + Impl::HIPInternal::default_instance = nullptr; } +HIP::~HIP() { Impl::check_execution_space_destructor_precondition(name()); } + HIP::HIP() - : m_space_instance(&Impl::HIPInternal::singleton(), - [](Impl::HIPInternal*) {}) { - Impl::HIPInternal::singleton().verify_is_initialized( - "HIP instance constructor"); -} + : m_space_instance( + (Impl::check_execution_space_constructor_precondition(name()), + Impl::HIPInternal::default_instance)) {} HIP::HIP(hipStream_t const stream, Impl::ManageStream manage_stream) : m_space_instance( - new Impl::HIPInternal, [manage_stream](Impl::HIPInternal* ptr) { - ptr->finalize(); - if (static_cast(manage_stream)) { - KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamDestroy(ptr->m_stream)); - } - delete ptr; - }) { - Impl::HIPInternal::singleton().verify_is_initialized( - "HIP instance constructor"); - m_space_instance->initialize(stream); -} - -KOKKOS_DEPRECATED HIP::HIP(hipStream_t const stream, bool manage_stream) - : HIP(stream, - manage_stream ? Impl::ManageStream::yes : Impl::ManageStream::no) {} + (Impl::check_execution_space_constructor_precondition(name()), + static_cast(manage_stream) + ? Impl::HostSharedPtr(new Impl::HIPInternal(stream), + customDeleterManagesStream) + : Impl::HostSharedPtr(new Impl::HIPInternal(stream)))) {} void HIP::print_configuration(std::ostream& os, bool /*verbose*/) const { os << "Device Execution Space:\n"; @@ -196,7 +189,7 @@ hipStream_t HIP::hip_stream() const { return m_space_instance->m_stream; } int HIP::hip_device() const { return impl_internal_space_instance()->m_hipDev; } hipDeviceProp_t const& HIP::hip_device_prop() { - return Impl::HIPInternal::singleton().m_deviceProp; + return Impl::HIPInternal::default_instance->m_deviceProp; } const char* HIP::name() { return "HIP"; } diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP.hpp index 80ce8625984..7d1a8eb99e8 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP.hpp @@ -34,37 +34,25 @@ class HIP { using scratch_memory_space = ScratchMemorySpace; + KOKKOS_DEFAULTED_FUNCTION HIP(const HIP&) = default; + KOKKOS_FUNCTION HIP(HIP&& other) noexcept + : HIP(static_cast(other)) {} + KOKKOS_DEFAULTED_FUNCTION HIP& operator=(const HIP&) = default; + KOKKOS_FUNCTION HIP& operator=(HIP&& other) noexcept { + return *this = static_cast(other); + } + ~HIP(); HIP(); explicit HIP(hipStream_t stream) : HIP(stream, Impl::ManageStream::no) {} -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - template - KOKKOS_DEPRECATED_WITH_COMMENT( - "HIP execution space should be constructed explicitly.") - HIP(hipStream_t stream) - : HIP(stream) {} -#endif - HIP(hipStream_t stream, Impl::ManageStream manage_stream); - KOKKOS_DEPRECATED HIP(hipStream_t stream, bool manage_stream); - //@} //------------------------------------ //! \name Functions that all Kokkos devices must implement. //@{ -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION static int in_parallel() { -#if defined(__HIP_DEVICE_COMPILE__) - return true; -#else - return false; -#endif - } -#endif - /** \brief Wait until all dispatched functors complete. * * The parallel_for or parallel_reduce dispatch of a functor may return @@ -92,19 +80,8 @@ class HIP { static void impl_initialize(InitializationSettings const&); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - KOKKOS_DEPRECATED static size_type detect_device_count() { - int count; - KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&count)); - return count; - } -#endif - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - static int concurrency(); -#else int concurrency() const; -#endif + static const char* name(); inline Impl::HIPInternal* impl_internal_space_instance() const { diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp index 27e347f71b7..5eab26595bb 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp @@ -28,6 +28,8 @@ class GraphImpl { GraphNodeImpl; + using device_handle_t = Kokkos::Impl::DeviceHandle; + // Not movable or copyable; it spends its whole life as a shared_ptr in the // Graph object. GraphImpl() = delete; @@ -38,9 +40,9 @@ class GraphImpl { ~GraphImpl(); - explicit GraphImpl(Kokkos::HIP instance); + explicit GraphImpl(const device_handle_t& device_handle); - GraphImpl(Kokkos::HIP instance, hipGraph_t graph); + GraphImpl(const device_handle_t& device_handle, hipGraph_t graph); void add_node(std::shared_ptr const& arg_node_ptr); @@ -64,7 +66,7 @@ class GraphImpl { void submit(const Kokkos::HIP& exec); - Kokkos::HIP const& get_execution_space() const noexcept; + auto get_device_handle() const noexcept -> device_handle_t const&; auto create_root_node_ptr(); @@ -74,7 +76,7 @@ class GraphImpl { void instantiate() { KOKKOS_EXPECTS(!m_graph_exec); KOKKOS_IMPL_HIP_SAFE_CALL( - m_execution_space.impl_internal_space_instance() + m_device_handle.m_exec.impl_internal_space_instance() ->hip_graph_instantiate_wrapper(&m_graph_exec, m_graph, nullptr, nullptr, 0)); KOKKOS_ENSURES(m_graph_exec); @@ -84,7 +86,7 @@ class GraphImpl { hipGraphExec_t hip_graph_exec() { return m_graph_exec; } private: - Kokkos::HIP m_execution_space; + device_handle_t m_device_handle; hipGraph_t m_graph = nullptr; hipGraphExec_t m_graph_exec = nullptr; @@ -94,29 +96,31 @@ class GraphImpl { }; inline GraphImpl::~GraphImpl() { - m_execution_space.fence("Kokkos::GraphImpl::~GraphImpl: Graph Destruction"); + m_device_handle.m_exec.fence( + "Kokkos::GraphImpl::~GraphImpl: Graph Destruction"); KOKKOS_EXPECTS(m_graph); if (m_graph_exec) { KOKKOS_IMPL_HIP_SAFE_CALL( - m_execution_space.impl_internal_space_instance() + m_device_handle.m_exec.impl_internal_space_instance() ->hip_graph_exec_destroy_wrapper(m_graph_exec)); } if (m_graph_owning) { - KOKKOS_IMPL_HIP_SAFE_CALL(m_execution_space.impl_internal_space_instance() - ->hip_graph_destroy_wrapper(m_graph)); + KOKKOS_IMPL_HIP_SAFE_CALL( + m_device_handle.m_exec.impl_internal_space_instance() + ->hip_graph_destroy_wrapper(m_graph)); } } -inline GraphImpl::GraphImpl(Kokkos::HIP instance) - : m_execution_space(std::move(instance)), m_graph_owning(true) { - KOKKOS_IMPL_HIP_SAFE_CALL(m_execution_space.impl_internal_space_instance() - ->hip_graph_create_wrapper(&m_graph, 0)); +inline GraphImpl::GraphImpl(const device_handle_t& device_handle) + : m_device_handle(device_handle), m_graph_owning(true) { + KOKKOS_IMPL_HIP_SAFE_CALL( + m_device_handle.m_exec.impl_internal_space_instance() + ->hip_graph_create_wrapper(&m_graph, 0)); } -inline GraphImpl::GraphImpl(Kokkos::HIP instance, hipGraph_t graph) - : m_execution_space(std::move(instance)), - m_graph(graph), - m_graph_owning(false) { +inline GraphImpl::GraphImpl(const device_handle_t& device_handle, + hipGraph_t graph) + : m_device_handle(device_handle), m_graph(graph), m_graph_owning(false) { KOKKOS_EXPECTS(graph != nullptr); } @@ -124,12 +128,12 @@ inline void GraphImpl::add_node( std::shared_ptr const& arg_node_ptr) { // All of the predecessors are just added as normal, so all we need to // do here is add an empty node - KOKKOS_IMPL_HIP_SAFE_CALL(m_execution_space.impl_internal_space_instance() - ->hip_graph_add_empty_node_wrapper( - &(arg_node_ptr->node_details_t::node), - m_graph, - /* dependencies = */ nullptr, - /* numDependencies = */ 0)); + KOKKOS_IMPL_HIP_SAFE_CALL( + m_device_handle.m_exec.impl_internal_space_instance() + ->hip_graph_add_empty_node_wrapper( + &(arg_node_ptr->node_details_t::node), m_graph, + /* dependencies = */ nullptr, + /* numDependencies = */ 0)); } template @@ -196,7 +200,7 @@ inline void GraphImpl::add_predecessor( KOKKOS_EXPECTS(node); KOKKOS_IMPL_HIP_SAFE_CALL( - m_execution_space.impl_internal_space_instance() + m_device_handle.m_exec.impl_internal_space_instance() ->hip_graph_add_dependencies_wrapper(m_graph, &pred_node, &node, 1)); } @@ -209,21 +213,22 @@ inline void GraphImpl::submit(const Kokkos::HIP& exec) { m_graph_exec)); } -inline Kokkos::HIP const& GraphImpl::get_execution_space() - const noexcept { - return m_execution_space; +inline auto GraphImpl::get_device_handle() const noexcept + -> device_handle_t const& { + return m_device_handle; } inline auto GraphImpl::create_root_node_ptr() { KOKKOS_EXPECTS(m_graph); KOKKOS_EXPECTS(!m_graph_exec); - auto rv = std::make_shared(get_execution_space(), + auto rv = std::make_shared(m_device_handle, _graph_node_is_root_ctor_tag{}); - KOKKOS_IMPL_HIP_SAFE_CALL(m_execution_space.impl_internal_space_instance() - ->hip_graph_add_empty_node_wrapper( - &(rv->node_details_t::node), m_graph, - /* dependencies = */ nullptr, - /* numDependencies = */ 0)); + KOKKOS_IMPL_HIP_SAFE_CALL( + m_device_handle.m_exec.impl_internal_space_instance() + ->hip_graph_add_empty_node_wrapper(&(rv->node_details_t::node), + m_graph, + /* dependencies = */ nullptr, + /* numDependencies = */ 0)); KOKKOS_ENSURES(rv->node_details_t::node); return rv; } @@ -235,7 +240,7 @@ inline auto GraphImpl::create_aggregate_ptr(PredecessorRefs&&...) { // each predecessor ref, so all we need to do here is create the (trivial) // aggregate node. return std::make_shared( - m_execution_space, _graph_node_kernel_ctor_tag{}, aggregate_impl_t{}); + m_device_handle, _graph_node_kernel_ctor_tag{}, aggregate_impl_t{}); } } // namespace Impl } // namespace Kokkos diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Half_Conversion.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Half_Conversion.hpp index 5a236d52ec9..7a578f35eec 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Half_Conversion.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Half_Conversion.hpp @@ -178,6 +178,121 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t, T> cast_from_half(half_t val) { return static_cast(cast_from_half(val)); } + +/************************** bhalf conversions *********************************/ +KOKKOS_INLINE_FUNCTION +bhalf_t cast_to_bhalf(bhalf_t val) { return val; } + +KOKKOS_INLINE_FUNCTION +bhalf_t cast_to_bhalf(float val) { return bhalf_t::impl_type(val); } + +KOKKOS_INLINE_FUNCTION +bhalf_t cast_to_bhalf(bool val) { return bhalf_t::impl_type(val); } + +KOKKOS_INLINE_FUNCTION +bhalf_t cast_to_bhalf(double val) { return bhalf_t::impl_type(val); } + +KOKKOS_INLINE_FUNCTION +bhalf_t cast_to_bhalf(short val) { return bhalf_t::impl_type(val); } + +KOKKOS_INLINE_FUNCTION +bhalf_t cast_to_bhalf(unsigned short val) { return bhalf_t::impl_type(val); } + +KOKKOS_INLINE_FUNCTION +bhalf_t cast_to_bhalf(int val) { return bhalf_t::impl_type(val); } + +KOKKOS_INLINE_FUNCTION +bhalf_t cast_to_bhalf(unsigned int val) { return bhalf_t::impl_type(val); } + +KOKKOS_INLINE_FUNCTION +bhalf_t cast_to_bhalf(long long val) { + // FIXME_HIP + return bhalf_t::impl_type(static_cast(val)); +} + +KOKKOS_INLINE_FUNCTION +bhalf_t cast_to_bhalf(unsigned long long val) { + // FIXME_HIP + return bhalf_t::impl_type(static_cast(val)); +} + +KOKKOS_INLINE_FUNCTION +bhalf_t cast_to_bhalf(long val) { + return cast_to_bhalf(static_cast(val)); +} + +KOKKOS_INLINE_FUNCTION +bhalf_t cast_to_bhalf(unsigned long val) { + return cast_to_bhalf(static_cast(val)); +} + +template +KOKKOS_INLINE_FUNCTION std::enable_if_t, T> +cast_from_bhalf(bhalf_t val) { + return static_cast(bhalf_t::impl_type(val)); +} + +template +KOKKOS_INLINE_FUNCTION std::enable_if_t, T> +cast_from_bhalf(bhalf_t val) { + return static_cast(bhalf_t::impl_type(val)); +} + +template +KOKKOS_INLINE_FUNCTION std::enable_if_t, T> +cast_from_bhalf(bhalf_t val) { + return static_cast(bhalf_t::impl_type(val)); +} + +template +KOKKOS_INLINE_FUNCTION std::enable_if_t, T> +cast_from_bhalf(bhalf_t val) { + return static_cast(bhalf_t::impl_type(val)); +} + +template +KOKKOS_INLINE_FUNCTION std::enable_if_t, T> +cast_from_bhalf(bhalf_t val) { + return static_cast(bhalf_t::impl_type(val)); +} + +template +KOKKOS_INLINE_FUNCTION std::enable_if_t, T> +cast_from_bhalf(bhalf_t val) { + return static_cast(bhalf_t::impl_type(val)); +} + +template +KOKKOS_INLINE_FUNCTION std::enable_if_t, T> +cast_from_bhalf(bhalf_t val) { + return static_cast(bhalf_t::impl_type(val)); +} + +template +KOKKOS_INLINE_FUNCTION std::enable_if_t, T> +cast_from_bhalf(bhalf_t val) { + return static_cast(bhalf_t::impl_type(val)); +} + +template +KOKKOS_INLINE_FUNCTION + std::enable_if_t, T> + cast_from_bhalf(bhalf_t val) { + return static_cast(bhalf_t::impl_type(val)); +} + +template +KOKKOS_INLINE_FUNCTION std::enable_if_t, T> +cast_from_bhalf(bhalf_t val) { + return static_cast(bhalf_t::impl_type(val)); +} + +template +KOKKOS_INLINE_FUNCTION std::enable_if_t, T> +cast_from_bhalf(bhalf_t val) { + return static_cast(bhalf_t::impl_type(val)); +} + } // namespace Kokkos::Experimental #endif diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Half_Impl_Type.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Half_Impl_Type.hpp index 319f99872ca..a447efaf645 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Half_Impl_Type.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Half_Impl_Type.hpp @@ -5,11 +5,11 @@ #define KOKKOS_HIP_HALF_IMPL_TYPE_HPP_ #include +#include #ifndef KOKKOS_IMPL_HALF_TYPE_DEFINED // Make sure no one else tries to define half_t #define KOKKOS_IMPL_HALF_TYPE_DEFINED -#define KOKKOS_IMPL_HIP_HALF_TYPE_DEFINED namespace Kokkos { namespace Impl { @@ -19,4 +19,17 @@ struct half_impl_t { } // namespace Impl } // namespace Kokkos #endif // KOKKOS_IMPL_HALF_TYPE_DEFINED + +#ifndef KOKKOS_IMPL_BHALF_TYPE_DEFINED +// Make sure no one else tries to define bhalf_t +#define KOKKOS_IMPL_BHALF_TYPE_DEFINED +namespace Kokkos::Impl { +struct bhalf_impl_t { + using type = __hip_bfloat16; +}; + +} // namespace Kokkos::Impl + +#endif // KOKKOS_IMPL_BHALF_TYPE_DEFINED + #endif // KOKKOS_ENABLE_HIP diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Half_MathematicalFunctions.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Half_MathematicalFunctions.hpp new file mode 100644 index 00000000000..c652c5a640a --- /dev/null +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Half_MathematicalFunctions.hpp @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project + +#ifndef KOKKOS_HIP_HALF_MATHEMATICAL_FUNCTIONS_HPP_ +#define KOKKOS_HIP_HALF_MATHEMATICAL_FUNCTIONS_HPP_ + +#include + +namespace Kokkos { +namespace Impl { + +// Mathematical functions are only available on the device +#if defined(KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH) && defined(__HIP_DEVICE_COMPILE__) +#define KOKKOS_HIP_HALF_UNARY_FUNCTION(OP, HIP_NAME, HALF_TYPE) \ + KOKKOS_INLINE_FUNCTION HALF_TYPE impl_##OP(HALF_TYPE x) { \ + return HIP_NAME(HALF_TYPE::impl_type(x)); \ + } + +#define KOKKOS_HIP_HALF_BINARY_FUNCTION(OP, HIP_NAME, HALF_TYPE) \ + KOKKOS_INLINE_FUNCTION HALF_TYPE impl_##OP(HALF_TYPE x, HALF_TYPE y) { \ + return HIP_NAME(HALF_TYPE::impl_type(x), HALF_TYPE::impl_type(y)); \ + } + +#define KOKKOS_HIP_HALF_UNARY_PREDICATE(OP, HIP_NAME, HALF_TYPE) \ + KOKKOS_INLINE_FUNCTION bool impl_##OP(HALF_TYPE x) { \ + return HIP_NAME(HALF_TYPE::impl_type(x)); \ + } + +#define KOKKOS_HIP_HALF_UNARY_FUNCTION_IMPL(OP, HIP_NAME) \ + KOKKOS_HIP_HALF_UNARY_FUNCTION(OP, HIP_NAME, Kokkos::Experimental::half_t) +#define KOKKOS_HIP_HALF_BINARY_FUNCTION_IMPL(OP, HIP_NAME) \ + KOKKOS_HIP_HALF_BINARY_FUNCTION(OP, HIP_NAME, Kokkos::Experimental::half_t) +#define KOKKOS_HIP_HALF_UNARY_PREDICATE_IMPL(OP, HIP_NAME) \ + KOKKOS_HIP_HALF_UNARY_PREDICATE(OP, HIP_NAME, Kokkos::Experimental::half_t) + +KOKKOS_INLINE_FUNCTION Kokkos::Experimental::half_t impl_test_fallback_half( + Kokkos::Experimental::half_t) { + return Kokkos::Experimental::half_t(0.f); +} + +#define KOKKOS_HIP_BHALF_UNARY_FUNCTION_IMPL(OP, HIP_NAME) \ + KOKKOS_HIP_HALF_UNARY_FUNCTION(OP, HIP_NAME, Kokkos::Experimental::bhalf_t) +#define KOKKOS_HIP_BHALF_BINARY_FUNCTION_IMPL(OP, HIP_NAME) \ + KOKKOS_HIP_HALF_BINARY_FUNCTION(OP, HIP_NAME, Kokkos::Experimental::bhalf_t) +#define KOKKOS_HIP_BHALF_UNARY_PREDICATE_IMPL(OP, HIP_NAME) \ + KOKKOS_HIP_HALF_UNARY_PREDICATE(OP, HIP_NAME, Kokkos::Experimental::bhalf_t) + +KOKKOS_INLINE_FUNCTION Kokkos::Experimental::bhalf_t impl_test_fallback_bhalf( + Kokkos::Experimental::bhalf_t) { + return Kokkos::Experimental::bhalf_t(0.f); +} + +#define KOKKOS_HIP_HALF_AND_BHALF_UNARY_FUNCTION_IMPL(OP, HIP_NAME) \ + KOKKOS_HIP_HALF_UNARY_FUNCTION_IMPL(OP, HIP_NAME) \ + KOKKOS_HIP_BHALF_UNARY_FUNCTION_IMPL(OP, HIP_NAME) + +#define KOKKOS_HIP_HALF_AND_BHALF_BINARY_FUNCTION_IMPL(OP, HIP_NAME) \ + KOKKOS_HIP_HALF_BINARY_FUNCTION_IMPL(OP, HIP_NAME) \ + KOKKOS_HIP_BHALF_BINARY_FUNCTION_IMPL(OP, HIP_NAME) + +#define KOKKOS_HIP_HALF_AND_BHALF_UNARY_PREDICATE_IMPL(OP, HIP_NAME) \ + KOKKOS_HIP_HALF_UNARY_PREDICATE_IMPL(OP, HIP_NAME) \ + KOKKOS_HIP_BHALF_UNARY_PREDICATE_IMPL(OP, HIP_NAME) + +// Basic operations +KOKKOS_HIP_HALF_AND_BHALF_UNARY_FUNCTION_IMPL(abs, __habs) +KOKKOS_HIP_HALF_AND_BHALF_UNARY_FUNCTION_IMPL(fabs, __habs) +// fmod +// remainder +// remquo +KOKKOS_HIP_HALF_AND_BHALF_BINARY_FUNCTION_IMPL(fmax, __hmax) +KOKKOS_HIP_HALF_AND_BHALF_BINARY_FUNCTION_IMPL(fmin, __hmin) +// fdim +// Exponential functions +KOKKOS_HIP_HALF_AND_BHALF_UNARY_FUNCTION_IMPL(exp, hexp) +KOKKOS_HIP_HALF_AND_BHALF_UNARY_FUNCTION_IMPL(exp2, hexp2) +// expm1 +KOKKOS_HIP_HALF_AND_BHALF_UNARY_FUNCTION_IMPL(log, hlog) +KOKKOS_HIP_HALF_AND_BHALF_UNARY_FUNCTION_IMPL(log10, hlog10) +KOKKOS_HIP_HALF_AND_BHALF_UNARY_FUNCTION_IMPL(log2, hlog2) +// log1p +// Power functions +// pow +KOKKOS_HIP_HALF_AND_BHALF_UNARY_FUNCTION_IMPL(sqrt, hsqrt) +// cbrt +// hypot +// Trigonometric functions +KOKKOS_HIP_HALF_AND_BHALF_UNARY_FUNCTION_IMPL(sin, hsin) +KOKKOS_HIP_HALF_AND_BHALF_UNARY_FUNCTION_IMPL(cos, hcos) +// tan +// asin +// acos +// atan +// atan2 +// Hyperbolic functions +// sinh +// cosh +// tanh +// asinh +// acosh +// atanh +// Error and gamma functions +// erf +// erfc +// tgamma +// lgamma +// Nearest integer floating point functions +KOKKOS_HIP_HALF_AND_BHALF_UNARY_FUNCTION_IMPL(ceil, hceil) +KOKKOS_HIP_HALF_AND_BHALF_UNARY_FUNCTION_IMPL(floor, hfloor) +KOKKOS_HIP_HALF_AND_BHALF_UNARY_FUNCTION_IMPL(trunc, htrunc) +// round +KOKKOS_HIP_HALF_AND_BHALF_UNARY_FUNCTION_IMPL(rint, hrint) +// NOTE HIP does not provide these functions, but we can exclude domain errors, +// as the range of int is enough for any value half_t can take. +// Thus we just cast to the required return type +// We are still missing the bhalf_t versions +KOKKOS_INLINE_FUNCTION long impl_lrint(Kokkos::Experimental::half_t x) { + return static_cast(impl_rint(x)); +} +KOKKOS_INLINE_FUNCTION long long impl_llrint(Kokkos::Experimental::half_t x) { + return static_cast(impl_rint(x)); +} +// logb +// nextafter +// copysign +// isfinite +KOKKOS_HIP_HALF_AND_BHALF_UNARY_PREDICATE_IMPL(isinf, __hisinf) +KOKKOS_HIP_HALF_AND_BHALF_UNARY_PREDICATE_IMPL(isnan, __hisnan) +// signbit +// Non-standard functions +KOKKOS_HIP_HALF_AND_BHALF_UNARY_FUNCTION_IMPL(rsqrt, hrsqrt) +KOKKOS_HIP_HALF_AND_BHALF_UNARY_FUNCTION_IMPL(rcp, hrcp) + +#undef KOKKOS_HIP_HALF_AND_BHALF_UNARY_FUNCTION_IMPL +#undef KOKKOS_HIP_HALF_AND_BHALF_BINARY_FUNCTION_IMPL +#undef KOKKOS_HIP_HALF_AND_BHALF_UNARY_PREDICATE_IMPL + +#undef KOKKOS_HIP_BHALF_UNARY_FUNCTION_IMPL +#undef KOKKOS_HIP_BHALF_BINARY_FUNCTION_IMPL +#undef KOKKOS_HIP_BHALF_UNARY_PREDICATE_IMPL + +#undef KOKKOS_HIP_HALF_UNARY_FUNCTION_IMPL +#undef KOKKOS_HIP_HALF_BINARY_FUNCTION_IMPL +#undef KOKKOS_HIP_HALF_UNARY_PREDICATE_IMPL + +#undef KOKKOS_HIP_HALF_UNARY_FUNCTION +#undef KOKKOS_HIP_HALF_BINARY_FUNCTION +#undef KOKKOS_HIP_HALF_UNARY_PREDICATE + +#endif // defined(KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH) && + // defined(__HIP_DEVICE_COMPILE__) + +} // namespace Impl +} // namespace Kokkos + +#endif diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp index f1c2119a2e0..aefd9a0362f 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -127,21 +127,6 @@ void HIPInternal::print_configuration(std::ostream &s) const { //---------------------------------------------------------------------------- -HIPInternal::~HIPInternal() { - if (m_scratchSpace || m_scratchFlags) { - std::cerr << "Kokkos::HIP ERROR: Failed to call " - "Kokkos::HIP::finalize()" - << std::endl; - std::cerr.flush(); - } - - m_scratchSpaceCount = 0; - m_scratchFlagsCount = 0; - m_scratchSpace = nullptr; - m_scratchFlags = nullptr; - m_stream = nullptr; -} - int HIPInternal::verify_is_initialized(const char *const label) const { if (m_hipDev < 0) { Kokkos::abort((std::string("Kokkos::HIP::") + label + @@ -154,13 +139,6 @@ int HIPInternal::verify_is_initialized(const char *const label) const { uint32_t HIPInternal::impl_get_instance_id() const noexcept { return m_instance_id; } -HIPInternal &HIPInternal::singleton() { - static HIPInternal *self = nullptr; - if (!self) { - self = new HIPInternal(); - } - return *self; -} void HIPInternal::fence() const { fence("Kokkos::HIPInternal::fence: Unnamed Internal Fence"); @@ -173,18 +151,11 @@ void HIPInternal::fence(const std::string &name) const { [&]() { KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamSynchronize(m_stream)); }); } -void HIPInternal::initialize(hipStream_t stream) { - KOKKOS_EXPECTS(!is_initialized()); - - if (was_finalized) - Kokkos::abort("Calling HIP::initialize after HIP::finalize is illegal\n"); - - KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamGetDevice(stream, &m_hipDev)); +HIPInternal::HIPInternal(hipStream_t stream) : m_stream(stream) { + KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamGetDevice(m_stream, &m_hipDev)); KOKKOS_IMPL_HIP_SAFE_CALL(hipSetDevice(m_hipDev)); hip_devices.insert(m_hipDev); - m_stream = stream; - // Allocate a staging buffer for constant mem in pinned host memory. if (!constantMemHostStaging[m_hipDev]) { void *constant_mem_void_ptr = nullptr; @@ -353,7 +324,7 @@ void HIPInternal::release_team_scratch_space(int scratch_pool_id) { //---------------------------------------------------------------------------- -void HIPInternal::finalize() { +HIPInternal::~HIPInternal() { // First, lock the shared resource locking helper. // Then, fence the stream and check if it was involved in the last constant // memory launch. @@ -361,12 +332,10 @@ void HIPInternal::finalize() { // thread from launching another kernel in-between the fence // and the 'check_if_involved_and_unlock'. auto lock = HIPInternal::constantMemReusable[m_hipDev].lock(); - this->fence("Kokkos::HIPInternal::finalize: fence on finalization"); + this->fence("Kokkos::HIPInternal::finalize: fence on destruction"); HIPInternal::constantMemReusable[m_hipDev].check_if_involved_and_unlock( std::move(lock), m_stream); - was_finalized = true; - auto device_mem_space = Kokkos::HIPSpace::impl_create(m_hipDev, m_stream); if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) { device_mem_space.deallocate(m_scratchFlags, @@ -388,19 +357,7 @@ void HIPInternal::finalize() { m_team_scratch_current_size[i]); } - m_scratchSpaceCount = 0; - m_scratchFlagsCount = 0; - m_scratchSpace = nullptr; - m_scratchFlags = nullptr; - for (int i = 0; i < m_n_team_scratch; ++i) { - m_team_scratch_current_size[i] = 0; - m_team_scratch_ptr[i] = nullptr; - } - KOKKOS_IMPL_HIP_SAFE_CALL(hip_free_wrapper(m_scratch_locks)); - m_scratch_locks = nullptr; - m_num_scratch_locks = 0; - m_hipDev = -1; } int HIPInternal::m_maxThreadsPerSM = 0; @@ -409,6 +366,8 @@ hipDeviceProp_t HIPInternal::m_deviceProp; std::mutex HIPInternal::scratchFunctorMutex; +HostSharedPtr HIPInternal::default_instance; + std::set HIPInternal::hip_devices = {}; std::map HIPInternal::constantMemHostStaging = {}; std::map HIPInternal::constantMemReusable = {}; diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp index db321021127..bcbfd36a775 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp @@ -8,6 +8,7 @@ #include #include +#include #include @@ -20,9 +21,10 @@ namespace Kokkos { namespace Impl { struct HIPTraits { -#if defined(KOKKOS_ARCH_AMD_GFX906) || defined(KOKKOS_ARCH_AMD_GFX908) || \ - defined(KOKKOS_ARCH_AMD_GFX90A) || defined(KOKKOS_ARCH_AMD_GFX940) || \ - defined(KOKKOS_ARCH_AMD_GFX942) || defined(KOKKOS_ARCH_AMD_GFX942_APU) +#if defined(KOKKOS_ARCH_AMD_GFX906) || defined(KOKKOS_ARCH_AMD_GFX908) || \ + defined(KOKKOS_ARCH_AMD_GFX90A) || defined(KOKKOS_ARCH_AMD_GFX940) || \ + defined(KOKKOS_ARCH_AMD_GFX942) || defined(KOKKOS_ARCH_AMD_GFX942_APU) || \ + defined(KOKKOS_ARCH_AMD_GFX950) static constexpr int WarpSize = 64; static constexpr int WarpIndexMask = 0x003f; /* hexadecimal for 63 */ static constexpr int WarpIndexShift = 6; /* WarpSize == 1 << WarpShift*/ @@ -31,6 +33,8 @@ struct HIPTraits { static constexpr int WarpSize = 32; static constexpr int WarpIndexMask = 0x001f; /* hexadecimal for 31 */ static constexpr int WarpIndexShift = 5; /* WarpSize == 1 << WarpShift*/ +#else +#error "Unexpected AMD GFX architecture!" #endif static constexpr int ConservativeThreadsPerBlock = 256; // conservative fallback blocksize in case of spills @@ -105,6 +109,7 @@ struct SharedResourceLock { SharedResourceLock(SharedResourceLock &&other) = delete; SharedResourceLock &operator=(SharedResourceLock const &other) = delete; SharedResourceLock &operator=(SharedResourceLock &&other) = delete; + ~SharedResourceLock() = default; // Acquire the right to use the shared resource. The instance is locked first. [[nodiscard]] auto acquire() { @@ -141,16 +146,14 @@ struct SharedResourceLock { }; class HIPInternal { - private: - HIPInternal(const HIPInternal &); - HIPInternal &operator=(const HIPInternal &); - public: using size_type = ::Kokkos::HIP::size_type; int m_hipDev = -1; static int m_maxThreadsPerSM; + static HostSharedPtr default_instance; + static hipDeviceProp_t m_deviceProp; static int concurrency(); @@ -179,32 +182,22 @@ class HIPInternal { int32_t *m_scratch_locks = nullptr; size_t m_num_scratch_locks = 0; - bool was_finalized = false; - static std::set hip_devices; static std::map constantMemHostStaging; static std::map constantMemReusable; - static HIPInternal &singleton(); - int verify_is_initialized(const char *const label) const; - int is_initialized() const { - return nullptr != m_scratchSpace && nullptr != m_scratchFlags; - } - - void initialize(hipStream_t stream); - void finalize(); + HIPInternal(hipStream_t stream); + ~HIPInternal(); + HIPInternal(const HIPInternal &) = delete; + HIPInternal &operator=(const HIPInternal &) = delete; void print_configuration(std::ostream &) const; void fence() const; void fence(const std::string &) const; - ~HIPInternal(); - - HIPInternal() = default; - // Using HIP API function/objects will be w.r.t. device 0 unless // hipSetDevice(device_id) is called with the correct device_id. // The correct device_id is stored in the variable diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_IsXnack.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_IsXnack.hpp index 4f04a80e7d7..c9836d2c1a8 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_IsXnack.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_IsXnack.hpp @@ -37,8 +37,9 @@ bool xnack_boot_config_has_hmm_mirror(); // Returns true iff the architecture of the gpu supports accessing system // allocated memory constexpr bool gpu_arch_can_access_system_allocations() { -#if defined(KOKKOS_ARCH_AMD_GFX908) || defined(KOKKOS_ARCH_AMD_GFX90A) || \ - defined(KOKKOS_ARCH_AMD_GFX942) || defined(KOKKOS_ARCH_AMD_GFX942_APU) +#if defined(KOKKOS_ARCH_AMD_GFX908) || defined(KOKKOS_ARCH_AMD_GFX90A) || \ + defined(KOKKOS_ARCH_AMD_GFX942) || defined(KOKKOS_ARCH_AMD_GFX942_APU) || \ + defined(KOKKOS_ARCH_AMD_GFX950) return true; #elif defined(KOKKOS_ARCH_AMD_GFX906) || defined(KOKKOS_ARCH_AMD_GFX1103) || \ defined(KOKKOS_ARCH_AMD_GFX1100) || defined(KOKKOS_ARCH_AMD_GFX1030) || \ diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp index 92c8b6fbf4e..d7fe82336ec 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp @@ -451,7 +451,7 @@ struct HIPParallelLaunchKernelInvoker(base_t::get_kernel_func()); - params.kernelParams = const_cast(args); + params.kernelParams = args; params.extra = nullptr; KOKKOS_IMPL_HIP_SAFE_CALL(hip_instance->hip_graph_add_kernel_node_wrapper( diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp index 01ecf3ee2e7..f7a2b832a7f 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp @@ -22,14 +22,61 @@ struct default_inner_direction { namespace Impl { +template <> +struct TileSizeRecommended { + template + static auto get(Policy const&) { + constexpr auto InnerDirection = Policy::inner_direction; + constexpr int Rank = Policy::rank; + + using tile_type = typename Policy::tile_type; + + if constexpr (InnerDirection == Iterate::Left) { + if constexpr (Rank == 2) { + return tile_type{64, 4}; + } else if constexpr (Rank == 3) { + return tile_type{32, 2, 4}; + } else if constexpr (Rank == 4) { + return tile_type{16, 4, 2, 2}; + } else if constexpr (Rank == 5) { + return tile_type{16, 4, 2, 2, 1}; + } else if constexpr (Rank == 6) { + return tile_type{8, 4, 2, 2, 2, 1}; + } + tile_type tile_sizes{}; + for (int i = 0; i < Rank; ++i) { + tile_sizes[i] = 2; + } + tile_sizes[0] = 16; + return tile_sizes; + } else { + if constexpr (Rank == 2) { + return tile_type{4, 64}; + } else if constexpr (Rank == 3) { + return tile_type{4, 2, 32}; + } else if constexpr (Rank == 4) { + return tile_type{2, 2, 4, 16}; + } else if constexpr (Rank == 5) { + return tile_type{1, 2, 2, 4, 16}; + } else if constexpr (Rank == 6) { + return tile_type{1, 2, 2, 2, 4, 8}; + } + tile_type tile_sizes{}; + for (int i = 0; i < Rank; ++i) { + tile_sizes[i] = 2; + } + tile_sizes[Rank - 1] = 16; + return tile_sizes; + } + } +}; + // Settings for MDRangePolicy template <> inline TileSizeProperties get_tile_size_properties(const HIP& space) { TileSizeProperties properties; const auto& device_prop = space.hip_device_prop(); properties.max_threads = device_prop.maxThreadsPerBlock; - properties.default_largest_tile_size = 16; - properties.default_tile_size = 4; properties.max_total_tile_size = HIPTraits::MaxThreadsPerBlock; properties.max_threads_dimensions[0] = device_prop.maxThreadsDim[0]; properties.max_threads_dimensions[1] = device_prop.maxThreadsDim[1]; diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_MDRange.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_MDRange.hpp index 2f8148188ca..e06e7977e49 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_MDRange.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_MDRange.hpp @@ -26,20 +26,24 @@ class ParallelFor, HIP> { using index_type = typename Policy::index_type; using LaunchBounds = typename Policy::launch_bounds; using MaxGridSize = Kokkos::Array; + using array_type = typename Policy::point_type; const FunctorType m_functor; const Policy m_policy; const MaxGridSize m_max_grid_size; + array_type m_lower; + array_type m_upper; + array_type m_extent; // tile_size * num_tiles + public: - ParallelFor() = delete; - ParallelFor(ParallelFor const&) = default; - ParallelFor& operator=(ParallelFor const&) = delete; + ParallelFor() = delete; inline __device__ void operator()() const { - Kokkos::Impl::DeviceIterateTile( - m_policy, m_functor, m_max_grid_size) + Kokkos::Impl::DeviceIterate(m_lower, m_upper, + m_extent, m_functor) .exec_range(); } @@ -47,114 +51,12 @@ class ParallelFor, HIP> { using ClosureType = ParallelFor; if (m_policy.m_num_tiles == 0) return; - if (Policy::rank == 2) { - // id0 to threadIdx.x; id1 to threadIdx.y - dim3 const block(m_policy.m_tile[0], m_policy.m_tile[1], 1); - - dim3 const grid( - std::min( - (m_policy.m_upper[0] - m_policy.m_lower[0] + block.x - 1) / - block.x, - m_max_grid_size[0]), - std::min( - (m_policy.m_upper[1] - m_policy.m_lower[1] + block.y - 1) / - block.y, - m_max_grid_size[1]), - 1); - - hip_parallel_launch( - *this, grid, block, 0, - m_policy.space().impl_internal_space_instance(), false); - } else if (Policy::rank == 3) { - // id0 to threadIdx.x; id1 to threadIdx.y; id2 to threadIdx.z - dim3 const block(m_policy.m_tile[0], m_policy.m_tile[1], - m_policy.m_tile[2]); - - dim3 const grid( - std::min( - (m_policy.m_upper[0] - m_policy.m_lower[0] + block.x - 1) / - block.x, - m_max_grid_size[0]), - std::min( - (m_policy.m_upper[1] - m_policy.m_lower[1] + block.y - 1) / - block.y, - m_max_grid_size[1]), - std::min( - (m_policy.m_upper[2] - m_policy.m_lower[2] + block.z - 1) / - block.z, - m_max_grid_size[2])); - - hip_parallel_launch( - *this, grid, block, 0, - m_policy.space().impl_internal_space_instance(), false); - } else if (Policy::rank == 4) { - // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to - // threadIdx.z - dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], - m_policy.m_tile[2], m_policy.m_tile[3]); - - dim3 const grid( - std::min( - m_policy.m_tile_end[0] * m_policy.m_tile_end[1], - m_max_grid_size[0]), - std::min( - (m_policy.m_upper[2] - m_policy.m_lower[2] + block.y - 1) / - block.y, - m_max_grid_size[1]), - std::min( - (m_policy.m_upper[3] - m_policy.m_lower[3] + block.z - 1) / - block.z, - m_max_grid_size[2])); - - hip_parallel_launch( - *this, grid, block, 0, - m_policy.space().impl_internal_space_instance(), false); - } else if (Policy::rank == 5) { - // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 - // to threadIdx.z - dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], - m_policy.m_tile[2] * m_policy.m_tile[3], - m_policy.m_tile[4]); - - dim3 const grid( - std::min( - m_policy.m_tile_end[0] * m_policy.m_tile_end[1], - m_max_grid_size[0]), - std::min( - m_policy.m_tile_end[2] * m_policy.m_tile_end[3], - m_max_grid_size[1]), - std::min( - (m_policy.m_upper[4] - m_policy.m_lower[4] + block.z - 1) / - block.z, - m_max_grid_size[2])); - - hip_parallel_launch( - *this, grid, block, 0, - m_policy.space().impl_internal_space_instance(), false); - } else if (Policy::rank == 6) { - // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; - // id4,id5 to threadIdx.z - dim3 const block(m_policy.m_tile[0] * m_policy.m_tile[1], - m_policy.m_tile[2] * m_policy.m_tile[3], - m_policy.m_tile[4] * m_policy.m_tile[5]); - - dim3 const grid(std::min( - m_policy.m_tile_end[0] * m_policy.m_tile_end[1], - m_max_grid_size[0]), - std::min( - m_policy.m_tile_end[2] * m_policy.m_tile_end[3], - m_max_grid_size[1]), - std::min( - m_policy.m_tile_end[4] * m_policy.m_tile_end[5], - m_max_grid_size[2])); - - hip_parallel_launch( - *this, grid, block, 0, - m_policy.space().impl_internal_space_instance(), false); - } else { - Kokkos::abort("Kokkos::MDRange Error: Exceeded rank bounds with HIP\n"); - } + const auto [grid, block] = + Kokkos::Impl::compute_device_launch_params(m_policy, m_max_grid_size); + hip_parallel_launch( + *this, grid, block, 0, m_policy.space().impl_internal_space_instance(), + false); } // end execute ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) @@ -167,7 +69,22 @@ class ParallelFor, HIP> { m_policy.space().hip_device_prop().maxGridSize[1]), static_cast( m_policy.space().hip_device_prop().maxGridSize[2]), - }) {} + }) { + // Initialize begins and ends based on layout + // Swap the fastest indexes to x dimension + for (array_index_type i = 0; i < Policy::rank; ++i) { + if constexpr (Policy::inner_direction == Iterate::Left) { + m_lower[i] = m_policy.m_lower[i]; + m_upper[i] = m_policy.m_upper[i]; + m_extent[i] = m_policy.m_tile[i] * m_policy.m_tile_end[i]; + } else { + m_lower[i] = m_policy.m_lower[Policy::rank - 1 - i]; + m_upper[i] = m_policy.m_upper[Policy::rank - 1 - i]; + m_extent[i] = m_policy.m_tile[Policy::rank - 1 - i] * + m_policy.m_tile_end[Policy::rank - 1 - i]; + } + } + } template static int max_tile_size_product(const Policy&, const Functor&) { diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp index 0d55a2cda9c..7ebdc92e1d6 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp @@ -18,9 +18,10 @@ class ParallelFor, Kokkos::HIP> { using Policy = Kokkos::RangePolicy; private: - using Member = typename Policy::member_type; - using WorkTag = typename Policy::work_tag; - using LaunchBounds = typename Policy::launch_bounds; + using Member = typename Policy::member_type; + using WorkTag = typename Policy::work_tag; + using LaunchBounds = typename Policy::launch_bounds; + using StaticBatchSize = typename Policy::static_batch_size; const FunctorType m_functor; const Policy m_policy; @@ -40,26 +41,36 @@ class ParallelFor, Kokkos::HIP> { public: using functor_type = FunctorType; - ParallelFor() = delete; - ParallelFor(ParallelFor const&) = default; - ParallelFor& operator=(ParallelFor const&) = delete; + ParallelFor() = delete; inline __device__ void operator()() const { - const auto work_stride = Member(blockDim.y) * gridDim.x; - const Member work_end = m_policy.end(); + constexpr auto batch_size = Member(StaticBatchSize::batch_size); + const auto work_stride = Member(blockDim.y) * gridDim.x; + const Member work_end = m_policy.end(); for (Member iwork = m_policy.begin() + threadIdx.y + static_cast(blockDim.y) * blockIdx.x; iwork < work_end; - iwork = iwork < static_cast(work_end - work_stride) - ? iwork + work_stride - : work_end) { - this->template exec_range(iwork); + iwork = + iwork < static_cast(work_end - work_stride * batch_size) + ? iwork + work_stride * batch_size + : work_end) { + for (Member i = 0; i < static_cast(work_stride * batch_size) && + i < work_end - iwork; + i = (i < static_cast(work_end - work_stride - iwork)) + ? i + work_stride + : work_end - iwork) { + this->template exec_range(iwork + i); + } } } inline void execute() const { - const typename Policy::index_type nwork = m_policy.end() - m_policy.begin(); + constexpr typename Policy::index_type batch_size = + StaticBatchSize::batch_size; + const typename Policy::index_type nwork = + (m_policy.end() - m_policy.begin()) / batch_size + + ((m_policy.end() - m_policy.begin()) % batch_size == 0 ? 0 : 1); using DriverType = ParallelFor; const int block_size = diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp index 7607ec89f57..7cc6b56d022 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp @@ -58,9 +58,7 @@ class ParallelFor, HIP> { } public: - ParallelFor() = delete; - ParallelFor(ParallelFor const&) = default; - ParallelFor& operator=(ParallelFor const&) = delete; + ParallelFor() = delete; __device__ inline void operator()() const { // Iterate this block through the league @@ -99,6 +97,12 @@ class ParallelFor, HIP> { *this, grid, block, shmem_size_total, m_policy.space().impl_internal_space_instance(), true); // copy to device and execute + + if (m_scratch_pool_id >= 0) { + m_policy.space() + .impl_internal_space_instance() + ->release_team_scratch_space(m_scratch_pool_id); + } } ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) @@ -143,24 +147,34 @@ class ParallelFor, HIP> { } unsigned int const shmem_size_total = m_shmem_begin + m_shmem_size; - if (internal_space_instance->m_deviceProp.sharedMemPerBlock < - shmem_size_total) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::ParallelFor< HIP > insufficient shared memory")); + + auto maxShmemPerBlock = + internal_space_instance->m_deviceProp.sharedMemPerBlock; + if (maxShmemPerBlock < shmem_size_total) { + std::stringstream error; + error << "Kokkos::parallel_for: Requested too much scratch memory " + "on level 0. Requested: " + << m_shmem_size + << ", Maximum: " << maxShmemPerBlock - m_shmem_begin; + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); } - size_t max_size = arg_policy.team_size_max(arg_functor, ParallelForTag()); - if (static_cast(m_team_size) > static_cast(max_size)) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::ParallelFor< HIP > requested too large team size.")); + if (m_scratch_size[1] > static_cast(m_policy.scratch_size_max(1))) { + std::stringstream error; + error << "Kokkos::parallel_for: Requested too much scratch memory " + "on level 1. Requested: " + << m_scratch_size[1] + << ", Maximum: " << m_policy.scratch_size_max(1); + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); } - } - ~ParallelFor() { - if (m_scratch_pool_id >= 0) { - m_policy.space() - .impl_internal_space_instance() - ->release_team_scratch_space(m_scratch_pool_id); + int max_size = arg_policy.team_size_max(arg_functor, ParallelForTag()); + if (m_team_size > max_size) { + std::stringstream error; + error << "Kokkos::parallel_for: Requested too large team size. " + "Requested: " + << m_team_size << ", Maximum: " << max_size; + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); } } }; diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp index 45963b5e35d..f46c8b2fcd3 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp @@ -287,6 +287,12 @@ class ParallelReduce= 0) { + m_policy.space() + .impl_internal_space_instance() + ->release_team_scratch_space(m_scratch_pool_id); + } } template @@ -313,7 +319,7 @@ class ParallelReduce bad team size")); } - if (internal_space_instance->m_deviceProp.sharedMemPerBlock < - shmem_size_total) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelReduce< HIP > requested too much " - "L0 scratch memory")); + auto maxShmemPerBlock = + internal_space_instance->m_deviceProp.sharedMemPerBlock; + if (maxShmemPerBlock < shmem_size_total) { + std::stringstream error; + error + << "Kokkos::parallel_reduce: Requested too much scratch memory " + "on level 0. Requested: " + << m_shmem_size + << ", Maximum: " << maxShmemPerBlock - m_shmem_begin - m_team_begin; + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); } - size_t max_size = arg_policy.team_size_max( - arg_functor_reducer.get_functor(), arg_functor_reducer.get_reducer(), - ParallelReduceTag()); - if (static_cast(m_team_size) > static_cast(max_size)) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::ParallelReduce< HIP > requested too " - "large team size.")); + if (m_scratch_size[1] > static_cast(m_policy.scratch_size_max(1))) { + std::stringstream error; + error + << "Kokkos::parallel_reduce: Requested too much scratch memory " + "on level 1. Requested: " + << m_scratch_size[1] << ", Maximum: " << m_policy.scratch_size_max(1); + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); } - } - ~ParallelReduce() { - if (m_scratch_pool_id >= 0) { - m_policy.space() - .impl_internal_space_instance() - ->release_team_scratch_space(m_scratch_pool_id); + if (m_team_size > + arg_policy.team_size_max_internal(m_functor_reducer.get_functor(), + m_functor_reducer.get_reducer(), + ParallelReduceTag())) { + std::stringstream error; + error << "Kokkos::parallel_reduce: Requested too large team size. " + "Requested: " + << m_team_size << ", Maximum: " + << arg_policy.team_size_max_internal( + m_functor_reducer.get_functor(), + m_functor_reducer.get_reducer(), ParallelReduceTag()); + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); } } }; diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelScan_Range.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelScan_Range.hpp index ce9b35b0d3b..665e4f3524a 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelScan_Range.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelScan_Range.hpp @@ -149,6 +149,9 @@ class ParallelScanHIPBase { } else if (0 == threadIdx.y) { final_reducer.init(reinterpret_cast(shared_accum)); } + // FIXME_HIP below __syncthreads() is added to handle MI300A. + // Likely compiler optimization bug. + __syncthreads(); const WorkRange range(m_policy, blockIdx.x, gridDim.x); @@ -156,8 +159,10 @@ class ParallelScanHIPBase { iwork_base < range.end(); iwork_base += blockDim.y) { const typename Policy::member_type iwork = iwork_base + threadIdx.y; - __syncthreads(); // Don't overwrite previous iteration values until they - // are used + // FIXME_HIP: we encountered something believed to be a compiler bug on + // MI300A: instead of syncing here, we need to sync before the loop + // and at the very end of the loop. + //__syncthreads(); final_reducer.init( reinterpret_cast(shared_prefix + word_count.value)); @@ -205,6 +210,9 @@ class ParallelScanHIPBase { if (iwork + 1 == m_policy.end() && m_policy.end() == range.end() && m_result_ptr_device_accessible) *m_result_ptr = *reinterpret_cast(shared_prefix); + // FIXME_HIP below __syncthreads() is moved from the beginning of this + // loop to here to handle issues on MI300A. Likely compiler bug. + __syncthreads(); } } diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp index 42947e993a8..66e8d4d2372 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp @@ -45,11 +45,6 @@ class HIPSpace { /*--------------------------------*/ HIPSpace(); - HIPSpace(HIPSpace&& rhs) = default; - HIPSpace(const HIPSpace& rhs) = default; - HIPSpace& operator=(HIPSpace&& rhs) = default; - HIPSpace& operator=(const HIPSpace& rhs) = default; - ~HIPSpace() = default; private: HIPSpace(int device_id, hipStream_t stream); @@ -131,11 +126,6 @@ class HIPHostPinnedSpace { /*--------------------------------*/ HIPHostPinnedSpace(); - HIPHostPinnedSpace(HIPHostPinnedSpace&& rhs) = default; - HIPHostPinnedSpace(const HIPHostPinnedSpace& rhs) = default; - HIPHostPinnedSpace& operator=(HIPHostPinnedSpace&& rhs) = default; - HIPHostPinnedSpace& operator=(const HIPHostPinnedSpace& rhs) = default; - ~HIPHostPinnedSpace() = default; private: HIPHostPinnedSpace(int device_id, hipStream_t stream); @@ -216,11 +206,6 @@ class HIPManagedSpace { /*--------------------------------*/ HIPManagedSpace(); - HIPManagedSpace(HIPManagedSpace&& rhs) = default; - HIPManagedSpace(const HIPManagedSpace& rhs) = default; - HIPManagedSpace& operator=(HIPManagedSpace&& rhs) = default; - HIPManagedSpace& operator=(const HIPManagedSpace& rhs) = default; - ~HIPManagedSpace() = default; private: HIPManagedSpace(int device_id, hipStream_t stream); diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp index aa12297b103..b730c2dc9d4 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp @@ -567,7 +567,7 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< * less than N) and a scan operation is performed. The last call to closure has * final == true. */ -// This is the same code as in CUDA and largely the same as in OpenMPTarget +// This is the same code as in CUDA. template KOKKOS_INLINE_FUNCTION void parallel_scan( const Impl::TeamThreadRangeBoundariesStruct& diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp index b7ac3da485f..eb1f4557d59 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp @@ -74,8 +74,18 @@ class TeamPolicyInternal } template - inline int team_size_max(const FunctorType& f, const ReducerType&, - const ParallelReduceTag&) const { + inline int team_size_max(const FunctorType& f, const ReducerType& reducer, + const ParallelReduceTag& tag) const { + using functor_analysis_type = + Impl::FunctorAnalysis; + return team_size_max_internal( + f, typename functor_analysis_type::Reducer{reducer}, tag); + } + + template + inline int team_size_max_internal(const FunctorType& f, const ReducerType&, + const ParallelReduceTag&) const { using closure_type = Impl::ParallelReduce, TeamPolicy, Kokkos::HIP>; @@ -108,8 +118,18 @@ class TeamPolicyInternal } template - int team_size_recommended(FunctorType const& f, ReducerType const&, - ParallelReduceTag const&) const { + int team_size_recommended(const FunctorType& f, const ReducerType& reducer, + const ParallelReduceTag& tag) const { + using functor_analysis_type = + Impl::FunctorAnalysis; + return team_size_recommended_internal( + f, typename functor_analysis_type::Reducer{reducer}, tag); + } + + template + int team_size_recommended_internal(const FunctorType& f, const ReducerType&, + const ParallelReduceTag&) const { using closure_type = Impl::ParallelReduce, TeamPolicy, Kokkos::HIP>; @@ -202,10 +222,14 @@ class TeamPolicyInternal "space."); // Make sure total block size is permissible - if (m_team_size * m_vector_length > HIPTraits::MaxThreadsPerBlock) { - Impl::throw_runtime_exception( - std::string("Kokkos::TeamPolicy< HIP > the team size is too large. " - "Team size x vector length must be smaller than 1024.")); + if (m_team_size * m_vector_length > + static_cast(HIPTraits::MaxThreadsPerBlock)) { + std::stringstream error; + error << "Kokkos::TeamPolicy: Requested too large team size. " + "Requested: " + << m_team_size + << ", Maximum: " << HIPTraits::MaxThreadsPerBlock / m_vector_length; + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); } } diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_UniqueToken.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_UniqueToken.hpp index d110c39589c..57e082ee292 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_UniqueToken.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_UniqueToken.hpp @@ -44,18 +44,6 @@ class UniqueToken { } public: - KOKKOS_DEFAULTED_FUNCTION - UniqueToken(const UniqueToken&) = default; - - KOKKOS_DEFAULTED_FUNCTION - UniqueToken(UniqueToken&&) = default; - - KOKKOS_DEFAULTED_FUNCTION - UniqueToken& operator=(const UniqueToken&) = default; - - KOKKOS_DEFAULTED_FUNCTION - UniqueToken& operator=(UniqueToken&&) = default; - /// \brief upper bound for acquired values, i.e. 0 <= value < size() KOKKOS_INLINE_FUNCTION size_type size() const noexcept { return m_locks.extent(0); } diff --git a/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp b/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp index 031d7f1a437..ff0c0c15873 100644 --- a/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp +++ b/lib/kokkos/core/src/HPX/Kokkos_HPX.cpp @@ -95,33 +95,6 @@ void HPX::print_configuration(std::ostream &os, const bool) const { os << hpx::configuration_string() << '\n'; } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -bool &HPX::impl_get_in_parallel() noexcept { - static thread_local bool in_parallel = false; - return in_parallel; -} - -HPX::impl_in_parallel_scope::impl_in_parallel_scope() noexcept { - KOKKOS_EXPECTS(!impl_get_in_parallel()); - impl_get_in_parallel() = true; -} - -HPX::impl_in_parallel_scope::~impl_in_parallel_scope() noexcept { - KOKKOS_EXPECTS(impl_get_in_parallel()); - impl_get_in_parallel() = false; -} - -HPX::impl_not_in_parallel_scope::impl_not_in_parallel_scope() noexcept { - KOKKOS_EXPECTS(impl_get_in_parallel()); - impl_get_in_parallel() = false; -} - -HPX::impl_not_in_parallel_scope::~impl_not_in_parallel_scope() noexcept { - KOKKOS_EXPECTS(!impl_get_in_parallel()); - impl_get_in_parallel() = true; -} -#endif - void HPX::impl_decrement_active_parallel_region_count() { std::unique_lock l(m_active_parallel_region_count_mutex); if (--m_active_parallel_region_count == 0) { @@ -136,23 +109,11 @@ void HPX::impl_increment_active_parallel_region_count() { } void HPX::impl_instance_fence_locked(const std::string &name) const { - Kokkos::Tools::Experimental::Impl::profile_fence_event< - Kokkos::Experimental::HPX>( - name, - Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{ - impl_instance_id()}, - [&]() { - auto &s = impl_get_sender(); - - hpx::this_thread::experimental::sync_wait(std::move(s)); - s = hpx::execution::experimental::unique_any_sender<>( - hpx::execution::experimental::just()); - }); + impl_get_instance_data().fence_locked(name); } void HPX::impl_instance_fence(const std::string &name) const { - std::lock_guard l(impl_get_sender_mutex()); - impl_instance_fence_locked(name); + impl_get_instance_data().fence(name); } void HPX::impl_static_fence(const std::string &name) { @@ -181,11 +142,7 @@ void HPX::impl_static_fence(const std::string &name) { }); } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -int HPX::concurrency() { -#else int HPX::concurrency() const { -#endif hpx::runtime *rt = hpx::get_runtime_ptr(); if (rt == nullptr) { return hpx::threads::hardware_concurrency(); diff --git a/lib/kokkos/core/src/HPX/Kokkos_HPX.hpp b/lib/kokkos/core/src/HPX/Kokkos_HPX.hpp index 030e3042433..6efdec58b24 100644 --- a/lib/kokkos/core/src/HPX/Kokkos_HPX.hpp +++ b/lib/kokkos/core/src/HPX/Kokkos_HPX.hpp @@ -20,11 +20,12 @@ static_assert(false, #include #include #include +#include #include #include #include -#include #include +#include #include @@ -41,6 +42,7 @@ static_assert(false, #include #include #include +#include #include #include @@ -119,8 +121,11 @@ class HPX { static hpx::condition_variable_any m_active_parallel_region_count_cond; struct instance_data { - instance_data() = default; - ~instance_data() = default; + instance_data() = default; + // NOLINTNEXTLINE(bugprone-exception-escape) + ~instance_data() { + fence("Kokkos::Experimental::HPX: fence on destruction"); + } instance_data(uint32_t instance_id) : m_instance_id(instance_id) {} instance_data(uint32_t instance_id, hpx::execution::experimental::unique_any_sender<> &&sender) @@ -131,6 +136,25 @@ class HPX { instance_data &operator=(const instance_data &) = delete; instance_data &operator=(instance_data) = delete; + void fence(const std::string &name) { + std::lock_guard l(m_sender_mutex); + fence_locked(name); + } + + void fence_locked(const std::string &name) { + Kokkos::Tools::Experimental::Impl::profile_fence_event< + Kokkos::Experimental::HPX>( + name, + Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{m_instance_id}, + [&]() { + auto &s = m_sender; + + hpx::this_thread::experimental::sync_wait(std::move(s)); + s = hpx::execution::experimental::unique_any_sender<>( + hpx::execution::experimental::just()); + }); + } + uint32_t m_instance_id{HPX::impl_default_instance_id()}; hpx::execution::experimental::unique_any_sender<> m_sender{ hpx::execution::experimental::just()}; @@ -156,42 +180,40 @@ class HPX { #pragma GCC diagnostic ignored "-Wuninitialized" HPX() - : m_instance_data(Kokkos::Impl::HostSharedPtr( - &m_default_instance_data, &default_instance_deleter)) {} + : m_instance_data( + (Kokkos::Impl::check_execution_space_constructor_precondition( + name()), + Kokkos::Impl::HostSharedPtr( + &m_default_instance_data, &default_instance_deleter))) {} #pragma GCC diagnostic pop - ~HPX() = default; + ~HPX() { + Kokkos::Impl::check_execution_space_destructor_precondition(name()); + } explicit HPX(instance_mode mode) : m_instance_data( - mode == instance_mode::independent - ? (Kokkos::Impl::HostSharedPtr( - new instance_data(m_next_instance_id++))) - : Kokkos::Impl::HostSharedPtr( - &m_default_instance_data, &default_instance_deleter)) {} + (Kokkos::Impl::check_execution_space_constructor_precondition( + name()), + mode == instance_mode::independent + ? (Kokkos::Impl::HostSharedPtr( + new instance_data(m_next_instance_id++))) + : Kokkos::Impl::HostSharedPtr( + &m_default_instance_data, &default_instance_deleter))) {} explicit HPX(hpx::execution::experimental::unique_any_sender<> &&sender) - : m_instance_data(Kokkos::Impl::HostSharedPtr( - new instance_data(m_next_instance_id++, std::move(sender)))) {} - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - template - KOKKOS_DEPRECATED_WITH_COMMENT( - "HPX execution space should be constructed explicitly.") - HPX(instance_mode mode) - : HPX(mode) {} - - template - KOKKOS_DEPRECATED_WITH_COMMENT( - "HPX execution space should be constructed explicitly.") - HPX(hpx::execution::experimental::unique_any_sender<> &&sender) - : HPX(std::move(sender)) {} -#endif - - HPX(HPX &&other) = default; - HPX(const HPX &other) = default; + : m_instance_data( + (Kokkos::Impl::check_execution_space_constructor_precondition( + name()), + Kokkos::Impl::HostSharedPtr(new instance_data( + m_next_instance_id++, std::move(sender))))) {} - HPX &operator=(HPX &&) = default; - HPX &operator=(const HPX &) = default; + KOKKOS_DEFAULTED_FUNCTION HPX(const HPX &) = default; + KOKKOS_FUNCTION HPX(HPX &&other) noexcept + : HPX(static_cast(other)) {} + KOKKOS_DEFAULTED_FUNCTION HPX &operator=(const HPX &) = default; + KOKKOS_FUNCTION HPX &operator=(HPX &&other) noexcept { + return *this = static_cast(other); + } void print_configuration(std::ostream &os, bool /*verbose*/ = false) const; instance_data &impl_get_instance_data() const noexcept { @@ -202,34 +224,6 @@ class HPX { return impl_get_instance_data().m_instance_id; } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - static bool &impl_get_in_parallel() noexcept; - - struct impl_in_parallel_scope { - impl_in_parallel_scope() noexcept; - ~impl_in_parallel_scope() noexcept; - impl_in_parallel_scope(impl_in_parallel_scope &&) = delete; - impl_in_parallel_scope(impl_in_parallel_scope const &) = delete; - impl_in_parallel_scope &operator=(impl_in_parallel_scope &&) = delete; - impl_in_parallel_scope &operator=(impl_in_parallel_scope const &) = delete; - }; - - struct impl_not_in_parallel_scope { - impl_not_in_parallel_scope() noexcept; - ~impl_not_in_parallel_scope() noexcept; - impl_not_in_parallel_scope(impl_not_in_parallel_scope &&) = delete; - impl_not_in_parallel_scope(impl_not_in_parallel_scope const &) = delete; - impl_not_in_parallel_scope &operator=(impl_not_in_parallel_scope &&) = - delete; - impl_not_in_parallel_scope &operator=(impl_not_in_parallel_scope const &) = - delete; - }; - - KOKKOS_DEPRECATED static bool in_parallel(HPX const & = HPX()) noexcept { - return impl_get_in_parallel(); - } -#endif - static void impl_decrement_active_parallel_region_count(); static void impl_increment_active_parallel_region_count(); @@ -240,24 +234,10 @@ class HPX { void fence( const std::string &name = "Kokkos::Experimental::HPX::fence: Unnamed Instance Fence") const { - impl_instance_fence(name); - } - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - KOKKOS_DEPRECATED static bool is_asynchronous(HPX const & = HPX()) noexcept { -#if defined(KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH) - return true; -#else - return false; -#endif + impl_get_instance_data().fence(name); } -#endif -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - static int concurrency(); -#else int concurrency() const; -#endif static void impl_initialize(InitializationSettings const &); static void impl_finalize(); static int impl_thread_pool_size() noexcept; @@ -346,12 +326,8 @@ class HPX { hpx::threads::thread_stacksize stacksize = hpx::threads::thread_stacksize::default_) const { impl_bulk_plain_erased(force_synchronous, is_light_weight_policy, - {[functor](Index i) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - impl_in_parallel_scope p; -#endif - functor.execute_range(i); - }}, + // NOLINTNEXTLINE(bugprone-exception-escape) + {[functor](Index i) { functor.execute_range(i); }}, n, stacksize); } @@ -409,26 +385,11 @@ class HPX { Functor const &functor, Index const n, hpx::threads::thread_stacksize stacksize = hpx::threads::thread_stacksize::default_) const { - impl_bulk_setup_finalize_erased(force_synchronous, is_light_weight_policy, - {[functor](Index i) { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - impl_in_parallel_scope p; -#endif - functor.execute_range(i); - }}, - {[functor]() { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - impl_in_parallel_scope p; -#endif - functor.setup(); - }}, - {[functor]() { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - impl_in_parallel_scope p; -#endif - functor.finalize(); - }}, - n, stacksize); + impl_bulk_setup_finalize_erased( + force_synchronous, is_light_weight_policy, + {[functor](Index i) { functor.execute_range(i); }}, + {[functor]() { functor.setup(); }}, + {[functor]() { functor.finalize(); }}, n, stacksize); } static constexpr const char *name() noexcept { return "HPX"; } @@ -792,8 +753,16 @@ class TeamPolicyInternal m_league_size = league_size_request; const int max_team_size = 1; // TODO: Can't use team_size_max(...) because // it requires a functor as argument. - m_team_size = - team_size_request > max_team_size ? max_team_size : team_size_request; + + if (team_size_request > max_team_size) { + std::stringstream error; + error << "Kokkos::TeamPolicy: Requested too large team size. " + "Requested: " + << team_size_request << ", Maximum: " << max_team_size; + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); + } + + m_team_size = team_size_request; if (m_chunk_size > 0 && !Kokkos::has_single_bit(static_cast(m_chunk_size))) { @@ -1322,17 +1291,7 @@ class ParallelScan, const WorkRange range(m_policy, t, num_worker_threads); execute_chunk(range.begin(), range.end(), update_sum, false); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - { - // Since arrive_and_wait may yield and resume on another worker thread we - // set in_parallel = false on the current thread before suspending and set - // it again to true when we resume. - Kokkos::Experimental::HPX::impl_not_in_parallel_scope p; - barrier.arrive_and_wait(); - } -#else barrier.arrive_and_wait(); -#endif if (t == 0) { final_reducer.init(reinterpret_cast( @@ -1354,17 +1313,7 @@ class ParallelScan, } } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - { - // Since arrive_and_wait may yield and resume on another worker thread we - // set in_parallel = false on the current thread before suspending and set - // it again to true when we resume. - Kokkos::Experimental::HPX::impl_not_in_parallel_scope p; - barrier.arrive_and_wait(); - } -#else barrier.arrive_and_wait(); -#endif reference_type update_base = Analysis::Reducer::reference(reinterpret_cast( @@ -1445,17 +1394,7 @@ class ParallelScanWithTotal, const WorkRange range(m_policy, t, num_worker_threads); execute_chunk(range.begin(), range.end(), update_sum, false); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - { - // Since arrive_and_wait may yield and resume on another worker thread we - // set in_parallel = false on the current thread before suspending and set - // it again to true when we resume. - Kokkos::Experimental::HPX::impl_not_in_parallel_scope p; - barrier.arrive_and_wait(); - } -#else barrier.arrive_and_wait(); -#endif if (t == 0) { final_reducer.init(reinterpret_cast( @@ -1477,17 +1416,7 @@ class ParallelScanWithTotal, } } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - { - // Since arrive_and_wait may yield and resume on another worker thread we - // set in_parallel = false on the current thread before suspending and set - // it again to true when we resume. - Kokkos::Experimental::HPX::impl_not_in_parallel_scope p; - barrier.arrive_and_wait(); - } -#else barrier.arrive_and_wait(); -#endif reference_type update_base = Analysis::Reducer::reference(reinterpret_cast( @@ -1591,7 +1520,32 @@ class ParallelFor, m_league(m_policy.league_size()), m_shared(m_policy.scratch_size(0) + m_policy.scratch_size(1) + FunctorTeamShmemSize::value( - m_functor, m_policy.team_size())) {} + m_functor, m_policy.team_size())) { + if ((m_policy.scratch_size(0) + FunctorTeamShmemSize::value( + m_functor, m_policy.team_size())) > + static_cast( + TeamPolicy::scratch_size_max(0))) { + std::stringstream error; + error << "Kokkos::parallel_for: Requested too much scratch memory " + "on level 0. Requested: " + << m_policy.scratch_size(0) + + FunctorTeamShmemSize::value( + m_functor, m_policy.team_size()) + << ", Maximum: " + << TeamPolicy::scratch_size_max(0); + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); + } + if (m_policy.scratch_size(1) > + static_cast( + TeamPolicy::scratch_size_max(1))) { + std::stringstream error; + error << "Kokkos::parallel_for: Requested too much scratch memory " + "on level 1. Requested: " + << m_policy.scratch_size(1) << ", Maximum: " + << TeamPolicy::scratch_size_max(1); + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); + } + } }; template @@ -1716,6 +1670,32 @@ class ParallelReduce::accessible, "HPX reduce result must be a View accessible from HostSpace"); + if ((arg_policy.scratch_size(0) + + FunctorTeamShmemSize::value( + arg_functor_reducer.get_functor(), arg_policy.team_size())) > + static_cast( + TeamPolicy::scratch_size_max(0))) { + std::stringstream error; + error << "Kokkos::parallel_reduce: Requested too much scratch " + "memory on level 0. Requested: " + << arg_policy.scratch_size(0) + + FunctorTeamShmemSize::value( + arg_functor_reducer.get_functor(), + arg_policy.team_size()) + << ", Maximum: " + << TeamPolicy::scratch_size_max(0); + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); + } + if (arg_policy.scratch_size(1) > + static_cast( + TeamPolicy::scratch_size_max(1))) { + std::stringstream error; + error << "Kokkos::parallel_reduce: Requested too much scratch " + "memory on level 1. Requested: " + << arg_policy.scratch_size(1) << ", Maximum: " + << TeamPolicy::scratch_size_max(1); + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); + } } }; } // namespace Impl diff --git a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp index 0e1e1d3dd25..f43f61d0e25 100644 --- a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp +++ b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp @@ -22,17 +22,6 @@ static_assert(false, namespace Kokkos { -// ------------------------------------------------------------------ // -// Moved to Kokkos_Layout.hpp for more general accessibility -/* -enum class Iterate -{ - Default, // Default for the device - Left, // Left indices stride fastest - Right, // Right indices stride fastest -}; -*/ - template struct default_outer_direction { using type = Iterate; @@ -46,6 +35,7 @@ struct default_inner_direction { }; namespace Impl { + // NOTE the comparison below is encapsulated to silent warnings about pointless // comparison of unsigned integer with zero template @@ -124,8 +114,6 @@ constexpr NVCC_WONT_LET_ME_CALL_YOU_Array to_array_potentially_narrowing( struct TileSizeProperties { int max_threads; // (per SM, CU) - int default_largest_tile_size; - int default_tile_size; int max_total_tile_size; // For GPU backends: hardware limits for block dimensions std::array max_threads_dimensions; @@ -136,15 +124,58 @@ TileSizeProperties get_tile_size_properties(const ExecutionSpace&) { // Host settings TileSizeProperties properties; properties.max_threads = std::numeric_limits::max(); - properties.default_largest_tile_size = 0; - properties.default_tile_size = 2; properties.max_total_tile_size = std::numeric_limits::max(); - for (int i = 0; i < 3; ++i) { - properties.max_threads_dimensions[i] = std::numeric_limits::max(); - } + properties.max_threads_dimensions[0] = std::numeric_limits::max(); + properties.max_threads_dimensions[1] = std::numeric_limits::max(); + properties.max_threads_dimensions[2] = std::numeric_limits::max(); return properties; } +// Default tile size recommended (for MDRangePolicy) +template +struct TileSizeRecommended { + template + static auto get(Policy const& policy); +}; + +// Recommend tile sizes for each rank of MDRangePolicy. +// Each rank is tiled with a default size of 2, except the innermost rank which +// is set to its full work range length. +template +template +auto TileSizeRecommended::get(Policy const& policy) { + constexpr auto InnerDirection = Policy::inner_direction; + constexpr int Rank = Policy::rank; + + using tile_type = Kokkos::Array; + + tile_type recommended_tile_sizes{}; + int default_tile_size = 2; + int max_total_tile_size = policy.max_total_tile_size(); + + int inner_rank = (InnerDirection == Iterate::Right) ? Rank - 1 : 0; + int outer_bound = (InnerDirection == Iterate::Right) ? -1 : Rank; + int iter_step = (InnerDirection == Iterate::Right) ? -1 : 1; + auto inner_work_range = + policy.m_upper[inner_rank] - policy.m_lower[inner_rank]; + + int prod_tile_size = 1; + for (int i = inner_rank; i != outer_bound; i += iter_step) { + int rank_tile_size = 1; + if (prod_tile_size * default_tile_size <= max_total_tile_size) { + rank_tile_size = default_tile_size; + } else { + rank_tile_size = 1; + } + if (i == inner_rank) { + rank_tile_size = std::max(inner_work_range, 1); + } + prod_tile_size *= rank_tile_size; + recommended_tile_sizes[i] = rank_tile_size; + } + return recommended_tile_sizes; +} + } // namespace Impl // multi-dimensional iteration pattern @@ -161,14 +192,13 @@ struct MDRangePolicy; template struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { - using traits = Kokkos::Impl::PolicyTraits; - using range_policy = RangePolicy; - - typename traits::execution_space m_space; + using traits = Kokkos::Impl::PolicyTraits; + using execution_space = typename traits::execution_space; + using range_policy = RangePolicy; using impl_range_policy = - RangePolicy; + RangePolicy; using execution_policy = MDRangePolicy; // needed for is_execution_policy @@ -203,13 +233,17 @@ struct MDRangePolicy // as template parameter to the MDRangePolicy or static_cast the individual // values - point_type m_lower = {}; - point_type m_upper = {}; - tile_type m_tile = {}; - point_type m_tile_end = {}; - index_type m_num_tiles = 1; - index_type m_prod_tile_dims = 1; - bool m_tune_tile_size = false; + execution_space m_space; + + point_type m_lower = {}; + point_type m_upper = {}; + tile_type m_tile = {}; + point_type m_tile_end = {}; + index_type m_num_tiles = 1; + index_type m_prod_tile_dims = 1; + bool m_tune_tile_size = false; + index_type m_max_total_tile_size = 1; + std::array m_max_threads_dimensions = {1, 1, 1}; static constexpr auto outer_direction = (iteration_pattern::outer_direction != Iterate::Default) @@ -281,7 +315,7 @@ struct MDRangePolicy point_type const& lower, point_type const& upper, tile_type const& tile = tile_type{}) : m_space(work_space), m_lower(lower), m_upper(upper), m_tile(tile) { - init_helper(Impl::get_tile_size_properties(work_space)); + update_tiling_properties(); } template typename traits::execution_space space) : MDRangePolicy(other) { this->m_space = std::move(space); + // Reset auto-tuned tiles if the execution space changes since the computed + // tile size may be different + if (this->m_tune_tile_size) { + this->m_tile = {}; + } + update_tiling_properties(); } template @@ -322,104 +362,116 @@ struct MDRangePolicy m_tile_end(p.m_tile_end), m_num_tiles(p.m_num_tiles), m_prod_tile_dims(p.m_prod_tile_dims), - m_tune_tile_size(p.m_tune_tile_size) {} + m_tune_tile_size(p.m_tune_tile_size), + m_max_total_tile_size(p.m_max_total_tile_size), + m_max_threads_dimensions(p.m_max_threads_dimensions) {} void impl_change_tile_size(const point_type& tile) { - m_tile = tile; - init_helper(Impl::get_tile_size_properties(m_space)); + this->m_tile = tile; + this->update_tiling_properties(); } + bool impl_tune_tile_size() const { return m_tune_tile_size; } tile_type tile_size_recommended() const { - tile_type rec_tile_sizes = {}; - - for (std::size_t i = 0; i < rec_tile_sizes.size(); ++i) { - rec_tile_sizes[i] = tile_size_recommended(i); - } - return rec_tile_sizes; + return Kokkos::Impl::TileSizeRecommended::get(*this); } - int max_total_tile_size() const { - return Impl::get_tile_size_properties(m_space).max_total_tile_size; - } + index_type max_total_tile_size() const { return m_max_total_tile_size; } private: - int tile_size_recommended(const int tile_rank) const { - auto properties = Impl::get_tile_size_properties(m_space); - int last_rank = (inner_direction == Iterate::Right) ? rank - 1 : 0; - int rank_acc = - (inner_direction == Iterate::Right) ? tile_rank + 1 : tile_rank - 1; - int rec_tile_size = (std::pow(properties.default_tile_size, rank_acc) < - properties.max_total_tile_size) - ? properties.default_tile_size - : 1; - - if (tile_rank == last_rank) { - rec_tile_size = tile_size_last_rank( - properties, m_upper[last_rank] - m_lower[last_rank]); + void update_tiling_properties() { + auto properties = Impl::get_tile_size_properties(m_space); + this->m_num_tiles = 1; + this->m_prod_tile_dims = 1; + this->m_max_total_tile_size = + static_cast(properties.max_total_tile_size); + this->m_max_threads_dimensions = properties.max_threads_dimensions; + + index_type effective_max_tile_size = this->m_max_total_tile_size; + + constexpr bool enforce_launch_bounds = +#if defined(KOKKOS_ENABLE_CUDA) + std::is_same_v; +#elif defined(KOKKOS_ENABLE_HIP) + std::is_same_v; +#else + false; +#endif + + if constexpr (enforce_launch_bounds && launch_bounds::maxTperB != 0) { + effective_max_tile_size = + std::min(effective_max_tile_size, + static_cast(launch_bounds::maxTperB)); } - return rec_tile_size; - } - int tile_size_last_rank(const Impl::TileSizeProperties properties, - const index_type length) const { - return properties.default_largest_tile_size == 0 - ? std::max(length, 1) - : properties.default_largest_tile_size; - } + tile_type default_tile = this->tile_size_recommended(); - void init_helper(Impl::TileSizeProperties properties) { - m_prod_tile_dims = 1; - int increment = 1; - int rank_start = 0; - int rank_end = rank; - if (inner_direction == Iterate::Right) { - increment = -1; - rank_start = rank - 1; - rank_end = -1; - } + int inner_rank = (inner_direction == Iterate::Right) ? rank - 1 : 0; + int outer_bound = (inner_direction == Iterate::Right) ? -1 : rank; + int iter_step = (inner_direction == Iterate::Right) ? -1 : 1; - for (int i = rank_start; i != rank_end; i += increment) { - const index_type length = m_upper[i] - m_lower[i]; + for (int i = inner_rank; i != outer_bound; i += iter_step) { + const index_type length = this->m_upper[i] - this->m_lower[i]; - if (m_upper[i] < m_lower[i]) { + if (this->m_upper[i] < this->m_lower[i]) { std::string msg = "Kokkos::MDRangePolicy bounds error: The lower bound (" + - std::to_string(m_lower[i]) + ") is greater than its upper bound (" + - std::to_string(m_upper[i]) + ") in dimension " + std::to_string(i) + - ".\n"; -#if !defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) + std::to_string(this->m_lower[i]) + + ") is greater than its upper bound (" + + std::to_string(this->m_upper[i]) + ") in dimension " + + std::to_string(i) + ".\n"; Kokkos::abort(msg.c_str()); -#elif defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) - Kokkos::Impl::log_warning(msg); -#endif } - if (m_tile[i] <= 0) { - m_tune_tile_size = true; - if ((inner_direction == Iterate::Right && (i < rank - 1)) || - (inner_direction == Iterate::Left && (i > 0))) { - if (m_prod_tile_dims * properties.default_tile_size < - static_cast(properties.max_total_tile_size)) { - m_tile[i] = properties.default_tile_size; - } else { - m_tile[i] = 1; - } + // If tile size is not specified or <= 0 set to recommended tile size + if (this->m_tile[i] <= 0) { + this->m_tune_tile_size = true; + // Set to recommended tile size if it fits within effective limit + if (this->m_prod_tile_dims * default_tile[i] <= + effective_max_tile_size) { + this->m_tile[i] = default_tile[i]; } else { - m_tile[i] = tile_size_last_rank(properties, length); + // Try to fit within effective limit by reducing tile size + while (default_tile[i] > 1 && + this->m_prod_tile_dims * default_tile[i] > + effective_max_tile_size) { + default_tile[i] >>= 1; + } + this->m_tile[i] = (default_tile[i] > 1) ? default_tile[i] : 1; } } - m_tile_end[i] = - static_cast((length + m_tile[i] - 1) / m_tile[i]); - m_num_tiles *= m_tile_end[i]; - m_prod_tile_dims *= m_tile[i]; + + this->m_tile_end[i] = static_cast( + (length + this->m_tile[i] - 1) / this->m_tile[i]); + this->m_num_tiles *= this->m_tile_end[i]; + this->m_prod_tile_dims *= this->m_tile[i]; + } + + if constexpr (enforce_launch_bounds && launch_bounds::maxTperB != 0) { + if (static_cast(launch_bounds::maxTperB) < + this->m_prod_tile_dims) { + std::string msg = + "Kokkos::MDRangePolicy tile dimensions error: Product of tile " + "dimensions (" + + std::to_string(static_cast(this->m_prod_tile_dims)) + + ") is greater than the maximum specified via LaunchBounds (" + + std::to_string(launch_bounds::maxTperB) + + ") - choose smaller tile dims\n"; + Kokkos::abort(msg.c_str()); + } } - if (m_prod_tile_dims > static_cast(properties.max_threads)) { - printf(" Product of tile dimensions exceed maximum limit: %d\n", - static_cast(properties.max_threads)); - Kokkos::abort( - "ExecSpace Error: MDRange tile dims exceed maximum number " - "of threads per block - choose smaller tile dims"); + + if (this->m_prod_tile_dims > + static_cast(this->m_max_total_tile_size)) { + std::string msg = + "Kokkos::MDRangePolicy tile dimensions error: Product of tile " + "dimensions (" + + std::to_string(static_cast(this->m_prod_tile_dims)) + + ") is greater than the maximum total tile size (" + + std::to_string(static_cast(this->m_max_total_tile_size)) + + ") - choose smaller tile dims\n"; + Kokkos::abort(msg.c_str()); } } }; diff --git a/lib/kokkos/core/src/Kokkos_Abort.hpp b/lib/kokkos/core/src/Kokkos_Abort.hpp index c1cd3958baf..61c8087a154 100644 --- a/lib/kokkos/core/src/Kokkos_Abort.hpp +++ b/lib/kokkos/core/src/Kokkos_Abort.hpp @@ -42,7 +42,7 @@ namespace Impl { #elif defined(KOKKOS_ENABLE_SYCL) && defined(__SYCL_DEVICE_ONLY__) // FIXME_SYCL SYCL doesn't abort #define KOKKOS_IMPL_ABORT_NORETURN -#elif !defined(KOKKOS_ENABLE_OPENMPTARGET) && !defined(KOKKOS_ENABLE_OPENACC) +#elif !defined(KOKKOS_ENABLE_OPENACC) // Host aborts #define KOKKOS_IMPL_ABORT_NORETURN [[noreturn]] #else @@ -59,9 +59,8 @@ namespace Impl { #define KOKKOS_IMPL_ABORT_NORETURN_DEVICE KOKKOS_IMPL_ABORT_NORETURN #endif -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || \ - defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENMPTARGET) || \ - defined(KOKKOS_ENABLE_OPENACC) +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || \ + defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENACC) KOKKOS_IMPL_ABORT_NORETURN_DEVICE inline KOKKOS_IMPL_DEVICE_FUNCTION void device_abort(const char *const msg) { #if defined(KOKKOS_ENABLE_CUDA) @@ -70,8 +69,8 @@ device_abort(const char *const msg) { ::Kokkos::Impl::hip_abort(msg); #elif defined(KOKKOS_ENABLE_SYCL) ::Kokkos::Impl::sycl_abort(msg); -#elif defined(KOKKOS_ENABLE_OPENMPTARGET) || defined(KOKKOS_ENABLE_OPENACC) - printf("%s", msg); // FIXME_OPENMPTARGET FIXME_OPENACC +#elif defined(KOKKOS_ENABLE_OPENACC) + printf("%s", msg); // FIXME_OPENACC #else #error faulty logic #endif diff --git a/lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp b/lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp index c259c63fb38..32bd33423c3 100644 --- a/lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp @@ -28,12 +28,7 @@ class AnonymousSpace { using device_type = Kokkos::Device; /**\brief Default memory space instance */ - AnonymousSpace() = default; - AnonymousSpace(AnonymousSpace &&rhs) = default; - AnonymousSpace(const AnonymousSpace &rhs) = default; - AnonymousSpace &operator=(AnonymousSpace &&) = default; - AnonymousSpace &operator=(const AnonymousSpace &) = default; - ~AnonymousSpace() = default; + AnonymousSpace() = default; /**\brief Return Name of the MemorySpace */ static constexpr const char *name() { return "Anonymous"; } diff --git a/lib/kokkos/core/src/Kokkos_Array.hpp b/lib/kokkos/core/src/Kokkos_Array.hpp index a24a3d628aa..968510b1798 100644 --- a/lib/kokkos/core/src/Kokkos_Array.hpp +++ b/lib/kokkos/core/src/Kokkos_Array.hpp @@ -22,28 +22,11 @@ namespace Kokkos { #ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK namespace Impl { -template > -struct ArrayBoundsCheck; -template -struct ArrayBoundsCheck { +struct ArrayBoundsCheck { KOKKOS_INLINE_FUNCTION - constexpr ArrayBoundsCheck(Integral i, size_t N) { - if (i < 0) { - char err[128] = "Kokkos::Array: index "; - to_chars_i(err + strlen(err), err + 128, i); - strcat(err, " < 0"); - Kokkos::abort(err); - } - ArrayBoundsCheck(i, N); - } -}; - -template -struct ArrayBoundsCheck { - KOKKOS_INLINE_FUNCTION - constexpr ArrayBoundsCheck(Integral i, size_t N) { - if (size_t(i) >= N) { + constexpr ArrayBoundsCheck(size_t i, size_t N) { + if (i >= N) { char err[128] = "Kokkos::Array: index "; to_chars_i(err + strlen(err), err + 128, i); strcat(err, " >= "); @@ -54,8 +37,7 @@ struct ArrayBoundsCheck { }; } // end namespace Impl -#define KOKKOS_ARRAY_BOUNDS_CHECK(i, N) \ - Kokkos::Impl::ArrayBoundsCheck(i, N) +#define KOKKOS_ARRAY_BOUNDS_CHECK(i, N) Kokkos::Impl::ArrayBoundsCheck(i, N) #else // !defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK ) @@ -66,11 +48,7 @@ struct ArrayBoundsCheck { /**\brief Derived from the C++17 'std::array'. * Dropping the iterator interface. */ -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -template -#else template -#endif struct Array { public: /** @@ -100,19 +78,13 @@ struct Array { return N; } - template - KOKKOS_INLINE_FUNCTION constexpr reference operator[](const iType& i) { - static_assert((std::is_integral_v || std::is_enum_v), - "Must be integral argument"); + KOKKOS_INLINE_FUNCTION constexpr reference operator[](size_type i) { KOKKOS_ARRAY_BOUNDS_CHECK(i, N); return m_internal_implementation_private_member_data[i]; } - template KOKKOS_INLINE_FUNCTION constexpr const_reference operator[]( - const iType& i) const { - static_assert((std::is_integral_v || std::is_enum_v), - "Must be integral argument"); + size_type i) const { KOKKOS_ARRAY_BOUNDS_CHECK(i, N); return m_internal_implementation_private_member_data[i]; } @@ -168,13 +140,8 @@ struct Array { } }; -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -template -struct Array { -#else template struct Array { -#endif public: using reference = T&; using const_reference = std::add_const_t&; @@ -247,156 +214,6 @@ struct Array { Array&, Array&) noexcept {} }; -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -namespace Impl { -struct KokkosArrayContiguous {}; -struct KokkosArrayStrided {}; -} // namespace Impl - -template <> -struct KOKKOS_DEPRECATED Array { - using contiguous = Impl::KokkosArrayContiguous; - using strided = Impl::KokkosArrayStrided; -}; - -template -struct KOKKOS_DEPRECATED - Array { - private: - T* m_elem; - size_t m_size; - - public: - using reference = T&; - using const_reference = std::add_const_t&; - using size_type = size_t; - using difference_type = ptrdiff_t; - using value_type = T; - using pointer = T*; - using const_pointer = std::add_const_t*; - - KOKKOS_INLINE_FUNCTION constexpr size_type size() const { return m_size; } - KOKKOS_INLINE_FUNCTION constexpr bool empty() const { return 0 == m_size; } - KOKKOS_INLINE_FUNCTION constexpr size_type max_size() const { return m_size; } - - template - KOKKOS_INLINE_FUNCTION reference operator[](const iType& i) { - static_assert((std::is_integral_v || std::is_enum_v), - "Must be integral argument"); - KOKKOS_ARRAY_BOUNDS_CHECK(i, m_size); - return m_elem[i]; - } - - template - KOKKOS_INLINE_FUNCTION const_reference operator[](const iType& i) const { - static_assert((std::is_integral_v || std::is_enum_v), - "Must be integral argument"); - KOKKOS_ARRAY_BOUNDS_CHECK(i, m_size); - return m_elem[i]; - } - - KOKKOS_INLINE_FUNCTION pointer data() { return m_elem; } - KOKKOS_INLINE_FUNCTION const_pointer data() const { return m_elem; } - - KOKKOS_DEFAULTED_FUNCTION ~Array() = default; - KOKKOS_INLINE_FUNCTION_DELETED Array() = delete; - KOKKOS_INLINE_FUNCTION_DELETED Array(const Array& rhs) = delete; - - // Some supported compilers are not sufficiently C++11 compliant - // for default move constructor and move assignment operator. - // Array( Array && rhs ) = default ; - // Array & operator = ( Array && rhs ) = delete ; - - KOKKOS_INLINE_FUNCTION - Array& operator=(const Array& rhs) { - if (&rhs == this) return *this; - const size_t n = size() < rhs.size() ? size() : rhs.size(); - for (size_t i = 0; i < n; ++i) m_elem[i] = rhs[i]; - return *this; - } - - template - KOKKOS_INLINE_FUNCTION Array& operator=(const Array& rhs) { - const size_t n = size() < rhs.size() ? size() : rhs.size(); - for (size_t i = 0; i < n; ++i) m_elem[i] = rhs[i]; - return *this; - } - - KOKKOS_INLINE_FUNCTION constexpr Array(pointer arg_ptr, size_type arg_size, - size_type = 0) - : m_elem(arg_ptr), m_size(arg_size) {} -}; - -template -struct KOKKOS_DEPRECATED - Array { - private: - T* m_elem; - size_t m_size; - size_t m_stride; - - public: - using reference = T&; - using const_reference = std::add_const_t&; - using size_type = size_t; - using difference_type = ptrdiff_t; - using value_type = T; - using pointer = T*; - using const_pointer = std::add_const_t*; - - KOKKOS_INLINE_FUNCTION constexpr size_type size() const { return m_size; } - KOKKOS_INLINE_FUNCTION constexpr bool empty() const { return 0 == m_size; } - KOKKOS_INLINE_FUNCTION constexpr size_type max_size() const { return m_size; } - - template - KOKKOS_INLINE_FUNCTION reference operator[](const iType& i) { - static_assert((std::is_integral_v || std::is_enum_v), - "Must be integral argument"); - KOKKOS_ARRAY_BOUNDS_CHECK(i, m_size); - return m_elem[i * m_stride]; - } - - template - KOKKOS_INLINE_FUNCTION const_reference operator[](const iType& i) const { - static_assert((std::is_integral_v || std::is_enum_v), - "Must be integral argument"); - KOKKOS_ARRAY_BOUNDS_CHECK(i, m_size); - return m_elem[i * m_stride]; - } - - KOKKOS_INLINE_FUNCTION pointer data() { return m_elem; } - KOKKOS_INLINE_FUNCTION const_pointer data() const { return m_elem; } - - KOKKOS_DEFAULTED_FUNCTION ~Array() = default; - KOKKOS_INLINE_FUNCTION_DELETED Array() = delete; - KOKKOS_INLINE_FUNCTION_DELETED Array(const Array&) = delete; - - // Some supported compilers are not sufficiently C++11 compliant - // for default move constructor and move assignment operator. - // Array( Array && rhs ) = default ; - // Array & operator = ( Array && rhs ) = delete ; - - KOKKOS_INLINE_FUNCTION - Array& operator=(const Array& rhs) { - if (&rhs == this) return *this; - const size_t n = size() < rhs.size() ? size() : rhs.size(); - for (size_t i = 0; i < n; ++i) m_elem[i * m_stride] = rhs[i]; - return *this; - } - - template - KOKKOS_INLINE_FUNCTION Array& operator=(const Array& rhs) { - const size_t n = size() < rhs.size() ? size() : rhs.size(); - for (size_t i = 0; i < n; ++i) m_elem[i * m_stride] = rhs[i]; - return *this; - } - - KOKKOS_INLINE_FUNCTION constexpr Array(pointer arg_ptr, size_type arg_size, - size_type arg_stride) - : m_elem(arg_ptr), m_size(arg_size), m_stride(arg_stride) {} -}; -#endif - template Array(T, Us...) -> Array; diff --git a/lib/kokkos/core/src/Kokkos_Assert.hpp b/lib/kokkos/core/src/Kokkos_Assert.hpp index 134c7053723..e420e7516cf 100644 --- a/lib/kokkos/core/src/Kokkos_Assert.hpp +++ b/lib/kokkos/core/src/Kokkos_Assert.hpp @@ -45,9 +45,7 @@ #else // not debug mode #define KOKKOS_EXPECTS(...) #define KOKKOS_ENSURES(...) -#ifndef KOKKOS_ASSERT #define KOKKOS_ASSERT(...) -#endif // ifndef KOKKOS_ASSERT #endif // end debug mode ifdefs #endif /* #ifndef KOKKOS_ASSERT_HPP */ diff --git a/lib/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp b/lib/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp index 5499641ed63..28562ab8b9b 100644 --- a/lib/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp +++ b/lib/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp @@ -18,12 +18,6 @@ static_assert(false, namespace Kokkos { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -KOKKOS_DEPRECATED inline const char* atomic_query_version() { - return "KOKKOS_DESUL_ATOMICS"; -} -#endif - #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #if defined(KOKKOS_COMPILER_GNU) && !defined(__PGIC__) && \ !defined(__CUDA_ARCH__) @@ -67,9 +61,6 @@ KOKKOS_INLINE_FUNCTION void store_fence() { desul::atomic_thread_fence(desul::M // load/store template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_load (T const* ptr) { return desul::atomic_load (const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_store(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_store(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -template KOKKOS_DEPRECATED_WITH_COMMENT("Use atomic_store() instead!") KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_assign(T* ptr, Impl::not_deduced_atomic_t val) { atomic_store(ptr, val); } -#endif // atomic_fetch_op template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_add(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_add(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } @@ -112,26 +103,19 @@ template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_max(T template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_min(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_min(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_mul(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_mul(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_div(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_div(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } -template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_mod(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_mod(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } -template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_and(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_and(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } -template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_or (T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_or (const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } -template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_xor(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_xor(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } -template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_nand(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_nand_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } -template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_lshift(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_lshift(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } -template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_rshift(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_rshift(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_mod(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_mod(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_and(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_and(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_or (T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_or (const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_xor(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_xor(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_nand(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_nand(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_lshift(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_lshift(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_rshift(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_rshift(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_inc(T* ptr) { desul::atomic_inc(const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_dec(T* ptr) { desul::atomic_dec(const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -template KOKKOS_DEPRECATED_WITH_COMMENT("Use atomic_inc() instead!") KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_increment(T* ptr) { atomic_inc(ptr); } -template KOKKOS_DEPRECATED_WITH_COMMENT("Use atomic_dec() instead!") KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_decrement(T* ptr) { atomic_dec(ptr); } -#endif // exchange template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_exchange (T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_exchange (const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_compare_exchange(T* ptr, Impl::not_deduced_atomic_t expected, Impl::not_deduced_atomic_t desired) { return desul::atomic_compare_exchange(const_cast*>(ptr), expected, desired, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -template KOKKOS_DEPRECATED_WITH_COMMENT("Use atomic_compare_exchange() instead!") KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_compare_exchange_strong(T* ptr, Impl::not_deduced_atomic_t expected, Impl::not_deduced_atomic_t desired) { return expected == atomic_compare_exchange(ptr, expected, desired); } -#endif // clang-format on } // namespace Kokkos diff --git a/lib/kokkos/core/src/Kokkos_Complex.hpp b/lib/kokkos/core/src/Kokkos_Complex.hpp index f265ee67a84..85c57c506b3 100644 --- a/lib/kokkos/core/src/Kokkos_Complex.hpp +++ b/lib/kokkos/core/src/Kokkos_Complex.hpp @@ -49,13 +49,6 @@ class KOKKOS_DEFAULTED_FUNCTION complex() = default; - //! Copy constructor. - KOKKOS_DEFAULTED_FUNCTION - complex(const complex&) noexcept = default; - - KOKKOS_DEFAULTED_FUNCTION - complex& operator=(const complex&) noexcept = default; - /// \brief Conversion constructor from compatible RType template , int> = 0> @@ -250,171 +243,6 @@ class template friend constexpr const RT&& get(const complex&&) noexcept; - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - //! Copy constructor from volatile. - template , int> = 0> - KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION - complex(const volatile complex& src) noexcept - // Intentionally do the conversions implicitly here so that users don't - // get any warnings about narrowing, etc., that they would expect to get - // otherwise. - : re_(src.re_), im_(src.im_) {} - - /// \brief Assignment operator, for volatile *this and - /// nonvolatile input. - /// - /// \param src [in] Input; right-hand side of the assignment. - /// - /// This operator returns \c void instead of volatile - /// complex& . See Kokkos Issue #177 for the - /// explanation. In practice, this means that you should not chain - /// assignments with volatile lvalues. - // - // Templated, so as not to be a copy assignment operator (Kokkos issue #2577) - // Intended to behave as - // void operator=(const complex&) volatile noexcept - // - // Use cases: - // complex r; - // const complex cr; - // volatile complex vl; - // vl = r; - // vl = cr; - template , int> = 0> - KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION void operator=( - const Complex& src) volatile noexcept { - re_ = src.re_; - im_ = src.im_; - // We deliberately do not return anything here. See explanation - // in public documentation above. - } - - //! Assignment operator, volatile LHS and volatile RHS - // TODO Should this return void like the other volatile assignment operators? - // - // Templated, so as not to be a copy assignment operator (Kokkos issue #2577) - // Intended to behave as - // volatile complex& operator=(const volatile complex&) volatile noexcept - // - // Use cases: - // volatile complex vr; - // const volatile complex cvr; - // volatile complex vl; - // vl = vr; - // vl = cvr; - template , int> = 0> - KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION volatile complex& operator=( - const volatile Complex& src) volatile noexcept { - re_ = src.re_; - im_ = src.im_; - return *this; - } - - //! Assignment operator, volatile RHS and non-volatile LHS - // - // Templated, so as not to be a copy assignment operator (Kokkos issue #2577) - // Intended to behave as - // complex& operator=(const volatile complex&) noexcept - // - // Use cases: - // volatile complex vr; - // const volatile complex cvr; - // complex l; - // l = vr; - // l = cvr; - // - template , int> = 0> - KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION complex& operator=( - const volatile Complex& src) noexcept { - re_ = src.re_; - im_ = src.im_; - return *this; - } - - // Mirroring the behavior of the assignment operators from complex RHS in the - // RealType RHS versions. - - //! Assignment operator (from a volatile real number). - KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION void operator=( - const volatile RealType& val) noexcept { - re_ = val; - im_ = RealType(0); - // We deliberately do not return anything here. See explanation - // in public documentation above. - } - - //! Assignment operator volatile LHS and non-volatile RHS - KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION complex& operator=( - const RealType& val) volatile noexcept { - re_ = val; - im_ = RealType(0); - return *this; - } - - //! Assignment operator volatile LHS and volatile RHS - // TODO Should this return void like the other volatile assignment operators? - KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION complex& operator=( - const volatile RealType& val) volatile noexcept { - re_ = val; - im_ = RealType(0); - return *this; - } - - //! The imaginary part of this complex number (volatile overload). - KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION volatile RealType& - imag() volatile noexcept { - return im_; - } - - //! The real part of this complex number (volatile overload). - KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION volatile RealType& - real() volatile noexcept { - return re_; - } - - //! The imaginary part of this complex number (volatile overload). - KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION RealType imag() const - volatile noexcept { - return im_; - } - - //! The real part of this complex number (volatile overload). - KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION RealType real() const - volatile noexcept { - return re_; - } - - KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION void operator+=( - const volatile complex& src) volatile noexcept { - re_ += src.re_; - im_ += src.im_; - } - - KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION void operator+=( - const volatile RealType& src) volatile noexcept { - re_ += src; - } - - KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION void operator*=( - const volatile complex& src) volatile noexcept { - const RealType realPart = re_ * src.re_ - im_ * src.im_; - const RealType imagPart = re_ * src.im_ + im_ * src.re_; - - re_ = realPart; - im_ = imagPart; - } - - KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION void operator*=( - const volatile RealType& src) volatile noexcept { - re_ *= src; - im_ *= src; - } -#endif // KOKKOS_ENABLE_DEPRECATED_CODE_4 }; } // namespace Kokkos @@ -699,7 +527,8 @@ operator*(const complex& y, const RealType2& x) noexcept { //! Imaginary part of a complex number. template -KOKKOS_INLINE_FUNCTION RealType imag(const complex& x) noexcept { +KOKKOS_INLINE_FUNCTION constexpr RealType imag( + const complex& x) noexcept { return x.imag(); } @@ -711,7 +540,8 @@ KOKKOS_INLINE_FUNCTION constexpr Impl::promote_t imag( //! Real part of a complex number. template -KOKKOS_INLINE_FUNCTION RealType real(const complex& x) noexcept { +KOKKOS_INLINE_FUNCTION constexpr RealType real( + const complex& x) noexcept { return x.real(); } @@ -796,9 +626,9 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex sqrt( //! Conjugate of a complex number. template -KOKKOS_INLINE_FUNCTION complex conj( +KOKKOS_INLINE_FUNCTION constexpr complex conj( const complex& x) noexcept { - return complex(real(x), -imag(x)); + return {real(x), -imag(x)}; } template @@ -808,6 +638,18 @@ KOKKOS_INLINE_FUNCTION constexpr complex> conj( return complex(x, -type()); } +//! Norm of a complex number. +template +KOKKOS_INLINE_FUNCTION constexpr RealType norm(const complex& x) { + return x.real() * x.real() + x.imag() * x.imag(); +} + +template +KOKKOS_INLINE_FUNCTION constexpr Impl::promote_t norm( + ArithmeticType x) { + return static_cast>(x) * x; +} + //! Exponential of a complex number. template KOKKOS_INLINE_FUNCTION complex exp(const complex& x) { diff --git a/lib/kokkos/core/src/Kokkos_Concepts.hpp b/lib/kokkos/core/src/Kokkos_Concepts.hpp index 32c3c6abb98..f1380561d53 100644 --- a/lib/kokkos/core/src/Kokkos_Concepts.hpp +++ b/lib/kokkos/core/src/Kokkos_Concepts.hpp @@ -114,14 +114,14 @@ struct LaunchBounds { namespace Kokkos { -#define KOKKOS_IMPL_IS_CONCEPT(CONCEPT) \ +#define KOKKOS_IMPL_DEFINE_TRAIT_FROM_TYPEDEF(TYPEDEF) \ template \ - struct is_##CONCEPT { \ + struct is_##TYPEDEF { \ private: \ template \ - using have_t = typename U::CONCEPT; \ + using have_t = typename U::TYPEDEF; \ template \ - using have_type_t = typename U::CONCEPT##_type; \ + using have_type_t = typename U::TYPEDEF##_type; \ \ public: \ static constexpr bool value = \ @@ -130,33 +130,42 @@ namespace Kokkos { constexpr operator bool() const noexcept { return value; } \ }; \ template \ - inline constexpr bool is_##CONCEPT##_v = is_##CONCEPT::value; + inline constexpr bool is_##TYPEDEF##_v = is_##TYPEDEF::value; + +#define KOKKOS_IMPL_DEFINE_CONCEPT_AND_TRAIT_FROM_TYPEDEF(TYPEDEF, \ + CXX20_CONCEPT) \ + KOKKOS_IMPL_DEFINE_TRAIT_FROM_TYPEDEF(TYPEDEF) \ + template \ + concept CXX20_CONCEPT = is_##TYPEDEF##_v; // Public concept: -KOKKOS_IMPL_IS_CONCEPT(memory_space) -KOKKOS_IMPL_IS_CONCEPT(memory_traits) -KOKKOS_IMPL_IS_CONCEPT(execution_space) -KOKKOS_IMPL_IS_CONCEPT(execution_policy) -KOKKOS_IMPL_IS_CONCEPT(array_layout) -KOKKOS_IMPL_IS_CONCEPT(reducer) -KOKKOS_IMPL_IS_CONCEPT(team_handle) +KOKKOS_IMPL_DEFINE_CONCEPT_AND_TRAIT_FROM_TYPEDEF(memory_space, MemorySpace) +KOKKOS_IMPL_DEFINE_TRAIT_FROM_TYPEDEF(memory_traits) +KOKKOS_IMPL_DEFINE_CONCEPT_AND_TRAIT_FROM_TYPEDEF(execution_space, + ExecutionSpace) +KOKKOS_IMPL_DEFINE_CONCEPT_AND_TRAIT_FROM_TYPEDEF(execution_policy, + ExecutionPolicy) +KOKKOS_IMPL_DEFINE_TRAIT_FROM_TYPEDEF(array_layout) +KOKKOS_IMPL_DEFINE_CONCEPT_AND_TRAIT_FROM_TYPEDEF(reducer, Reducer) +KOKKOS_IMPL_DEFINE_CONCEPT_AND_TRAIT_FROM_TYPEDEF(team_handle, TeamHandle) namespace Experimental { -KOKKOS_IMPL_IS_CONCEPT(work_item_property) -KOKKOS_IMPL_IS_CONCEPT(hooks_policy) +KOKKOS_IMPL_DEFINE_TRAIT_FROM_TYPEDEF(work_item_property) +KOKKOS_IMPL_DEFINE_TRAIT_FROM_TYPEDEF(hooks_policy) } // namespace Experimental namespace Impl { // Implementation concept: -KOKKOS_IMPL_IS_CONCEPT(thread_team_member) -KOKKOS_IMPL_IS_CONCEPT(host_thread_team_member) -KOKKOS_IMPL_IS_CONCEPT(graph_kernel) +KOKKOS_IMPL_DEFINE_TRAIT_FROM_TYPEDEF(thread_team_member) +KOKKOS_IMPL_DEFINE_TRAIT_FROM_TYPEDEF(host_thread_team_member) +KOKKOS_IMPL_DEFINE_TRAIT_FROM_TYPEDEF(graph_kernel) } // namespace Impl -#undef KOKKOS_IMPL_IS_CONCEPT +#undef KOKKOS_IMPL_DEFINE_TRAIT_FROM_TYPEDEF +#undef KOKKOS_IMPL_DEFINE_CONCEPT_AND_TRAIT_FROM_TYPEDEF } // namespace Kokkos diff --git a/lib/kokkos/core/src/Kokkos_CopyViews.hpp b/lib/kokkos/core/src/Kokkos_CopyViews.hpp index ed14ec03674..c2014814e58 100644 --- a/lib/kokkos/core/src/Kokkos_CopyViews.hpp +++ b/lib/kokkos/core/src/Kokkos_CopyViews.hpp @@ -66,11 +66,34 @@ struct ViewFill { void operator()(const iType&) const { a() = val; } }; +// Increasing the number of elements per thread improves throughput for +// configurations that support StaticBatchSize. The values were found +// empirically. +template +struct ViewFillStaticBatchSize { + static constexpr int value = 1; +}; +#ifdef KOKKOS_ENABLE_CUDA +template +struct ViewFillStaticBatchSize { + static constexpr int value = 16; +}; +#endif +#ifdef KOKKOS_ENABLE_HIP +template +struct ViewFillStaticBatchSize { + static constexpr int value = size < 4 ? 8 : 4; +}; +#endif + template struct ViewFill { ViewType a; typename ViewType::const_value_type val; - using policy_type = Kokkos::RangePolicy>; + using policy_type = Kokkos::RangePolicy< + ExecSpace, Kokkos::IndexType, + Kokkos::Experimental::StaticBatchSize::value>>; ViewFill(const ViewType& a_, typename ViewType::const_value_type& val_, const ExecSpace& space) @@ -3118,10 +3141,6 @@ inline auto create_mirror(const Kokkos::View& src, return dst_type(prop_copy, src.layout()); #endif } -#if defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ - !defined(KOKKOS_COMPILER_MSVC) - __builtin_unreachable(); -#endif } } // namespace Impl @@ -3228,11 +3247,6 @@ inline auto choose_create_mirror( return create_mirror(arg_prop, src); } } - -#if defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ - !defined(KOKKOS_COMPILER_MSVC) - __builtin_unreachable(); -#endif } // create a mirror view @@ -3266,10 +3280,6 @@ inline auto create_mirror_view( return Kokkos::Impl::choose_create_mirror(src, arg_prop); } } -#if defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ - !defined(KOKKOS_COMPILER_MSVC) - __builtin_unreachable(); -#endif } } // namespace Impl @@ -3375,10 +3385,6 @@ auto create_mirror_view_and_copy( deep_copy(mirror, src); return mirror; } -#if defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ - !defined(KOKKOS_COMPILER_MSVC) - __builtin_unreachable(); -#endif } // Previously when using auto here, the intel compiler 19.3 would diff --git a/lib/kokkos/core/src/Kokkos_Core.cppm b/lib/kokkos/core/src/Kokkos_Core.cppm index f5bbc645333..794c39a8be0 100644 --- a/lib/kokkos/core/src/Kokkos_Core.cppm +++ b/lib/kokkos/core/src/Kokkos_Core.cppm @@ -47,9 +47,6 @@ export { #ifdef KOKKOS_ENABLE_HPX using ::Kokkos::Experimental::HPX; #endif -#ifdef KOKKOS_ENABLE_OPENMPTARGET - using ::Kokkos::Experimental::OpenMPTarget; -#endif #ifdef KOKKOS_ENABLE_OPENACC using ::Kokkos::Experimental::OpenACC; #endif @@ -62,6 +59,7 @@ export { using ::Kokkos::DefaultHostExecutionSpace; using ::Kokkos::Device; using ::Kokkos::device_id; + using ::Kokkos::ExecutionSpace; using ::Kokkos::has_shared_host_pinned_space; using ::Kokkos::has_shared_space; using ::Kokkos::HostSpace; @@ -72,6 +70,7 @@ export { using ::Kokkos::is_memory_space; using ::Kokkos::is_memory_space_v; using ::Kokkos::is_space; + using ::Kokkos::MemorySpace; using ::Kokkos::ScratchMemorySpace; using ::Kokkos::ScratchRequest; #ifdef KOKKOS_HAS_SHARED_SPACE @@ -146,6 +145,7 @@ export { using ::Kokkos::default_inner_direction; using ::Kokkos::default_outer_direction; using ::Kokkos::Dynamic; + using ::Kokkos::ExecutionPolicy; using ::Kokkos::IndexType; using ::Kokkos::is_execution_policy; using ::Kokkos::is_execution_policy_v; @@ -166,7 +166,7 @@ export { using ::Kokkos::Schedule; using ::Kokkos::single; using ::Kokkos::Static; - using ::Kokkos::team_policy_check_valid_storage_level_argument; + using ::Kokkos::TeamHandle; using ::Kokkos::TeamPolicy; using ::Kokkos::TeamThreadMDRange; using ::Kokkos::TeamThreadRange; @@ -264,6 +264,7 @@ export { using ::Kokkos::MinMaxLocScalar; using ::Kokkos::MinMaxScalar; using ::Kokkos::Prod; + using ::Kokkos::Reducer; using ::Kokkos::reduction_identity; using ::Kokkos::StdIsPartitioned; // FIXME Move to algorithms using ::Kokkos::StdIsPartScalar; // FIXME Move to algorithms @@ -483,15 +484,38 @@ export { using ::Kokkos::fmod; using ::Kokkos::fmodf; using ::Kokkos::fmodl; + using ::Kokkos::fpclassify; + using ::Kokkos::frexp; + using ::Kokkos::frexpf; + using ::Kokkos::frexpl; using ::Kokkos::hypot; using ::Kokkos::hypotf; using ::Kokkos::hypotl; + using ::Kokkos::ilogb; + using ::Kokkos::ilogbf; + using ::Kokkos::ilogbl; using ::Kokkos::isfinite; + using ::Kokkos::isgreater; + using ::Kokkos::isgreaterequal; using ::Kokkos::isinf; + using ::Kokkos::isless; + using ::Kokkos::islessequal; + using ::Kokkos::islessgreater; using ::Kokkos::isnan; + using ::Kokkos::isnormal; + using ::Kokkos::isunordered; + using ::Kokkos::ldexp; + using ::Kokkos::ldexpf; + using ::Kokkos::ldexpl; using ::Kokkos::lgamma; using ::Kokkos::lgammaf; using ::Kokkos::lgammal; + using ::Kokkos::llrint; + using ::Kokkos::llrintf; + using ::Kokkos::llrintl; + using ::Kokkos::llround; + using ::Kokkos::llroundf; + using ::Kokkos::llroundl; using ::Kokkos::log; using ::Kokkos::log10; using ::Kokkos::log10f; @@ -507,6 +531,15 @@ export { using ::Kokkos::logbl; using ::Kokkos::logf; using ::Kokkos::logl; + using ::Kokkos::lrint; + using ::Kokkos::lrintf; + using ::Kokkos::lrintl; + using ::Kokkos::lround; + using ::Kokkos::lroundf; + using ::Kokkos::lroundl; + using ::Kokkos::modf; + using ::Kokkos::modff; + using ::Kokkos::modfl; using ::Kokkos::nan; using ::Kokkos::nanf; using ::Kokkos::nanl; @@ -516,18 +549,37 @@ export { using ::Kokkos::nextafter; using ::Kokkos::nextafterf; using ::Kokkos::nextafterl; + using ::Kokkos::nexttoward; + using ::Kokkos::nexttowardf; + using ::Kokkos::nexttowardl; + using ::Kokkos::norm; using ::Kokkos::pow; using ::Kokkos::powf; using ::Kokkos::powl; + using ::Kokkos::rcp; + using ::Kokkos::rcpf; + using ::Kokkos::rcpl; using ::Kokkos::remainder; using ::Kokkos::remainderf; using ::Kokkos::remainderl; + using ::Kokkos::remquo; + using ::Kokkos::remquof; + using ::Kokkos::remquol; + using ::Kokkos::rint; + using ::Kokkos::rintf; + using ::Kokkos::rintl; using ::Kokkos::round; using ::Kokkos::roundf; using ::Kokkos::roundl; using ::Kokkos::rsqrt; using ::Kokkos::rsqrtf; using ::Kokkos::rsqrtl; + using ::Kokkos::scalbln; + using ::Kokkos::scalblnf; + using ::Kokkos::scalblnl; + using ::Kokkos::scalbn; + using ::Kokkos::scalbnf; + using ::Kokkos::scalbnl; using ::Kokkos::signbit; using ::Kokkos::sin; using ::Kokkos::sinf; diff --git a/lib/kokkos/core/src/Kokkos_Core_Impl.cppm b/lib/kokkos/core/src/Kokkos_Core_Impl.cppm index 69763084222..2bb041c6277 100644 --- a/lib/kokkos/core/src/Kokkos_Core_Impl.cppm +++ b/lib/kokkos/core/src/Kokkos_Core_Impl.cppm @@ -76,7 +76,6 @@ export { // execution policies namespace Impl { - using ::Kokkos::Impl::get_tile_size_properties; using ::Kokkos::Impl::ParallelConstructName; using ::Kokkos::Impl::PolicyTraits; using ::Kokkos::Impl::PolicyUpdate; @@ -86,6 +85,7 @@ export { // miscellaneous namespace Impl { using ::Kokkos::Impl::FunctorAnalysis; + using ::Kokkos::Impl::integral_constant; using ::Kokkos::Impl::python_view_type_impl_t; using ::Kokkos::Impl::throw_runtime_exception; } // namespace Impl diff --git a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp index 4c46d45b211..12eeb4005d0 100644 --- a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp +++ b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp @@ -68,7 +68,7 @@ class InitializationSettings; /// Define Kokkos::DefaultExecutionSpace as per configuration option /// or chosen from the enabled execution spaces in the following order: -/// Kokkos::Cuda, Kokkos::Experimental::OpenMPTarget, Kokkos::OpenMP, +/// Kokkos::Cuda, Kokkos::OpenMP, /// Kokkos::Threads, Kokkos::Serial #if defined(__clang_analyzer__) @@ -85,9 +85,6 @@ namespace Kokkos { #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA) using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = Cuda; -#elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET) -using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = - Experimental::OpenMPTarget; #elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HIP) using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = HIP; #elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SYCL) @@ -106,7 +103,7 @@ using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = Serial; #else #error \ - "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::HIP, Kokkos::SYCL, Kokkos::Experimental::OpenMPTarget, Kokkos::Experimental::OpenACC, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Experimental::HPX, or Kokkos::Serial." + "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::HIP, Kokkos::SYCL, Kokkos::Experimental::OpenACC, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Experimental::HPX, or Kokkos::Serial." #endif #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP) @@ -149,7 +146,7 @@ using SharedSpace = HIPManagedSpace; using SharedSpace = SYCLSharedUSMSpace; #define KOKKOS_HAS_SHARED_SPACE // if only host compile point to HostSpace -#elif !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_OPENMPTARGET) +#elif !defined(KOKKOS_ENABLE_OPENACC) using SharedSpace = HostSpace; #define KOKKOS_HAS_SHARED_SPACE #endif @@ -170,7 +167,7 @@ using SharedHostPinnedSpace = HIPHostPinnedSpace; #elif defined(KOKKOS_ENABLE_SYCL) using SharedHostPinnedSpace = SYCLHostUSMSpace; #define KOKKOS_HAS_SHARED_HOST_PINNED_SPACE -#elif !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_OPENMPTARGET) +#elif !defined(KOKKOS_ENABLE_OPENACC) using SharedHostPinnedSpace = HostSpace; #define KOKKOS_HAS_SHARED_HOST_PINNED_SPACE #endif diff --git a/lib/kokkos/core/src/Kokkos_Crs.hpp b/lib/kokkos/core/src/Kokkos_Crs.hpp index b2e53ae228c..eeea0a848b7 100644 --- a/lib/kokkos/core/src/Kokkos_Crs.hpp +++ b/lib/kokkos/core/src/Kokkos_Crs.hpp @@ -79,12 +79,7 @@ class Crs { /* * Default Constructors, operators and destructor */ - KOKKOS_DEFAULTED_FUNCTION Crs() = default; - KOKKOS_DEFAULTED_FUNCTION Crs(Crs const&) = default; - KOKKOS_DEFAULTED_FUNCTION Crs(Crs&&) = default; - KOKKOS_DEFAULTED_FUNCTION Crs& operator=(Crs const&) = default; - KOKKOS_DEFAULTED_FUNCTION Crs& operator=(Crs&&) = default; - KOKKOS_DEFAULTED_FUNCTION ~Crs() = default; + KOKKOS_DEFAULTED_FUNCTION Crs() = default; /** \brief Assign to a view of the rhs array. * If the old view is the last view diff --git a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp index a0d62e24e8a..d560d73728b 100644 --- a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp +++ b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp @@ -20,6 +20,7 @@ static_assert(false, #include #endif #include +#include #include //---------------------------------------------------------------------------- @@ -33,11 +34,6 @@ struct ParallelReduceTag {}; struct ChunkSize { int value; explicit ChunkSize(int value_) : value(value_) {} -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - template - KOKKOS_DEPRECATED_WITH_COMMENT("ChunkSize should be constructed explicitly.") - ChunkSize(int value_) : value(value_) {} -#endif }; namespace Impl { @@ -179,15 +175,6 @@ class RangePolicy : public Impl::PolicyTraits { this->m_space = std::move(space); } - public: -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - KOKKOS_DEPRECATED_WITH_COMMENT("Use set_chunk_size instead") - inline void set(ChunkSize chunksize) { - m_granularity = chunksize.value; - m_granularity_mask = m_granularity - 1; - } -#endif - public: /** \brief return chunk_size */ inline member_type chunk_size() const { return m_granularity; } @@ -345,8 +332,6 @@ class RangePolicy : public Impl::PolicyTraits { private: member_type m_begin; member_type m_end; - WorkRange(); - WorkRange& operator=(const WorkRange&); }; }; @@ -553,6 +538,9 @@ inline std::enable_if_t, int> extract_vector_length( return 1; } +// Causes abnormal program termination if level is not `0` or `1` +void team_policy_check_valid_storage_level_argument(int level); + } // namespace Impl Impl::PerTeamValue PerTeam(const size_t& arg); @@ -594,9 +582,6 @@ struct ScratchRequest { } }; -// Causes abnormal program termination if level is not `0` or `1` -void team_policy_check_valid_storage_level_argument(int level); - /** \brief Execution policy for parallel work over a league of teams of * threads. * @@ -635,6 +620,49 @@ class TeamPolicy template friend class TeamPolicy; + static int validate_league_size_argument(int league_size) { + if (league_size < 0) { + std::stringstream err; + err << "Kokkos::TeamPolicy error: league_size (" << league_size + << ") must be greater than or equal to 0"; + Kokkos::abort(err.str().c_str()); + } + return league_size; + } + static int validate_team_size_argument(int team_size) { + if (team_size < 1) { + std::stringstream err; + err << "Kokkos::TeamPolicy error: team_size (" << team_size + << ") must be greater than or equal to 1"; + Kokkos::abort(err.str().c_str()); + } + return team_size; + } + static int validate_vector_length_argument(int vector_length) { + if (vector_length < 1) { + std::stringstream err; + err << "Kokkos::TeamPolicy error: vector_length (" << vector_length + << ") must be greater than or equal to 1"; + Kokkos::abort(err.str().c_str()); + } +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_5 + int const vector_length_max = internal_policy::vector_length_max(); + if (vector_length > vector_length_max) { + std::stringstream err; + err << "Kokkos::TeamPolicy error: vector_length (" << vector_length + << ") exceeds the maximum allowed (" << vector_length_max << ")"; + Kokkos::abort(err.str().c_str()); + } + if (!Kokkos::has_single_bit(static_cast(vector_length))) { + std::stringstream err; + err << "Kokkos::TeamPolicy error: vector_length (" << vector_length + << ") must be a power of 2"; + Kokkos::abort(err.str().c_str()); + } +#endif + return vector_length; + } + public: using traits = Impl::PolicyTraits; @@ -643,47 +671,46 @@ class TeamPolicy TeamPolicy() : internal_policy(0, AUTO) {} /** \brief Construct policy with the given instance of the execution space */ - TeamPolicy(const typename traits::execution_space& space_, - int league_size_request, int team_size_request, - int vector_length_request = 1) - : internal_policy(space_, league_size_request, team_size_request, - vector_length_request) {} - - TeamPolicy(const typename traits::execution_space& space_, - int league_size_request, const Kokkos::AUTO_t&, - int vector_length_request = 1) - : internal_policy(space_, league_size_request, Kokkos::AUTO(), - vector_length_request) {} - - TeamPolicy(const typename traits::execution_space& space_, - int league_size_request, const Kokkos::AUTO_t&, - const Kokkos::AUTO_t&) - : internal_policy(space_, league_size_request, Kokkos::AUTO(), - Kokkos::AUTO()) {} - TeamPolicy(const typename traits::execution_space& space_, - int league_size_request, const int team_size_request, - const Kokkos::AUTO_t&) - : internal_policy(space_, league_size_request, team_size_request, - Kokkos::AUTO()) {} + TeamPolicy(const typename traits::execution_space& space_, int league_size, + int team_size, int vector_length = 1) + : internal_policy(space_, validate_league_size_argument(league_size), + validate_team_size_argument(team_size), + validate_vector_length_argument(vector_length)) {} + + TeamPolicy(const typename traits::execution_space& space_, int league_size, + Kokkos::AUTO_t, int vector_length = 1) + : internal_policy(space_, validate_league_size_argument(league_size), + Kokkos::AUTO, + validate_vector_length_argument(vector_length)) {} + + TeamPolicy(const typename traits::execution_space& space_, int league_size, + Kokkos::AUTO_t, Kokkos::AUTO_t) + : internal_policy(space_, league_size, Kokkos::AUTO, Kokkos::AUTO) {} + + TeamPolicy(const typename traits::execution_space& space_, int league_size, + const int team_size, Kokkos::AUTO_t) + : internal_policy(space_, validate_league_size_argument(league_size), + validate_team_size_argument(team_size), Kokkos::AUTO) {} + /** \brief Construct policy with the default instance of the execution space */ - TeamPolicy(int league_size_request, int team_size_request, - int vector_length_request = 1) - : internal_policy(league_size_request, team_size_request, - vector_length_request) {} - - TeamPolicy(int league_size_request, const Kokkos::AUTO_t&, - int vector_length_request = 1) - : internal_policy(league_size_request, Kokkos::AUTO(), - vector_length_request) {} - - TeamPolicy(int league_size_request, const Kokkos::AUTO_t&, - const Kokkos::AUTO_t&) - : internal_policy(league_size_request, Kokkos::AUTO(), Kokkos::AUTO()) {} - TeamPolicy(int league_size_request, const int team_size_request, - const Kokkos::AUTO_t&) - : internal_policy(league_size_request, team_size_request, - Kokkos::AUTO()) {} + TeamPolicy(int league_size, int team_size, int vector_length = 1) + : internal_policy(validate_league_size_argument(league_size), + validate_team_size_argument(team_size), + validate_vector_length_argument(vector_length)) {} + + TeamPolicy(int league_size, Kokkos::AUTO_t, int vector_length = 1) + : internal_policy(validate_league_size_argument(league_size), + Kokkos::AUTO, + validate_vector_length_argument(vector_length)) {} + + TeamPolicy(int league_size, Kokkos::AUTO_t, Kokkos::AUTO_t) + : internal_policy(validate_league_size_argument(league_size), + Kokkos::AUTO, Kokkos::AUTO) {} + + TeamPolicy(int league_size, int team_size, Kokkos::AUTO_t) + : internal_policy(validate_league_size_argument(league_size), + validate_team_size_argument(team_size), Kokkos::AUTO) {} template TeamPolicy(const TeamPolicy p) : internal_policy(p) { @@ -715,27 +742,27 @@ class TeamPolicy internal_policy&>, "internal set_chunk_size should return a reference"); - team_policy_check_valid_storage_level_argument(level); + Impl::team_policy_check_valid_storage_level_argument(level); return static_cast( internal_policy::set_scratch_size(level, per_team)); } inline TeamPolicy& set_scratch_size(const int& level, const Impl::PerThreadValue& per_thread) { - team_policy_check_valid_storage_level_argument(level); + Impl::team_policy_check_valid_storage_level_argument(level); return static_cast( internal_policy::set_scratch_size(level, per_thread)); } inline TeamPolicy& set_scratch_size(const int& level, const Impl::PerTeamValue& per_team, const Impl::PerThreadValue& per_thread) { - team_policy_check_valid_storage_level_argument(level); + Impl::team_policy_check_valid_storage_level_argument(level); return static_cast( internal_policy::set_scratch_size(level, per_team, per_thread)); } inline TeamPolicy& set_scratch_size(const int& level, const Impl::PerThreadValue& per_thread, const Impl::PerTeamValue& per_team) { - team_policy_check_valid_storage_level_argument(level); + Impl::team_policy_check_valid_storage_level_argument(level); return static_cast( internal_policy::set_scratch_size(level, per_team, per_thread)); } diff --git a/lib/kokkos/core/src/Kokkos_Graph.hpp b/lib/kokkos/core/src/Kokkos_Graph.hpp index ad45c925abe..6c96184cf99 100644 --- a/lib/kokkos/core/src/Kokkos_Graph.hpp +++ b/lib/kokkos/core/src/Kokkos_Graph.hpp @@ -16,6 +16,7 @@ // GraphAccess needs to be defined, not just declared #include +#include #include #include @@ -42,6 +43,8 @@ struct [[nodiscard]] Graph { //---------------------------------------------------------------------------- private: + using device_handle_t = Kokkos::Impl::DeviceHandle; + //---------------------------------------------------------------------------- // {{{2 @@ -64,8 +67,8 @@ struct [[nodiscard]] Graph { public: // Construct an empty graph with a root node. - Graph(ExecutionSpace exec = ExecutionSpace{}) - : m_impl_ptr{std::make_shared(std::move(exec))}, + Graph(const device_handle_t& device_handle = device_handle_t{}) + : m_impl_ptr{std::make_shared(device_handle)}, m_root{m_impl_ptr->create_root_node_ptr()} {} #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || \ @@ -75,15 +78,15 @@ struct [[nodiscard]] Graph { #if defined(KOKKOS_ENABLE_CXX20) requires std::same_as #endif - Graph(ExecutionSpace exec, T&& native_graph) - : m_impl_ptr{std::make_shared(std::move(exec), + Graph(const device_handle_t& device_handle, T&& native_graph) + : m_impl_ptr{std::make_shared(device_handle, std::forward(native_graph))}, m_root{m_impl_ptr->create_root_node_ptr()} { } #endif - ExecutionSpace const& get_execution_space() const { - return m_impl_ptr->get_execution_space(); + const auto& get_device_handle() const { + return m_impl_ptr->get_device_handle(); } // Once the graph is instantiated, it is undefined behavior to add nodes. @@ -96,6 +99,14 @@ struct [[nodiscard]] Graph { auto root_node() const { return root_t{m_impl_ptr, m_root}; } + // The graph is started once previous work on the execution space has + // finished. + // TODO: The graph nodes are created with user-provided device handles. + // However, preliminary work (e.g., copying the driver to the device for + // global launch) is enqueued in the device handle execution space + // instance. Currently, the user is responsible for adding proper + // synchronization for node preliminary work. Ideally, the graph itself + // should handle this synchronization on first submission. void submit(const execution_space& exec = execution_space{}) const { KOKKOS_EXPECTS(bool(m_impl_ptr)) (*m_impl_ptr).submit(exec); @@ -141,13 +152,15 @@ auto when_all(PredecessorRefs&&... arg_pred_refs) { // {{{1 template -Graph create_graph(ExecutionSpace ex, Closure&& arg_closure) { +Graph create_graph( + const Kokkos::Impl::DeviceHandle& device_handle, + Closure&& arg_closure) { // Create a shared pointer to the graph: // We need an attorney class here so we have an implementation friend to // create a Graph class without graph having public constructors. We can't // just make `create_graph` itself a friend because of the way that friend // function template injection works. - Graph rv{std::move(ex)}; + Graph rv{device_handle}; // Invoke the user's graph construction closure ((Closure&&)arg_closure)(rv.root_node()); // and given them back the graph @@ -161,7 +174,8 @@ template < std::enable_if_t>, Graph> create_graph(Closure&& arg_closure) { - return create_graph(ExecutionSpace{}, (Closure&&)arg_closure); + return create_graph(Kokkos::Impl::DeviceHandle{}, + (Closure&&)arg_closure); } // end create_graph }}}1 diff --git a/lib/kokkos/core/src/Kokkos_GraphNode.hpp b/lib/kokkos/core/src/Kokkos_GraphNode.hpp index 766f1af3be3..e9336838225 100644 --- a/lib/kokkos/core/src/Kokkos_GraphNode.hpp +++ b/lib/kokkos/core/src/Kokkos_GraphNode.hpp @@ -20,13 +20,25 @@ static_assert(false, #include #include // GraphAccess #include +#include #include // std::shared_ptr namespace Kokkos { namespace Experimental { -template {}) \ + Kokkos::abort( \ + "The execution space instance of the execution policy of a graph " \ + "node must be the default one."); + +template +concept ExecutionPolicyOn = + ExecutionPolicy && std::same_as; + +template class GraphNodeRef { //---------------------------------------------------------------------------- @@ -40,10 +52,6 @@ class GraphNodeRef { Kokkos::Impl::is_specialization_of::value, "Invalid predecessor template parameter given to GraphNodeRef"); - static_assert( - Kokkos::is_execution_space::value, - "Invalid execution space template parameter given to GraphNodeRef"); - static_assert(std::is_same_v || Kokkos::Impl::is_graph_kernel::value || Kokkos::Impl::is_graph_capture_v || @@ -69,10 +77,12 @@ class GraphNodeRef { //---------------------------------------------------------------------------- private: + using device_handle_t = Kokkos::Impl::DeviceHandle; + //---------------------------------------------------------------------------- // {{{2 - template + template friend class GraphNodeRef; friend struct Kokkos::Impl::GraphAccess; friend struct Graph; @@ -136,7 +146,7 @@ class GraphNodeRef { m_graph_impl, Kokkos::Impl::GraphAccess::make_node_shared_ptr< typename return_t::node_impl_t>( - m_node_impl->execution_space_instance(), + m_node_impl->get_device_handle(), Kokkos::Impl::_graph_node_kernel_ctor_tag{}, (NextKernelDeduced&&)arg_kernel, // *this is the predecessor @@ -171,12 +181,7 @@ class GraphNodeRef { // {{{3 // Copyable and movable (basically just shared_ptr semantics - GraphNodeRef() noexcept = default; - GraphNodeRef(GraphNodeRef const&) = default; - GraphNodeRef(GraphNodeRef&&) noexcept = default; - GraphNodeRef& operator=(GraphNodeRef const&) = default; - GraphNodeRef& operator=(GraphNodeRef&&) noexcept = default; - ~GraphNodeRef() = default; + GraphNodeRef() noexcept = default; // end rule of 6 ctors }}}3 //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -217,32 +222,39 @@ class GraphNodeRef { // TODO We should do better than a p-for (that uses registers, heavier). // This should "just" launch the function on device with our driver. - template < - typename Label, typename Policy, typename Functor, - std::enable_if_t< - std::is_invocable_r_v> && - Kokkos::Impl::is_view_label_v> && - Kokkos::Impl::is_specialization_of_v, - int> = 0> - auto then(Label&& label, const ExecutionSpace& exec, Policy&& policy, - Functor&& functor) const { + template + requires(Kokkos::Impl::NodeProperties> && + std::is_invocable_r_v> && + Kokkos::Impl::is_specialization_of_v) + auto then(Props&& props, Policy&& policy, Functor&& functor) const { using next_kernel_t = Kokkos::Impl::GraphNodeThenImpl, std::remove_cvref_t>; - return this->_then_kernel(next_kernel_t(std::forward end then_parallel_for }}}2 @@ -434,33 +447,24 @@ class GraphNodeRef { return static_cast(v2); } - template > - is_execution_policy>::value, - // -------------------- - int> = 0> - auto then_parallel_reduce(std::string arg_name, Policy&& arg_policy, + template + requires(Kokkos::Impl::NodeProperties> && + ExecutionPolicyOn, ExecutionSpace>) + auto then_parallel_reduce(Props&& props, Policy&& arg_policy, Functor&& functor, ReturnType&& return_value) const { auto graph_impl_ptr = m_graph_impl.lock(); KOKKOS_EXPECTS(bool(graph_impl_ptr)) KOKKOS_EXPECTS(bool(m_node_impl)) - // TODO @graph restore this expectation once we add comparability to space - // instances - // KOKKOS_EXPECTS( - // arg_policy.space() == m_graph_impl->get_execution_space()); + + KOKKOS_IMPL_POLICY_ON_DEFAULT_EXEC(arg_policy) // needs static assertion of constraint: // DataParallelReductionFunctor - using policy_t = std::remove_cv_t>; - static_assert( - std::is_same_v, - // TODO @graph make defaulted execution space work - // || policy_t::execution_space_is_defaulted, - "Execution Space mismatch between execution policy and graph"); + auto full_props = with_properties_if_unset( + std::forward(props), graph_impl_ptr->get_device_handle(), + "[unlabeled]"); // This is also just an expectation, but it's one that we expect the user // to interact with (even in release mode), so we should throw an exception @@ -469,7 +473,8 @@ class GraphNodeRef { // whether or not they point to a View as a runtime boolean rather than part // of the type. if (Kokkos::Impl::parallel_reduce_needs_fence( - graph_impl_ptr->get_execution_space(), return_value)) { + Kokkos::Impl::get_property(full_props).m_exec, + return_value)) { Kokkos::Impl::throw_runtime_exception( "Parallel reductions in graphs can't operate on Reducers that " "reference a scalar because they can't complete synchronously. Use a " @@ -517,8 +522,10 @@ class GraphNodeRef { // End of Kokkos reducer disaster //---------------------------------------- - auto policy = Experimental::require((Policy&&)arg_policy, - Kokkos::Impl::KernelInGraphProperty{}); + auto policy = Experimental::require( + Policy(Kokkos::Impl::PolicyUpdate{}, (Policy&&)arg_policy, + Kokkos::Impl::get_property(full_props).m_exec), + Kokkos::Impl::KernelInGraphProperty{}); using passed_reducer_type = typename return_value_adapter::reducer_type; @@ -545,21 +552,18 @@ class GraphNodeRef { Kokkos::ParallelReduceTag>; return this->_then_kernel(next_kernel_t{ - std::move(arg_name), graph_impl_ptr->get_execution_space(), - functor_reducer, (Policy&&)policy, - return_value_adapter::return_value(return_value, functor)}); + Kokkos::Impl::extract_property(full_props), + Kokkos::Impl::extract_property(full_props).m_exec, + std::move(functor_reducer), std::move(policy), + return_value_adapter::return_value(return_value, + std::forward(functor))}); } - template > - is_execution_policy>::value, - // -------------------- - int> = 0> + template + requires ExecutionPolicyOn, ExecutionSpace> auto then_parallel_reduce(Policy&& arg_policy, Functor&& functor, ReturnType&& return_value) const { - return this->then_parallel_reduce("", (Policy&&)arg_policy, + return this->then_parallel_reduce(node_props(), (Policy&&)arg_policy, (Functor&&)functor, (ReturnType&&)return_value); } @@ -570,16 +574,18 @@ class GraphNodeRef { Functor&& functor, ReturnType&& return_value) const { return this->then_parallel_reduce( - std::move(label), Kokkos::RangePolicy{0, idx_end}, - (Functor&&)functor, (ReturnType&&)return_value); + node_props(std::move(label)), + Kokkos::RangePolicy{0, idx_end}, (Functor&&)functor, + (ReturnType&&)return_value); } template auto then_parallel_reduce(typename execution_space::size_type idx_end, Functor&& functor, ReturnType&& return_value) const { - return this->then_parallel_reduce("", idx_end, (Functor&&)functor, - (ReturnType&&)return_value); + return this->then_parallel_reduce( + node_props(), Kokkos::RangePolicy{0, idx_end}, + (Functor&&)functor, (ReturnType&&)return_value); } // end then_parallel_reduce }}}2 diff --git a/lib/kokkos/core/src/Kokkos_Graph_fwd.hpp b/lib/kokkos/core/src/Kokkos_Graph_fwd.hpp index 2a55d6cca35..9fc6d62ca7d 100644 --- a/lib/kokkos/core/src/Kokkos_Graph_fwd.hpp +++ b/lib/kokkos/core/src/Kokkos_Graph_fwd.hpp @@ -8,7 +8,7 @@ #define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_GRAPH_FWD #endif -#include +#include namespace Kokkos { namespace Experimental { @@ -18,7 +18,7 @@ struct TypeErasedTag {}; template struct Graph; -template class GraphNodeRef; diff --git a/lib/kokkos/core/src/Kokkos_HostSpace.hpp b/lib/kokkos/core/src/Kokkos_HostSpace.hpp index 372ecb56929..3979efa5821 100644 --- a/lib/kokkos/core/src/Kokkos_HostSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_HostSpace.hpp @@ -50,34 +50,7 @@ class HostSpace { //! This memory space preferred device_type using device_type = Kokkos::Device; - HostSpace() = default; - HostSpace(HostSpace&& rhs) = default; - HostSpace(const HostSpace& rhs) = default; - HostSpace& operator=(HostSpace&&) = default; - HostSpace& operator=(const HostSpace&) = default; - ~HostSpace() = default; - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - /**\brief Non-default memory space instance to choose allocation mechansim, - * if available */ - -#if defined(KOKKOS_COMPILER_GNU) && KOKKOS_COMPILER_GNU < 1100 - // We see deprecation warnings even when not using the deprecated - // HostSpace constructor below when using gcc before release 11. - enum -#else - enum KOKKOS_DEPRECATED -#endif - AllocationMechanism { - STD_MALLOC, - POSIX_MEMALIGN, - POSIX_MMAP, - INTEL_MM_ALLOC - }; - - KOKKOS_DEPRECATED - explicit HostSpace(const AllocationMechanism&); -#endif + HostSpace() = default; /**\brief Allocate untracked memory in the space */ template diff --git a/lib/kokkos/core/src/Kokkos_Layout.hpp b/lib/kokkos/core/src/Kokkos_Layout.hpp index 965fb5339d4..c6717cd4ff0 100644 --- a/lib/kokkos/core/src/Kokkos_Layout.hpp +++ b/lib/kokkos/core/src/Kokkos_Layout.hpp @@ -46,11 +46,6 @@ struct LayoutLeft { enum : bool { is_extent_constructible = true }; - LayoutLeft(LayoutLeft const&) = default; - LayoutLeft(LayoutLeft&&) = default; - LayoutLeft& operator=(LayoutLeft const&) = default; - LayoutLeft& operator=(LayoutLeft&&) = default; - KOKKOS_INLINE_FUNCTION explicit constexpr LayoutLeft(size_t N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -100,11 +95,6 @@ struct LayoutRight { enum : bool { is_extent_constructible = true }; - LayoutRight(LayoutRight const&) = default; - LayoutRight(LayoutRight&&) = default; - LayoutRight& operator=(LayoutRight const&) = default; - LayoutRight& operator=(LayoutRight&&) = default; - KOKKOS_INLINE_FUNCTION explicit constexpr LayoutRight(size_t N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -141,11 +131,6 @@ struct LayoutStride { enum : bool { is_extent_constructible = false }; - LayoutStride(LayoutStride const&) = default; - LayoutStride(LayoutStride&&) = default; - LayoutStride& operator=(LayoutStride const&) = default; - LayoutStride& operator=(LayoutStride&&) = default; - /** \brief Compute strides from ordered dimensions. * * Values of order uniquely form the set [0..rank) @@ -214,11 +199,6 @@ enum class Iterate { Right // Right indices stride fastest }; -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -template -struct KOKKOS_DEPRECATED is_layouttiled : std::false_type {}; -#endif - namespace Impl { // For use with view_copy template @@ -250,12 +230,6 @@ struct layout_iterate_type_selector { }; } // namespace Impl -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -template -using layout_iterate_type_selector KOKKOS_DEPRECATED = - Impl::layout_iterate_type_selector; -#endif - } // namespace Kokkos #endif // #ifndef KOKKOS_LAYOUT_HPP diff --git a/lib/kokkos/core/src/Kokkos_Macros.hpp b/lib/kokkos/core/src/Kokkos_Macros.hpp index 842b259abbd..3e476d22bd8 100644 --- a/lib/kokkos/core/src/Kokkos_Macros.hpp +++ b/lib/kokkos/core/src/Kokkos_Macros.hpp @@ -11,15 +11,18 @@ * KOKKOS_ENABLE_THREADS Kokkos::Threads execution space * KOKKOS_ENABLE_HPX Kokkos::Experimental::HPX execution space * KOKKOS_ENABLE_OPENMP Kokkos::OpenMP execution space - * KOKKOS_ENABLE_OPENMPTARGET Kokkos::Experimental::OpenMPTarget - * execution space * KOKKOS_ENABLE_HIP Kokkos::HIP execution space * KOKKOS_ENABLE_SYCL Kokkos::SYCL execution space * KOKKOS_ENABLE_HWLOC HWLOC library is available. * KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK Insert array bounds checks, is expensive! - * KOKKOS_ENABLE_CUDA_UVM Use CUDA UVM for Cuda memory space. */ +#ifndef KOKKOS_DONT_INCLUDE_CORE_CONFIG_H +#include +#include +#include +#endif + #define KOKKOS_VERSION_LESS(MAJOR, MINOR, PATCH) \ (KOKKOS_VERSION < ((MAJOR)*10000 + (MINOR)*100 + (PATCH))) @@ -40,12 +43,6 @@ #error implementation bug #endif -#ifndef KOKKOS_DONT_INCLUDE_CORE_CONFIG_H -#include -#include -#include -#endif - #if __has_include() #include #else @@ -77,11 +74,11 @@ //---------------------------------------------------------------------------- -#if defined(KOKKOS_ENABLE_ATOMICS_BYPASS) && \ - (defined(KOKKOS_ENABLE_THREADS) || defined(KOKKOS_ENABLE_CUDA) || \ - defined(KOKKOS_ENABLE_OPENMP) || defined(KOKKOS_ENABLE_HPX) || \ - defined(KOKKOS_ENABLE_OPENMPTARGET) || defined(KOKKOS_ENABLE_HIP) || \ - defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENACC)) +#if defined(KOKKOS_ENABLE_ATOMICS_BYPASS) && \ + (defined(KOKKOS_ENABLE_THREADS) || defined(KOKKOS_ENABLE_CUDA) || \ + defined(KOKKOS_ENABLE_OPENMP) || defined(KOKKOS_ENABLE_HPX) || \ + defined(KOKKOS_ENABLE_HIP) || defined(KOKKOS_ENABLE_SYCL) || \ + defined(KOKKOS_ENABLE_OPENACC)) #error Atomics may only be disabled if neither a host parallel nor a device backend is enabled #endif @@ -128,7 +125,8 @@ // CRAY compiler for host code #define KOKKOS_COMPILER_CRAYC _CRAYC -#elif defined(__APPLE_CC__) +#elif defined(__APPLE_CC__) && defined(__clang__) && \ + defined(__apple_build_version__) #define KOKKOS_COMPILER_APPLECC __APPLE_CC__ #elif defined(__NVCOMPILER) @@ -327,15 +325,12 @@ #define KOKKOS_IMPL_DEVICE_FUNCTION #endif -// FIXME_OPENACC FIXME_OPENMPTARGET +// FIXME_OPENACC // Move to setup files once there is more content // clang-format off #if defined(KOKKOS_ENABLE_OPENACC) #define KOKKOS_IMPL_RELOCATABLE_FUNCTION @"KOKKOS_RELOCATABLE_FUNCTION is not supported for the OpenACC backend" #endif -#if defined(KOKKOS_ENABLE_OPENMPTARGET) -#define KOKKOS_IMPL_RELOCATABLE_FUNCTION @"KOKKOS_RELOCATABLE_FUNCTION is not supported for the OpenMPTarget backend" -#endif // clang-format on #if !defined(KOKKOS_IMPL_RELOCATABLE_FUNCTION) @@ -431,26 +426,24 @@ static_assert( // Determine the default execution space for parallel dispatch. // There is zero or one default execution space specified. -#if 1 < ((defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA) ? 1 : 0) + \ - (defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HIP) ? 1 : 0) + \ - (defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SYCL) ? 1 : 0) + \ - (defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENACC) ? 1 : 0) + \ - (defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET) ? 1 : 0) + \ - (defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP) ? 1 : 0) + \ - (defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS) ? 1 : 0) + \ - (defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HPX) ? 1 : 0) + \ +#if 1 < ((defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA) ? 1 : 0) + \ + (defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HIP) ? 1 : 0) + \ + (defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SYCL) ? 1 : 0) + \ + (defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENACC) ? 1 : 0) + \ + (defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP) ? 1 : 0) + \ + (defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS) ? 1 : 0) + \ + (defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HPX) ? 1 : 0) + \ (defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL) ? 1 : 0)) #error "More than one KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_* specified." #endif // If default is not specified then chose from enabled execution spaces. -// Priority: CUDA, HIP, SYCL, OPENACC, OPENMPTARGET, OPENMP, THREADS, HPX, +// Priority: CUDA, HIP, SYCL, OPENACC, OPENMP, THREADS, HPX, // SERIAL #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA) #elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HIP) #elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SYCL) #elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENACC) -#elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET) #elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP) #elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS) #elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HPX) @@ -463,8 +456,6 @@ static_assert( #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SYCL #elif defined(KOKKOS_ENABLE_OPENACC) #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENACC -#elif defined(KOKKOS_ENABLE_OPENMPTARGET) -#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET #elif defined(KOKKOS_ENABLE_OPENMP) #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP #elif defined(KOKKOS_ENABLE_THREADS) @@ -490,39 +481,6 @@ static_assert( #define KOKKOS_IF_ON_HOST(CODE) NV_IF_TARGET(NV_IS_HOST, CODE) #endif -#ifdef KOKKOS_ENABLE_OPENMPTARGET -#ifdef KOKKOS_COMPILER_NVHPC -#define KOKKOS_IF_ON_DEVICE(CODE) \ - if (__builtin_is_device_code()) { \ - KOKKOS_IMPL_STRIP_PARENS(CODE) \ - } -#define KOKKOS_IF_ON_HOST(CODE) \ - if (!__builtin_is_device_code()) { \ - KOKKOS_IMPL_STRIP_PARENS(CODE) \ - } -#else -// Base function. -static constexpr bool kokkos_omp_on_host() { return true; } - -#pragma omp begin declare variant match(device = {kind(host)}) -static constexpr bool kokkos_omp_on_host() { return true; } -#pragma omp end declare variant - -#pragma omp begin declare variant match(device = {kind(nohost)}) -static constexpr bool kokkos_omp_on_host() { return false; } -#pragma omp end declare variant - -#define KOKKOS_IF_ON_DEVICE(CODE) \ - if constexpr (!kokkos_omp_on_host()) { \ - KOKKOS_IMPL_STRIP_PARENS(CODE) \ - } -#define KOKKOS_IF_ON_HOST(CODE) \ - if constexpr (kokkos_omp_on_host()) { \ - KOKKOS_IMPL_STRIP_PARENS(CODE) \ - } -#endif -#endif - #ifdef KOKKOS_ENABLE_OPENACC #ifdef KOKKOS_COMPILER_NVHPC #define KOKKOS_IF_ON_DEVICE(CODE) \ @@ -565,18 +523,11 @@ static constexpr bool kokkos_omp_on_host() { return false; } //---------------------------------------------------------------------------- -#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) -#define KOKKOS_ENABLE_CUDA_LDG_INTRINSIC -#endif - #define KOKKOS_INVALID_INDEX (~std::size_t(0)) #define KOKKOS_IMPL_CTOR_DEFAULT_ARG KOKKOS_INVALID_INDEX -// Guard intel compiler version 19 and older -// intel error #2651: attribute does not apply to any entity -// using KOKKOS_DEPRECATED = ... -#if defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) && !defined(__NVCC__) +#if defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) #define KOKKOS_DEPRECATED [[deprecated]] #define KOKKOS_DEPRECATED_WITH_COMMENT(comment) [[deprecated(comment)]] #else @@ -599,33 +550,52 @@ static constexpr bool kokkos_omp_on_host() { return false; } // clang-format off #if defined(__NVCOMPILER) - #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH_() \ _Pragma("diag_suppress 1216") \ _Pragma("diag_suppress deprecated_entity_with_custom_message") - #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() \ + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP_() \ _Pragma("diag_default 1216") \ - _Pragma("diag_suppress deprecated_entity_with_custom_message") + _Pragma("diag_default deprecated_entity_with_custom_message") #elif defined(__EDG__) - #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH_() \ _Pragma("warning push") \ _Pragma("warning disable 1478") - #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() \ + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP_() \ _Pragma("warning pop") #elif defined(__GNUC__) || defined(__clang__) - #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH_() \ _Pragma("GCC diagnostic push") \ _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") - #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() \ + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP_() \ _Pragma("GCC diagnostic pop") #elif defined(_MSC_VER) - #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH_() \ _Pragma("warning(push)") \ _Pragma("warning(disable: 4996)") - #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() \ + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP_() \ _Pragma("warning(pop)") #else - #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() - #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH_() + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP_() +#endif + +// FIXME NVCC <13: using the deprecation warnings push/pop mechanism with nvcc +// and nvc++ as host compiler leads to bugs where some of the _Pragma are not +// taken into account. +#if defined(__NVCC__) && defined(__NVCC_DIAG_PRAGMA_SUPPORT__) && \ + (!defined(__NVCOMPILER) || (KOKKOS_COMPILER_NVCC >= 1300)) + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH_() \ + _Pragma("nv_diagnostic push") \ + _Pragma("nv_diag_suppress 1215,1444") + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() \ + _Pragma("nv_diagnostic pop") \ + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP_() +#else + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH_() + #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() \ + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP_() #endif #if defined(__NVCOMPILER) diff --git a/lib/kokkos/core/src/Kokkos_MathematicalFunctions.hpp b/lib/kokkos/core/src/Kokkos_MathematicalFunctions.hpp index 215c77aaad2..952e51b9b9f 100644 --- a/lib/kokkos/core/src/Kokkos_MathematicalFunctions.hpp +++ b/lib/kokkos/core/src/Kokkos_MathematicalFunctions.hpp @@ -8,17 +8,19 @@ #define KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_MATHFUNCTIONS #endif +#include #include #include #include #include #ifdef KOKKOS_ENABLE_SYCL -// FIXME_SYCL -#if __has_include() #include -#else -#include +#endif + +#if defined(KOKKOS_ENABLE_CUDA) +#if defined(CUDA_VERSION) && CUDA_VERSION >= 12090 +#include #endif #endif @@ -71,13 +73,8 @@ using promote_3_t = typename promote_3::type; #if defined(KOKKOS_ENABLE_SYCL) #define KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE sycl #else -#if (defined(KOKKOS_COMPILER_NVCC) || defined(KOKKOS_COMPILER_NVHPC)) && \ - defined(__GNUC__) && (__GNUC__ < 6) && !defined(__clang__) -#define KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE -#else #define KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE std #endif -#endif #define KOKKOS_IMPL_MATH_UNARY_FUNCTION(FUNC) \ KOKKOS_INLINE_FUNCTION float FUNC(float x) { \ @@ -107,6 +104,34 @@ using promote_3_t = typename promote_3::type; return FUNC(static_cast(x)); \ } +#define KOKKOS_IMPL_MATH_UNARY_INT_FUNCTION(FUNC) \ + KOKKOS_INLINE_FUNCTION int FUNC(float x) { \ + using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC; \ + return FUNC(x); \ + } \ + KOKKOS_INLINE_FUNCTION int FUNC(double x) { \ + using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC; \ + return FUNC(x); \ + } \ + inline int FUNC(long double x) { \ + using std::FUNC; \ + return FUNC(x); \ + } \ + KOKKOS_INLINE_FUNCTION int FUNC##f(float x) { \ + using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC; \ + return FUNC(x); \ + } \ + inline int FUNC##l(long double x) { \ + using std::FUNC; \ + return FUNC(x); \ + } \ + template \ + KOKKOS_INLINE_FUNCTION std::enable_if_t, int> FUNC( \ + T x) { \ + using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC; \ + return FUNC(static_cast(x)); \ + } + // isinf, isnan, and isinfinite do not work on Windows with CUDA with std:: // getting warnings about calling host function in device function then // runtime test fails @@ -190,6 +215,211 @@ using promote_3_t = typename promote_3::type; return FUNC(static_cast(x), static_cast(y)); \ } +#define KOKKOS_IMPL_MATH_BINARY_PTR_FUNCTION(FUNC) \ + KOKKOS_INLINE_FUNCTION float FUNC(float x, float* y) { \ + using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC; \ + return FUNC(x, y); \ + } \ + KOKKOS_INLINE_FUNCTION double FUNC(double x, double* y) { \ + using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC; \ + return FUNC(x, y); \ + } \ + KOKKOS_INLINE_FUNCTION float FUNC##f(float x, float* y) { \ + using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC; \ + return FUNC(x, y); \ + } \ + inline long double FUNC(long double x, long double* y) { \ + using std::FUNC; \ + return FUNC(x, y); \ + } \ + inline long double FUNC##l(long double x, long double* y) { \ + using std::FUNC; \ + return FUNC(x, y); \ + } \ + template \ + KOKKOS_INLINE_FUNCTION std::enable_if_t, double> FUNC( \ + T x, double* y) { \ + using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC; \ + return FUNC(static_cast(x), y); \ + } + +#define KOKKOS_IMPL_MATH_BINARY_PREDICATE(FUNC, NAMESPACE) \ + KOKKOS_INLINE_FUNCTION bool FUNC(float x, float y) { \ + using NAMESPACE::FUNC; \ + return FUNC(x, y); \ + } \ + KOKKOS_INLINE_FUNCTION bool FUNC(double x, double y) { \ + using NAMESPACE::FUNC; \ + return FUNC(x, y); \ + } \ + inline bool FUNC(long double x, long double y) { \ + using std::FUNC; \ + return FUNC(x, y); \ + } \ + template \ + KOKKOS_INLINE_FUNCTION \ + std::enable_if_t && std::is_arithmetic_v && \ + !std::is_same_v && \ + !std::is_same_v, \ + bool> \ + FUNC(T1 x, T2 y) { \ + using Promoted = Kokkos::Impl::promote_2_t; \ + using NAMESPACE::FUNC; \ + return FUNC(static_cast(x), static_cast(y)); \ + } \ + template \ + inline std::enable_if_t && \ + std::is_arithmetic_v && \ + (std::is_same_v || \ + std::is_same_v), \ + bool> \ + FUNC(T1 x, T2 y) { \ + using Promoted = Kokkos::Impl::promote_2_t; \ + static_assert(std::is_same_v); \ + using std::FUNC; \ + return FUNC(static_cast(x), static_cast(y)); \ + } + +#define KOKKOS_IMPL_MATH_BINARY_PREDICATE_DEVICE_FALLBACK(FUNC, OP) \ + KOKKOS_INLINE_FUNCTION bool FUNC(float x, float y) { \ + KOKKOS_IF_ON_DEVICE(return OP;) \ + KOKKOS_IF_ON_HOST(using std::FUNC; return FUNC(x, y);) \ + } \ + KOKKOS_INLINE_FUNCTION bool FUNC(double x, double y) { \ + KOKKOS_IF_ON_DEVICE(return OP;) \ + KOKKOS_IF_ON_HOST(using std::FUNC; return FUNC(x, y);) \ + } \ + inline bool FUNC(long double x, long double y) { \ + using std::FUNC; \ + return FUNC(x, y); \ + } \ + template \ + KOKKOS_INLINE_FUNCTION \ + std::enable_if_t && std::is_arithmetic_v && \ + !std::is_same_v && \ + !std::is_same_v, \ + bool> \ + FUNC(T1 a, T2 b) { \ + using Promoted = Kokkos::Impl::promote_2_t; \ + auto x = static_cast(a); \ + auto y = static_cast(b); \ + KOKKOS_IF_ON_DEVICE(return OP;) \ + KOKKOS_IF_ON_HOST(using std::FUNC; return FUNC(x, y);) \ + } \ + template \ + inline std::enable_if_t && \ + std::is_arithmetic_v && \ + (std::is_same_v || \ + std::is_same_v), \ + bool> \ + FUNC(T1 x, T2 y) { \ + using Promoted = Kokkos::Impl::promote_2_t; \ + static_assert(std::is_same_v); \ + using std::FUNC; \ + return FUNC(static_cast(x), static_cast(y)); \ + } + +#define KOKKOS_IMPL_MATH_BINARY_INT_FUNCTION(FUNC1, FUNC2, intT) \ + KOKKOS_INLINE_FUNCTION float FUNC1(float x, intT y) { \ + using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC2; \ + return FUNC2(x, y); \ + } \ + KOKKOS_INLINE_FUNCTION double FUNC1(double x, intT y) { \ + using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC2; \ + return FUNC2(x, y); \ + } \ + KOKKOS_INLINE_FUNCTION float FUNC1##f(float x, intT y) { \ + using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC2; \ + return FUNC2(x, y); \ + } \ + inline long double FUNC1(long double x, intT y) { \ + using std::FUNC2; \ + return FUNC2(x, y); \ + } \ + inline long double FUNC1##l(long double x, intT y) { \ + using std::FUNC2; \ + return FUNC2(x, y); \ + } \ + template \ + KOKKOS_INLINE_FUNCTION std::enable_if_t, double> \ + FUNC1(T x, intT y) { \ + using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC2; \ + return FUNC2(static_cast(x), y); \ + } + +#define KOKKOS_IMPL_MATH_BINARY_INT_PTR_FUNCTION(FUNC) \ + KOKKOS_INLINE_FUNCTION float FUNC(float x, int* y) { \ + using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC; \ + return FUNC(x, y); \ + } \ + KOKKOS_INLINE_FUNCTION double FUNC(double x, int* y) { \ + using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC; \ + return FUNC(x, y); \ + } \ + KOKKOS_INLINE_FUNCTION float FUNC##f(float x, int* y) { \ + using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC; \ + return FUNC(x, y); \ + } \ + inline long double FUNC(long double x, int* y) { \ + using std::FUNC; \ + return FUNC(x, y); \ + } \ + inline long double FUNC##l(long double x, int* y) { \ + using std::FUNC; \ + return FUNC(x, y); \ + } \ + template \ + KOKKOS_INLINE_FUNCTION std::enable_if_t, double> FUNC( \ + T x, int* y) { \ + using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC; \ + return FUNC(static_cast(x), y); \ + } + +#define KOKKOS_IMPL_MATH_TERNARY_INT_PTR_FUNCTION(FUNC) \ + KOKKOS_INLINE_FUNCTION float FUNC(float x, float y, int* z) { \ + using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC; \ + return FUNC(x, y, z); \ + } \ + KOKKOS_INLINE_FUNCTION double FUNC(double x, double y, int* z) { \ + using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC; \ + return FUNC(x, y, z); \ + } \ + inline long double FUNC(long double x, long double y, int* z) { \ + using std::FUNC; \ + return FUNC(x, y, z); \ + } \ + KOKKOS_INLINE_FUNCTION float FUNC##f(float x, float y, int* z) { \ + using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC; \ + return FUNC(x, y, z); \ + } \ + inline long double FUNC##l(long double x, long double y, int* z) { \ + using std::FUNC; \ + return FUNC(x, y, z); \ + } \ + template \ + KOKKOS_INLINE_FUNCTION \ + std::enable_if_t && std::is_arithmetic_v && \ + !std::is_same_v && \ + !std::is_same_v, \ + Kokkos::Impl::promote_2_t> \ + FUNC(T1 x, T2 y, int* z) { \ + using Promoted = Kokkos::Impl::promote_2_t; \ + using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC; \ + return FUNC(static_cast(x), static_cast(y), z); \ + } \ + template \ + inline std::enable_if_t && \ + std::is_arithmetic_v && \ + (std::is_same_v || \ + std::is_same_v), \ + long double> \ + FUNC(T1 x, T2 y, int* z) { \ + using Promoted = Kokkos::Impl::promote_2_t; \ + static_assert(std::is_same_v); \ + using std::FUNC; \ + return FUNC(static_cast(x), static_cast(y), z); \ + } + #define KOKKOS_IMPL_MATH_TERNARY_FUNCTION(FUNC) \ KOKKOS_INLINE_FUNCTION float FUNC(float x, float y, float z) { \ using KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE::FUNC; \ @@ -286,7 +516,7 @@ inline long double abs(long double x) { KOKKOS_IMPL_MATH_UNARY_FUNCTION(fabs) KOKKOS_IMPL_MATH_BINARY_FUNCTION(fmod) KOKKOS_IMPL_MATH_BINARY_FUNCTION(remainder) -// remquo +KOKKOS_IMPL_MATH_TERNARY_INT_PTR_FUNCTION(remquo) KOKKOS_IMPL_MATH_TERNARY_FUNCTION(fma) KOKKOS_IMPL_MATH_BINARY_FUNCTION(fmax) KOKKOS_IMPL_MATH_BINARY_FUNCTION(fmin) @@ -320,7 +550,8 @@ inline long double exp2(long double val) { return exp(ln2 * val); } template -KOKKOS_INLINE_FUNCTION double exp2(T val) { +KOKKOS_INLINE_FUNCTION std::enable_if_t, double> exp2( + T val) { constexpr double ln2 = 0.693147180559945309417232121458176568L; return exp(ln2 * static_cast(val)); } @@ -405,45 +636,167 @@ KOKKOS_IMPL_MATH_UNARY_FUNCTION(ceil) KOKKOS_IMPL_MATH_UNARY_FUNCTION(floor) KOKKOS_IMPL_MATH_UNARY_FUNCTION(trunc) KOKKOS_IMPL_MATH_UNARY_FUNCTION(round) -// lround -// llround -// FIXME_SYCL not available as of current SYCL 2020 specification (revision 4) -#ifndef KOKKOS_ENABLE_SYCL // FIXME_SYCL +// FIXME_SYCL not available as of current SYCL 2020 specification (revision 11) +#ifndef KOKKOS_ENABLE_SYCL +KOKKOS_IMPL_MATH_UNARY_INT_FUNCTION(lround) +KOKKOS_IMPL_MATH_UNARY_INT_FUNCTION(llround) KOKKOS_IMPL_MATH_UNARY_FUNCTION(nearbyint) #endif -// rint -// lrint -// llrint +KOKKOS_IMPL_MATH_UNARY_FUNCTION(rint) +// FIXME_SYCL not available as of current SYCL 2020 specification (revision 11) +#ifndef KOKKOS_ENABLE_SYCL +KOKKOS_IMPL_MATH_UNARY_INT_FUNCTION(lrint) +KOKKOS_IMPL_MATH_UNARY_INT_FUNCTION(llrint) +#endif // Floating point manipulation functions -// frexp -// ldexp -// modf -// scalbn -// scalbln -// ilog +KOKKOS_IMPL_MATH_BINARY_INT_PTR_FUNCTION(frexp) +KOKKOS_IMPL_MATH_BINARY_INT_FUNCTION(ldexp, ldexp, int) +KOKKOS_IMPL_MATH_BINARY_PTR_FUNCTION(modf) +// FIXME_SYCL not available as of current SYCL 2020 specification (revision 11) +#ifndef KOKKOS_ENABLE_SYCL +KOKKOS_IMPL_MATH_BINARY_INT_FUNCTION(scalbn, scalbn, int) +KOKKOS_IMPL_MATH_BINARY_INT_FUNCTION(scalbln, scalbln, long) +#elif defined(FLT_RADIX) && (FLT_RADIX == 2) +// If FLT_RADIX==2, we can implement scalbn via ldexp. +KOKKOS_IMPL_MATH_BINARY_INT_FUNCTION(scalbn, ldexp, int) +#endif +KOKKOS_IMPL_MATH_UNARY_INT_FUNCTION(ilogb) KOKKOS_IMPL_MATH_UNARY_FUNCTION(logb) KOKKOS_IMPL_MATH_BINARY_FUNCTION(nextafter) -// nexttoward +inline float nexttoward(float from, long double to) { + using std::nexttoward; + return nexttoward(from, to); +} +inline float nexttowardf(float from, long double to) { + using std::nexttoward; + return nexttoward(from, to); +} +inline double nexttoward(double from, long double to) { + using std::nexttoward; + return nexttoward(from, to); +} +inline long double nexttoward(long double from, long double to) { + using std::nexttoward; + return nexttoward(from, to); +} +inline long double nexttowardl(long double from, long double to) { + using std::nexttoward; + return nexttoward(from, to); +} +template +inline std::enable_if_t, double> nexttoward( + Integer from, long double to) { + using std::nexttoward; + return nexttoward(from, to); +} KOKKOS_IMPL_MATH_BINARY_FUNCTION(copysign) // Classification and comparison -// fpclassify +// fpclassify not available on Cuda and SYCL +// FIXME_NVHPC nvhpc's fpclassify return FP_ZERO for subnormal values. +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_SYCL) || \ + defined(KOKKOS_COMPILER_NVHPC) +#define KOKKOS_IMPL_MATH_FPCLASSIFY(SPECIFIER, TYPE) \ + SPECIFIER int fpclassify(TYPE x) { \ + if (x != x) { \ + return FP_NAN; \ + } else if (x == 0) { \ + return FP_ZERO; \ + } else if (Kokkos::abs(x) < Kokkos::Experimental::norm_min_v) { \ + return FP_SUBNORMAL; \ + } else if (Kokkos::abs(x) == Kokkos::Experimental::infinity_v) { \ + return FP_INFINITE; \ + } else { \ + return FP_NORMAL; \ + } \ + } + +KOKKOS_IMPL_MATH_FPCLASSIFY(KOKKOS_INLINE_FUNCTION, float) +KOKKOS_IMPL_MATH_FPCLASSIFY(KOKKOS_INLINE_FUNCTION, double) +KOKKOS_IMPL_MATH_FPCLASSIFY(inline, long double) + +#undef KOKKOS_IMPL_MATH_FPCLASSIFY + +template +KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t, int> +fpclassify(T x) { + if (x == 0) { + return FP_ZERO; + } else { + return FP_NORMAL; + } +} +#else +KOKKOS_IMPL_MATH_UNARY_FUNCTION(fpclassify) +#endif KOKKOS_IMPL_MATH_UNARY_PREDICATE(isfinite) KOKKOS_IMPL_MATH_UNARY_PREDICATE(isinf) KOKKOS_IMPL_MATH_UNARY_PREDICATE(isnan) -// isnormal +#if defined(KOKKOS_ENABLE_CUDA) +#define KOKKOS_IMPL_MATH_ISNORMAL(SPECIFIER, TYPE) \ + SPECIFIER bool isnormal(TYPE x) { \ + auto const abs = Kokkos::abs(x); \ + return (abs >= Kokkos::Experimental::norm_min_v)&&( \ + abs <= Kokkos::Experimental::finite_max_v); \ + } + +KOKKOS_IMPL_MATH_ISNORMAL(KOKKOS_INLINE_FUNCTION, float) +KOKKOS_IMPL_MATH_ISNORMAL(KOKKOS_INLINE_FUNCTION, double) +KOKKOS_IMPL_MATH_ISNORMAL(inline, long double) + +#undef KOKKOS_IMPL_MATH_ISNORMAL + +template +KOKKOS_INLINE_FUNCTION std::enable_if_t, bool> isnormal( + T x) { + return x != T(0); +} +#else +KOKKOS_IMPL_MATH_UNARY_PREDICATE(isnormal) +#endif KOKKOS_IMPL_MATH_UNARY_PREDICATE(signbit) -// isgreater -// isgreaterequal -// isless -// islessequal -// islessgreater -// isunordered +#if defined(KOKKOS_ENABLE_CUDA) +#if defined(CUDA_VERSION) && CUDA_VERSION >= 12090 +KOKKOS_IMPL_MATH_BINARY_PREDICATE(isgreater, cuda::std) +KOKKOS_IMPL_MATH_BINARY_PREDICATE(isgreaterequal, cuda::std) +KOKKOS_IMPL_MATH_BINARY_PREDICATE(isless, cuda::std) +KOKKOS_IMPL_MATH_BINARY_PREDICATE(islessequal, cuda::std) +KOKKOS_IMPL_MATH_BINARY_PREDICATE(islessgreater, cuda::std) +KOKKOS_IMPL_MATH_BINARY_PREDICATE(isunordered, cuda::std) +#else +KOKKOS_IMPL_MATH_BINARY_PREDICATE_DEVICE_FALLBACK(isgreater, x > y) +KOKKOS_IMPL_MATH_BINARY_PREDICATE_DEVICE_FALLBACK(isgreaterequal, x >= y) +KOKKOS_IMPL_MATH_BINARY_PREDICATE_DEVICE_FALLBACK(isless, x < y) +KOKKOS_IMPL_MATH_BINARY_PREDICATE_DEVICE_FALLBACK(islessequal, x <= y) +KOKKOS_IMPL_MATH_BINARY_PREDICATE_DEVICE_FALLBACK(islessgreater, x y) +KOKKOS_IMPL_MATH_BINARY_PREDICATE_DEVICE_FALLBACK(isunordered, + isnan(x) || isnan(y)) +#endif +#else +KOKKOS_IMPL_MATH_BINARY_PREDICATE(isgreater, + KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE) +KOKKOS_IMPL_MATH_BINARY_PREDICATE(isgreaterequal, + KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE) +KOKKOS_IMPL_MATH_BINARY_PREDICATE(isless, KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE) +KOKKOS_IMPL_MATH_BINARY_PREDICATE(islessequal, + KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE) +KOKKOS_IMPL_MATH_BINARY_PREDICATE(islessgreater, + KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE) +KOKKOS_IMPL_MATH_BINARY_PREDICATE(isunordered, + KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE) +#endif #undef KOKKOS_IMPL_MATH_FUNCTIONS_NAMESPACE #undef KOKKOS_IMPL_MATH_UNARY_FUNCTION +#undef KOKKOS_IMPL_MATH_UNARY_INT_FUNCTION #undef KOKKOS_IMPL_MATH_UNARY_PREDICATE #undef KOKKOS_IMPL_MATH_BINARY_FUNCTION +#undef KOKKOS_IMPL_MATH_BINARY_PTR_FUNCTION +#undef KOKKOS_IMPL_MATH_BINARY_PREDICATE +#undef KOKKOS_IMPL_MATH_BINARY_PREDICATE_DEVICE_FALLBACK +#undef KOKKOS_IMPL_MATH_BINARY_INT_FUNCTION +#undef KOKKOS_IMPL_MATH_BINARY_INT_PTR_FUNCTION #undef KOKKOS_IMPL_MATH_TERNARY_FUNCTION +#undef KOKKOS_IMPL_MATH_TERNARY_INT_PTR_FUNCTION // non-standard math functions provided by CUDA/HIP/SYCL KOKKOS_INLINE_FUNCTION float rsqrt(float val) { @@ -477,6 +830,34 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t, double> rsqrt( return Kokkos::rsqrt(static_cast(x)); } +// reciprocal functions 1/x +KOKKOS_INLINE_FUNCTION float rcp(float val) { +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) + KOKKOS_IF_ON_DEVICE(return __frcp_rn(val);) + KOKKOS_IF_ON_HOST(return 1.0f / val;) +#elif defined(KOKKOS_ENABLE_SYCL) + return sycl::native::recip(val); +#else + return 1.0f / val; +#endif +} +KOKKOS_INLINE_FUNCTION double rcp(double val) { +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) + KOKKOS_IF_ON_DEVICE(return __drcp_rn(val);) + KOKKOS_IF_ON_HOST(return 1.0 / val;) +#else + return 1.0 / val; +#endif +} +inline long double rcp(long double val) { return 1.0l / val; } +KOKKOS_INLINE_FUNCTION float rcpf(float val) { return Kokkos::rcp(val); } +inline long double rcpl(long double val) { return Kokkos::rcp(val); } +template +KOKKOS_INLINE_FUNCTION std::enable_if_t, double> rcp( + T x) { + return Kokkos::rcp(static_cast(x)); +} + } // namespace Kokkos #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_MATHFUNCTIONS diff --git a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp index 95bce22d1f1..3da64207425 100644 --- a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp +++ b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp @@ -238,11 +238,6 @@ class MemoryPool { //-------------------------------------------------------------------------- - KOKKOS_DEFAULTED_FUNCTION MemoryPool(MemoryPool &&) = default; - KOKKOS_DEFAULTED_FUNCTION MemoryPool(const MemoryPool &) = default; - KOKKOS_DEFAULTED_FUNCTION MemoryPool &operator=(MemoryPool &&) = default; - KOKKOS_DEFAULTED_FUNCTION MemoryPool &operator=(const MemoryPool &) = default; - KOKKOS_INLINE_FUNCTION MemoryPool() : m_tracker(), m_sb_state_array(nullptr), diff --git a/lib/kokkos/core/src/Kokkos_Pair.hpp b/lib/kokkos/core/src/Kokkos_Pair.hpp index 485b56859b5..c609504401e 100644 --- a/lib/kokkos/core/src/Kokkos_Pair.hpp +++ b/lib/kokkos/core/src/Kokkos_Pair.hpp @@ -71,17 +71,6 @@ struct pair { : first(p.first), second(p.second) { } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - /// \brief Copy constructor. - /// - /// This calls the copy constructors of T1 and T2. It won't compile - /// if those copy constructors are not defined and public. - template - KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr pair( - const volatile pair& p) - : first(p.first), second(p.second) {} -#endif - /// \brief Assignment operator. /// /// This calls the assignment operators of T1 and T2. It won't @@ -93,28 +82,6 @@ struct pair { return *this; } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - /// \brief Assignment operator, for volatile *this. - /// - /// \param p [in] Input; right-hand side of the assignment. - /// - /// This calls the assignment operators of T1 and T2. It will not - /// compile if the assignment operators are not defined and public. - /// - /// This operator returns \c void instead of volatile pair& . See Kokkos Issue #177 for the explanation. In - /// practice, this means that you should not chain assignments with - /// volatile lvalues. - template - KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION void operator=( - const volatile pair& p) volatile { - first = p.first; - second = p.second; - // We deliberately do not return anything here. See explanation - // in public documentation above. - } -#endif - // from std::pair template pair(const std::pair& p) : first(p.first), second(p.second) {} @@ -400,87 +367,6 @@ KOKKOS_FORCEINLINE_FUNCTION pair tie(T1& x, T2& y) { return (pair(x, y)); } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -// -// Specialization of Kokkos::pair for a \c void second argument. This -// is not actually a "pair"; it only contains one element, the first. -// -template -struct KOKKOS_DEPRECATED pair { - using first_type = T1; - using second_type = void; - - first_type first; - enum { second = 0 }; - - KOKKOS_DEFAULTED_FUNCTION constexpr pair() = default; - - KOKKOS_FORCEINLINE_FUNCTION constexpr pair(const first_type& f) : first(f) {} - - KOKKOS_FORCEINLINE_FUNCTION constexpr pair(const first_type& f, int) - : first(f) {} - - template - KOKKOS_FORCEINLINE_FUNCTION constexpr pair(const pair& p) - : first(p.first) {} - - template - KOKKOS_FORCEINLINE_FUNCTION pair& operator=( - const pair& p) { - first = p.first; - return *this; - } -}; - -// -// Specialization of relational operators for Kokkos::pair. -// - -#if defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) && \ - defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU < 1110) -KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() -#endif -template -KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator==( - const pair& lhs, const pair& rhs) { - return lhs.first == rhs.first; -} - -template -KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator!=( - const pair& lhs, const pair& rhs) { - return !(lhs == rhs); -} - -template -KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator<( - const pair& lhs, const pair& rhs) { - return lhs.first < rhs.first; -} - -template -KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator<=( - const pair& lhs, const pair& rhs) { - return !(rhs < lhs); -} - -template -KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>( - const pair& lhs, const pair& rhs) { - return rhs < lhs; -} - -template -KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>=( - const pair& lhs, const pair& rhs) { - return !(lhs < rhs); -} -#if defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) && \ - defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU < 1110) -KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() -#endif -#endif - namespace Impl { template struct is_pair_like : std::false_type {}; diff --git a/lib/kokkos/core/src/Kokkos_Parallel.hpp b/lib/kokkos/core/src/Kokkos_Parallel.hpp index d191723c6a4..651d4286993 100644 --- a/lib/kokkos/core/src/Kokkos_Parallel.hpp +++ b/lib/kokkos/core/src/Kokkos_Parallel.hpp @@ -13,7 +13,6 @@ static_assert(false, #define KOKKOS_PARALLEL_HPP #include -#include #include #include #include @@ -21,8 +20,9 @@ static_assert(false, #include #include -#include +#include #include +#include #include #include @@ -115,9 +115,7 @@ namespace Kokkos { * This compares to a single iteration \c iwork of a \c for loop. * If \c execution_space is not defined DefaultExecutionSpace will be used. */ -template < - class ExecPolicy, class FunctorType, - class Enable = std::enable_if_t::value>> +template inline void parallel_for(const std::string& str, const ExecPolicy& policy, const FunctorType& functor) { /** Enforce correct use **/ @@ -139,10 +137,8 @@ inline void parallel_for(const std::string& str, const ExecPolicy& policy, Kokkos::Tools::Impl::end_parallel_for(inner_policy, functor, str, kpID); } -template -inline void parallel_for( - const ExecPolicy& policy, const FunctorType& functor, - std::enable_if_t::value>* = nullptr) { +template +inline void parallel_for(const ExecPolicy& policy, const FunctorType& functor) { /** Enforce correct use **/ Impl::CheckUsage::check("parallel_for", policy); @@ -347,9 +343,7 @@ namespace Kokkos { /// }; /// \endcode /// -template ::value>> +template inline void parallel_scan(const std::string& str, const ExecutionPolicy& policy, const FunctorType& functor) { /** Enforce correct use **/ @@ -372,10 +366,9 @@ inline void parallel_scan(const std::string& str, const ExecutionPolicy& policy, Kokkos::Tools::Impl::end_parallel_scan(inner_policy, functor, str, kpID); } -template -inline void parallel_scan( - const ExecutionPolicy& policy, const FunctorType& functor, - std::enable_if_t::value>* = nullptr) { +template +inline void parallel_scan(const ExecutionPolicy& policy, + const FunctorType& functor) { /** Enforce correct use **/ Impl::CheckUsage::check("parallel_scan", policy); @@ -409,9 +402,8 @@ inline void parallel_scan(const size_t work_count, const FunctorType& functor) { ::Kokkos::parallel_scan("", work_count, functor); } -template ::value>> +template inline void parallel_scan(const std::string& str, const ExecutionPolicy& policy, const FunctorType& functor, ReturnType& return_value) { @@ -447,11 +439,11 @@ inline void parallel_scan(const std::string& str, const ExecutionPolicy& policy, "Kokkos::parallel_scan: fence due to result being a value, not a view"); } -template -inline void parallel_scan( - const ExecutionPolicy& policy, const FunctorType& functor, - ReturnType& return_value, - std::enable_if_t::value>* = nullptr) { +template +inline void parallel_scan(const ExecutionPolicy& policy, + const FunctorType& functor, + ReturnType& return_value) { /** Enforce correct use **/ Impl::CheckUsage::check("parallel_scan", policy); diff --git a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp index e867cb51384..eca3994ac4e 100644 --- a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp +++ b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp @@ -10,7 +10,7 @@ static_assert(false, #define KOKKOS_PARALLEL_REDUCE_HPP #include -#include +#include #include #include #include @@ -71,11 +71,7 @@ struct ParallelReduceReturnValue< using return_type = ReturnType; using reducer_type = InvalidType; - using value_type_scalar = typename return_type::value_type; - using value_type_array = typename return_type::value_type* const; - - using value_type = std::conditional_t; + using value_type = typename return_type::value_type; static return_type& return_value(ReturnType& return_val, const FunctorType&) { return return_val; // NOLINT(bugprone-return-const-ref-from-parameter) @@ -135,36 +131,39 @@ struct ParallelReduceReturnValue< } }; -template +template struct ParallelReducePolicyType; -template -struct ParallelReducePolicyType< - std::enable_if_t::value>, - PolicyType, FunctorType> { +template +struct ParallelReducePolicyType { using policy_type = PolicyType; static PolicyType policy(const PolicyType& policy_) { return policy_; } }; -template -struct ParallelReducePolicyType< - std::enable_if_t>, PolicyType, FunctorType> { +template +struct ParallelReducePolicyType { using execution_space = typename Impl::FunctorPolicyExecutionSpace::execution_space; using policy_type = Kokkos::RangePolicy; - static policy_type policy(const PolicyType& policy_) { + static policy_type policy(const IntegralType& policy_) { return policy_type(0, policy_); } }; -template +template struct ParallelReduceAdaptor { using return_value_adapter = Impl::ParallelReduceReturnValue; + static constexpr bool is_array_reduction = + Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, PolicyType, FunctorType, + typename return_value_adapter::value_type>::StaticValueSize == 0; + // Equivalent to std::get(std::tuple) but callable on the device. template static KOKKOS_FUNCTION std::conditional_t forwarding_switch( @@ -175,10 +174,11 @@ struct ParallelReduceAdaptor { return static_cast(v2); } - static inline void execute_impl(const std::string& label, - const PolicyType& policy, - const FunctorType& functor, - ReturnType& return_value) { + static inline void execute(const std::string& label, const PolicyType& policy, + const FunctorType& functor, + ReturnType& return_value) + requires(!(is_array_reduction && std::is_pointer_v)) + { using PassedReducerType = typename return_value_adapter::reducer_type; uint64_t kpID = 0; @@ -202,6 +202,21 @@ struct ParallelReduceAdaptor { label, kpID); const auto& inner_policy = response.policy; + if constexpr (Kokkos::is_view_v) { + if constexpr (is_array_reduction) + static_assert( + ReturnType::rank == 1, + "Array reductions with a View result type require a rank-1 View!"); + else + static_assert( + ReturnType::rank == 0, + "Scalar reductions with a View result type require a rank-0 View!"); + if (!return_value.span_is_contiguous()) + Kokkos::abort( + "Reductions with a View result type must use a View with " + "contiguous memory!"); + } + auto closure = construct_with_shared_allocation_tracking_disabled< Impl::ParallelReduce( inner_policy, functor, label, kpID); } - - static constexpr bool is_array_reduction = - Impl::FunctorAnalysis< - Impl::FunctorPatternInterface::REDUCE, PolicyType, FunctorType, - typename return_value_adapter::value_type>::StaticValueSize == 0; - - template - static inline std::enable_if_t)> - execute(const std::string& label, const PolicyType& policy, - const FunctorType& functor, ReturnType& return_value) { - execute_impl(label, policy, functor, return_value); - } }; } // namespace Impl @@ -257,7 +259,7 @@ struct ReducerHasTestReferenceFunction { }; }; -template +template constexpr std::enable_if_t< // constraints only necessary because SFINAE lacks subsumption !ReducerHasTestReferenceFunction::value && @@ -268,7 +270,7 @@ parallel_reduce_needs_fence(ExecutionSpace const&, Arg const&) { return true; } -template +template constexpr std::enable_if_t< // equivalent to: // (requires (Reducer const& r) { @@ -281,7 +283,7 @@ parallel_reduce_needs_fence(ExecutionSpace const&, Reducer const& reducer) { return reducer.references_scalar(); } -template +template constexpr std::enable_if_t< // requires Kokkos::ViewLike Kokkos::is_view::value, @@ -291,7 +293,7 @@ parallel_reduce_needs_fence(ExecutionSpace const&, ViewLike const&) { return false; } -template +template struct ParallelReduceFence { template static void fence(const ExecutionSpace& ex, const std::string& name, @@ -343,9 +345,9 @@ struct ParallelReduceFence { */ // ReturnValue is scalar or array: take by reference -template -inline std::enable_if_t::value && - !(Kokkos::is_view::value || +template +inline std::enable_if_t::value || Kokkos::is_reducer::value || std::is_pointer_v)> parallel_reduce(const std::string& label, const PolicyType& policy, @@ -368,9 +370,9 @@ parallel_reduce(const std::string& label, const PolicyType& policy, return_value); } -template -inline std::enable_if_t::value && - !(Kokkos::is_view::value || +template +inline std::enable_if_t::value || Kokkos::is_reducer::value || std::is_pointer_v)> parallel_reduce(const PolicyType& policy, const FunctorType& functor, @@ -393,8 +395,7 @@ parallel_reduce(const std::string& label, const size_t& work_count, "parallel_reduce", work_count, label.c_str()); using policy_type = - typename Impl::ParallelReducePolicyType::policy_type; + typename Impl::ParallelReducePolicyType::policy_type; parallel_reduce(label, policy_type(0, work_count), functor, return_value); } @@ -412,11 +413,11 @@ parallel_reduce(const size_t& work_count, const FunctorType& functor, } // ReturnValue as View or Reducer: take by copy to allow for inline construction -template -inline std::enable_if_t::value && - (Kokkos::is_view::value || - Kokkos::is_reducer::value || - std::is_pointer_v)> +template +inline std::enable_if_t::value || + Kokkos::is_reducer::value || + std::is_pointer_v> parallel_reduce(const std::string& label, const PolicyType& policy, const FunctorType& functor, const ReturnType& return_value) { /** Enforce correct use **/ @@ -432,11 +433,11 @@ parallel_reduce(const std::string& label, const PolicyType& policy, return_value); } -template -inline std::enable_if_t::value && - (Kokkos::is_view::value || - Kokkos::is_reducer::value || - std::is_pointer_v)> +template +inline std::enable_if_t::value || + Kokkos::is_reducer::value || + std::is_pointer_v> parallel_reduce(const PolicyType& policy, const FunctorType& functor, const ReturnType& return_value) { /** Enforce correct use **/ @@ -457,8 +458,7 @@ parallel_reduce(const std::string& label, const size_t& work_count, "parallel_reduce", work_count, label.c_str()); using policy_type = - typename Impl::ParallelReducePolicyType::policy_type; + typename Impl::ParallelReducePolicyType::policy_type; parallel_reduce(label, policy_type(0, work_count), functor, return_value); } @@ -476,12 +476,9 @@ parallel_reduce(const size_t& work_count, const FunctorType& functor, } // No Return Argument -template -inline void parallel_reduce( - const std::string& label, const PolicyType& policy, - const FunctorType& functor, - std::enable_if_t::value>* = - nullptr) { +template +inline void parallel_reduce(const std::string& label, const PolicyType& policy, + const FunctorType& functor) { /** Enforce correct use **/ Impl::CheckUsage::check( "parallel_reduce", policy, label.c_str()); @@ -506,11 +503,9 @@ inline void parallel_reduce( result_view); } -template -inline void parallel_reduce( - const PolicyType& policy, const FunctorType& functor, - std::enable_if_t::value>* = - nullptr) { +template +inline void parallel_reduce(const PolicyType& policy, + const FunctorType& functor) { /** Enforce correct use **/ Impl::CheckUsage::check("parallel_reduce", policy); @@ -526,8 +521,7 @@ inline void parallel_reduce(const std::string& label, const size_t& work_count, "parallel_reduce", work_count, label.c_str()); using policy_type = - typename Impl::ParallelReducePolicyType::policy_type; + typename Impl::ParallelReducePolicyType::policy_type; parallel_reduce(label, policy_type(0, work_count), functor); } diff --git a/lib/kokkos/core/src/Kokkos_ReductionIdentity.hpp b/lib/kokkos/core/src/Kokkos_ReductionIdentity.hpp index a4712d9f8cb..7f9baeb6194 100644 --- a/lib/kokkos/core/src/Kokkos_ReductionIdentity.hpp +++ b/lib/kokkos/core/src/Kokkos_ReductionIdentity.hpp @@ -26,7 +26,7 @@ struct reduction_identity { KOKKOS_FUNCTION constexpr static Integral max() noexcept { return min_; } KOKKOS_FUNCTION constexpr static Integral min() noexcept { return max_; } KOKKOS_FUNCTION constexpr static Integral bor() noexcept { return 0x0; } - KOKKOS_FUNCTION constexpr static Integral band() noexcept { return 0x0; } + KOKKOS_FUNCTION constexpr static Integral band() noexcept { return ~(0x0); } KOKKOS_FUNCTION constexpr static Integral lor() noexcept { return 0; } KOKKOS_FUNCTION constexpr static Integral land() noexcept { return 1; } }; diff --git a/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp b/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp index b867433cc92..4e5160576a4 100644 --- a/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp @@ -57,15 +57,6 @@ class ScratchMemorySpace { static constexpr const char* name() { return "ScratchMemorySpace"; } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - // This function is unused - template - KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION static constexpr IntType align( - const IntType& size) { - return (size + DEFAULT_ALIGNMENT_MASK) & ~DEFAULT_ALIGNMENT_MASK; - } -#endif - template KOKKOS_INLINE_FUNCTION void* get_shmem(const IntType& size, int level = -1) const { diff --git a/lib/kokkos/core/src/Kokkos_Timer.hpp b/lib/kokkos/core/src/Kokkos_Timer.hpp index 93aa5c4f594..98cb047968f 100644 --- a/lib/kokkos/core/src/Kokkos_Timer.hpp +++ b/lib/kokkos/core/src/Kokkos_Timer.hpp @@ -17,6 +17,7 @@ class Timer { public: Timer(const Timer&) = delete; Timer& operator=(const Timer&) = delete; + ~Timer() = default; Timer() { reset(); } diff --git a/lib/kokkos/core/src/Kokkos_Tuners.hpp b/lib/kokkos/core/src/Kokkos_Tuners.hpp index 31156b3037d..d8a2d232f70 100644 --- a/lib/kokkos/core/src/Kokkos_Tuners.hpp +++ b/lib/kokkos/core/src/Kokkos_Tuners.hpp @@ -401,11 +401,8 @@ class TeamSizeTuner : public ExtendableTunerMixin { TunerType tuner; public: - TeamSizeTuner() = default; - TeamSizeTuner& operator=(const TeamSizeTuner& other) = default; - TeamSizeTuner(const TeamSizeTuner& other) = default; - TeamSizeTuner& operator=(TeamSizeTuner&& other) = default; - TeamSizeTuner(TeamSizeTuner&& other) = default; + TeamSizeTuner() = default; + template TeamSizeTuner(const std::string& name, @@ -728,12 +725,9 @@ struct MDRangeTuner : public ExtendableTunerMixin> { SpaceDescription desc; int max_tile_size = calc.get_mdrange_max_tile_size_product(policy, functor, tag); - Kokkos::Impl::TileSizeProperties tile_properties = - Kokkos::Impl::get_tile_size_properties(policy.space()); Impl::fill_tile(desc, max_tile_size); - Impl::apply_tiles_constraints(desc, tile_properties.max_threads_dimensions, - rank); + Impl::apply_tiles_constraints(desc, policy.m_max_threads_dimensions, rank); std::vector feature_names; for (int x = 0; x < rank; ++x) { feature_names.push_back(name + "_tile_size_" + std::to_string(x)); diff --git a/lib/kokkos/core/src/Kokkos_UniqueToken.hpp b/lib/kokkos/core/src/Kokkos_UniqueToken.hpp index c5f1e65d106..9030b4bc6fc 100644 --- a/lib/kokkos/core/src/Kokkos_UniqueToken.hpp +++ b/lib/kokkos/core/src/Kokkos_UniqueToken.hpp @@ -95,6 +95,9 @@ class AcquireUniqueToken { KOKKOS_FUNCTION AcquireUniqueToken(token_type t) : my_token(t), my_acquired_val(my_token.acquire()) {} + AcquireUniqueToken(const AcquireUniqueToken&) = delete; + AcquireUniqueToken& operator=(const AcquireUniqueToken&) = delete; + KOKKOS_FUNCTION ~AcquireUniqueToken() { my_token.release(my_acquired_val); } KOKKOS_FUNCTION size_type value() const { return my_acquired_val; } @@ -132,6 +135,8 @@ class AcquireTeamUniqueToken { // `UniqueTokenScope` enumeration type and its enumerators away which would // hurt readability. KOKKOS_FUNCTION AcquireTeamUniqueToken(token_type t, team_member_type team); + AcquireTeamUniqueToken(const AcquireTeamUniqueToken&) = delete; + AcquireTeamUniqueToken& operator=(const AcquireTeamUniqueToken&) = delete; KOKKOS_FUNCTION ~AcquireTeamUniqueToken(); KOKKOS_FUNCTION size_type value() const { return my_acquired_val; } static std::size_t shmem_size() { return scratch_view::shmem_size(); } diff --git a/lib/kokkos/core/src/Kokkos_View.hpp b/lib/kokkos/core/src/Kokkos_View.hpp index 5d41294966b..62245ff15b7 100644 --- a/lib/kokkos/core/src/Kokkos_View.hpp +++ b/lib/kokkos/core/src/Kokkos_View.hpp @@ -286,9 +286,6 @@ class View : public Impl::BasicViewFromTraits::type { static constexpr Impl::integral_constant rank = {}; static constexpr Impl::integral_constant rank_dynamic = {}; -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - enum {Rank KOKKOS_DEPRECATED_WITH_COMMENT("Use rank instead.") = rank()}; -#endif KOKKOS_INLINE_FUNCTION constexpr array_layout layout() const { return Impl::array_layout_from_mapping( @@ -693,8 +690,11 @@ class View : public Impl::BasicViewFromTraits::type { // FIXME_NVCC: nvcc 12.2 and 12.3 view these as ambiguous even though they have // exclusive requirements clauses. 12.6 Also has some issues though it manifests -// differently -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_COMPILER_NVHPC) +// differently. Clang with CUDA also had segfaults in CI +// Define the workaround here since this condition will be re-used. +// We undef KOKKOS_IMPL_VIEW_HOOKS_NVCC_WORKAROUND later. +#if defined(KOKKOS_COMPILER_NVCC) || defined(KOKKOS_COMPILER_NVHPC) || \ + (defined(KOKKOS_COMPILER_CLANG) && defined(KOKKOS_ENABLE_CUDA)) #define KOKKOS_IMPL_VIEW_HOOKS_NVCC_WORKAROUND 1 #endif #ifdef KOKKOS_IMPL_VIEW_HOOKS_NVCC_WORKAROUND @@ -922,6 +922,7 @@ class View : public Impl::BasicViewFromTraits::type { template requires(!std::is_null_pointer_v

&& + std::is_convertible_v && std::is_constructible_v && sizeof...(Args) != rank() + 1) KOKKOS_FUNCTION explicit View(P ptr_, Args... args) @@ -1503,22 +1504,6 @@ KOKKOS_INLINE_FUNCTION auto subview(const View& src, Args... args) { Args...>::type(src, args...); } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -template -KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION auto subview(const View& src, - Args... args) { - static_assert(View::rank == sizeof...(Args), - "subview requires one argument for each source View rank"); - static_assert(Kokkos::is_memory_traits::value); - - return typename Kokkos::Impl::ViewMapping< - void /* deduce subview type from source view traits */ - , - typename Impl::RemoveAlignedMemoryTrait::type, - Args...>::type(src, args...); -} -#endif - template using Subview = decltype(subview(std::declval(), std::declval()...)); diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp index 21a22f654f5..864d30f6545 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp @@ -6,9 +6,10 @@ #include #include #include -#include -#include +#include #include +#include +#include #if defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) #include @@ -24,24 +25,19 @@ #include #include +Kokkos::Experimental::OpenACC::~OpenACC() { + Kokkos::Impl::check_execution_space_destructor_precondition(name()); +} + Kokkos::Experimental::OpenACC::OpenACC() : m_space_instance( - &Kokkos::Experimental::Impl::OpenACCInternal::singleton(), - [](Impl::OpenACCInternal*) {}) { - Impl::OpenACCInternal::singleton().verify_is_initialized( - "OpenACC instance constructor"); -} + (Kokkos::Impl::check_execution_space_constructor_precondition(name()), + Impl::OpenACCInternal::default_instance)) {} Kokkos::Experimental::OpenACC::OpenACC(int async_arg) - : m_space_instance(new Kokkos::Experimental::Impl::OpenACCInternal, - [](Impl::OpenACCInternal* ptr) { - ptr->finalize(); - delete ptr; - }) { - Impl::OpenACCInternal::singleton().verify_is_initialized( - "OpenACC instance constructor"); - m_space_instance->initialize(async_arg); -} + : m_space_instance( + (Kokkos::Impl::check_execution_space_constructor_precondition(name()), + new Kokkos::Experimental::Impl::OpenACCInternal(async_arg))) {} void Kokkos::Experimental::OpenACC::impl_initialize( InitializationSettings const& settings) { @@ -101,11 +97,12 @@ void Kokkos::Experimental::OpenACC::impl_initialize( // FIXME_OPENACC: Compute Impl::OpenACCInternal::m_concurrency correctly. #endif } - Impl::OpenACCInternal::singleton().initialize(); + Impl::OpenACCInternal::default_instance = + Kokkos::Impl::HostSharedPtr(new Impl::OpenACCInternal); } void Kokkos::Experimental::OpenACC::impl_finalize() { - Impl::OpenACCInternal::singleton().finalize(); + Impl::OpenACCInternal::default_instance = nullptr; } void Kokkos::Experimental::OpenACC::print_configuration(std::ostream& os, diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp index c8080b9d3ab..172038c54fd 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp @@ -62,6 +62,14 @@ class OpenACC { using scratch_memory_space = ScratchMemorySpace; + OpenACC(const OpenACC&) = default; + OpenACC(OpenACC&& other) noexcept + : OpenACC(static_cast(other)) {} + OpenACC& operator=(const OpenACC&) = default; + OpenACC& operator=(OpenACC&& other) noexcept { + return *this = static_cast(other); + } + ~OpenACC(); OpenACC(); explicit OpenACC(int async_arg); @@ -76,16 +84,7 @@ class OpenACC { static void impl_static_fence(std::string const& name); static char const* name() { return "OpenACC"; } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - static int concurrency(); -#else int concurrency() const; -#endif -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - KOKKOS_DEPRECATED static bool in_parallel() { - return acc_on_device(acc_device_not_host); - } -#endif uint32_t impl_instance_id() const noexcept; Impl::OpenACCInternal* impl_internal_space_instance() const { return m_space_instance.get(); diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.cpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.cpp index 4f1cc55b09c..3970f43de96 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.cpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.cpp @@ -17,23 +17,11 @@ int Kokkos::Experimental::Impl::OpenACCInternal::m_acc_device_num = -1; int Kokkos::Experimental::Impl::OpenACCInternal::m_concurrency = -1; int Kokkos::Experimental::Impl::OpenACCInternal::m_next_async = -1; -Kokkos::Experimental::Impl::OpenACCInternal& -Kokkos::Experimental::Impl::OpenACCInternal::singleton() { - static OpenACCInternal self; - return self; -} - -bool Kokkos::Experimental::Impl::OpenACCInternal::verify_is_initialized( - const char* const label) const { - if (!m_is_initialized) { - Kokkos::abort((std::string("Kokkos::Experimental::OpenACC::") + label + - " : ERROR device not initialized\n") - .c_str()); - } - return m_is_initialized; -} +Kokkos::Impl::HostSharedPtr + Kokkos::Experimental::Impl::OpenACCInternal::default_instance; -void Kokkos::Experimental::Impl::OpenACCInternal::initialize(int async_arg) { +Kokkos::Experimental::Impl::OpenACCInternal::OpenACCInternal(int async_arg) + : m_async_arg(async_arg) { if ((async_arg < 0) && (async_arg != acc_async_sync) && (async_arg != acc_async_noval)) { Kokkos::abort((std::string("Kokkos::Experimental::OpenACC::initialize()") + @@ -41,16 +29,6 @@ void Kokkos::Experimental::Impl::OpenACCInternal::initialize(int async_arg) { " unless being a special value defined in OpenACC\n") .c_str()); } - m_async_arg = async_arg; - m_is_initialized = true; -} - -void Kokkos::Experimental::Impl::OpenACCInternal::finalize() { - m_is_initialized = false; -} - -bool Kokkos::Experimental::Impl::OpenACCInternal::is_initialized() const { - return m_is_initialized; } void Kokkos::Experimental::Impl::OpenACCInternal::print_configuration( @@ -73,12 +51,6 @@ uint32_t Kokkos::Experimental::Impl::OpenACCInternal::instance_id() reinterpret_cast(this)); } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -int Kokkos::Experimental::OpenACC::concurrency() { - return Impl::OpenACCInternal::m_concurrency; -} -#else int Kokkos::Experimental::OpenACC::concurrency() const { return Impl::OpenACCInternal::m_concurrency; } -#endif diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp index f92a71913d6..dcadc1acee5 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp @@ -5,6 +5,7 @@ #define KOKKOS_OPENACC_INSTANCE_HPP #include +#include #include @@ -15,26 +16,23 @@ namespace Kokkos::Experimental::Impl { class OpenACCInternal { - bool m_is_initialized = false; - - OpenACCInternal(const OpenACCInternal&) = default; - OpenACCInternal& operator=(const OpenACCInternal&) = default; + OpenACCInternal(const OpenACCInternal&) = delete; + OpenACCInternal& operator=(const OpenACCInternal&) = delete; public: + static Kokkos::Impl::HostSharedPtr default_instance; static int m_acc_device_num; static int m_concurrency; static int m_next_async; int m_async_arg = acc_async_noval; - OpenACCInternal() = default; - - static OpenACCInternal& singleton(); - - bool verify_is_initialized(const char* const label) const; + OpenACCInternal(int async_arg = acc_async_noval); - void initialize(int async_arg = acc_async_noval); - void finalize(); - bool is_initialized() const; + ~OpenACCInternal() { + fence( + "Kokkos::Experimental::Impl::OpenACCInternal::finalize: fence on " + "destruction"); + } void print_configuration(std::ostream& os, bool verbose = false) const; diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp index af951c2b49b..21e34454e70 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp @@ -856,6 +856,10 @@ class Kokkos::Impl::ParallelFor, ParallelFor(Functor const& functor, Policy const& policy) : m_functor(functor), m_policy(policy) {} + static int max_tile_size_product(const Policy&, const Functor&) { + return 512; + } + void execute() const { static_assert(1 < Policy::rank && Policy::rank < 7); static_assert(Policy::inner_direction == Iterate::Left || diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.cpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.cpp index 11b211b9875..bf8f448fcf0 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.cpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.cpp @@ -10,37 +10,127 @@ #include #include +#include #include +#include +#include +#include +#include namespace Kokkos { -OpenMP::OpenMP() - : m_space_instance(&Impl::OpenMPInternal::singleton(), - [](Impl::OpenMPInternal *) {}) { - Impl::OpenMPInternal::singleton().verify_is_initialized( - "OpenMP instance constructor"); +OpenMP::~OpenMP() { + Impl::check_execution_space_destructor_precondition(name()); } +OpenMP::OpenMP() + : m_space_instance( + (Impl::check_execution_space_constructor_precondition(name()), + Impl::OpenMPInternal::default_instance)) {} + OpenMP::OpenMP(int pool_size) - : m_space_instance(new Impl::OpenMPInternal(pool_size), - [](Impl::OpenMPInternal *ptr) { - ptr->finalize(); - delete ptr; - }) { - Impl::OpenMPInternal::singleton().verify_is_initialized( - "OpenMP instance constructor"); -} + : m_space_instance( + (Impl::check_execution_space_constructor_precondition(name()), + Impl::HostSharedPtr(new Impl::OpenMPInternal(pool_size)))) {} int OpenMP::impl_get_current_max_threads() noexcept { return Impl::OpenMPInternal::get_current_max_threads(); } void OpenMP::impl_initialize(InitializationSettings const &settings) { - Impl::OpenMPInternal::singleton().initialize( - settings.has_num_threads() ? settings.get_num_threads() : -1); + int thread_count = + settings.has_num_threads() ? settings.get_num_threads() : -1; + if (omp_in_parallel()) { + std::string msg("Kokkos::OpenMP::initialize ERROR : in parallel"); + Kokkos::Impl::throw_runtime_exception(msg); + } + + { + if (Kokkos::show_warnings() && !std::getenv("OMP_PROC_BIND")) { + std::cerr + << R"WARNING(Kokkos::OpenMP::initialize WARNING: OMP_PROC_BIND environment variable not set + In general, for best performance with OpenMP 4.0 or better set OMP_PROC_BIND=spread and OMP_PLACES=threads + For best performance with OpenMP 3.1 set OMP_PROC_BIND=true + For unit testing set OMP_PROC_BIND=false +)WARNING" << std::endl; + + if (Impl::mpi_detected()) { + std::cerr + << R"WARNING(MPI detected: For OpenMP binding to work as intended, MPI ranks must be bound to exclusive CPU sets. +)WARNING" << std::endl; + } + } + + // Before any other call to OMP query the maximum number of threads + // and save the value for re-initialization unit testing. + + Impl::OpenMPInternal::hardware_max_threads = + Impl::OpenMPInternal::get_current_max_threads(); + + int process_num_threads = Impl::OpenMPInternal::hardware_max_threads; + + if (Kokkos::hwloc::available()) { + process_num_threads = Kokkos::hwloc::get_available_numa_count() * + Kokkos::hwloc::get_available_cores_per_numa() * + Kokkos::hwloc::get_available_threads_per_core(); + } + + // if thread_count < 0, use hardware_max_threads; + // if thread_count == 0, set hardware_max_threads to + // process_num_threads if thread_count > 0, set + // hardware_max_threads to thread_count + if (thread_count < 0) { + thread_count = Impl::OpenMPInternal::hardware_max_threads; + } else if (thread_count == 0) { + if (Impl::OpenMPInternal::hardware_max_threads != process_num_threads) { + Impl::OpenMPInternal::hardware_max_threads = process_num_threads; + omp_set_num_threads(Impl::OpenMPInternal::hardware_max_threads); + } + } else { + if (Kokkos::show_warnings() && thread_count > process_num_threads) { + std::cerr << "Kokkos::OpenMP::initialize WARNING: You are likely " + "oversubscribing your CPU cores.\n" + << " process threads available : " << std::setw(3) + << process_num_threads + << ", requested thread : " << std::setw(3) << thread_count + << std::endl; + } + Impl::OpenMPInternal::hardware_max_threads = thread_count; + omp_set_num_threads(Impl::OpenMPInternal::hardware_max_threads); + } + +// setup thread local +#pragma omp parallel num_threads(Impl::OpenMPInternal::hardware_max_threads) + { Impl::SharedAllocationRecord::tracking_enable(); } + } + + // Create the default instance. + Impl::OpenMPInternal::default_instance = + Impl::HostSharedPtr( + new Impl::OpenMPInternal(Impl::OpenMPInternal::hardware_max_threads)); + + // Check for over-subscription + auto const reported_ranks = Impl::mpi_ranks_per_node(); + auto const mpi_local_size = reported_ranks < 0 ? 1 : reported_ranks; + int const procs_per_node = std::thread::hardware_concurrency(); + if (Kokkos::show_warnings() && + (mpi_local_size * long(thread_count) > procs_per_node)) { + std::cerr << "Kokkos::OpenMP::initialize WARNING: You are likely " + "oversubscribing your CPU cores." + << std::endl; + std::cerr << " Detected: " + << procs_per_node << " cores per node." << std::endl; + std::cerr << " Detected: " + << mpi_local_size << " MPI_ranks per node." << std::endl; + std::cerr << " Requested: " + << thread_count << " threads per process." << std::endl; + } } -void OpenMP::impl_finalize() { Impl::OpenMPInternal::singleton().finalize(); } +void OpenMP::impl_finalize() { + // Destroy the default instance. + Impl::OpenMPInternal::default_instance = nullptr; +} void OpenMP::print_configuration(std::ostream &os, bool /*verbose*/) const { os << "Host Parallel Execution Space:\n"; @@ -51,13 +141,7 @@ void OpenMP::print_configuration(std::ostream &os, bool /*verbose*/) const { m_space_instance->print_configuration(os); } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -int OpenMP::concurrency(OpenMP const &instance) { - return instance.impl_thread_pool_size(); -} -#else int OpenMP::concurrency() const { return impl_thread_pool_size(); } -#endif void OpenMP::impl_static_fence(std::string const &name) { Kokkos::Tools::Experimental::Impl::profile_fence_event( @@ -75,20 +159,9 @@ void OpenMP::impl_static_fence(std::string const &name) { } void OpenMP::fence(const std::string &name) const { - Kokkos::Tools::Experimental::Impl::profile_fence_event( - name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1}, - [this]() { - auto *internal_instance = this->impl_internal_space_instance(); - std::lock_guard lock(internal_instance->m_instance_mutex); - }); + impl_internal_space_instance()->fence(name); } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -KOKKOS_DEPRECATED bool OpenMP::in_parallel(OpenMP const &exec_space) noexcept { - return exec_space.impl_internal_space_instance()->m_level < omp_get_level(); -} -#endif - int OpenMP::impl_thread_pool_size() const noexcept { return (impl_internal_space_instance()->get_level() < omp_get_level()) ? omp_get_num_threads() diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp index edf46c3727c..edb519129b5 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp @@ -51,26 +51,21 @@ class OpenMP { using size_type = memory_space::size_type; using scratch_memory_space = ScratchMemorySpace; + KOKKOS_DEFAULTED_FUNCTION OpenMP(const OpenMP&) = default; + KOKKOS_FUNCTION OpenMP(OpenMP&& other) noexcept + : OpenMP(static_cast(other)) {} + KOKKOS_DEFAULTED_FUNCTION OpenMP& operator=(const OpenMP&) = default; + KOKKOS_FUNCTION OpenMP& operator=(OpenMP&& other) noexcept { + return *this = static_cast(other); + } + ~OpenMP(); OpenMP(); explicit OpenMP(int pool_size); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - template - KOKKOS_DEPRECATED_WITH_COMMENT( - "OpenMP execution space should be constructed explicitly.") - OpenMP(int pool_size) - : OpenMP(pool_size) {} -#endif - /// \brief Print configuration information to the given output stream. void print_configuration(std::ostream& os, bool verbose = false) const; -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - /// \brief is the instance running a parallel algorithm - KOKKOS_DEPRECATED static bool in_parallel(OpenMP const& = OpenMP()) noexcept; -#endif - /// \brief Wait until all dispatched functors complete on the given instance /// /// This is a no-op on OpenMP @@ -79,22 +74,7 @@ class OpenMP { void fence(std::string const& name = "Kokkos::OpenMP::fence: Unnamed Instance Fence") const; -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - /// \brief Does the given instance return immediately after launching - /// a parallel algorithm - /// - /// This always returns false on OpenMP - KOKKOS_DEPRECATED inline static bool is_asynchronous( - OpenMP const& = OpenMP()) noexcept { - return false; - } -#endif - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - static int concurrency(OpenMP const& = OpenMP()); -#else int concurrency() const; -#endif static void impl_initialize(InitializationSettings const&); diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp index 0f881e33d8d..c5ba54a0b12 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp @@ -19,23 +19,21 @@ import kokkos.core; #include #include -#include #include +#include #include #include -namespace { -int g_openmp_hardware_max_threads = 1; -} - namespace Kokkos { namespace Impl { std::vector OpenMPInternal::all_instances; std::mutex OpenMPInternal::all_instances_mutex; +HostSharedPtr OpenMPInternal::default_instance; +int OpenMPInternal::hardware_max_threads; int OpenMPInternal::max_hardware_threads() noexcept { - return g_openmp_hardware_max_threads; + return hardware_max_threads; } void OpenMPInternal::clear_thread_data() { @@ -130,11 +128,6 @@ void OpenMPInternal::resize_thread_data(size_t pool_reduce_bytes, } } -OpenMPInternal &OpenMPInternal::singleton() { - static OpenMPInternal self(get_current_max_threads()); - return self; -} - int OpenMPInternal::get_current_max_threads() noexcept { // Using omp_get_max_threads(); is problematic in conjunction with // Hwloc on Intel (essentially an initial call to the OpenMP runtime @@ -154,135 +147,51 @@ int OpenMPInternal::get_current_max_threads() noexcept { return count; } -void OpenMPInternal::initialize(int thread_count) { - if (m_initialized) { - Kokkos::abort( - "Calling OpenMP::initialize after OpenMP::finalize is illegal\n"); - } - - if (omp_in_parallel()) { - std::string msg("Kokkos::OpenMP::initialize ERROR : in parallel"); - Kokkos::Impl::throw_runtime_exception(msg); - } - +OpenMPInternal::OpenMPInternal(int arg_pool_size) + : m_pool_size{arg_pool_size}, m_level{omp_get_level()}, m_pool() { + // guard pushing to all_instances { - if (Kokkos::show_warnings() && !std::getenv("OMP_PROC_BIND")) { - std::cerr - << R"WARNING(Kokkos::OpenMP::initialize WARNING: OMP_PROC_BIND environment variable not set - In general, for best performance with OpenMP 4.0 or better set OMP_PROC_BIND=spread and OMP_PLACES=threads - For best performance with OpenMP 3.1 set OMP_PROC_BIND=true - For unit testing set OMP_PROC_BIND=false -)WARNING" << std::endl; - - if (mpi_detected()) { - std::cerr - << R"WARNING(MPI detected: For OpenMP binding to work as intended, MPI ranks must be bound to exclusive CPU sets. -)WARNING" << std::endl; - } - } - - // Before any other call to OMP query the maximum number of threads - // and save the value for re-initialization unit testing. - - g_openmp_hardware_max_threads = get_current_max_threads(); - - int process_num_threads = g_openmp_hardware_max_threads; - - if (Kokkos::hwloc::available()) { - process_num_threads = Kokkos::hwloc::get_available_numa_count() * - Kokkos::hwloc::get_available_cores_per_numa() * - Kokkos::hwloc::get_available_threads_per_core(); - } - - // if thread_count < 0, use g_openmp_hardware_max_threads; - // if thread_count == 0, set g_openmp_hardware_max_threads to - // process_num_threads if thread_count > 0, set - // g_openmp_hardware_max_threads to thread_count - if (thread_count < 0) { - thread_count = g_openmp_hardware_max_threads; - } else if (thread_count == 0) { - if (g_openmp_hardware_max_threads != process_num_threads) { - g_openmp_hardware_max_threads = process_num_threads; - omp_set_num_threads(g_openmp_hardware_max_threads); - } - } else { - if (Kokkos::show_warnings() && thread_count > process_num_threads) { - std::cerr << "Kokkos::OpenMP::initialize WARNING: You are likely " - "oversubscribing your CPU cores.\n" - << " process threads available : " << std::setw(3) - << process_num_threads - << ", requested thread : " << std::setw(3) << thread_count - << std::endl; + if (omp_get_level() != 0) { + constexpr char msg[] = + "Kokkos::OpenMP instances can only be created outside OpenMP " + "regions!"; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + if (Kokkos::show_warnings()) { + std::cerr << msg << '\n'; } - g_openmp_hardware_max_threads = thread_count; - omp_set_num_threads(g_openmp_hardware_max_threads); - } - -// setup thread local -#pragma omp parallel num_threads(g_openmp_hardware_max_threads) - { Impl::SharedAllocationRecord::tracking_enable(); } - - auto &instance = OpenMPInternal::singleton(); - instance.m_pool_size = g_openmp_hardware_max_threads; - - // New, unified host thread team data: - { - size_t pool_reduce_bytes = static_cast(32) * thread_count; - size_t team_reduce_bytes = static_cast(32) * thread_count; - size_t team_shared_bytes = static_cast(1024) * thread_count; - size_t thread_local_bytes = 1024; - - instance.resize_thread_data(pool_reduce_bytes, team_reduce_bytes, - team_shared_bytes, thread_local_bytes); +#else + Kokkos::abort(msg); +#endif } + std::scoped_lock lock(all_instances_mutex); + all_instances.push_back(this); } - // Check for over-subscription - auto const reported_ranks = mpi_ranks_per_node(); - auto const mpi_local_size = reported_ranks < 0 ? 1 : reported_ranks; - int const procs_per_node = std::thread::hardware_concurrency(); - if (Kokkos::show_warnings() && - (mpi_local_size * long(thread_count) > procs_per_node)) { - std::cerr << "Kokkos::OpenMP::initialize WARNING: You are likely " - "oversubscribing your CPU cores." - << std::endl; - std::cerr << " Detected: " - << procs_per_node << " cores per node." << std::endl; - std::cerr << " Detected: " - << mpi_local_size << " MPI_ranks per node." << std::endl; - std::cerr << " Requested: " - << thread_count << " threads per process." << std::endl; + // New, unified host thread team data: + { + size_t pool_reduce_bytes = static_cast(32) * arg_pool_size; + size_t team_reduce_bytes = static_cast(32) * arg_pool_size; + size_t team_shared_bytes = static_cast(1024) * arg_pool_size; + size_t thread_local_bytes = 1024; + + resize_thread_data(pool_reduce_bytes, team_reduce_bytes, team_shared_bytes, + thread_local_bytes); } +} - m_initialized = true; +void OpenMPInternal::fence(const std::string &name) { + Kokkos::Tools::Experimental::Impl::profile_fence_event( + name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1}, + [this]() { std::lock_guard lock(m_instance_mutex); }); } -void OpenMPInternal::finalize() { +OpenMPInternal::~OpenMPInternal() { if (omp_in_parallel()) { - std::string msg("Kokkos::OpenMP::finalize ERROR "); - if (this != &singleton()) msg.append(": not initialized"); - if (omp_in_parallel()) msg.append(": in parallel"); + std::string msg("Kokkos::OpenMP::finalize ERROR : in parallel"); Kokkos::Impl::throw_runtime_exception(msg); } - if (this == &singleton()) { - auto const &instance = singleton(); - // Silence Cuda Warning - const int nthreads = instance.m_pool_size <= g_openmp_hardware_max_threads - ? g_openmp_hardware_max_threads - : instance.m_pool_size; - (void)nthreads; - -#pragma omp parallel num_threads(nthreads) - { Impl::SharedAllocationRecord::tracking_disable(); } - - // allow main thread to track - Impl::SharedAllocationRecord::tracking_enable(); - - g_openmp_hardware_max_threads = 1; - } - - m_initialized = false; + fence("Kokkos::OpenMPInternal: fence on destruction"); // guard erasing from all_instances { @@ -302,24 +211,13 @@ void OpenMPInternal::finalize() { void OpenMPInternal::print_configuration(std::ostream &s) const { s << "Kokkos::OpenMP"; - if (m_initialized) { - const int numa_count = 1; - const int core_per_numa = g_openmp_hardware_max_threads; - const int thread_per_core = 1; + const int numa_count = 1; + const int core_per_numa = hardware_max_threads; + const int thread_per_core = 1; - s << " thread_pool_topology[ " << numa_count << " x " << core_per_numa - << " x " << thread_per_core << " ]" << std::endl; - } else { - s << " not initialized" << std::endl; - } + s << " thread_pool_topology[ " << numa_count << " x " << core_per_numa + << " x " << thread_per_core << " ]" << std::endl; } -bool OpenMPInternal::verify_is_initialized(const char *const label) const { - if (!m_initialized) { - std::cerr << "Kokkos::OpenMP " << label - << " : ERROR OpenMP is not initialized" << std::endl; - } - return m_initialized; -} } // namespace Impl } // namespace Kokkos diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp index 0a925ddc2bb..e913ca623d4 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp @@ -40,29 +40,11 @@ struct OpenMPTraits { class OpenMPInternal { private: - OpenMPInternal(int arg_pool_size) - : m_pool_size{arg_pool_size}, m_level{omp_get_level()}, m_pool() { - // guard pushing to all_instances - { -#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 - if (omp_get_level() != 0) - Kokkos::abort( - "Kokkos::OpenMP instances can only be created outside OpenMP " - "regions!"); -#endif - std::scoped_lock lock(all_instances_mutex); - all_instances.push_back(this); - } - } - - OpenMPInternal() = delete; OpenMPInternal(const OpenMPInternal&) = delete; OpenMPInternal& operator=(const OpenMPInternal&) = delete; static int get_current_max_threads() noexcept; - bool m_initialized = false; - int m_pool_size; int m_level; @@ -71,11 +53,10 @@ class OpenMPInternal { public: friend class Kokkos::OpenMP; - static OpenMPInternal& singleton(); - - void initialize(int thread_cound); + OpenMPInternal(int arg_pool_size); + ~OpenMPInternal(); - void finalize(); + void fence(const std::string&); void clear_thread_data(); @@ -96,16 +77,15 @@ class OpenMPInternal { int get_level() const { return m_level; } - bool is_initialized() const { return m_initialized; } - - bool verify_is_initialized(const char* const label) const; - void print_configuration(std::ostream& s) const; std::mutex m_instance_mutex; + static HostSharedPtr default_instance; + static std::vector all_instances; static std::mutex all_instances_mutex; + static int hardware_max_threads; }; inline bool execute_in_serial(OpenMP const& space = OpenMP()) { diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp index a732b7b2f5e..cb0c7292f75 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_For.hpp @@ -7,6 +7,7 @@ #include #include #include +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -378,6 +379,28 @@ class ParallelFor, FunctorTeamShmemSize::value( m_functor, m_policy.team_size())) { m_instance = m_policy.space().impl_internal_space_instance(); + + if ((m_policy.scratch_size(0) + FunctorTeamShmemSize::value( + m_functor, m_policy.team_size())) > + static_cast(TeamPolicy::scratch_size_max(0))) { + std::stringstream error; + error << "Kokkos::parallel_for: Requested too much scratch " + "memory on level 0. Requested: " + << m_policy.scratch_size(0) + + FunctorTeamShmemSize::value( + m_functor, m_policy.team_size()) + << ", Maximum: " << TeamPolicy::scratch_size_max(0); + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); + } + if (m_policy.scratch_size(1) > + static_cast(TeamPolicy::scratch_size_max(1))) { + std::stringstream error; + error << "Kokkos::parallel_for: Requested too much scratch " + "memory on level 1. Requested: " + << m_policy.scratch_size(1) + << ", Maximum: " << TeamPolicy::scratch_size_max(1); + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); + } } }; diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp index 23afd1d54bb..1c191858fb9 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel_Reduce.hpp @@ -7,6 +7,7 @@ #include #include #include +#include namespace Kokkos { namespace Impl { @@ -513,6 +514,29 @@ class ParallelReduce::accessible, "Kokkos::OpenMP reduce result must be a View accessible from " "HostSpace"); + + if ((arg_policy.scratch_size(0) + + FunctorTeamShmemSize::value( + m_functor_reducer.get_functor(), arg_policy.team_size())) > + static_cast(TeamPolicy::scratch_size_max(0))) { + std::stringstream error; + error << "Kokkos::parallel_reduce: Requested too much scratch " + "memory on level 0. Requested: " + << arg_policy.scratch_size(0) + + FunctorTeamShmemSize::value( + m_functor_reducer.get_functor(), arg_policy.team_size()) + << ", Maximum: " << TeamPolicy::scratch_size_max(0); + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); + } + if (arg_policy.scratch_size(1) > + static_cast(TeamPolicy::scratch_size_max(1))) { + std::stringstream error; + error << "Kokkos::parallel_reduce: Requested too much scratch " + "memory on level 1. Requested: " + << arg_policy.scratch_size(1) + << ", Maximum: " << TeamPolicy::scratch_size_max(1); + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); + } } }; diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Team.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Team.hpp index ccff5df31fe..0efd572061b 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Team.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Team.hpp @@ -116,8 +116,13 @@ class TeamPolicyInternal m_league_size = league_size_request; - if (team_size_request > team_max) - Kokkos::abort("Kokkos::abort: Requested Team Size is too large!"); + if (team_size_request > team_max) { + std::stringstream error; + error << "Kokkos::TeamPolicy: Requested too large team size. " + "Requested: " + << team_size_request << ", Maximum: " << team_max; + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); + } m_team_size = team_size_request < team_max ? team_size_request : team_max; // Round team size up to a multiple of 'team_gain' diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp deleted file mode 100644 index fdb9186d384..00000000000 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget.hpp +++ /dev/null @@ -1,138 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#include -static_assert(false, - "Including non-public Kokkos header files is not allowed."); -#endif -#ifndef KOKKOS_OPENMPTARGET_HPP -#define KOKKOS_OPENMPTARGET_HPP - -#include - -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(_OPENMP) - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -/*--------------------------------------------------------------------------*/ - -namespace Kokkos { -namespace Experimental { -namespace Impl { -class OpenMPTargetInternal; -} - -/// \class OpenMPTarget -/// \brief Kokkos device for multicore processors in the host memory space. -class OpenMPTarget { - public: - //------------------------------------ - //! \name Type declarations that all Kokkos devices must provide. - //@{ - - //! Tag this class as a kokkos execution space - using execution_space = OpenMPTarget; - using memory_space = OpenMPTargetSpace; - //! This execution space preferred device_type - using device_type = Kokkos::Device; - - using array_layout = LayoutLeft; - using size_type = memory_space::size_type; - - using scratch_memory_space = ScratchMemorySpace; - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - KOKKOS_DEPRECATED inline static bool in_parallel() { - return omp_in_parallel(); - } -#endif - - static void fence(const std::string& name = - "Kokkos::OpenMPTarget::fence: Unnamed Instance Fence"); - - static void impl_static_fence(const std::string& name); - - /** \brief Return the maximum amount of concurrency. */ -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - static int concurrency(); -#else - int concurrency() const; -#endif - - //! Print configuration information to the given output stream. - void print_configuration(std::ostream& os, bool verbose = false) const; - - static const char* name(); - - //! Free any resources being consumed by the device. - static void impl_finalize(); - - //! Initialize, telling the CUDA run-time library which device to use. - static void impl_initialize(InitializationSettings const&); - - inline Impl::OpenMPTargetInternal* impl_internal_space_instance() const { - return m_space_instance; - } - - OpenMPTarget(); - uint32_t impl_instance_id() const noexcept; - - private: - friend bool operator==(OpenMPTarget const& lhs, OpenMPTarget const& rhs) { - return lhs.impl_internal_space_instance() == - rhs.impl_internal_space_instance(); - } - friend bool operator!=(OpenMPTarget const& lhs, OpenMPTarget const& rhs) { - return !(lhs == rhs); - } - Impl::OpenMPTargetInternal* m_space_instance; -}; -} // namespace Experimental - -namespace Impl { -template <> -struct MemorySpaceAccess< - Kokkos::Experimental::OpenMPTargetSpace, - Kokkos::Experimental::OpenMPTarget::scratch_memory_space> { - enum : bool { assignable = false }; - enum : bool { accessible = true }; - enum : bool { deepcopy = false }; -}; -} // namespace Impl - -namespace Tools { -namespace Experimental { -template <> -struct DeviceTypeTraits<::Kokkos::Experimental::OpenMPTarget> { - static constexpr DeviceType id = - ::Kokkos::Profiling::Experimental::DeviceType::OpenMPTarget; - static int device_id(const Kokkos::Experimental::OpenMPTarget&) { - return omp_get_default_device(); - } -}; -} // namespace Experimental -} // namespace Tools - -} // namespace Kokkos - -/*--------------------------------------------------------------------------*/ -/*--------------------------------------------------------------------------*/ - -#include -#include -#include - -/*--------------------------------------------------------------------------*/ - -#endif /* #if defined( KOKKOS_ENABLE_OPENMPTARGET ) && defined( _OPENMP ) */ -#endif /* #ifndef KOKKOS_OPENMPTARGET_HPP */ diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp deleted file mode 100644 index 838af4d0549..00000000000 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp +++ /dev/null @@ -1,110 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#define KOKKOS_IMPL_PUBLIC_INCLUDE -#endif - -#include - -#include -#include - -/*--------------------------------------------------------------------------*/ - -#include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Experimental { -/* Default allocation mechanism */ -OpenMPTargetSpace::OpenMPTargetSpace() {} - -void* OpenMPTargetSpace::impl_allocate( - - const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size, - const Kokkos::Tools::SpaceHandle arg_handle) const { - static_assert(sizeof(void*) == sizeof(uintptr_t), - "Error sizeof(void*) != sizeof(uintptr_t)"); - - void* ptr = omp_target_alloc(arg_alloc_size, omp_get_default_device()); - - if (!ptr) { - Kokkos::Impl::throw_bad_alloc(name(), arg_alloc_size, arg_label); - } - - if (Kokkos::Profiling::profileLibraryLoaded()) { - const size_t reported_size = - (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; - Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size); - } - - return ptr; -} - -void* OpenMPTargetSpace::allocate(const size_t arg_alloc_size) const { - return allocate("[unlabeled]", arg_alloc_size); -} - -void* OpenMPTargetSpace::allocate(const char* arg_label, - const size_t arg_alloc_size, - const size_t arg_logical_size) const { - return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); -} - -void OpenMPTargetSpace::impl_deallocate( - const char* arg_label, void* const arg_alloc_ptr, - const size_t arg_alloc_size, const size_t arg_logical_size, - const Kokkos::Tools::SpaceHandle arg_handle) const { - if (Kokkos::Profiling::profileLibraryLoaded()) { - const size_t reported_size = - (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; - Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, - reported_size); - } - if (arg_alloc_ptr) { - omp_target_free(arg_alloc_ptr, omp_get_default_device()); - } -} - -void OpenMPTargetSpace::deallocate(void* const arg_alloc_ptr, - const size_t arg_alloc_size) const { - deallocate("[unlabeled]", arg_alloc_ptr, arg_alloc_size); -} - -void OpenMPTargetSpace::deallocate(const char* arg_label, - void* const arg_alloc_ptr, - const size_t arg_alloc_size, - const size_t arg_logical_size) const - -{ - impl_deallocate(arg_label, arg_alloc_ptr, arg_alloc_size, arg_logical_size); -} - -} // namespace Experimental -} // namespace Kokkos - -//============================================================================== -// {{{1 - -#include - -KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( - Kokkos::Experimental::OpenMPTargetSpace); - -// end Explicit instantiations of CRTP Base classes }}}1 -//============================================================================== diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp deleted file mode 100644 index 82568ef31f9..00000000000 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp +++ /dev/null @@ -1,133 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#include -static_assert(false, - "Including non-public Kokkos header files is not allowed."); -#endif -#ifndef KOKKOS_OPENMPTARGETSPACE_HPP -#define KOKKOS_OPENMPTARGETSPACE_HPP - -#include -#include -#include -#include - -#include -#include - -#ifdef KOKKOS_ENABLE_OPENMPTARGET - -#include -#include -#include - -namespace Kokkos { -namespace Impl { - -//---------------------------------------- - -template <> -struct MemorySpaceAccess { - enum : bool { assignable = false }; - enum : bool { accessible = false }; - enum : bool { deepcopy = true }; -}; - -//---------------------------------------- - -template <> -struct MemorySpaceAccess { - enum : bool { assignable = false }; - enum : bool { accessible = false }; - enum : bool { deepcopy = true }; -}; - -//---------------------------------------- -} // namespace Impl -} // namespace Kokkos - -namespace Kokkos { -namespace Experimental { - -/// \class OpenMPTargetSpace -/// \brief Memory management for host memory. -/// -/// OpenMPTargetSpace is a memory space that governs host memory. "Host" -/// memory means the usual CPU-accessible memory. -class OpenMPTargetSpace { - public: - //! Tag this class as a kokkos memory space - using memory_space = OpenMPTargetSpace; - using size_type = unsigned; - - /// \typedef execution_space - /// \brief Default execution space for this memory space. - /// - /// Every memory space has a default execution space. This is - /// useful for things like initializing a View (which happens in - /// parallel using the View's default execution space). - using execution_space = Kokkos::Experimental::OpenMPTarget; - - //! This memory space preferred device_type - using device_type = Kokkos::Device; - - /*--------------------------------*/ - - /**\brief Default memory space instance */ - OpenMPTargetSpace(); - OpenMPTargetSpace(OpenMPTargetSpace&& rhs) = default; - OpenMPTargetSpace(const OpenMPTargetSpace& rhs) = default; - OpenMPTargetSpace& operator=(OpenMPTargetSpace&&) = default; - OpenMPTargetSpace& operator=(const OpenMPTargetSpace&) = default; - ~OpenMPTargetSpace() = default; - - /**\brief Allocate untracked memory in the space */ - // FIXME_OPENMPTARGET Use execution space instance - void* allocate(const OpenMPTarget&, const size_t arg_alloc_size) const { - return allocate(arg_alloc_size); - } - // FIXME_OPENMPTARGET Use execution space instance - void* allocate(const OpenMPTarget&, const char* arg_label, - const size_t arg_alloc_size, - const size_t arg_logical_size = 0) const { - return allocate(arg_label, arg_alloc_size, arg_logical_size); - } - void* allocate(const size_t arg_alloc_size) const; - void* allocate(const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size = 0) const; - - /**\brief Deallocate untracked memory in the space */ - void deallocate(void* const arg_alloc_ptr, - const std::size_t arg_alloc_size) const; - void deallocate(const char* arg_label, void* const arg_alloc_ptr, - const size_t arg_alloc_size, - const size_t arg_logical_size = 0) const; - - static constexpr const char* name() { return "OpenMPTargetSpace"; } - - private: - void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size = 0, - const Kokkos::Tools::SpaceHandle = - Kokkos::Tools::make_space_handle(name())) const; - void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr, - const size_t arg_alloc_size, - const size_t arg_logical_size = 0, - const Kokkos::Tools::SpaceHandle = - Kokkos::Tools::make_space_handle(name())) const; -}; -} // namespace Experimental -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( - Kokkos::Experimental::OpenMPTargetSpace); - -#endif -#endif /* #define KOKKOS_OPENMPTARGETSPACE_HPP */ diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Abort.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Abort.hpp deleted file mode 100644 index 04bd716b4a4..00000000000 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Abort.hpp +++ /dev/null @@ -1,22 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project - -#ifndef KOKKOS_OPENMPTARGET_ABORT_HPP -#define KOKKOS_OPENMPTARGET_ABORT_HPP - -#include -#ifdef KOKKOS_ENABLE_OPENMPTARGET - -namespace Kokkos { -namespace Impl { - -KOKKOS_INLINE_FUNCTION void OpenMPTarget_abort(char const *msg) { - fprintf(stderr, "%s.\n", msg); - std::abort(); -} - -} // namespace Impl -} // namespace Kokkos - -#endif -#endif diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_DeepCopy.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_DeepCopy.hpp deleted file mode 100644 index faaa218798c..00000000000 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_DeepCopy.hpp +++ /dev/null @@ -1,88 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#include -static_assert(false, - "Including non-public Kokkos header files is not allowed."); -#endif -#ifndef KOKKOS_OPENMPTARGET_DEEP_COPY_HPP -#define KOKKOS_OPENMPTARGET_DEEP_COPY_HPP - -#include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -// TODO: implement all possible deep_copies -template -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - // In the Release and RelWithDebInfo builds, the size of the memcpy should - // be greater than zero to avoid error. omp_target_memcpy returns zero on - // success. - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_default_device(), - omp_get_default_device())); - } - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy: fence " - "before " - "copy"); - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_default_device(), - omp_get_default_device())); - } -}; - -template -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_default_device(), - omp_get_initial_device())); - } - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy: fence before " - "copy"); - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_default_device(), - omp_get_initial_device())); - } -}; - -template -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_initial_device(), - omp_get_default_device())); - } - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy: fence before " - "copy"); - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_initial_device(), - omp_get_default_device())); - } -}; - -} // namespace Impl -} // namespace Kokkos - -#endif // KOKKOS_OPENMPTARGET_DEEP_COPY_HPP diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Error.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Error.hpp deleted file mode 100644 index fc8f9dfa320..00000000000 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Error.hpp +++ /dev/null @@ -1,32 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project - -#ifndef KOKKOS_OPENMPTARGET_ERROR_HPP -#define KOKKOS_OPENMPTARGET_ERROR_HPP - -#include -#include - -namespace Kokkos { -namespace Impl { - -inline void ompt_internal_safe_call(int e, const char* name, - const char* file = nullptr, - const int line = 0) { - if (e != 0) { - std::ostringstream out; - out << name << " return value of " << e << " indicates failure"; - if (file) { - out << " " << file << ":" << line; - } - throw_runtime_exception(out.str()); - } -} - -#define KOKKOS_IMPL_OMPT_SAFE_CALL(call) \ - Kokkos::Impl::ompt_internal_safe_call(call, #call, __FILE__, __LINE__) - -} // namespace Impl -} // namespace Kokkos - -#endif diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_FunctorAdapter.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_FunctorAdapter.hpp deleted file mode 100644 index 74e126f0144..00000000000 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_FunctorAdapter.hpp +++ /dev/null @@ -1,35 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project - -#ifndef KOKKOS_OPENMPTARGET_FUNCTOR_ADAPTER_HPP -#define KOKKOS_OPENMPTARGET_FUNCTOR_ADAPTER_HPP - -#include -#include - -namespace Kokkos::Experimental::Impl { - -template -class FunctorAdapter { - Functor m_functor; - using WorkTag = typename Policy::work_tag; - - public: - FunctorAdapter() = default; - FunctorAdapter(Functor const &functor) : m_functor(functor) {} - - Functor get_functor() const { return m_functor; } - - template - KOKKOS_FUNCTION void operator()(Args &&...args) const { - if constexpr (std::is_void_v) { - m_functor(static_cast(args)...); - } else { - m_functor(WorkTag(), static_cast(args)...); - } - } -}; - -} // namespace Kokkos::Experimental::Impl - -#endif // KOKKOS_OPENMPTARGET_FUNCTOR_ADAPTER_HPP diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp deleted file mode 100644 index 926b7d26ac6..00000000000 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp +++ /dev/null @@ -1,269 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#define KOKKOS_IMPL_PUBLIC_INCLUDE -#endif - -#include -#include - -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(_OPENMP) - -// FIXME_OPENMPTARGET - macro for workaround implementation in UniqueToken -// constructor. undef'ed at the end -#define KOKKOS_IMPL_OPENMPTARGET_WORKAROUND - -#include -#ifdef KOKKOS_ENABLE_EXPERIMENTAL_CXX20_MODULES -import kokkos.core; -#else -#include -#endif -#include -#include -#include -#include - -#include - -namespace Kokkos { -namespace Experimental { -namespace Impl { -uint32_t OpenMPTargetInternal::impl_get_instance_id() const noexcept { - return m_instance_id; -} - -void OpenMPTargetInternal::fence(openmp_fence_is_static is_static) { - fence( - "Kokkos::Experimental::Impl::OpenMPTargetInternal::fence: Unnamed " - "Internal Fence", - is_static); -} -void OpenMPTargetInternal::fence(const std::string& name, - openmp_fence_is_static is_static) { - if (is_static == openmp_fence_is_static::no) { - Kokkos::Tools::Experimental::Impl::profile_fence_event< - Kokkos::Experimental::OpenMPTarget>( - name, - Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{ - impl_get_instance_id()}, - [&]() {}); - } else { - Kokkos::Tools::Experimental::Impl::profile_fence_event< - Kokkos::Experimental::OpenMPTarget>( - name, - Kokkos::Tools::Experimental::SpecialSynchronizationCases:: - GlobalDeviceSynchronization, - [&]() {}); - } -} -int OpenMPTargetInternal::concurrency() const { - int max_threads_sm = 2048; - int max_threads = max_threads_sm * 80; -#if defined(KOKKOS_ARCH_AMPERE86) - max_threads = max_threads_sm * 84; -#elif defined(KOKKOS_ARCH_AMPERE87) - max_threads_sm = 1024; - max_threads = max_threads_sm * 32; // Orin Nano cores -#elif defined(KOKKOS_ARCH_AMPERE80) - return max_threads_sm * 108; -#elif defined(KOKKOS_ARCH_VOLTA72) - return max_threads_sm * 84; -#elif defined(KOKKOS_ARCH_VOLTA70) - return max_threads_sm * 80; -#elif defined(KOKKOS_ARCH_PASCAL60) || defined(KOKKOS_ARCH_PASCAL61) - return max_threads_sm * 60; -#endif - - return max_threads; -} -const char* OpenMPTargetInternal::name() { return "OpenMPTarget"; } -void OpenMPTargetInternal::print_configuration(std::ostream& os, - bool /*verbose*/) const { - // FIXME_OPENMPTARGET - os << "Using OpenMPTarget\n"; -} - -void OpenMPTargetInternal::impl_finalize() { - if (m_uniquetoken_ptr != nullptr) - Kokkos::kokkos_free( - m_uniquetoken_ptr); -} - -void OpenMPTargetInternal::impl_initialize() { - // FIXME_OPENMPTARGET: Only fix the number of teams for NVIDIA architectures -#if defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) - omp_set_num_teams(512); -#endif -} - -OpenMPTargetInternal* OpenMPTargetInternal::impl_singleton() { - static OpenMPTargetInternal self; - return &self; -} - -void OpenMPTargetInternal::verify_is_process(const char* const label) { - // Fails if the current task is in a parallel region or is not on the host. - if (omp_in_parallel() && (!omp_is_initial_device())) { - std::string msg(label); - msg.append(" ERROR: in parallel or on device"); - Kokkos::Impl::throw_runtime_exception(msg); - } -} - -void OpenMPTargetInternal::clear_scratch() { - Kokkos::Experimental::OpenMPTargetSpace space; - space.deallocate(m_scratch_ptr, m_scratch_size); - m_scratch_ptr = nullptr; - m_scratch_size = 0; -} - -void* OpenMPTargetInternal::get_scratch_ptr() { return m_scratch_ptr; } - -void OpenMPTargetInternal::resize_scratch(int64_t team_size, - int64_t shmem_size_L0, - int64_t shmem_size_L1, - int64_t league_size) { - Kokkos::Experimental::OpenMPTargetSpace space; - // Level-0 scratch when using clang/17 and higher comes from their OpenMP - // extension, `ompx_dyn_cgroup_mem`. -#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS) - shmem_size_L0 = 0; -#endif - const int64_t shmem_size = - shmem_size_L0 + shmem_size_L1; // L0 + L1 scratch memory per team. - const int64_t padding = shmem_size * 10 / 100; // Padding per team. - - // Maximum active teams possible. - // The number should not exceed the maximum in-flight teams possible or the - // league_size. - int max_active_teams = - std::min(OpenMPTargetInternal::concurrency() / team_size, league_size); - - // max_active_teams is the number of active teams on the given hardware. - // We set the number of teams to be twice the number of max_active_teams for - // the compiler to pick the right number in its case. - omp_set_num_teams(max_active_teams * 2); - - // Total amount of scratch memory allocated is depenedent - // on the maximum number of in-flight teams possible. - int64_t total_size = - (shmem_size + - ::Kokkos::Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE + padding) * - max_active_teams * 2; - - if (total_size > m_scratch_size) { - space.deallocate(m_scratch_ptr, m_scratch_size); - m_scratch_size = total_size; - m_scratch_ptr = space.allocate(total_size); - } -} - -} // namespace Impl - -OpenMPTarget::OpenMPTarget() - : m_space_instance(Impl::OpenMPTargetInternal::impl_singleton()) {} - -const char* OpenMPTarget::name() { - return Impl::OpenMPTargetInternal::impl_singleton()->name(); -} -void OpenMPTarget::print_configuration(std::ostream& os, bool verbose) const { - os << "OpenMPTarget Execution Space:\n"; - os << " KOKKOS_ENABLE_OPENMPTARGET: yes\n"; - - os << "\nOpenMPTarget Runtime Configuration:\n"; - - m_space_instance->print_configuration(os, verbose); -} - -uint32_t OpenMPTarget::impl_instance_id() const noexcept { - return m_space_instance->impl_get_instance_id(); -} - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -int OpenMPTarget::concurrency() { - return Impl::OpenMPTargetInternal::impl_singleton()->concurrency(); -} -#else -int OpenMPTarget::concurrency() const { - return m_space_instance->concurrency(); -} -#endif - -void OpenMPTarget::fence(const std::string& name) { - Impl::OpenMPTargetInternal::impl_singleton()->fence(name); -} - -void OpenMPTarget::impl_static_fence(const std::string& name) { - Impl::OpenMPTargetInternal::impl_singleton()->fence( - name, Kokkos::Experimental::Impl::openmp_fence_is_static::yes); -} - -void OpenMPTarget::impl_initialize(InitializationSettings const& settings) { - using Kokkos::Impl::get_visible_devices; - std::vector const& visible_devices = get_visible_devices(); - using Kokkos::Impl::get_gpu; - const int device_num = get_gpu(settings).value_or(visible_devices[0]); - omp_set_default_device(device_num); - - Impl::OpenMPTargetInternal::impl_singleton()->impl_initialize(); -} -void OpenMPTarget::impl_finalize() { - Impl::OpenMPTargetInternal::impl_singleton()->impl_finalize(); -} -} // Namespace Experimental - -namespace Impl { -int g_openmptarget_space_factory_initialized = - Kokkos::Impl::initialize_space_factory( - "160_OpenMPTarget"); - -} // namespace Impl -} // Namespace Kokkos - -namespace Kokkos { -namespace Experimental { - -UniqueToken:: - UniqueToken(Kokkos::Experimental::OpenMPTarget const& space) { -#ifdef KOKKOS_IMPL_OPENMPTARGET_WORKAROUND - uint32_t* ptr = space.impl_internal_space_instance()->m_uniquetoken_ptr; - int count = Kokkos::Experimental::OpenMPTarget().concurrency(); - if (ptr == nullptr) { - int size = count * sizeof(uint32_t); - ptr = static_cast( - Kokkos::kokkos_malloc( - "Kokkos::OpenMPTarget::m_uniquetoken_ptr", size)); - std::vector h_buf(count, 0); - if (0 < size) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy(ptr, h_buf.data(), size, 0, - 0, omp_get_default_device(), - omp_get_initial_device())); - - space.impl_internal_space_instance()->m_uniquetoken_ptr = ptr; - } -#else -// FIXME_OPENMPTARGET - 2 versions of non-working implementations to fill `ptr` -// with 0's -// Version 1 - Creating a target region and filling the -// pointer Error - CUDA error: named symbol not found -#pragma omp target teams distribute parallel for is_device_ptr(ptr) \ - map(to : size) - for (int i = 0; i < count; ++i) ptr[i] = 0; - - // Version 2 : Allocating a view on the device and filling it with a scalar - // value of 0. - Kokkos::View ptr_view( - ptr, count); - Kokkos::deep_copy(ptr_view, 0); -#endif - m_buffer = ptr; - m_count = count; -} -} // namespace Experimental -} // namespace Kokkos - -#undef KOKKOS_IMPL_OPENMPTARGET_WORKAROUND -#endif // defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(_OPENMP) diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp deleted file mode 100644 index a71838fa6e7..00000000000 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.hpp +++ /dev/null @@ -1,61 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project - -#ifndef KOKKOS_OPENMPTARGET_INSTANCE_HPP -#define KOKKOS_OPENMPTARGET_INSTANCE_HPP - -namespace Kokkos { -namespace Experimental { -namespace Impl { - -enum class openmp_fence_is_static { yes, no }; - -class OpenMPTargetInternal { - private: - OpenMPTargetInternal() = default; - OpenMPTargetInternal(const OpenMPTargetInternal&) = delete; - OpenMPTargetInternal& operator=(const OpenMPTargetInternal&) = delete; - - public: - void fence(openmp_fence_is_static is_static = openmp_fence_is_static::no); - void fence(const std::string& name, - openmp_fence_is_static is_static = openmp_fence_is_static::no); - - /** \brief Return the maximum amount of concurrency. */ - int concurrency() const; - - //! Print configuration information to the given output stream. - void print_configuration(std::ostream& os, bool verbose) const; - - static const char* name(); - - //! Free any resources being consumed by the device. - void impl_finalize(); - - uint32_t impl_get_instance_id() const noexcept; - //! Initialize, telling the CUDA run-time library which device to use. - void impl_initialize(); - - static OpenMPTargetInternal* impl_singleton(); - - static void verify_is_process(const char* const); - - void* get_scratch_ptr(); - void clear_scratch(); - void resize_scratch(int64_t team_reduce_bytes, int64_t team_shared_bytes, - int64_t thread_local_bytes, int64_t league_size); - - void* m_scratch_ptr = nullptr; - std::mutex m_mutex_scratch_ptr; - int64_t m_scratch_size = 0; - uint32_t* m_uniquetoken_ptr = nullptr; - - private: - uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance< - Kokkos::Experimental::OpenMPTarget>(reinterpret_cast(this)); -}; -} // Namespace Impl -} // Namespace Experimental -} // Namespace Kokkos - -#endif // KOKKOS_OPENMPTARGET_INSTANCE_HPP diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_MDRangePolicy.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_MDRangePolicy.hpp deleted file mode 100644 index 53344105225..00000000000 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_MDRangePolicy.hpp +++ /dev/null @@ -1,25 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project - -#ifndef KOKKOS_OPENMPTARGET_MDRANGEPOLICY_HPP_ -#define KOKKOS_OPENMPTARGET_MDRANGEPOLICY_HPP_ - -#include - -namespace Kokkos { -namespace Impl { - -using OpenMPTargetIterateLeft = std::integral_constant; -using OpenMPTargetIterateRight = - std::integral_constant; - -template -struct ThreadAndVectorNestLevel - : AcceleratorBasedNestLevel {}; - -} // namespace Impl -} // namespace Kokkos - -#endif diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp deleted file mode 100644 index e86b995483e..00000000000 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Macros.hpp +++ /dev/null @@ -1,27 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project - -#ifndef KOKKOS_OPENMPTARGET_MACROS_HPP -#define KOKKOS_OPENMPTARGET_MACROS_HPP - -// Define a macro for llvm compiler greater than version 17 and on NVIDIA and -// AMD GPUs. This would be useful in cases where non-OpenMP standard llvm -// extensions can be used. -#if defined(KOKKOS_COMPILER_CLANG) && (KOKKOS_COMPILER_CLANG >= 1700) && \ - (defined(KOKKOS_ARCH_AMD_GPU) || defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU)) -#define KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS -#endif - -#define KOKKOS_IMPL_OPENMPTARGET_PRAGMA_HELPER(x) _Pragma(#x) -#define KOKKOS_IMPL_OMPTARGET_PRAGMA(x) \ - KOKKOS_IMPL_OPENMPTARGET_PRAGMA_HELPER(omp target x) - -// Use scratch memory extensions to request dynamic shared memory for the -// right compiler/architecture combination. -#ifdef KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS -#define KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(N) ompx_dyn_cgroup_mem(N) -#else -#define KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(N) -#endif - -#endif // KOKKOS_OPENMPTARGET_MACROS_HPP diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp deleted file mode 100644 index 67da6868dd7..00000000000 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp +++ /dev/null @@ -1,741 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project - -#ifndef KOKKOS_OPENMPTARGET_PARALLEL_HPP -#define KOKKOS_OPENMPTARGET_PARALLEL_HPP - -#include -#include -#include -#include - -#include -#include -#include "Kokkos_OpenMPTarget_Abort.hpp" -#include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -class OpenMPTargetExecTeamMember { - public: - static constexpr int TEAM_REDUCE_SIZE = 512; - - using execution_space = Kokkos::Experimental::OpenMPTarget; - using scratch_memory_space = execution_space::scratch_memory_space; - using team_handle = OpenMPTargetExecTeamMember; - - scratch_memory_space m_team_shared; - size_t m_team_scratch_size[2]; - int m_team_rank; - int m_team_size; - int m_league_rank; - int m_league_size; - int m_vector_length; - int m_vector_lane; - int m_shmem_block_index; - void* m_glb_scratch; - void* m_reduce_scratch; - - public: - KOKKOS_INLINE_FUNCTION - const execution_space::scratch_memory_space& team_shmem() const { - return m_team_shared.set_team_thread_mode(0, 1, 0); - } - - // set_team_thread_mode routine parameters for future understanding: - // first parameter - scratch level. - // second parameter - size multiplier for advancing scratch ptr after a - // request was serviced. third parameter - offset size multiplier from current - // scratch ptr when returning a ptr for a request. - KOKKOS_INLINE_FUNCTION - const execution_space::scratch_memory_space& team_scratch(int level) const { - return m_team_shared.set_team_thread_mode(level, 1, 0); - } - - KOKKOS_INLINE_FUNCTION - const execution_space::scratch_memory_space& thread_scratch(int level) const { - return m_team_shared.set_team_thread_mode(level, team_size(), team_rank()); - } - - KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank; } - KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size; } - KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank; } - KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size; } - KOKKOS_INLINE_FUNCTION void* impl_reduce_scratch() const { - return m_reduce_scratch; - } - - KOKKOS_INLINE_FUNCTION void team_barrier() const { -#pragma omp barrier - } - - template - KOKKOS_INLINE_FUNCTION void team_broadcast(ValueType& value, - int thread_id) const { - // Make sure there is enough scratch space: - using type = std::conditional_t<(sizeof(ValueType) < TEAM_REDUCE_SIZE), - ValueType, void>; - type* team_scratch = - reinterpret_cast(static_cast(m_glb_scratch) + - TEAM_REDUCE_SIZE * omp_get_team_num()); -#pragma omp barrier - if (team_rank() == thread_id) *team_scratch = value; -#pragma omp barrier - value = *team_scratch; - } - - template - KOKKOS_INLINE_FUNCTION void team_broadcast(const Closure& f, ValueType& value, - const int& thread_id) const { - f(value); - team_broadcast(value, thread_id); - } - - template - KOKKOS_INLINE_FUNCTION std::enable_if_t::value> - team_reduce(ReducerType const& reducer) const noexcept { - team_reduce(reducer, reducer.reference()); - } - - // FIXME_OPENMPTARGET this function currently ignores the reducer passed. - template - KOKKOS_INLINE_FUNCTION std::enable_if_t::value> - team_reduce(ReducerType const&, - typename ReducerType::value_type& value) const noexcept { -#pragma omp barrier - - using value_type = typename ReducerType::value_type; - // const JoinLambdaAdapter op(op_in); - - // Make sure there is enough scratch space: - using type = std::conditional_t<(sizeof(value_type) < TEAM_REDUCE_SIZE), - value_type, void>; - - const int n_values = TEAM_REDUCE_SIZE / sizeof(value_type); - type* team_scratch = - reinterpret_cast(static_cast(m_glb_scratch) + - TEAM_REDUCE_SIZE * omp_get_team_num()); - for (int i = m_team_rank; i < n_values; i += m_team_size) { - team_scratch[i] = value_type(); - } - -#pragma omp barrier - - for (int k = 0; k < m_team_size; k += n_values) { - if ((k <= m_team_rank) && (k + n_values > m_team_rank)) - team_scratch[m_team_rank % n_values] += value; -#pragma omp barrier - } - - for (int d = 1; d < n_values; d *= 2) { - if ((m_team_rank + d < n_values) && (m_team_rank % (2 * d) == 0)) { - team_scratch[m_team_rank] += team_scratch[m_team_rank + d]; - } -#pragma omp barrier - } - value = team_scratch[0]; - } - - /** \brief Intra-team exclusive prefix sum with team_rank() ordering - * with intra-team non-deterministic ordering accumulation. - * - * The global inter-team accumulation value will, at the end of the - * league's parallel execution, be the scan's total. - * Parallel execution ordering of the league's teams is non-deterministic. - * As such the base value for each team's scan operation is similarly - * non-deterministic. - */ - template - KOKKOS_INLINE_FUNCTION ArgType - team_scan(const ArgType& /*value*/, ArgType* const /*global_accum*/) const { - // FIXME_OPENMPTARGET - /* // Make sure there is enough scratch space: - using type = - std::conditional_t<(sizeof(ArgType) < TEAM_REDUCE_SIZE), ArgType, void>; - - volatile type * const work_value = ((type*) m_exec.scratch_thread()); - - *work_value = value ; - - memory_fence(); - - if ( team_fan_in() ) { - // The last thread to synchronize returns true, all other threads wait - for team_fan_out() - // m_team_base[0] == highest ranking team member - // m_team_base[ m_team_size - 1 ] == lowest ranking team member - // - // 1) copy from lower to higher rank, initialize lowest rank to zero - // 2) prefix sum from lowest to highest rank, skipping lowest rank - - type accum = 0 ; - - if ( global_accum ) { - for ( int i = m_team_size ; i-- ; ) { - type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i - )->scratch_thread()); accum += val ; - } - accum = atomic_fetch_add( global_accum , accum ); - } - - for ( int i = m_team_size ; i-- ; ) { - type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i - )->scratch_thread()); const type offset = accum ; accum += val ; val = - offset ; - } - - memory_fence(); - } - - team_fan_out(); - - return *work_value ;*/ - return ArgType(); - } - - /** \brief Intra-team exclusive prefix sum with team_rank() ordering. - * - * The highest rank thread can compute the reduction total as - * reduction_total = dev.team_scan( value ) + value ; - */ - template - KOKKOS_INLINE_FUNCTION Type team_scan(const Type& value) const { - return this->template team_scan(value, 0); - } - - //---------------------------------------- - // Private for the driver - - private: - using space = execution_space::scratch_memory_space; - - public: - // FIXME_OPENMPTARGET - 512(16*32) bytes at the begining of the scratch space - // for each league is saved for reduction. It should actually be based on the - // ValueType of the reduction variable. - inline OpenMPTargetExecTeamMember( - const int league_rank, const int league_size, const int team_size, - const int vector_length // const TeamPolicyInternal< OpenMPTarget, - // Properties ...> & team - , - void* const glb_scratch, const int shmem_block_index, - const size_t shmem_size_L0, const size_t shmem_size_L1) - : m_team_scratch_size{shmem_size_L0, shmem_size_L1}, - m_team_rank(0), - m_team_size(team_size), - m_league_rank(league_rank), - m_league_size(league_size), - m_vector_length(vector_length), - m_shmem_block_index(shmem_block_index), - m_glb_scratch(glb_scratch) { - const int omp_tid = omp_get_thread_num(); - - // The scratch memory allocated is a sum of TEAM_REDUCE_SIZE, L0 shmem size - // and L1 shmem size. TEAM_REDUCE_SIZE = 512 bytes saved per team for - // hierarchical reduction. There is an additional 10% of the requested - // scratch memory allocated per team as padding. Hence the product with 0.1. - // - // Use llvm extensions for dynamic shared memory with compilers/architecture - // combinations where it is supported. - // - // Size allocated in HBM will now change based on whether we use llvm - // extensions. -#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS) - const int total_shmem = shmem_size_L1 + shmem_size_L1 * 0.1; -#else - const int total_shmem = - shmem_size_L0 + shmem_size_L1 + (shmem_size_L0 + shmem_size_L1) * 0.1; -#endif - - // Per team offset for buffer in HBM. - const int reduce_offset = - m_shmem_block_index * (total_shmem + TEAM_REDUCE_SIZE); - -#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS) - const int l1_offset = reduce_offset + TEAM_REDUCE_SIZE; - char* l0_scratch = - static_cast(llvm_omp_target_dynamic_shared_alloc()); - m_team_shared = scratch_memory_space( - l0_scratch, shmem_size_L0, static_cast(glb_scratch) + l1_offset, - shmem_size_L1); -#else - const int l0_offset = reduce_offset + TEAM_REDUCE_SIZE; - const int l1_offset = l0_offset + shmem_size_L0; - m_team_shared = scratch_memory_space( - (static_cast(glb_scratch) + l0_offset), shmem_size_L0, - static_cast(glb_scratch) + l1_offset, shmem_size_L1); -#endif - m_reduce_scratch = static_cast(glb_scratch) + reduce_offset; - m_league_rank = league_rank; - m_team_rank = omp_tid; - m_vector_lane = 0; - } - - static inline int team_reduce_size() { return TEAM_REDUCE_SIZE; } -}; - -template -class TeamPolicyInternal - : public PolicyTraits { - public: - //! Tag this class as a kokkos execution policy - using execution_policy = TeamPolicyInternal; - - using traits = PolicyTraits; - - //---------------------------------------- - - template - inline static int team_size_max(const FunctorType&, const ParallelForTag&) { - return 256; - } - - template - inline static int team_size_max(const FunctorType&, - const ParallelReduceTag&) { - return 256; - } - - template - inline static int team_size_max(const FunctorType&, const ReducerType&, - const ParallelReduceTag&) { - return 256; - } - - template - inline static int team_size_recommended(const FunctorType&, - const ParallelForTag&) { - return 128; - } - - template - inline static int team_size_recommended(const FunctorType&, - const ParallelReduceTag&) { - return 128; - } - - template - inline static int team_size_recommended(const FunctorType&, - const ReducerType&, - const ParallelReduceTag&) { - return 128; - } - - //---------------------------------------- - - private: - int m_league_size; - int m_team_size; - int m_vector_length; - int m_team_alloc; - int m_team_iter; - std::array m_team_scratch_size; - std::array m_thread_scratch_size; - bool m_tune_team_size; - bool m_tune_vector_length; - constexpr const static size_t default_team_size = 256; - int m_chunk_size; - - inline void init(const int league_size_request, const int team_size_request, - const int vector_length_request) { - m_league_size = league_size_request; - - // Minimum team size should be 32 for OpenMPTarget backend. - if (team_size_request < 32) { - Kokkos::Impl::OpenMPTarget_abort( - "OpenMPTarget backend requires a minimum of 32 threads per team.\n"); - } else - m_team_size = team_size_request; - - m_vector_length = vector_length_request; - set_auto_chunk_size(); - } - - template - friend class TeamPolicyInternal; - - public: - // FIXME_OPENMPTARGET : Currently this routine is a copy of the Cuda - // implementation, but this has to be tailored to be architecture specific. - inline static int scratch_size_max(int level) { - return ( - level == 0 ? 1024 * 40 : // 48kB is the max for CUDA, but we need some - // for team_member.reduce etc. - 20 * 1024 * - 1024); // arbitrarily setting this to 20MB, for a Volta V100 - // that would give us about 3.2GB for 2 teams per SM - } - inline bool impl_auto_team_size() const { return m_tune_team_size; } - inline bool impl_auto_vector_length() const { return m_tune_vector_length; } - inline void impl_set_team_size(const size_t size) { m_team_size = size; } - inline void impl_set_vector_length(const size_t length) { - m_tune_vector_length = length; - } - inline int impl_vector_length() const { return m_vector_length; } - inline int team_size() const { return m_team_size; } - inline int league_size() const { return m_league_size; } - inline size_t scratch_size(const int& level, int team_size_ = -1) const { - if (team_size_ < 0) team_size_ = m_team_size; - return m_team_scratch_size[level] + - team_size_ * m_thread_scratch_size[level]; - } - - inline Kokkos::Experimental::OpenMPTarget space() const { - return Kokkos::Experimental::OpenMPTarget(); - } - - template - TeamPolicyInternal(const TeamPolicyInternal& p) - : m_league_size(p.m_league_size), - m_team_size(p.m_team_size), - m_vector_length(p.m_vector_length), - m_team_alloc(p.m_team_alloc), - m_team_iter(p.m_team_iter), - m_team_scratch_size(p.m_team_scratch_size), - m_thread_scratch_size(p.m_thread_scratch_size), - m_tune_team_size(p.m_tune_team_size), - m_tune_vector_length(p.m_tune_vector_length), - m_chunk_size(p.m_chunk_size) {} - - /** \brief Specify league size, request team size */ - TeamPolicyInternal(const typename traits::execution_space&, - int league_size_request, int team_size_request, - int vector_length_request = 1) - : m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_tune_team_size(false), - m_tune_vector_length(false), - m_chunk_size(0) { - init(league_size_request, team_size_request, vector_length_request); - } - - TeamPolicyInternal(const typename traits::execution_space&, - int league_size_request, - const Kokkos::AUTO_t& /* team_size_request */ - , - int vector_length_request = 1) - : m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_tune_team_size(true), - m_tune_vector_length(false), - m_chunk_size(0) { - init(league_size_request, default_team_size / vector_length_request, - vector_length_request); - } - - TeamPolicyInternal(const typename traits::execution_space&, - int league_size_request, - const Kokkos::AUTO_t& /* team_size_request */ - , - const Kokkos::AUTO_t& /* vector_length_request */) - : m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_tune_team_size(true), - m_tune_vector_length(true), - m_chunk_size(0) { - init(league_size_request, default_team_size, 1); - } - TeamPolicyInternal(const typename traits::execution_space&, - int league_size_request, int team_size_request, - const Kokkos::AUTO_t& /* vector_length_request */) - : m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_tune_team_size(false), - m_tune_vector_length(true), - m_chunk_size(0) { - init(league_size_request, team_size_request, 1); - } - - TeamPolicyInternal(int league_size_request, int team_size_request, - int vector_length_request = 1) - : m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_tune_team_size(false), - m_tune_vector_length(false), - m_chunk_size(0) { - init(league_size_request, team_size_request, vector_length_request); - } - - TeamPolicyInternal(int league_size_request, - const Kokkos::AUTO_t& /* team_size_request */ - , - int vector_length_request = 1) - : m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_tune_team_size(true), - m_tune_vector_length(false), - m_chunk_size(0) { - init(league_size_request, default_team_size / vector_length_request, - vector_length_request); - } - - TeamPolicyInternal(int league_size_request, - const Kokkos::AUTO_t& /* team_size_request */ - , - const Kokkos::AUTO_t& /* vector_length_request */) - : m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_tune_team_size(true), - m_tune_vector_length(true), - m_chunk_size(0) { - init(league_size_request, default_team_size, 1); - } - TeamPolicyInternal(int league_size_request, int team_size_request, - const Kokkos::AUTO_t& /* vector_length_request */) - : m_team_scratch_size{0, 0}, - m_thread_scratch_size{0, 0}, - m_tune_team_size(false), - m_tune_vector_length(true), - m_chunk_size(0) { - init(league_size_request, team_size_request, 1); - } - - // FIXME_OPENMPTARGET https://github.com/kokkos/kokkos/issues/8510 - TeamPolicyInternal(const PolicyUpdate, const TeamPolicyInternal& other, - typename traits::execution_space) - : TeamPolicyInternal(other) {} - - inline static size_t vector_length_max() { - return 32; /* TODO: this is bad. Need logic that is compiler and backend - aware */ - } - inline int team_alloc() const { return m_team_alloc; } - inline int team_iter() const { return m_team_iter; } - - inline int chunk_size() const { return m_chunk_size; } - - /** \brief set chunk_size to a discrete value*/ - inline TeamPolicyInternal& set_chunk_size( - typename traits::index_type chunk_size_) { - m_chunk_size = chunk_size_; - return *this; - } - - /** \brief set per team scratch size for a specific level of the scratch - * hierarchy */ - inline TeamPolicyInternal& set_scratch_size(const int& level, - const PerTeamValue& per_team) { - m_team_scratch_size[level] = per_team.value; - return *this; - } - - /** \brief set per thread scratch size for a specific level of the scratch - * hierarchy */ - inline TeamPolicyInternal& set_scratch_size( - const int& level, const PerThreadValue& per_thread) { - m_thread_scratch_size[level] = per_thread.value; - return *this; - } - - /** \brief set per thread and per team scratch size for a specific level of - * the scratch hierarchy */ - inline TeamPolicyInternal& set_scratch_size( - const int& level, const PerTeamValue& per_team, - const PerThreadValue& per_thread) { - m_team_scratch_size[level] = per_team.value; - m_thread_scratch_size[level] = per_thread.value; - return *this; - } - - private: - /** \brief finalize chunk_size if it was set to AUTO*/ - inline void set_auto_chunk_size() { - int concurrency = 2048 * 128; - - if (concurrency == 0) concurrency = 1; - - if (m_chunk_size > 0 && - !Kokkos::has_single_bit(static_cast(m_chunk_size))) { - Kokkos::abort("TeamPolicy blocking granularity must be power of two"); - } - - int new_chunk_size = 1; - while (new_chunk_size * 100 * concurrency < m_league_size) - new_chunk_size *= 2; - if (new_chunk_size < 128) { - new_chunk_size = 1; - while ((new_chunk_size * 40 * concurrency < m_league_size) && - (new_chunk_size < 128)) - new_chunk_size *= 2; - } - m_chunk_size = new_chunk_size; - } - - public: - using member_type = Impl::OpenMPTargetExecTeamMember; -}; - -} // namespace Impl -} // namespace Kokkos - -namespace Kokkos { - -template -KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember> -TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread, - const iType& count) { - return Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>(thread, count); -} - -template -KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct< - std::common_type_t, Impl::OpenMPTargetExecTeamMember> -TeamThreadRange(const Impl::OpenMPTargetExecTeamMember& thread, - const iType1& begin, const iType2& end) { - using iType = std::common_type_t; - return Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(begin), - iType(end)); -} - -template -KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember> -ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, - const iType& count) { - return Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>(thread, count); -} - -template -KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct< - std::common_type_t, Impl::OpenMPTargetExecTeamMember> -ThreadVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, - const iType1& arg_begin, const iType2& arg_end) { - using iType = std::common_type_t; - return Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(arg_begin), - iType(arg_end)); -} - -template -KOKKOS_INLINE_FUNCTION Impl::TeamVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember> -TeamVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, - const iType& count) { - return Impl::TeamVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>(thread, count); -} - -template -KOKKOS_INLINE_FUNCTION Impl::TeamVectorRangeBoundariesStruct< - std::common_type_t, Impl::OpenMPTargetExecTeamMember> -TeamVectorRange(const Impl::OpenMPTargetExecTeamMember& thread, - const iType1& arg_begin, const iType2& arg_end) { - using iType = std::common_type_t; - return Impl::TeamVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>(thread, iType(arg_begin), - iType(arg_end)); -} - -KOKKOS_INLINE_FUNCTION -Impl::ThreadSingleStruct PerTeam( - const Impl::OpenMPTargetExecTeamMember& thread) { - return Impl::ThreadSingleStruct(thread); -} - -KOKKOS_INLINE_FUNCTION -Impl::VectorSingleStruct PerThread( - const Impl::OpenMPTargetExecTeamMember& thread) { - return Impl::VectorSingleStruct(thread); -} -} // namespace Kokkos - -namespace Kokkos { - -template -KOKKOS_INLINE_FUNCTION void single( - const Impl::VectorSingleStruct& - /*single_struct*/, - const FunctorType& lambda) { - lambda(); -} - -template -KOKKOS_INLINE_FUNCTION void single( - const Impl::ThreadSingleStruct& - single_struct, - const FunctorType& lambda) { - if (single_struct.team_member.team_rank() == 0) lambda(); -} - -template -KOKKOS_INLINE_FUNCTION void single( - const Impl::VectorSingleStruct& - /*single_struct*/, - const FunctorType& lambda, ValueType& val) { - lambda(val); -} - -template -KOKKOS_INLINE_FUNCTION void single( - const Impl::ThreadSingleStruct& - single_struct, - const FunctorType& lambda, ValueType& val) { - if (single_struct.team_member.team_rank() == 0) { - lambda(val); - } - single_struct.team_member.team_broadcast(val, 0); -} -} // namespace Kokkos - -namespace Kokkos { -namespace Impl { - -template -struct TeamThreadRangeBoundariesStruct { - using index_type = iType; - const iType start; - const iType end; - const OpenMPTargetExecTeamMember& member; - - TeamThreadRangeBoundariesStruct(const OpenMPTargetExecTeamMember& arg_thread, - iType arg_count) - : start(0), end(arg_count), member(arg_thread) {} - TeamThreadRangeBoundariesStruct(const OpenMPTargetExecTeamMember& arg_thread, - iType arg_begin, iType arg_end) - : start(arg_begin), end(arg_end), member(arg_thread) {} -}; - -template -struct ThreadVectorRangeBoundariesStruct { - using index_type = iType; - const index_type start; - const index_type end; - const OpenMPTargetExecTeamMember& member; - - ThreadVectorRangeBoundariesStruct( - const OpenMPTargetExecTeamMember& arg_thread, index_type arg_count) - : start(0), end(arg_count), member(arg_thread) {} - ThreadVectorRangeBoundariesStruct( - const OpenMPTargetExecTeamMember& arg_thread, index_type arg_begin, - index_type arg_end) - : start(arg_begin), end(arg_end), member(arg_thread) {} -}; - -template -struct TeamVectorRangeBoundariesStruct { - using index_type = iType; - const index_type start; - const index_type end; - const OpenMPTargetExecTeamMember& member; - - TeamVectorRangeBoundariesStruct(const OpenMPTargetExecTeamMember& arg_thread, - index_type arg_count) - : start(0), end(arg_count), member(arg_thread) {} - TeamVectorRangeBoundariesStruct(const OpenMPTargetExecTeamMember& arg_thread, - index_type arg_begin, index_type arg_end) - : start(arg_begin), end(arg_end), member(arg_thread) {} -}; - -} // namespace Impl - -} // namespace Kokkos - -#endif /* KOKKOS_OPENMPTARGET_PARALLEL_HPP */ diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp deleted file mode 100644 index 29dc70b06ff..00000000000 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp +++ /dev/null @@ -1,325 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project - -#ifndef KOKKOS_OPENMPTARGET_PARALLELFOR_MDRANGE_HPP -#define KOKKOS_OPENMPTARGET_PARALLELFOR_MDRANGE_HPP - -#include -#include -#include "Kokkos_OpenMPTarget_MDRangePolicy.hpp" -#include "Kokkos_OpenMPTarget_Instance.hpp" -#include "Kokkos_OpenMPTarget_FunctorAdapter.hpp" - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template -class ParallelFor, - Kokkos::Experimental::OpenMPTarget> { - private: - using Policy = Kokkos::MDRangePolicy; - using Member = typename Policy::member_type; - using Index = typename Policy::index_type; - - using FunctorAdapter = - Kokkos::Experimental::Impl::FunctorAdapter; - const FunctorAdapter m_functor; - - const Policy m_policy; - - public: - inline void execute() const { - Experimental::Impl::OpenMPTargetInternal::verify_is_process( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - - Policy policy = m_policy; - - static_assert(1 < Policy::rank && Policy::rank < 7); - static_assert(Policy::inner_direction == Iterate::Left || - Policy::inner_direction == Iterate::Right); - - execute_tile( - m_functor, policy, - std::integral_constant()); - } - - template - inline std::enable_if_t execute_tile( - const FunctorAdapter& functor, const Policy& policy, - OpenMPTargetIterateRight) const { - const Index begin_0 = policy.m_lower[0]; - const Index begin_1 = policy.m_lower[1]; - - const Index end_0 = policy.m_upper[0]; - const Index end_1 = policy.m_upper[1]; - -#pragma omp target teams distribute parallel for collapse(2) map(to : functor) - for (auto i0 = begin_0; i0 < end_0; ++i0) - for (auto i1 = begin_1; i1 < end_1; ++i1) { - functor(i0, i1); - } - } - - template - inline std::enable_if_t execute_tile( - const FunctorAdapter& functor, const Policy& policy, - OpenMPTargetIterateRight) const { - const Index begin_0 = policy.m_lower[0]; - const Index begin_1 = policy.m_lower[1]; - const Index begin_2 = policy.m_lower[2]; - - const Index end_0 = policy.m_upper[0]; - const Index end_1 = policy.m_upper[1]; - const Index end_2 = policy.m_upper[2]; - -#pragma omp target teams distribute parallel for collapse(3) map(to : functor) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - functor(i0, i1, i2); - } - } - } - } - - template - inline std::enable_if_t execute_tile( - const FunctorAdapter& functor, const Policy& policy, - OpenMPTargetIterateRight) const { - const Index begin_0 = policy.m_lower[0]; - const Index begin_1 = policy.m_lower[1]; - const Index begin_2 = policy.m_lower[2]; - const Index begin_3 = policy.m_lower[3]; - - const Index end_0 = policy.m_upper[0]; - const Index end_1 = policy.m_upper[1]; - const Index end_2 = policy.m_upper[2]; - const Index end_3 = policy.m_upper[3]; - -#pragma omp target teams distribute parallel for collapse(4) map(to : functor) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i3 = begin_3; i3 < end_3; ++i3) { - functor(i0, i1, i2, i3); - } - } - } - } - } - - template - inline std::enable_if_t execute_tile( - const FunctorAdapter& functor, const Policy& policy, - OpenMPTargetIterateRight) const { - const Index begin_0 = policy.m_lower[0]; - const Index begin_1 = policy.m_lower[1]; - const Index begin_2 = policy.m_lower[2]; - const Index begin_3 = policy.m_lower[3]; - const Index begin_4 = policy.m_lower[4]; - - const Index end_0 = policy.m_upper[0]; - const Index end_1 = policy.m_upper[1]; - const Index end_2 = policy.m_upper[2]; - const Index end_3 = policy.m_upper[3]; - const Index end_4 = policy.m_upper[4]; - -#pragma omp target teams distribute parallel for collapse(5) map(to : functor) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i3 = begin_3; i3 < end_3; ++i3) { - for (auto i4 = begin_4; i4 < end_4; ++i4) { - functor(i0, i1, i2, i3, i4); - } - } - } - } - } - } - - template - inline std::enable_if_t execute_tile( - const FunctorAdapter& functor, const Policy& policy, - OpenMPTargetIterateRight) const { - const Index begin_0 = policy.m_lower[0]; - const Index begin_1 = policy.m_lower[1]; - const Index begin_2 = policy.m_lower[2]; - const Index begin_3 = policy.m_lower[3]; - const Index begin_4 = policy.m_lower[4]; - const Index begin_5 = policy.m_lower[5]; - - const Index end_0 = policy.m_upper[0]; - const Index end_1 = policy.m_upper[1]; - const Index end_2 = policy.m_upper[2]; - const Index end_3 = policy.m_upper[3]; - const Index end_4 = policy.m_upper[4]; - const Index end_5 = policy.m_upper[5]; - -#pragma omp target teams distribute parallel for collapse(6) map(to : functor) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i3 = begin_3; i3 < end_3; ++i3) { - for (auto i4 = begin_4; i4 < end_4; ++i4) { - for (auto i5 = begin_5; i5 < end_5; ++i5) { - { - functor(i0, i1, i2, i3, i4, i5); - } - } - } - } - } - } - } - } - - template - inline std::enable_if_t execute_tile( - const FunctorAdapter& functor, const Policy& policy, - OpenMPTargetIterateLeft) const { - const Index begin_0 = policy.m_lower[0]; - const Index begin_1 = policy.m_lower[1]; - - const Index end_0 = policy.m_upper[0]; - const Index end_1 = policy.m_upper[1]; - -#pragma omp target teams distribute parallel for collapse(2) map(to : functor) - for (auto i1 = begin_1; i1 < end_1; ++i1) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - functor(i0, i1); - } - } - - template - inline std::enable_if_t execute_tile( - const FunctorAdapter& functor, const Policy& policy, - OpenMPTargetIterateLeft) const { - const Index begin_0 = policy.m_lower[0]; - const Index begin_1 = policy.m_lower[1]; - const Index begin_2 = policy.m_lower[2]; - - const Index end_0 = policy.m_upper[0]; - const Index end_1 = policy.m_upper[1]; - const Index end_2 = policy.m_upper[2]; - -#pragma omp target teams distribute parallel for collapse(3) map(to : functor) - for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i0 = begin_0; i0 < end_0; ++i0) { - functor(i0, i1, i2); - } - } - } - } - - template - inline std::enable_if_t execute_tile( - const FunctorAdapter& functor, const Policy& policy, - OpenMPTargetIterateLeft) const { - const Index begin_0 = policy.m_lower[0]; - const Index begin_1 = policy.m_lower[1]; - const Index begin_2 = policy.m_lower[2]; - const Index begin_3 = policy.m_lower[3]; - - const Index end_0 = policy.m_upper[0]; - const Index end_1 = policy.m_upper[1]; - const Index end_2 = policy.m_upper[2]; - const Index end_3 = policy.m_upper[3]; - -#pragma omp target teams distribute parallel for collapse(4) map(to : functor) - for (auto i3 = begin_3; i3 < end_3; ++i3) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i0 = begin_0; i0 < end_0; ++i0) { - functor(i0, i1, i2, i3); - } - } - } - } - } - - template - inline std::enable_if_t execute_tile( - const FunctorAdapter& functor, const Policy& policy, - OpenMPTargetIterateLeft) const { - const Index begin_0 = policy.m_lower[0]; - const Index begin_1 = policy.m_lower[1]; - const Index begin_2 = policy.m_lower[2]; - const Index begin_3 = policy.m_lower[3]; - const Index begin_4 = policy.m_lower[4]; - - const Index end_0 = policy.m_upper[0]; - const Index end_1 = policy.m_upper[1]; - const Index end_2 = policy.m_upper[2]; - const Index end_3 = policy.m_upper[3]; - const Index end_4 = policy.m_upper[4]; - -#pragma omp target teams distribute parallel for collapse(5) map(to : functor) - for (auto i4 = begin_4; i4 < end_4; ++i4) { - for (auto i3 = begin_3; i3 < end_3; ++i3) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i0 = begin_0; i0 < end_0; ++i0) { - functor(i0, i1, i2, i3, i4); - } - } - } - } - } - } - - template - inline std::enable_if_t execute_tile( - const FunctorAdapter& functor, const Policy& policy, - OpenMPTargetIterateLeft) const { - const Index begin_0 = policy.m_lower[0]; - const Index begin_1 = policy.m_lower[1]; - const Index begin_2 = policy.m_lower[2]; - const Index begin_3 = policy.m_lower[3]; - const Index begin_4 = policy.m_lower[4]; - const Index begin_5 = policy.m_lower[5]; - - const Index end_0 = policy.m_upper[0]; - const Index end_1 = policy.m_upper[1]; - const Index end_2 = policy.m_upper[2]; - const Index end_3 = policy.m_upper[3]; - const Index end_4 = policy.m_upper[4]; - const Index end_5 = policy.m_upper[5]; - -#pragma omp target teams distribute parallel for collapse(6) map(to : functor) - for (auto i5 = begin_5; i5 < end_5; ++i5) { - for (auto i4 = begin_4; i4 < end_4; ++i4) { - for (auto i3 = begin_3; i3 < end_3; ++i3) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i0 = begin_0; i0 < end_0; ++i0) { - { - functor(i0, i1, i2, i3, i4, i5); - } - } - } - } - } - } - } - } - - inline ParallelFor(const FunctorType& arg_functor, Policy arg_policy) - : m_functor(arg_functor), m_policy(arg_policy) {} - // TODO DZP: based on a conversation with Christian, we're using 256 as a - // heuristic here. We need something better once we can query these kinds of - // properties - template - static int max_tile_size_product(const Policy&, const Functor&) { - return 256; - } -}; - -} // namespace Impl -} // namespace Kokkos - -#endif /* KOKKOS_OPENMPTARGET_PARALLELFOR_MDRANGE_HPP */ diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp deleted file mode 100644 index a3ac828992f..00000000000 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp +++ /dev/null @@ -1,52 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project - -#ifndef KOKKOS_OPENMPTARGET_PARALLEL_FOR_RANGE_HPP -#define KOKKOS_OPENMPTARGET_PARALLEL_FOR_RANGE_HPP - -#include -#include -#include -#include "Kokkos_OpenMPTarget_Instance.hpp" -#include "Kokkos_OpenMPTarget_FunctorAdapter.hpp" - -namespace Kokkos { -namespace Impl { - -template -class ParallelFor, - Kokkos::Experimental::OpenMPTarget> { - private: - using Policy = Kokkos::RangePolicy; - using Member = typename Policy::member_type; - - Kokkos::Experimental::Impl::FunctorAdapter m_functor; - const Policy m_policy; - - public: - void execute() const { execute_impl(); } - - void execute_impl() const { - Experimental::Impl::OpenMPTargetInternal::verify_is_process( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - const auto begin = m_policy.begin(); - const auto end = m_policy.end(); - - if (end <= begin) return; - - auto const a_functor(m_functor); - -#pragma omp target teams distribute parallel for map(to : a_functor) - for (auto i = begin; i < end; ++i) { - a_functor(i); - } - } - - ParallelFor(const FunctorType& arg_functor, Policy arg_policy) - : m_functor(arg_functor), m_policy(arg_policy) {} -}; - -} // namespace Impl -} // namespace Kokkos - -#endif diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp deleted file mode 100644 index e68064a29c0..00000000000 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp +++ /dev/null @@ -1,158 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project - -#ifndef KOKKOS_OPENMPTARGET_PARALLEL_FOR_TEAM_HPP -#define KOKKOS_OPENMPTARGET_PARALLEL_FOR_TEAM_HPP - -#include -#include -#include -#include -#include -#include - -namespace Kokkos { - -/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each - * i=0..N-1. - * - * The range i=0..N-1 is mapped to all threads of the the calling thread team. - */ -template -KOKKOS_INLINE_FUNCTION void parallel_for( - const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda) { -#pragma omp for nowait schedule(static, 1) - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i); -} - -/** \brief Intra-thread vector parallel_for. Executes lambda(iType i) for each - * i=0..N-1. - * - * The range i=0..N-1 is mapped to all vector lanes of the the calling thread. - */ -template -KOKKOS_INLINE_FUNCTION void parallel_for( - const Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda) { -#pragma omp simd - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i); -} - -/** \brief Intra-team vector parallel_for. Executes lambda(iType i) for each - * i=0..N-1. - * - * The range i=0..N-1 is mapped to all vector lanes of the the calling team. - */ -template -KOKKOS_INLINE_FUNCTION void parallel_for( - const Impl::TeamVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda) { -#pragma omp for simd nowait schedule(static, 1) - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i); -} - -namespace Impl { - -template -class ParallelFor, - Kokkos::Experimental::OpenMPTarget> { - private: - using Policy = - Kokkos::Impl::TeamPolicyInternal; - using Member = typename Policy::member_type; - - Kokkos::Experimental::Impl::FunctorAdapter m_functor; - - const Policy m_policy; - const size_t m_shmem_size; - - public: - void execute() const { - Experimental::Impl::OpenMPTargetInternal::verify_is_process( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - execute_impl(); - } - - private: - void execute_impl() const { - Experimental::Impl::OpenMPTargetInternal::verify_is_process( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - const auto league_size = m_policy.league_size(); - const auto team_size = m_policy.team_size(); - const auto vector_length = m_policy.impl_vector_length(); - - const size_t shmem_size_L0 = m_policy.scratch_size(0, team_size); - const size_t shmem_size_L1 = m_policy.scratch_size(1, team_size); - m_policy.space().impl_internal_space_instance()->resize_scratch( - team_size, shmem_size_L0, shmem_size_L1, league_size); - - void* scratch_ptr = - m_policy.space().impl_internal_space_instance()->get_scratch_ptr(); - auto const a_functor(m_functor); - - // FIXME_OPENMPTARGET - If the team_size is not a multiple of 32, the - // scratch implementation does not work in the Release or RelWithDebugInfo - // mode but works in the Debug mode. - - // Maximum active teams possible. - int max_active_teams = omp_get_max_teams(); - - // FIXME_OPENMPTARGET: Although the maximum number of teams is set using the - // omp_set_num_teams in the resize_scratch routine, the call is not - // respected. Hence we need to use `num_teams` routine to restrict the - // number of teams generated to max_active_teams. Hopefully we can avoid the - // num_teams clause in the future and let compiler pick the right number of - // teams. This is not true for Intel architectures. - - // If the league size is <=0, do not launch the kernel. - if (max_active_teams <= 0) return; - - // Performing our own scheduling of teams to avoid separation of code - // between teams-distribute and parallel. Gave a 2x performance boost in - // test cases with the clang compiler. atomic_compare_exchange can be - // avoided since the standard guarantees that the number of teams specified - // in the `num_teams` clause is always less than or equal to the maximum - // concurrently running teams. - KOKKOS_IMPL_OMPTARGET_PRAGMA( - teams thread_limit(team_size) firstprivate(a_functor) - num_teams(max_active_teams) is_device_ptr(scratch_ptr) - KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0)) -#pragma omp parallel - { - if (omp_get_num_teams() > max_active_teams) - Kokkos::abort("`omp_set_num_teams` call was not respected.\n"); - - const int blockIdx = omp_get_team_num(); - const int gridDim = omp_get_num_teams(); - - // Iterate through the number of teams until league_size and assign the - // league_id accordingly - // Guarantee that the compilers respect the `num_teams` clause - for (int league_id = blockIdx; league_id < league_size; - league_id += gridDim) { - typename Policy::member_type team(league_id, league_size, team_size, - vector_length, scratch_ptr, blockIdx, - shmem_size_L0, shmem_size_L1); - a_functor(team); - } - } - } - - public: - ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) - : m_functor(arg_functor), - m_policy(arg_policy), - m_shmem_size(m_policy.scratch_size(0) + m_policy.scratch_size(1) + - FunctorTeamShmemSize::value( - arg_functor, m_policy.team_size())) {} -}; - -} // namespace Impl -} // namespace Kokkos - -#endif diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp deleted file mode 100644 index ceb9f74d1c3..00000000000 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp +++ /dev/null @@ -1,606 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project - -#ifndef KOKKOS_OPENMPTARGET_PARALLELREDUCE_MDRANGE_HPP -#define KOKKOS_OPENMPTARGET_PARALLELREDUCE_MDRANGE_HPP - -#include -#include -#include "Kokkos_OpenMPTarget_MDRangePolicy.hpp" -#include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template -class ParallelReduce, - Kokkos::Experimental::OpenMPTarget> { - private: - using Policy = Kokkos::MDRangePolicy; - using FunctorType = typename CombinedFunctorReducerType::functor_type; - using ReducerType = typename CombinedFunctorReducerType::reducer_type; - - using Member = typename Policy::member_type; - using Index = typename Policy::index_type; - - using pointer_type = typename ReducerType::pointer_type; - using reference_type = typename ReducerType::reference_type; - - static constexpr bool UseReducer = - !std::is_same_v; - - const pointer_type m_result_ptr; - const CombinedFunctorReducerType m_functor_reducer; - const Policy m_policy; - - using ParReduceCopy = ParallelReduceCopy; - - bool m_result_ptr_on_device; - - using FunctorAdapter = - Kokkos::Experimental::Impl::FunctorAdapter; - - public: - inline void execute() const { - // Only let one ParallelReduce instance at a time use the scratch memory. - std::scoped_lock scratch_memory_lock( - m_policy.space().impl_internal_space_instance()->m_mutex_scratch_ptr); - - auto const functor = FunctorAdapter(m_functor_reducer.get_functor()); - execute_tile( - functor, m_policy, m_result_ptr, - std::integral_constant()); - } - - template - inline ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, - Policy arg_policy, const ViewType& arg_result_view) - : m_result_ptr(arg_result_view.data()), - m_functor_reducer(arg_functor_reducer), - m_policy(arg_policy), - m_result_ptr_on_device( - MemorySpaceAccess::accessible) {} - - template - inline std::enable_if_t execute_tile( - const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, - OpenMPTargetIterateLeft) const { - const Index begin_0 = policy.m_lower[0]; - const Index begin_1 = policy.m_lower[1]; - - const Index end_0 = policy.m_upper[0]; - const Index end_1 = policy.m_upper[1]; - - ValueType result = ValueType(); - - // FIXME_OPENMPTARGET: Unable to separate directives and their companion - // loops which leads to code duplication for different reduction types. - if constexpr (UseReducer) { -#pragma omp declare reduction(custom \ -:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(2) map(to : functor) \ - reduction(custom : result) - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i0 = begin_0; i0 < end_0; ++i0) { - functor(i0, i1, result); - } - } - } else { -#pragma omp target teams distribute parallel for collapse(2) map(to : functor) \ - reduction(+ : result) - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i0 = begin_0; i0 < end_0; ++i0) { - functor(i0, i1, result); - } - } - } - - ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), - m_result_ptr_on_device); - } - - template - inline std::enable_if_t execute_tile( - const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, - OpenMPTargetIterateLeft) const { - const Index begin_0 = policy.m_lower[0]; - const Index begin_1 = policy.m_lower[1]; - const Index begin_2 = policy.m_lower[2]; - - const Index end_0 = policy.m_upper[0]; - const Index end_1 = policy.m_upper[1]; - const Index end_2 = policy.m_upper[2]; - - ValueType result = ValueType(); - - // FIXME_OPENMPTARGET: Unable to separate directives and their companion - // loops which leads to code duplication for different reduction types. - if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom \ -:ValueType : OpenMPTargetReducerWrapper< \ - typename ReducerType::functor_type>::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper< \ - typename ReducerType::functor_type>::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(3) map(to : functor) \ - reduction(custom : result) - for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i0 = begin_0; i0 < end_0; ++i0) { - functor(i0, i1, i2, result); - } - } - } - } else { -#pragma omp target teams distribute parallel for collapse(3) map(to : functor) \ - reduction(+ : result) - for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i0 = begin_0; i0 < end_0; ++i0) { - functor(i0, i1, i2, result); - } - } - } - } - - ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), - m_result_ptr_on_device); - } - - template - inline std::enable_if_t execute_tile( - const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, - OpenMPTargetIterateLeft) const { - const Index begin_0 = policy.m_lower[0]; - const Index begin_1 = policy.m_lower[1]; - const Index begin_2 = policy.m_lower[3]; - const Index begin_3 = policy.m_lower[2]; - - const Index end_0 = policy.m_upper[0]; - const Index end_1 = policy.m_upper[1]; - const Index end_2 = policy.m_upper[2]; - const Index end_3 = policy.m_upper[3]; - - ValueType result = ValueType(); - - // FIXME_OPENMPTARGET: Unable to separate directives and their companion - // loops which leads to code duplication for different reduction types. - if constexpr (UseReducer) { -#pragma omp declare reduction(custom \ -:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(4) map(to : functor) \ - reduction(custom : result) - for (auto i3 = begin_3; i3 < end_3; ++i3) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i0 = begin_0; i0 < end_0; ++i0) { - functor(i0, i1, i2, i3, result); - } - } - } - } - } else { -#pragma omp target teams distribute parallel for collapse(4) map(to : functor) \ - reduction(+ : result) - for (auto i3 = begin_3; i3 < end_3; ++i3) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i0 = begin_0; i0 < end_0; ++i0) { - functor(i0, i1, i2, i3, result); - } - } - } - } - } - - ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), - m_result_ptr_on_device); - } - - template - inline std::enable_if_t execute_tile( - const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, - OpenMPTargetIterateLeft) const { - const Index begin_0 = policy.m_lower[0]; - const Index begin_1 = policy.m_lower[1]; - const Index begin_2 = policy.m_lower[2]; - const Index begin_3 = policy.m_lower[3]; - const Index begin_4 = policy.m_lower[4]; - - const Index end_0 = policy.m_upper[0]; - const Index end_1 = policy.m_upper[1]; - const Index end_2 = policy.m_upper[2]; - const Index end_3 = policy.m_upper[3]; - const Index end_4 = policy.m_upper[4]; - - ValueType result = ValueType(); - - // FIXME_OPENMPTARGET: Unable to separate directives and their companion - // loops which leads to code duplication for different reduction types. - if constexpr (UseReducer) { -#pragma omp declare reduction(custom \ -:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(5) map(to : functor) \ - reduction(custom : result) - for (auto i4 = begin_4; i4 < end_4; ++i4) { - for (auto i3 = begin_3; i3 < end_3; ++i3) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i0 = begin_0; i0 < end_0; ++i0) { - functor(i0, i1, i2, i3, i4, result); - } - } - } - } - } - } else { -#pragma omp target teams distribute parallel for collapse(5) map(to : functor) \ - reduction(+ : result) - for (auto i4 = begin_4; i4 < end_4; ++i4) { - for (auto i3 = begin_3; i3 < end_3; ++i3) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i0 = begin_0; i0 < end_0; ++i0) { - functor(i0, i1, i2, i3, i4, result); - } - } - } - } - } - } - - ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), - m_result_ptr_on_device); - } - - template - inline std::enable_if_t execute_tile( - const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, - OpenMPTargetIterateLeft) const { - const Index begin_0 = policy.m_lower[0]; - const Index begin_1 = policy.m_lower[1]; - const Index begin_2 = policy.m_lower[2]; - const Index begin_3 = policy.m_lower[3]; - const Index begin_4 = policy.m_lower[4]; - const Index begin_5 = policy.m_lower[5]; - - const Index end_0 = policy.m_upper[0]; - const Index end_1 = policy.m_upper[1]; - const Index end_2 = policy.m_upper[2]; - const Index end_3 = policy.m_upper[3]; - const Index end_4 = policy.m_upper[4]; - const Index end_5 = policy.m_upper[5]; - - ValueType result = ValueType(); - - // FIXME_OPENMPTARGET: Unable to separate directives and their companion - // loops which leads to code duplication for different reduction types. - if constexpr (UseReducer) { -#pragma omp declare reduction(custom \ -:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(6) map(to : functor) \ - reduction(custom : result) - for (auto i5 = begin_5; i5 < end_5; ++i5) { - for (auto i4 = begin_4; i4 < end_4; ++i4) { - for (auto i3 = begin_3; i3 < end_3; ++i3) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i0 = begin_0; i0 < end_0; ++i0) { - functor(i0, i1, i2, i3, i4, i5, result); - } - } - } - } - } - } - } else { -#pragma omp target teams distribute parallel for collapse(6) map(to : functor) \ - reduction(+ : result) - for (auto i5 = begin_5; i5 < end_5; ++i5) { - for (auto i4 = begin_4; i4 < end_4; ++i4) { - for (auto i3 = begin_3; i3 < end_3; ++i3) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i0 = begin_0; i0 < end_0; ++i0) { - functor(i0, i1, i2, i3, i4, i5, result); - } - } - } - } - } - } - } - - ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), - m_result_ptr_on_device); - } - - template - inline std::enable_if_t execute_tile( - const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, - OpenMPTargetIterateRight) const { - const Index begin_0 = policy.m_lower[0]; - const Index begin_1 = policy.m_lower[1]; - - const Index end_0 = policy.m_upper[0]; - const Index end_1 = policy.m_upper[1]; - - ValueType result = ValueType(); - - // FIXME_OPENMPTARGET: Unable to separate directives and their companion - // loops which leads to code duplication for different reduction types. - if constexpr (UseReducer) { -#pragma omp declare reduction(custom \ -:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(2) map(to : functor) \ - reduction(custom : result) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - functor(i0, i1, result); - } - } - } else { -#pragma omp target teams distribute parallel for collapse(2) map(to : functor) \ - reduction(+ : result) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - functor(i0, i1, result); - } - } - } - - ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), - m_result_ptr_on_device); - } - - template - inline std::enable_if_t execute_tile( - const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, - OpenMPTargetIterateRight) const { - const Index begin_0 = policy.m_lower[0]; - const Index begin_1 = policy.m_lower[1]; - const Index begin_2 = policy.m_lower[2]; - - const Index end_0 = policy.m_upper[0]; - const Index end_1 = policy.m_upper[1]; - const Index end_2 = policy.m_upper[2]; - - ValueType result = ValueType(); - - // FIXME_OPENMPTARGET: Unable to separate directives and their companion - // loops which leads to code duplication for different reduction types. - if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom \ -:ValueType : OpenMPTargetReducerWrapper< \ - typename ReducerType::functor_type>::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper< \ - typename ReducerType::functor_type>::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(3) map(to : functor) \ - reduction(custom : result) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - functor(i0, i1, i2, result); - } - } - } - } else { -#pragma omp target teams distribute parallel for collapse(3) map(to : functor) \ - reduction(+ : result) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - functor(i0, i1, i2, result); - } - } - } - } - - ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), - m_result_ptr_on_device); - } - - template - inline std::enable_if_t execute_tile( - const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, - OpenMPTargetIterateRight) const { - const Index begin_0 = policy.m_lower[0]; - const Index begin_1 = policy.m_lower[1]; - const Index begin_2 = policy.m_lower[3]; - const Index begin_3 = policy.m_lower[2]; - - const Index end_0 = policy.m_upper[0]; - const Index end_1 = policy.m_upper[1]; - const Index end_2 = policy.m_upper[2]; - const Index end_3 = policy.m_upper[3]; - - ValueType result = ValueType(); - - // FIXME_OPENMPTARGET: Unable to separate directives and their companion - // loops which leads to code duplication for different reduction types. - if constexpr (UseReducer) { -#pragma omp declare reduction(custom \ -:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(4) map(to : functor) \ - reduction(custom : result) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i3 = begin_3; i3 < end_3; ++i3) { - functor(i0, i1, i2, i3, result); - } - } - } - } - } else { -#pragma omp target teams distribute parallel for collapse(4) map(to : functor) \ - reduction(+ : result) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i3 = begin_3; i3 < end_3; ++i3) { - functor(i0, i1, i2, i3, result); - } - } - } - } - } - - ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), - m_result_ptr_on_device); - } - - template - inline std::enable_if_t execute_tile( - const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, - OpenMPTargetIterateRight) const { - const Index begin_0 = policy.m_lower[0]; - const Index begin_1 = policy.m_lower[1]; - const Index begin_2 = policy.m_lower[2]; - const Index begin_3 = policy.m_lower[3]; - const Index begin_4 = policy.m_lower[4]; - - const Index end_0 = policy.m_upper[0]; - const Index end_1 = policy.m_upper[1]; - const Index end_2 = policy.m_upper[2]; - const Index end_3 = policy.m_upper[3]; - const Index end_4 = policy.m_upper[4]; - - ValueType result = ValueType(); - - // FIXME_OPENMPTARGET: Unable to separate directives and their companion - // loops which leads to code duplication for different reduction types. - if constexpr (UseReducer) { -#pragma omp declare reduction(custom \ -:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(5) map(to : functor) \ - reduction(custom : result) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i3 = begin_3; i3 < end_3; ++i3) { - for (auto i4 = begin_4; i4 < end_4; ++i4) { - functor(i0, i1, i2, i3, i4, result); - } - } - } - } - } - } else { -#pragma omp target teams distribute parallel for collapse(5) map(to : functor) \ - reduction(+ : result) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i3 = begin_3; i3 < end_3; ++i3) { - for (auto i4 = begin_4; i4 < end_4; ++i4) { - functor(i0, i1, i2, i3, i4, result); - } - } - } - } - } - } - - ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), - m_result_ptr_on_device); - } - - template - inline std::enable_if_t execute_tile( - const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, - OpenMPTargetIterateRight) const { - const Index begin_0 = policy.m_lower[0]; - const Index begin_1 = policy.m_lower[1]; - const Index begin_2 = policy.m_lower[2]; - const Index begin_3 = policy.m_lower[3]; - const Index begin_4 = policy.m_lower[4]; - const Index begin_5 = policy.m_lower[5]; - - const Index end_0 = policy.m_upper[0]; - const Index end_1 = policy.m_upper[1]; - const Index end_2 = policy.m_upper[2]; - const Index end_3 = policy.m_upper[3]; - const Index end_4 = policy.m_upper[4]; - const Index end_5 = policy.m_upper[5]; - - ValueType result = ValueType(); - - // FIXME_OPENMPTARGET: Unable to separate directives and their companion - // loops which leads to code duplication for different reduction types. - if constexpr (UseReducer) { -#pragma omp declare reduction(custom \ -:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper::init(omp_priv)) - -#pragma omp target teams distribute parallel for collapse(6) map(to : functor) \ - reduction(custom : result) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i3 = begin_3; i3 < end_3; ++i3) { - for (auto i4 = begin_4; i4 < end_4; ++i4) { - for (auto i5 = begin_5; i5 < end_5; ++i5) { - functor(i0, i1, i2, i3, i4, i5, result); - } - } - } - } - } - } - } else { -#pragma omp target teams distribute parallel for collapse(6) map(to : functor) \ - reduction(+ : result) - for (auto i0 = begin_0; i0 < end_0; ++i0) { - for (auto i1 = begin_1; i1 < end_1; ++i1) { - for (auto i2 = begin_2; i2 < end_2; ++i2) { - for (auto i3 = begin_3; i3 < end_3; ++i3) { - for (auto i4 = begin_4; i4 < end_4; ++i4) { - for (auto i5 = begin_5; i5 < end_5; ++i5) { - functor(i0, i1, i2, i3, i4, i5, result); - } - } - } - } - } - } - } - - ParReduceCopy::memcpy_result(ptr, &result, sizeof(ValueType), - m_result_ptr_on_device); - } - - template - static int max_tile_size_product(const Policy&, const Functor&) { - return 256; - } -}; - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- -#endif /* KOKKOS_OPENMPTARGET_PARALLELREDUCE_MDRANGE_HPP */ diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp deleted file mode 100644 index eef20a71fbb..00000000000 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp +++ /dev/null @@ -1,106 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project - -#ifndef KOKKOS_OPENMPTARGET_PARALLELREDUCE_RANGE_HPP -#define KOKKOS_OPENMPTARGET_PARALLELREDUCE_RANGE_HPP - -#include -#include -#include -#include -#include - -namespace Kokkos { -namespace Impl { - -template -class ParallelReduce, - Kokkos::Experimental::OpenMPTarget> { - private: - using Policy = Kokkos::RangePolicy; - using FunctorType = typename CombinedFunctorReducerType::functor_type; - using ReducerType = typename CombinedFunctorReducerType::reducer_type; - - using pointer_type = typename ReducerType::pointer_type; - using reference_type = typename ReducerType::reference_type; - - static constexpr bool FunctorHasJoin = Impl::FunctorAnalysis< - Impl::FunctorPatternInterface::REDUCE, Policy, FunctorType, - typename ReducerType::value_type>::Reducer::has_join_member_function(); - static constexpr bool UseReducer = - !std::is_same_v; - static constexpr bool IsArray = std::is_pointer_v; - - using ParReduceSpecialize = - ParallelReduceSpecialize; - - const CombinedFunctorReducerType m_functor_reducer; - const Policy m_policy; - const pointer_type m_result_ptr; - bool m_result_ptr_on_device; - const int m_result_ptr_num_elems; - - public: - void execute() const { - // Only let one ParallelReduce instance at a time use the scratch memory. - std::scoped_lock scratch_memory_lock( - m_policy.space().impl_internal_space_instance()->m_mutex_scratch_ptr); - - auto const functor = - Kokkos::Experimental::Impl::FunctorAdapter( - m_functor_reducer.get_functor()); - - if constexpr (FunctorHasJoin) { - // Enter this loop if the Functor has a init-join. - ParReduceSpecialize::execute_init_join(functor, m_policy, m_result_ptr, - m_result_ptr_on_device); - } else if constexpr (UseReducer) { - // Enter this loop if the Functor is a reducer type. - ParReduceSpecialize::execute_reducer(functor, m_policy, m_result_ptr, - m_result_ptr_on_device); - } else if constexpr (IsArray) { - // Enter this loop if the reduction is on an array and the routine is - // templated over the size of the array. - if (m_result_ptr_num_elems <= 2) { - ParReduceSpecialize::template execute_array<2>( - functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } else if (m_result_ptr_num_elems <= 4) { - ParReduceSpecialize::template execute_array<4>( - functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } else if (m_result_ptr_num_elems <= 8) { - ParReduceSpecialize::template execute_array<8>( - functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } else if (m_result_ptr_num_elems <= 16) { - ParReduceSpecialize::template execute_array<16>( - functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } else if (m_result_ptr_num_elems <= 32) { - ParReduceSpecialize::template execute_array<32>( - functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } else { - Kokkos::abort("array reduction length must be <= 32"); - } - } else { - // This loop handles the basic scalar reduction. - ParReduceSpecialize::template execute_array<1>( - functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } - } - - template - ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, - const Policy& arg_policy, const ViewType& arg_result_view) - : m_functor_reducer(arg_functor_reducer), - m_policy(arg_policy), - m_result_ptr(arg_result_view.data()), - m_result_ptr_on_device( - MemorySpaceAccess::accessible), - m_result_ptr_num_elems(arg_result_view.size()) {} -}; - -} // namespace Impl -} // namespace Kokkos - -#endif diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp deleted file mode 100644 index 517b4f13e78..00000000000 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp +++ /dev/null @@ -1,407 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project - -#ifndef KOKKOS_OPENMPTARGET_PARALLELREDUCE_TEAM_HPP -#define KOKKOS_OPENMPTARGET_PARALLELREDUCE_TEAM_HPP - -#include -#include -#include -#include -#include - -namespace Kokkos { - -/** \brief Inter-thread vector parallel_reduce. Executes lambda(iType i, - * ValueType & val) for each i=0..N-1. - * - * The range i=0..N-1 is mapped to all threads of the the calling thread team - * and a summation of val is performed and put into result. - */ - -template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value> -parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda, ValueType& result) { - // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of - // elements in the array <= 32. For reduction we allocate, 16 bytes per - // element in the scratch space, hence, 16*32 = 512. - static_assert(sizeof(ValueType) <= - Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); - - ValueType* TeamThread_scratch = - static_cast(loop_boundaries.member.impl_reduce_scratch()); - -#pragma omp barrier - TeamThread_scratch[0] = ValueType(); -#pragma omp barrier - - if constexpr (std::is_arithmetic::value) { -#pragma omp for reduction(+ : TeamThread_scratch[ : 1]) - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { - ValueType tmp = ValueType(); - lambda(i, tmp); - TeamThread_scratch[0] += tmp; - } - } else { -#pragma omp declare reduction(custom:ValueType : omp_out += omp_in) - -#pragma omp for reduction(custom : TeamThread_scratch[ : 1]) - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { - ValueType tmp = ValueType(); - lambda(i, tmp); - TeamThread_scratch[0] += tmp; - } - } - - result = TeamThread_scratch[0]; -} - -// For some reason the actual version we wanted to write doesn't work -// and crashes. We should try this with every new compiler -// This is the variant we actually wanted to write -template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value> -parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda, ReducerType result) { - using ValueType = typename ReducerType::value_type; - -#pragma omp declare reduction(custominner \ -:ValueType : Impl::OpenMPTargetReducerWrapper::join(omp_out, \ - omp_in)) \ - initializer(Impl::OpenMPTargetReducerWrapper::init(omp_priv)) - - // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of - // elements in the array <= 32. For reduction we allocate, 16 bytes per - // element in the scratch space, hence, 16*32 = 512. - static_assert(sizeof(ValueType) <= - Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); - - ValueType* TeamThread_scratch = - static_cast(loop_boundaries.member.impl_reduce_scratch()); - -#pragma omp barrier - Impl::OpenMPTargetReducerWrapper::init(TeamThread_scratch[0]); -#pragma omp barrier - -#pragma omp for reduction(custominner : TeamThread_scratch[ : 1]) - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { - lambda(i, TeamThread_scratch[0]); - } - result.reference() = TeamThread_scratch[0]; -} - -/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, - * ValueType & val) for each i=0..N-1. - * - * The range i=0..N-1 is mapped to all vector lanes of the the calling thread - * and a reduction of val is performed using JoinType(ValueType& val, const - * ValueType& update) and put into init_result. The input value of init_result - * is used as initializer for temporary variables of ValueType. Therefore the - * input value should be the neutral element with respect to the join operation - * (e.g. '0 for +-' or '1 for *'). - */ -template -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda, const JoinType& join, ValueType& init_result) { - ValueType* TeamThread_scratch = - static_cast(loop_boundaries.member.impl_reduce_scratch()); - - // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of - // elements in the array <= 32. For reduction we allocate, 16 bytes per - // element in the scratch space, hence, 16*32 = 512. - static_assert(sizeof(ValueType) <= - Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); - - // FIXME_OPENMPTARGET: Still need to figure out how to get value_count here. - const int value_count = 1; - -#pragma omp barrier - TeamThread_scratch[0] = init_result; -#pragma omp barrier - -#pragma omp for - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { - lambda(i, TeamThread_scratch[omp_get_num_threads() * value_count]); - } - - // Reduce all partial results within a team. - const int team_size = omp_get_num_threads(); - int tree_neighbor_offset = 1; - do { -#pragma omp for - for (int i = 0; i < team_size - tree_neighbor_offset; - i += 2 * tree_neighbor_offset) { - const int neighbor = i + tree_neighbor_offset; - join(lambda, &TeamThread_scratch[i * value_count], - &TeamThread_scratch[neighbor * value_count]); - } - tree_neighbor_offset *= 2; - } while (tree_neighbor_offset < team_size); - init_result = TeamThread_scratch[0]; -} - -/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, - * ValueType & val) for each i=0..N-1. - * - * The range i=0..N-1 is mapped to all vector lanes of the the calling thread - * and a summation of val is performed and put into result. - */ -template -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda, ValueType& result) { - ValueType vector_reduce = ValueType(); - - if constexpr (std::is_arithmetic::value) { -#pragma omp simd reduction(+ : vector_reduce) - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { - ValueType tmp = ValueType(); - lambda(i, tmp); - vector_reduce += tmp; - } - } else { -#pragma omp declare reduction(custom:ValueType : omp_out += omp_in) - -#pragma omp simd reduction(custom : vector_reduce) - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { - lambda(i, vector_reduce); - } - } - - result = vector_reduce; -} - -template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value> -parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda, ReducerType const& result) { - using ValueType = typename ReducerType::value_type; - -#pragma omp declare reduction(custom \ -:ValueType : Impl::OpenMPTargetReducerWrapper::join(omp_out, \ - omp_in)) \ - initializer(Impl::OpenMPTargetReducerWrapper::init(omp_priv)) - - ValueType vector_reduce; - Impl::OpenMPTargetReducerWrapper::init(vector_reduce); - -#pragma omp simd reduction(custom : vector_reduce) - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { - lambda(i, vector_reduce); - } - - result.reference() = vector_reduce; -} - -/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, - * ValueType & val) for each i=0..N-1. - * - * The range i=0..N-1 is mapped to all vector lanes of the the calling thread - * and a reduction of val is performed using JoinType(ValueType& val, const - * ValueType& update) and put into init_result. The input value of init_result - * is used as initializer for temporary variables of ValueType. Therefore the - * input value should be the neutral element with respect to the join operation - * (e.g. '0 for +-' or '1 for *'). - */ -template -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda, const JoinType& join, ValueType& init_result) { - ValueType result = init_result; - - // FIXME_OPENMPTARGET think about omp simd - // join does not work with omp reduction clause - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { - ValueType tmp = ValueType(); - lambda(i, tmp); - join(result, tmp); - } - - init_result = result; -} - -/** \brief Intra-team vector parallel_reduce. Executes lambda(iType i, - * ValueType & val) for each i=0..N-1. - * - * The range i=0..N-1 is mapped to all vector lanes of the the calling team - * and a summation of val is performed and put into result. - */ -template -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::TeamVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda, ValueType& result) { - // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of - // elements in the array <= 32. For reduction we allocate, 16 bytes per - // element in the scratch space, hence, 16*32 = 512. - static_assert(sizeof(ValueType) <= - Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); - - ValueType* TeamVector_scratch = - static_cast(loop_boundaries.member.impl_reduce_scratch()); - -#pragma omp barrier - TeamVector_scratch[0] = ValueType(); -#pragma omp barrier - - if constexpr (std::is_arithmetic::value) { -#pragma omp for simd reduction(+ : TeamVector_scratch[ : 1]) - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { - ValueType tmp = ValueType(); - lambda(i, tmp); - TeamVector_scratch[0] += tmp; - } - } else { -#pragma omp declare reduction(custom:ValueType : omp_out += omp_in) - -#pragma omp for simd reduction(custom : TeamVector_scratch[ : 1]) - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { - ValueType tmp = ValueType(); - lambda(i, tmp); - TeamVector_scratch[0] += tmp; - } - } - - result = TeamVector_scratch[0]; -} - -template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value> -parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const Lambda& lambda, ReducerType const& result) { - using ValueType = typename ReducerType::value_type; - - // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of - // elements in the array <= 32. For reduction we allocate, 16 bytes per - // element in the scratch space, hence, 16*32 = 512. - static_assert(sizeof(ValueType) <= - Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); - -#pragma omp declare reduction(custom \ -:ValueType : Impl::OpenMPTargetReducerWrapper::join(omp_out, \ - omp_in)) \ - initializer(Impl::OpenMPTargetReducerWrapper::init(omp_priv)) - - ValueType* TeamVector_scratch = - static_cast(loop_boundaries.member.impl_reduce_scratch()); - -#pragma omp barrier - Impl::OpenMPTargetReducerWrapper::init(TeamVector_scratch[0]); -#pragma omp barrier - -#pragma omp for simd reduction(custom : TeamVector_scratch[ : 1]) - for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { - lambda(i, TeamVector_scratch[0]); - } - - result.reference() = TeamVector_scratch[0]; -} - -namespace Impl { - -template -class ParallelReduce, - Kokkos::Experimental::OpenMPTarget> { - private: - using Policy = - Kokkos::Impl::TeamPolicyInternal; - using FunctorType = typename CombinedFunctorReducerType::functor_type; - using ReducerType = typename CombinedFunctorReducerType::reducer_type; - - using Member = typename Policy::member_type; - - using pointer_type = typename ReducerType::pointer_type; - using reference_type = typename ReducerType::reference_type; - using value_type = typename ReducerType::value_type; - - bool m_result_ptr_on_device; - const int m_result_ptr_num_elems; - - static constexpr bool FunctorHasJoin = Impl::FunctorAnalysis< - Impl::FunctorPatternInterface::REDUCE, Policy, FunctorType, - typename ReducerType::value_type>::Reducer::has_join_member_function(); - static constexpr bool UseReducer = - !std::is_same_v; - static constexpr bool IsArray = std::is_pointer_v; - - using ParReduceSpecialize = - ParallelReduceSpecialize; - - const CombinedFunctorReducerType m_functor_reducer; - const Policy m_policy; - const pointer_type m_result_ptr; - const size_t m_shmem_size; - - public: - void execute() const { - // Only let one ParallelReduce instance at a time use the scratch memory. - std::scoped_lock scratch_memory_lock( - m_policy.space().impl_internal_space_instance()->m_mutex_scratch_ptr); - auto const functor = - Kokkos::Experimental::Impl::FunctorAdapter( - m_functor_reducer.get_functor()); - if constexpr (FunctorHasJoin) { - ParReduceSpecialize::execute_init_join(functor, m_policy, m_result_ptr, - m_result_ptr_on_device); - } else if constexpr (UseReducer) { - ParReduceSpecialize::execute_reducer(functor, m_policy, m_result_ptr, - m_result_ptr_on_device); - } else if constexpr (IsArray) { - if (m_result_ptr_num_elems <= 2) { - ParReduceSpecialize::template execute_array<2>( - functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } else if (m_result_ptr_num_elems <= 4) { - ParReduceSpecialize::template execute_array<4>( - functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } else if (m_result_ptr_num_elems <= 8) { - ParReduceSpecialize::template execute_array<8>( - functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } else if (m_result_ptr_num_elems <= 16) { - ParReduceSpecialize::template execute_array<16>( - functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } else if (m_result_ptr_num_elems <= 32) { - ParReduceSpecialize::template execute_array<32>( - functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } else { - Kokkos::abort("array reduction length must be <= 32"); - } - } else { - ParReduceSpecialize::template execute_array<1>( - functor, m_policy, m_result_ptr, m_result_ptr_on_device); - } - } - - template - ParallelReduce(const CombinedFunctorReducerType& arg_functor_reducer, - const Policy& arg_policy, const ViewType& arg_result) - : m_result_ptr_on_device( - MemorySpaceAccess::accessible), - m_result_ptr_num_elems(arg_result.size()), - m_functor_reducer(arg_functor_reducer), - m_policy(arg_policy), - m_result_ptr(arg_result.data()), - m_shmem_size( - arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + - FunctorTeamShmemSize::value( - arg_functor_reducer.get_functor(), arg_policy.team_size())) {} -}; - -} // namespace Impl - -} // namespace Kokkos - -#endif diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp deleted file mode 100644 index acdac771d1f..00000000000 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp +++ /dev/null @@ -1,243 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project - -#ifndef KOKKOS_OPENMPTARGET_PARALLELSCAN_RANGE_HPP -#define KOKKOS_OPENMPTARGET_PARALLELSCAN_RANGE_HPP - -#include -#include -#include -#include - -namespace Kokkos { -namespace Impl { - -template -class ParallelScan, - Kokkos::Experimental::OpenMPTarget> { - protected: - using Policy = Kokkos::RangePolicy; - - using Member = typename Policy::member_type; - using idx_type = typename Policy::index_type; - - using Analysis = Impl::FunctorAnalysis; - - using value_type = typename Analysis::value_type; - using pointer_type = typename Analysis::pointer_type; - using reference_type = typename Analysis::reference_type; - - const CombinedFunctorReducer - m_functor_reducer; - const Policy m_policy; - - value_type* m_result_ptr; - const bool m_result_ptr_device_accessible; - - using FunctorAdapter = - Kokkos::Experimental::Impl::FunctorAdapter; - - public: - void impl_execute( - Kokkos::View - element_values, - Kokkos::View - chunk_values, - Kokkos::View count) - const { - const idx_type begin = m_policy.begin(); - const idx_type end = m_policy.end(); - const idx_type N = end - begin; - const idx_type chunk_size = 128; - const idx_type n_chunks = (N + chunk_size - 1) / chunk_size; - idx_type nteams = n_chunks > 512 ? 512 : n_chunks; - idx_type team_size = 128; - - auto a_functor_reducer = m_functor_reducer; - auto a_functor = FunctorAdapter(m_functor_reducer.get_functor()); - -#pragma omp target teams distribute map(to : a_functor_reducer, a_functor) \ - num_teams(nteams) - for (idx_type team_id = 0; team_id < n_chunks; ++team_id) { - const typename Analysis::Reducer& reducer = - a_functor_reducer.get_reducer(); -#pragma omp parallel num_threads(team_size) - { - const idx_type local_offset = team_id * chunk_size + begin; - -#pragma omp for - for (idx_type i = 0; i < chunk_size; ++i) { - const idx_type idx = local_offset + i; - value_type val; - reducer.init(&val); - if ((idx >= begin) && (idx < end)) a_functor(idx, val, false); - - element_values(team_id, i) = val; - } -#pragma omp barrier - if (omp_get_thread_num() == 0) { - value_type sum; - reducer.init(&sum); - for (idx_type i = 0; i < chunk_size; ++i) { - reducer.join(&sum, &element_values(team_id, i)); - element_values(team_id, i) = sum; - } - chunk_values(team_id) = sum; - } -#pragma omp barrier - if (omp_get_thread_num() == 0) { - if (Kokkos::atomic_fetch_add(&count(), 1) == n_chunks - 1) { - value_type sum; - reducer.init(&sum); - for (idx_type i = 0; i < n_chunks; ++i) { - reducer.join(&sum, &chunk_values(i)); - chunk_values(i) = sum; - } - } - } - } - } - -#pragma omp target teams distribute map(to : a_functor_reducer, a_functor) \ - num_teams(nteams) thread_limit(team_size) - for (idx_type team_id = 0; team_id < n_chunks; ++team_id) { - const typename Analysis::Reducer& reducer = - a_functor_reducer.get_reducer(); -#pragma omp parallel num_threads(team_size) - { - const idx_type local_offset = team_id * chunk_size + begin; - value_type offset_value; - if (team_id > 0) - offset_value = chunk_values(team_id - 1); - else - reducer.init(&offset_value); - -#pragma omp for - for (idx_type i = 0; i < chunk_size; ++i) { - const idx_type idx = local_offset + i; - value_type local_offset_value; - if (i > 0) { - local_offset_value = element_values(team_id, i - 1); - // FIXME_OPENMPTARGET We seem to access memory illegaly on AMD GPUs -#if defined(KOKKOS_ARCH_AMD_GPU) && !defined(KOKKOS_ARCH_AMD_GFX1030) && \ - !defined(KOKKOS_ARCH_AMD_GFX1100) && !defined(KOKKOS_ARCH_AMD_GFX1103) && \ - !defined(KOKKOS_ARCH_AMD_GFX1201) - if constexpr (Analysis::Reducer::has_join_member_function()) { - a_functor.get_functor().join(local_offset_value, offset_value); - } else - local_offset_value += offset_value; -#else - reducer.join(&local_offset_value, &offset_value); -#endif - } else - local_offset_value = offset_value; - if (idx < end) a_functor(idx, local_offset_value, true); - - if (idx == end - 1 && m_result_ptr_device_accessible) - *m_result_ptr = local_offset_value; - } - } - } - } - - void execute() const { - Experimental::Impl::OpenMPTargetInternal::verify_is_process( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - const idx_type N = m_policy.end() - m_policy.begin(); - const idx_type chunk_size = 128; - const idx_type n_chunks = (N + chunk_size - 1) / chunk_size; - - // Only let one ParallelReduce instance at a time use the scratch memory. - std::scoped_lock scratch_memory_lock( - m_policy.space().impl_internal_space_instance()->m_mutex_scratch_ptr); - - // This could be scratch memory per team - Kokkos::View - element_values("element_values", n_chunks, chunk_size); - Kokkos::View - chunk_values("chunk_values", n_chunks); - Kokkos::View count( - "Count"); - - impl_execute(element_values, chunk_values, count); - } - - //---------------------------------------- - - ParallelScan(const FunctorType& arg_functor, const Policy& arg_policy, - pointer_type arg_result_ptr = nullptr, - bool arg_result_ptr_device_accessible = false) - : m_functor_reducer(arg_functor, typename Analysis::Reducer{arg_functor}), - m_policy(arg_policy), - m_result_ptr(arg_result_ptr), - m_result_ptr_device_accessible(arg_result_ptr_device_accessible) {} - - //---------------------------------------- -}; - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -template -class ParallelScanWithTotal, - ReturnType, Kokkos::Experimental::OpenMPTarget> - : public ParallelScan, - Kokkos::Experimental::OpenMPTarget> { - using base_t = ParallelScan, - Kokkos::Experimental::OpenMPTarget>; - using value_type = typename base_t::value_type; - - public: - void execute() const { - Experimental::Impl::OpenMPTargetInternal::verify_is_process( - "Kokkos::Experimental::OpenMPTarget parallel_for"); - const int64_t N = base_t::m_policy.end() - base_t::m_policy.begin(); - const int chunk_size = 128; - const int64_t n_chunks = (N + chunk_size - 1) / chunk_size; - - if (N > 0) { - // Only let one ParallelReduce instance at a time use the scratch memory. - std::scoped_lock scratch_memory_lock( - base_t::m_policy.space() - .impl_internal_space_instance() - ->m_mutex_scratch_ptr); - - // This could be scratch memory per team - Kokkos::View - element_values("element_values", n_chunks, chunk_size); - Kokkos::View - chunk_values("chunk_values", n_chunks); - Kokkos::View count( - "Count"); - - base_t::impl_execute(element_values, chunk_values, count); - - if (!base_t::m_result_ptr_device_accessible) { - const int size = base_t::m_functor_reducer.get_reducer().value_size(); - DeepCopy( - base_t::m_policy.space(), base_t::m_result_ptr, - chunk_values.data() + (n_chunks - 1), size); - } - } else if (!base_t::m_result_ptr_device_accessible) { - base_t::m_functor_reducer.get_reducer().init(base_t::m_result_ptr); - } - } - - template - ParallelScanWithTotal(const FunctorType& arg_functor, - const typename base_t::Policy& arg_policy, - const ViewType& arg_result_view) - : base_t(arg_functor, arg_policy, arg_result_view.data(), - MemorySpaceAccess::accessible) { - } -}; -} // namespace Impl -} // namespace Kokkos - -#endif diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Team.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Team.hpp deleted file mode 100644 index 37dd3130b24..00000000000 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Team.hpp +++ /dev/null @@ -1,125 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project - -#ifndef KOKKOS_OPENMPTARGET_PARALLELSCAN_TEAM_HPP -#define KOKKOS_OPENMPTARGET_PARALLELSCAN_TEAM_HPP - -#include -#include -#include -#include - -namespace Kokkos { - -// This is largely the same code as in HIP and CUDA except for the member name -template -KOKKOS_INLINE_FUNCTION void parallel_scan( - const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_bounds, - const FunctorType& lambda, ValueType& return_val) { - using Analysis = Impl::FunctorAnalysis, - FunctorType, void>; - using analysis_value_type = typename Analysis::value_type; - static_assert(std::is_same_v, - "Non-matching value types of functor and return type"); - - const auto start = loop_bounds.start; - const auto end = loop_bounds.end; - auto& member = loop_bounds.member; - const auto team_rank = member.team_rank(); - - const auto team_size = member.team_size(); - const auto nchunk = (end - start + team_size - 1) / team_size; - ValueType accum = {}; - // each team has to process one or - // more chunks of the prefix scan - for (iType i = 0; i < nchunk; ++i) { - auto ii = start + i * team_size + team_rank; - // local accumulation for this chunk - ValueType local_accum = {}; - // user updates value with prefix value - if (ii < loop_bounds.end) lambda(ii, local_accum, false); - // perform team scan - local_accum = member.team_scan(local_accum); - // add this blocks accum to total accumulation - auto val = accum + local_accum; - // user updates their data with total accumulation - if (ii < loop_bounds.end) lambda(ii, val, true); - // the last value needs to be propogated to next chunk - if (team_rank == team_size - 1) accum = val; - // broadcast last value to rest of the team - member.team_broadcast(accum, team_size - 1); - } - return_val = accum; -} - -template -KOKKOS_INLINE_FUNCTION void parallel_scan( - const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_bounds, - const FunctorType& lambda) { - using Analysis = Impl::FunctorAnalysis, - FunctorType, void>; - using value_type = typename Analysis::value_type; - value_type scan_val; - parallel_scan(loop_bounds, lambda, scan_val); -} -} // namespace Kokkos - -namespace Kokkos { - -/** \brief Intra-thread vector parallel exclusive prefix sum. Executes - * lambda(iType i, ValueType & val, bool final) for each i=0..N-1. - * - * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan - * operation is performed. Depending on the target execution space the operator - * might be called twice: once with final=false and once with final=true. When - * final==true val contains the prefix sum value. The contribution of this "i" - * needs to be added to val no matter whether final==true or not. In a serial - * execution (i.e. team_size==1) the operator is only called once with - * final==true. Scan_val will be set to the final sum value over all vector - * lanes. - */ -template -KOKKOS_INLINE_FUNCTION void parallel_scan( - const Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const FunctorType& lambda, ValueType& return_val) { - using Analysis = Impl::FunctorAnalysis, - FunctorType, void>; - using analysis_value_type = typename Analysis::value_type; - static_assert(std::is_same_v, - "Non-matching value types of functor and return type"); - - ValueType scan_val = {}; - -#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP -#pragma ivdep -#endif - for (iType i = loop_boundaries.start; i < loop_boundaries.end; ++i) { - lambda(i, scan_val, true); - } - - return_val = scan_val; -} - -template -KOKKOS_INLINE_FUNCTION void parallel_scan( - const Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::OpenMPTargetExecTeamMember>& loop_boundaries, - const FunctorType& lambda) { - using Analysis = Impl::FunctorAnalysis, - FunctorType, void>; - using value_type = typename Analysis::value_type; - - value_type scan_val = value_type(); - parallel_scan(loop_boundaries, lambda, scan_val); -} - -} // namespace Kokkos - -#endif diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp deleted file mode 100644 index 443648bad09..00000000000 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel_Common.hpp +++ /dev/null @@ -1,609 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project - -#ifndef KOKKOS_OPENMPTARGET_PARALLEL_COMMON_HPP -#define KOKKOS_OPENMPTARGET_PARALLEL_COMMON_HPP - -#include -#include -#include -#include -#include -#include - -namespace Kokkos { -namespace Impl { - -// This class has the memcpy routine that is commonly used by ParallelReduce -// over RangePolicy and TeamPolicy. -template -struct ParallelReduceCopy { - // Copy the result back to device if the view is on the device. - static void memcpy_result(PointerType dest, PointerType src, size_t size, - bool ptr_on_device) { - if (ptr_on_device) { - if (0 < size) { - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy(dest, src, size, 0, 0, - omp_get_default_device(), - omp_get_initial_device())); - } - - } else { - *dest = *src; - } - } -}; - -// template -template -struct ParallelReduceSpecialize { - inline static void execute(const FunctorType& /*f*/, const PolicyType& /*p*/, - PointerType /*result_ptr*/) { - constexpr int FunctorHasJoin = - Impl::FunctorAnalysis::Reducer::has_join_member_function(); - constexpr int UseReducerType = is_reducer_v; - - std::stringstream error_message; - error_message << "Error: Invalid Specialization " << FunctorHasJoin << ' ' - << UseReducerType << '\n'; - // FIXME_OPENMPTARGET - OpenMPTarget_abort(error_message.str().c_str()); - } -}; - -template -struct ParallelReduceSpecialize, - ReducerType, PointerType, ValueType> { - using PolicyType = Kokkos::RangePolicy; - using ReducerTypeFwd = - std::conditional_t::value, - FunctorType, ReducerType>; - using Analysis = Impl::FunctorAnalysis; - using ReferenceType = typename Analysis::reference_type; - - using ParReduceCopy = ParallelReduceCopy; - - using FunctorAdapter = - Kokkos::Experimental::Impl::FunctorAdapter; - - static void execute_reducer(const FunctorAdapter& f, const PolicyType& p, - PointerType result_ptr, bool ptr_on_device) { - Experimental::Impl::OpenMPTargetInternal::verify_is_process( - "Kokkos::Experimental::OpenMPTarget RangePolicy " - "parallel_reduce:reducer"); - const auto begin = p.begin(); - const auto end = p.end(); - - ValueType result; - OpenMPTargetReducerWrapper::init(result); - - // Initialize and copy back the result even if it is a zero length - // reduction. - if (end <= begin) { - ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType), - ptr_on_device); - return; - } - -#pragma omp declare reduction(custom \ -:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper::init(omp_priv)) - -#pragma omp target teams distribute parallel for map(to : f) \ - reduction(custom : result) - for (auto i = begin; i < end; ++i) { - f(i, result); - } - - ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType), - ptr_on_device); - } - - template - static void execute_array(const FunctorAdapter& f, const PolicyType& p, - PointerType result_ptr, bool ptr_on_device) { - Experimental::Impl::OpenMPTargetInternal::verify_is_process( - "Kokkos::Experimental::OpenMPTarget RangePolicy " - "parallel_reduce:array_reduction"); - const auto begin = p.begin(); - const auto end = p.end(); - - // Enter the loop if the reduction is on a scalar type. - if constexpr (NumReductions == 1) { - ValueType result = ValueType(); - - // Initialize and copy back the result even if it is a zero length - // reduction. - if (end <= begin) { - ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType), - ptr_on_device); - return; - } - - // Case where reduction is on a native data type. - if constexpr (std::is_arithmetic::value) { -#pragma omp target teams distribute parallel for map(to : f) \ - reduction(+ : result) - for (auto i = begin; i < end; ++i) f(i, result); - } else { -#pragma omp declare reduction(custom:ValueType : omp_out += omp_in) -#pragma omp target teams distribute parallel for map(to : f) \ - reduction(custom : result) - for (auto i = begin; i < end; ++i) f(i, result); - } - - ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType), - ptr_on_device); - } else { - ValueType result[NumReductions] = {}; - - // Initialize and copy back the result even if it is a zero length - // reduction. - if (end <= begin) { - ParReduceCopy::memcpy_result(result_ptr, result, - NumReductions * sizeof(ValueType), - ptr_on_device); - return; - } -#pragma omp target teams distribute parallel for map(to : f) \ - reduction(+ : result[ : NumReductions]) - for (auto i = begin; i < end; ++i) { - f(i, result); - } - - ParReduceCopy::memcpy_result( - result_ptr, result, NumReductions * sizeof(ValueType), ptr_on_device); - } - } - - static void execute_init_join(const FunctorAdapter& f, const PolicyType& p, - PointerType ptr, const bool ptr_on_device) { - Experimental::Impl::OpenMPTargetInternal::verify_is_process( - "Kokkos::Experimental::OpenMPTarget RangePolicy " - "parallel_reduce:init_join"); - const auto begin = p.begin(); - const auto end = p.end(); - - using FunctorAnalysis = - Impl::FunctorAnalysis; - - // Initialize the result pointer. - - const auto size = end - begin; - - // FIXME_OPENMPTARGET: The team size and concurrency are currently - // based on NVIDIA-V100 and should be modifid to be based on the - // architecture in the future. - const int max_team_threads = 32; - const int max_teams = p.space().concurrency() / max_team_threads; - // Number of elements in the reduction - const auto value_count = FunctorAnalysis::value_count(f.get_functor()); - - // Allocate scratch per active thread. Achieved by setting the first - // parameter of `resize_scratch=1`. - p.space().impl_internal_space_instance()->resize_scratch( - 1, 0, value_count * sizeof(ValueType), - std::numeric_limits::max()); - ValueType* scratch_ptr = static_cast( - p.space().impl_internal_space_instance()->get_scratch_ptr()); - - typename FunctorAnalysis::Reducer final_reducer(f.get_functor()); - - if (end <= begin) { -#pragma omp target map(to : final_reducer) is_device_ptr(scratch_ptr) - { - // If there is no work to be done, copy back the initialized values and - // exit. - final_reducer.init(scratch_ptr); - final_reducer.final(scratch_ptr); - } - if (0 < value_count) { - if (!ptr_on_device) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, - omp_get_initial_device(), omp_get_default_device())); - else - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, - omp_get_default_device(), omp_get_default_device())); - } - - return; - } - -#pragma omp target teams num_teams(max_teams) thread_limit(max_team_threads) \ - map(to : final_reducer) is_device_ptr(scratch_ptr) - { -#pragma omp parallel - { - const int team_num = omp_get_team_num(); - const int num_teams = omp_get_num_teams(); - const auto chunk_size = size / num_teams; - const auto team_begin = begin + team_num * chunk_size; - const auto team_end = - (team_num == num_teams - 1) ? end : (team_begin + chunk_size); - ValueType* team_scratch = - scratch_ptr + team_num * max_team_threads * value_count; - ReferenceType result = final_reducer.init( - &team_scratch[omp_get_thread_num() * value_count]); - - // Accumulate partial results in thread specific storage. -#pragma omp for simd - for (auto i = team_begin; i < team_end; ++i) { - f(i, result); - } - - // Reduce all paritial results within a team. - const int team_size = max_team_threads; - int tree_neighbor_offset = 1; - do { -#pragma omp for simd - for (int i = 0; i < team_size - tree_neighbor_offset; - i += 2 * tree_neighbor_offset) { - const int neighbor = i + tree_neighbor_offset; - final_reducer.join(&team_scratch[i * value_count], - &team_scratch[neighbor * value_count]); - } - tree_neighbor_offset *= 2; - } while (tree_neighbor_offset < team_size); - } // end parallel - } // end target - - int tree_neighbor_offset = 1; - do { -#pragma omp target teams distribute parallel for simd map(to : f) \ - is_device_ptr(scratch_ptr) - for (int i = 0; i < max_teams - tree_neighbor_offset; - i += 2 * tree_neighbor_offset) { - ValueType* team_scratch = scratch_ptr; - const int team_offset = max_team_threads * value_count; - final_reducer.join( - &team_scratch[i * team_offset], - &team_scratch[(i + tree_neighbor_offset) * team_offset]); - - // If `final` is provided by the functor. - // Do the final only once at the end. - if (tree_neighbor_offset * 2 >= max_teams && omp_get_team_num() == 0 && - omp_get_thread_num() == 0) { - final_reducer.final(scratch_ptr); - } - } - tree_neighbor_offset *= 2; - } while (tree_neighbor_offset < max_teams); - - // If the result view is on the host, copy back the values via memcpy. - if (0 < value_count) { - if (!ptr_on_device) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, - omp_get_initial_device(), omp_get_default_device())); - else - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, - omp_get_default_device(), omp_get_default_device())); - } - } -}; - -template -struct ParallelReduceSpecialize, - ReducerType, PointerType, ValueType> { - using PolicyType = TeamPolicyInternal; - using ReducerTypeFwd = - std::conditional_t::value, - FunctorType, ReducerType>; - using Analysis = Impl::FunctorAnalysis; - - using ReferenceType = typename Analysis::reference_type; - - using ParReduceCopy = ParallelReduceCopy; - - using FunctorAdapter = - Kokkos::Experimental::Impl::FunctorAdapter; - - static void execute_reducer(const FunctorAdapter& f, const PolicyType& p, - PointerType result_ptr, bool ptr_on_device) { - Experimental::Impl::OpenMPTargetInternal::verify_is_process( - "Kokkos::Experimental::OpenMPTarget TeamPolicy " - "parallel_reduce:reducer"); - - const int league_size = p.league_size(); - const int team_size = p.team_size(); - const int vector_length = p.impl_vector_length(); - - const size_t shmem_size_L0 = p.scratch_size(0, team_size); - const size_t shmem_size_L1 = p.scratch_size(1, team_size); - p.space().impl_internal_space_instance()->resize_scratch( - PolicyType::member_type::TEAM_REDUCE_SIZE, shmem_size_L0, shmem_size_L1, - league_size); - void* scratch_ptr = - p.space().impl_internal_space_instance()->get_scratch_ptr(); - - ValueType result = ValueType(); - - // Maximum active teams possible. - int max_active_teams = omp_get_max_teams(); - - // If the league size is <=0, do not launch the kernel. - if (max_active_teams <= 0) return; - -#pragma omp declare reduction(custom \ -:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper::init(omp_priv)) - - KOKKOS_IMPL_OMPTARGET_PRAGMA( - teams num_teams(max_active_teams) thread_limit(team_size) - firstprivate(f) is_device_ptr(scratch_ptr) reduction(custom - : result) - KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0)) -#pragma omp parallel reduction(custom : result) - { - if (omp_get_num_teams() > max_active_teams) - Kokkos::abort("`omp_set_num_teams` call was not respected.\n"); - - const int blockIdx = omp_get_team_num(); - const int gridDim = omp_get_num_teams(); - - // Guarantee that the compilers respect the `num_teams` clause - for (int league_id = blockIdx; league_id < league_size; - league_id += gridDim) { - typename PolicyType::member_type team( - league_id, league_size, team_size, vector_length, scratch_ptr, - blockIdx, shmem_size_L0, shmem_size_L1); - f(team, result); - } - } - - // Copy results back to device if `parallel_reduce` is on a device view. - ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType), - ptr_on_device); - } - - template - static void execute_array(const FunctorAdapter& f, const PolicyType& p, - PointerType result_ptr, bool ptr_on_device) { - Experimental::Impl::OpenMPTargetInternal::verify_is_process( - "Kokkos::Experimental::OpenMPTarget TeamPolicy " - "parallel_reduce:array_reduction"); - - const int league_size = p.league_size(); - const int team_size = p.team_size(); - const int vector_length = p.impl_vector_length(); - - const size_t shmem_size_L0 = p.scratch_size(0, team_size); - const size_t shmem_size_L1 = p.scratch_size(1, team_size); - p.space().impl_internal_space_instance()->resize_scratch( - PolicyType::member_type::TEAM_REDUCE_SIZE, shmem_size_L0, shmem_size_L1, - league_size); - void* scratch_ptr = - p.space().impl_internal_space_instance()->get_scratch_ptr(); - - // Maximum active teams possible. - int max_active_teams = omp_get_max_teams(); - - // If the league size is <=0, do not launch the kernel. - if (max_active_teams <= 0) return; - - // Case where the number of reduction items is 1. - if constexpr (NumReductions == 1) { - ValueType result = ValueType(); - - // Case where reduction is on a native data type. - if constexpr (std::is_arithmetic::value) { - // Use scratch memory extensions to request dynamic shared memory for - // the right compiler/architecture combination. - KOKKOS_IMPL_OMPTARGET_PRAGMA(teams num_teams(max_active_teams) thread_limit(team_size) map(to: f) \ - is_device_ptr(scratch_ptr) reduction(+: result) \ - KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0)) -#pragma omp parallel reduction(+ : result) - { - if (omp_get_num_teams() > max_active_teams) - Kokkos::abort("`omp_set_num_teams` call was not respected.\n"); - - const int blockIdx = omp_get_team_num(); - const int gridDim = omp_get_num_teams(); - - // Guarantee that the compilers respect the `num_teams` clause - for (int league_id = blockIdx; league_id < league_size; - league_id += gridDim) { - typename PolicyType::member_type team( - league_id, league_size, team_size, vector_length, scratch_ptr, - blockIdx, shmem_size_L0, shmem_size_L1); - f(team, result); - } - } - } else { - // Case where the reduction is on a non-native data type. -#pragma omp declare reduction(custom:ValueType : omp_out += omp_in) -#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) \ - map(to : f) is_device_ptr(scratch_ptr) reduction(custom : result) -#pragma omp parallel reduction(custom : result) - { - if (omp_get_num_teams() > max_active_teams) - Kokkos::abort("`omp_set_num_teams` call was not respected.\n"); - - const int blockIdx = omp_get_team_num(); - const int gridDim = omp_get_num_teams(); - - // Guarantee that the compilers respect the `num_teams` clause - for (int league_id = blockIdx; league_id < league_size; - league_id += gridDim) { - typename PolicyType::member_type team( - league_id, league_size, team_size, vector_length, scratch_ptr, - blockIdx, shmem_size_L0, shmem_size_L1); - f(team, result); - } - } - } - - // Copy results back to device if `parallel_reduce` is on a device view. - ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType), - ptr_on_device); - } else { - ValueType result[NumReductions] = {}; - // Case where the reduction is on an array. -#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) \ - map(to : f) is_device_ptr(scratch_ptr) \ - reduction(+ : result[ : NumReductions]) -#pragma omp parallel reduction(+ : result[ : NumReductions]) - { - if (omp_get_num_teams() > max_active_teams) - Kokkos::abort("`omp_set_num_teams` call was not respected.\n"); - - const int blockIdx = omp_get_team_num(); - const int gridDim = omp_get_num_teams(); - - // Guarantee that the compilers respect the `num_teams` clause - for (int league_id = blockIdx; league_id < league_size; - league_id += gridDim) { - typename PolicyType::member_type team( - league_id, league_size, team_size, vector_length, scratch_ptr, - blockIdx, shmem_size_L0, shmem_size_L1); - f(team, result); - } - } - - // Copy results back to device if `parallel_reduce` is on a device view. - ParReduceCopy::memcpy_result( - result_ptr, result, NumReductions * sizeof(ValueType), ptr_on_device); - } - } - - // FIXME_OPENMPTARGET : This routine is a copy from `parallel_reduce` over - // RangePolicy. Need a new implementation. - static void execute_init_join(const FunctorAdapter& f, const PolicyType& p, - PointerType ptr, const bool ptr_on_device) { - Experimental::Impl::OpenMPTargetInternal::verify_is_process( - "Kokkos::Experimental::OpenMPTarget TeamPolicy " - "parallel_reduce:init_join "); - using FunctorAnalysis = - Impl::FunctorAnalysis; - - const int league_size = p.league_size(); - const int team_size = p.team_size(); - const int vector_length = p.impl_vector_length(); - - auto begin = 0; - auto end = league_size * team_size + team_size * vector_length; - - const size_t shmem_size_L0 = p.scratch_size(0, team_size); - const size_t shmem_size_L1 = p.scratch_size(1, team_size); - - // FIXME_OPENMPTARGET: This would oversubscribe scratch memory since we are - // already using the available scratch memory to create temporaries for each - // thread. - if ((shmem_size_L0 + shmem_size_L1) > 0) { - Kokkos::abort( - "OpenMPTarget: Scratch memory is not supported in `parallel_reduce` " - "over functors with init/join."); - } - - const auto nteams = league_size; - - // Number of elements in the reduction - const auto value_count = FunctorAnalysis::value_count(f.get_functor()); - - // Allocate scratch per active thread. - p.space().impl_internal_space_instance()->resize_scratch( - 1, 0, value_count * sizeof(ValueType), league_size); - void* scratch_ptr = - p.space().impl_internal_space_instance()->get_scratch_ptr(); - typename FunctorAnalysis::Reducer final_reducer(f.get_functor()); - - if (end <= begin) { -// If there is no work to be done, copy back the initialized values and -// exit. -#pragma omp target map(to : final_reducer) is_device_ptr(scratch_ptr) - { - final_reducer.init(scratch_ptr); - final_reducer.final(scratch_ptr); - } - - if (0 < value_count) { - if (!ptr_on_device) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, - omp_get_initial_device(), omp_get_default_device())); - else - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, - omp_get_default_device(), omp_get_default_device())); - } - - return; - } - // Use scratch memory extensions to request dynamic shared memory for the - // right compiler/architecture combination. - KOKKOS_IMPL_OMPTARGET_PRAGMA( - teams num_teams(nteams) thread_limit(team_size) map(to - : f) - is_device_ptr(scratch_ptr) - KOKKOS_IMPL_OMPX_DYN_CGROUP_MEM(shmem_size_L0)) { -#pragma omp parallel - { - const int team_num = omp_get_team_num(); - const int num_teams = omp_get_num_teams(); - ValueType* team_scratch = static_cast(scratch_ptr) + - team_num * team_size * value_count; - ReferenceType result = final_reducer.init(&team_scratch[0]); - - for (int league_id = team_num; league_id < league_size; - league_id += num_teams) { - typename PolicyType::member_type team( - league_id, league_size, team_size, vector_length, scratch_ptr, - team_num, shmem_size_L0, shmem_size_L1); - f(team, result); - } - } // end parallel - } // end target - - int tree_neighbor_offset = 1; - do { -#pragma omp target teams distribute parallel for simd firstprivate( \ - final_reducer) is_device_ptr(scratch_ptr) - for (int i = 0; i < nteams - tree_neighbor_offset; - i += 2 * tree_neighbor_offset) { - ValueType* team_scratch = static_cast(scratch_ptr); - const int team_offset = team_size * value_count; - final_reducer.join( - &team_scratch[i * team_offset], - &team_scratch[(i + tree_neighbor_offset) * team_offset]); - - // If `final` is provided by the functor. - // Do the final only once at the end. - if (tree_neighbor_offset * 2 >= nteams && omp_get_team_num() == 0 && - omp_get_thread_num() == 0) { - final_reducer.final(scratch_ptr); - } - } - tree_neighbor_offset *= 2; - } while (tree_neighbor_offset < nteams); - - // If the result view is on the host, copy back the values via memcpy. - if (0 < value_count) { - if (!ptr_on_device) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, - omp_get_initial_device(), omp_get_default_device())); - else - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - ptr, scratch_ptr, value_count * sizeof(ValueType), 0, 0, - omp_get_default_device(), omp_get_default_device())); - } - } -}; - -} // namespace Impl -} // namespace Kokkos - -#endif diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp deleted file mode 100644 index 8bd133661ab..00000000000 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp +++ /dev/null @@ -1,554 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project - -#ifndef KOKKOS_OPENMPTARGETREDUCER_HPP -#define KOKKOS_OPENMPTARGETREDUCER_HPP - -#include - -#include -#include "Kokkos_OpenMPTarget_Abort.hpp" - -namespace Kokkos { -namespace Impl { - -template -struct OpenMPTargetReducerWrapper { - using value_type = typename Reducer::value_type; - - // Using a generic unknown Reducer for the OpenMPTarget backend is not - // implemented. - KOKKOS_INLINE_FUNCTION - static void join(value_type&, const value_type&) = delete; - - KOKKOS_INLINE_FUNCTION - static void init(value_type&) = delete; -}; - -template -struct OpenMPTargetReducerWrapper> { - public: - // Required - using value_type = std::remove_cv_t; - - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { dest += src; } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val = reduction_identity::sum(); - } -}; - -template -struct OpenMPTargetReducerWrapper> { - public: - // Required - using value_type = std::remove_cv_t; - - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { dest *= src; } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val = reduction_identity::prod(); - } -}; - -template -struct OpenMPTargetReducerWrapper> { - public: - // Required - using value_type = std::remove_cv_t; - - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - if (src < dest) dest = src; - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val = reduction_identity::min(); - } -}; - -template -struct OpenMPTargetReducerWrapper> { - public: - // Required - using value_type = std::remove_cv_t; - - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - if (src > dest) dest = src; - } - - // Required - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val = reduction_identity::max(); - } -}; - -template -struct OpenMPTargetReducerWrapper> { - public: - // Required - using value_type = std::remove_cv_t; - - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - dest = dest && src; - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val = reduction_identity::land(); - } -}; - -template -struct OpenMPTargetReducerWrapper> { - public: - // Required - using value_type = std::remove_cv_t; - - using result_view_type = Kokkos::View; - - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - dest = dest || src; - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val = reduction_identity::lor(); - } -}; - -template -struct OpenMPTargetReducerWrapper> { - public: - // Required - using value_type = std::remove_cv_t; - - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - dest = dest & src; - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val = reduction_identity::band(); - } -}; - -template -struct OpenMPTargetReducerWrapper> { - public: - // Required - using value_type = std::remove_cv_t; - - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - dest = dest | src; - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val = reduction_identity::bor(); - } -}; - -template -struct OpenMPTargetReducerWrapper> { - private: - using scalar_type = std::remove_cv_t; - using index_type = std::remove_cv_t; - - public: - // Required - using value_type = ValLocScalar; - - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - if (src.val < dest.val) - dest = src; - else if (src.val == dest.val && - dest.loc == reduction_identity::min()) { - dest.loc = src.loc; - } - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val.val = reduction_identity::min(); - val.loc = reduction_identity::min(); - } -}; - -template -struct OpenMPTargetReducerWrapper> { - private: - using scalar_type = std::remove_cv_t; - using index_type = std::remove_cv_t; - - public: - // Required - using value_type = ValLocScalar; - - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - if (src.val > dest.val) - dest = src; - else if (src.val == dest.val && - dest.loc == reduction_identity::min()) { - dest.loc = src.loc; - } - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val.val = reduction_identity::max(); - val.loc = reduction_identity::min(); - } -}; - -template -struct OpenMPTargetReducerWrapper> { - private: - using scalar_type = std::remove_cv_t; - - public: - // Required - using value_type = MinMaxScalar; - - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - if (src.min_val < dest.min_val) { - dest.min_val = src.min_val; - } - if (src.max_val > dest.max_val) { - dest.max_val = src.max_val; - } - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val.max_val = reduction_identity::max(); - val.min_val = reduction_identity::min(); - } -}; - -template -struct OpenMPTargetReducerWrapper> { - private: - using scalar_type = std::remove_cv_t; - using index_type = std::remove_cv_t; - - public: - // Required - using value_type = MinMaxLocScalar; - - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - if (src.min_val < dest.min_val) { - dest.min_val = src.min_val; - dest.min_loc = src.min_loc; - } else if (dest.min_val == src.min_val && - dest.min_loc == reduction_identity::min()) { - dest.min_loc = src.min_loc; - } - if (src.max_val > dest.max_val) { - dest.max_val = src.max_val; - dest.max_loc = src.max_loc; - } else if (dest.max_val == src.max_val && - dest.max_loc == reduction_identity::min()) { - dest.max_loc = src.max_loc; - } - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val.max_val = reduction_identity::max(); - val.min_val = reduction_identity::min(); - val.max_loc = reduction_identity::min(); - val.min_loc = reduction_identity::min(); - } -}; - -// -// specialize for MaxFirstLoc -// -template -struct OpenMPTargetReducerWrapper> { - private: - using scalar_type = std::remove_cv_t; - using index_type = std::remove_cv_t; - - public: - // Required - using value_type = ValLocScalar; - -// WORKAROUND OPENMPTARGET -// This pragma omp declare target should not be necessary, but Intel compiler -// fails without it -#pragma omp declare target - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - if (dest.val < src.val) { - dest = src; - } else if (!(src.val < dest.val)) { - dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc; - } - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val.val = reduction_identity::max(); - val.loc = reduction_identity::min(); - } -#pragma omp end declare target -}; - -// -// specialize for MinFirstLoc -// -template -struct OpenMPTargetReducerWrapper> { - private: - using scalar_type = std::remove_cv_t; - using index_type = std::remove_cv_t; - - public: - // Required - using value_type = ValLocScalar; - -// WORKAROUND OPENMPTARGET -// This pragma omp declare target should not be necessary, but Intel compiler -// fails without it -#pragma omp declare target - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - if (src.val < dest.val) { - dest = src; - } else if (!(dest.val < src.val)) { - dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc; - } - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val.val = reduction_identity::min(); - val.loc = reduction_identity::min(); - } -#pragma omp end declare target -}; - -// -// specialize for MinMaxFirstLastLoc -// -template -struct OpenMPTargetReducerWrapper> { - private: - using scalar_type = std::remove_cv_t; - using index_type = std::remove_cv_t; - - public: - // Required - using value_type = MinMaxLocScalar; - -// WORKAROUND OPENMPTARGET -// This pragma omp declare target should not be necessary, but Intel compiler -// fails without it -#pragma omp declare target - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - if (src.min_val < dest.min_val) { - dest.min_val = src.min_val; - dest.min_loc = src.min_loc; - } else if (!(dest.min_val < src.min_val)) { - dest.min_loc = (src.min_loc < dest.min_loc) ? src.min_loc : dest.min_loc; - } - - if (dest.max_val < src.max_val) { - dest.max_val = src.max_val; - dest.max_loc = src.max_loc; - } else if (!(src.max_val < dest.max_val)) { - dest.max_loc = (src.max_loc > dest.max_loc) ? src.max_loc : dest.max_loc; - } - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val.max_val = reduction_identity::max(); - val.min_val = reduction_identity::min(); - val.max_loc = reduction_identity::max(); - val.min_loc = reduction_identity::min(); - } -#pragma omp end declare target -}; - -// -// specialize for FirstLoc -// -template -struct OpenMPTargetReducerWrapper> { - private: - using index_type = std::remove_cv_t; - - public: - // Required - using value_type = FirstLocScalar; - -// WORKAROUND OPENMPTARGET -// This pragma omp declare target should not be necessary, but Intel compiler -// fails without it -#pragma omp declare target - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - dest.min_loc_true = (src.min_loc_true < dest.min_loc_true) - ? src.min_loc_true - : dest.min_loc_true; - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val.min_loc_true = reduction_identity::min(); - } -#pragma omp end declare target -}; - -// -// specialize for LastLoc -// -template -struct OpenMPTargetReducerWrapper> { - private: - using index_type = std::remove_cv_t; - - public: - // Required - using value_type = LastLocScalar; - -// WORKAROUND OPENMPTARGET -// This pragma omp declare target should not be necessary, but Intel compiler -// fails without it -#pragma omp declare target - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - dest.max_loc_true = (src.max_loc_true > dest.max_loc_true) - ? src.max_loc_true - : dest.max_loc_true; - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val.max_loc_true = reduction_identity::max(); - } -#pragma omp end declare target -}; - -// -// specialize for StdIsPartitioned -// -template -struct OpenMPTargetReducerWrapper> { - private: - using index_type = std::remove_cv_t; - - public: - // Required - using value_type = StdIsPartScalar; - -// WORKAROUND OPENMPTARGET -// This pragma omp declare target should not be necessary, but Intel compiler -// fails without it -#pragma omp declare target - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - dest.max_loc_true = (dest.max_loc_true < src.max_loc_true) - ? src.max_loc_true - : dest.max_loc_true; - - dest.min_loc_false = (dest.min_loc_false < src.min_loc_false) - ? dest.min_loc_false - : src.min_loc_false; - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val.max_loc_true = ::Kokkos::reduction_identity::max(); - val.min_loc_false = ::Kokkos::reduction_identity::min(); - } -#pragma omp end declare target -}; - -// -// specialize for StdPartitionPoint -// -template -struct OpenMPTargetReducerWrapper> { - private: - using index_type = std::remove_cv_t; - - public: - // Required - using value_type = StdPartPointScalar; - -// WORKAROUND OPENMPTARGET -// This pragma omp declare target should not be necessary, but Intel compiler -// fails without it -#pragma omp declare target - // Required - KOKKOS_INLINE_FUNCTION - static void join(value_type& dest, const value_type& src) { - dest.min_loc_false = (dest.min_loc_false < src.min_loc_false) - ? dest.min_loc_false - : src.min_loc_false; - } - - KOKKOS_INLINE_FUNCTION - static void init(value_type& val) { - val.min_loc_false = ::Kokkos::reduction_identity::min(); - } -#pragma omp end declare target -}; - -/* -template -class OpenMPTargetReducerWrapper { - public: - const ReducerType& reducer; - using value_type = typename ReducerType::value_type; - value_type& value; - - KOKKOS_INLINE_FUNCTION - void join(const value_type& upd) { - reducer.join(value,upd); - } - - KOKKOS_INLINE_FUNCTION - void init(const value_type& upd) { - reducer.init(value,upd); - } -};*/ - -} // namespace Impl -} // namespace Kokkos - -#endif diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_UniqueToken.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_UniqueToken.hpp deleted file mode 100644 index 6df5dda4e68..00000000000 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_UniqueToken.hpp +++ /dev/null @@ -1,94 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project - -#ifndef KOKKOS_OPENMPTARGET_UNIQUE_TOKEN_HPP -#define KOKKOS_OPENMPTARGET_UNIQUE_TOKEN_HPP - -#include -#ifdef KOKKOS_ENABLE_OPENMPTARGET - -#include -#include -#include -#include - -namespace Kokkos { -namespace Experimental { - -// both global and instance Unique Tokens are implemented in the same way -template <> -class UniqueToken { - protected: - uint32_t volatile* m_buffer; - uint32_t m_count; - - public: - using execution_space = OpenMPTarget; - using size_type = int32_t; - - explicit UniqueToken(execution_space const& = execution_space()); - - KOKKOS_DEFAULTED_FUNCTION - UniqueToken(const UniqueToken&) = default; - - KOKKOS_DEFAULTED_FUNCTION - UniqueToken(UniqueToken&&) = default; - - KOKKOS_DEFAULTED_FUNCTION - UniqueToken& operator=(const UniqueToken&) = default; - - KOKKOS_DEFAULTED_FUNCTION - UniqueToken& operator=(UniqueToken&&) = default; - - /// \brief upper bound for acquired values, i.e. 0 <= value < size() - KOKKOS_INLINE_FUNCTION - size_type size() const noexcept { return m_count; } - - /// \brief acquire value such that 0 <= value < size() - KOKKOS_INLINE_FUNCTION - size_type acquire() const { - const Kokkos::pair result = - Kokkos::Impl::concurrent_bitset::acquire_bounded( - m_buffer, m_count, Kokkos::Impl::clock_tic() % m_count); - - if (result.first < 0) { - Kokkos::abort( - "UniqueToken failure to acquire tokens, no tokens " - "available"); - } - - return result.first; - } - - /// \brief release an acquired value - KOKKOS_INLINE_FUNCTION - void release(size_type i) const noexcept { - Kokkos::Impl::concurrent_bitset::release(m_buffer, i); - } -}; - -template <> -class UniqueToken - : public UniqueToken { - private: - Kokkos::View - m_buffer_view; - - public: - explicit UniqueToken(execution_space const& arg = execution_space()) - : UniqueToken(arg) {} - - UniqueToken(size_type max_size, execution_space const& = execution_space()) - : m_buffer_view( - "Kokkos::UniqueToken::m_buffer_view", - ::Kokkos::Impl::concurrent_bitset::buffer_bound(max_size)) { - m_buffer = m_buffer_view.data(); - m_count = max_size; - } -}; - -} // namespace Experimental -} // namespace Kokkos - -#endif // KOKKOS_ENABLE_OPENMPTARGET -#endif // KOKKOS_OPENMPTARGET_UNIQUE_TOKEN_HPP diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp index b9bd1c49d29..7a83f26584b 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp @@ -15,8 +15,10 @@ import kokkos.core; #else #include #endif -#include + +#include #include +#include #include namespace { @@ -38,38 +40,39 @@ struct Container { } // namespace namespace Kokkos { + +SYCL::~SYCL() { Impl::check_execution_space_destructor_precondition(name()); } + SYCL::SYCL() - : m_space_instance(&Impl::SYCLInternal::singleton(), - [](Impl::SYCLInternal*) {}) { - Impl::SYCLInternal::singleton().verify_is_initialized( - "SYCL instance constructor"); -} + : m_space_instance( + (Impl::check_execution_space_constructor_precondition(name()), + Impl::SYCLInternal::default_instance)) {} SYCL::SYCL(const sycl::queue& stream) - : m_space_instance(new Impl::SYCLInternal, [](Impl::SYCLInternal* ptr) { - ptr->finalize(); - delete ptr; - }) { + : m_space_instance( + (Impl::check_execution_space_constructor_precondition(name()), + Impl::HostSharedPtr(new Impl::SYCLInternal(stream)))) { #ifdef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES if (!stream.is_in_order()) Kokkos::abort("User provided sycl::queues must be in-order!"); #endif - Impl::SYCLInternal::singleton().verify_is_initialized( - "SYCL instance constructor"); - m_space_instance->initialize(stream); } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -int SYCL::concurrency() { - return Impl::SYCLInternal::singleton().m_maxConcurrency; -} -#else int SYCL::concurrency() const { return m_space_instance->m_maxConcurrency; } -#endif const char* SYCL::name() { return "SYCL"; } -void SYCL::impl_finalize() { Impl::SYCLInternal::singleton().finalize(); } +void SYCL::impl_finalize() { + // The global_unique_token_locks array is static and should only be + // deallocated once by the default instance + Impl::sycl_global_unique_token_locks(true); +#ifdef KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED + desul::Impl::finalize_lock_arrays(); + desul::Impl::finalize_lock_arrays_sycl( + Impl::SYCLInternal::default_instance->m_queue); +#endif + Impl::SYCLInternal::default_instance = nullptr; +} void SYCL::print_configuration(std::ostream& os, bool verbose) const { os << "\nRuntime Configuration:\n"; @@ -183,7 +186,7 @@ void SYCL::impl_static_fence(const std::string& name) { std::scoped_lock lock(Impl::SYCLInternal::mutex); for (auto& queue : Impl::SYCLInternal::all_queues) { try { - (*queue)->wait_and_throw(); + queue->wait_and_throw(); } catch (sycl::exception const& e) { Kokkos::Impl::throw_runtime_exception( std::string("There was a synchronous SYCL error:\n") += @@ -198,8 +201,15 @@ void SYCL::impl_initialize(InitializationSettings const& settings) { const auto id = ::Kokkos::Impl::get_gpu(settings).value_or(visible_devices[0]); std::vector sycl_devices = Impl::get_sycl_devices(); - Impl::SYCLInternal::singleton().initialize(sycl_devices[id]); + Impl::SYCLInternal::default_instance = + Impl::HostSharedPtr(new Impl::SYCLInternal(sycl_devices[id])); Impl::SYCLInternal::m_syclDev = id; +#ifdef KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED + // Init the array for used for arbitrarily sized atomics + desul::Impl::init_lock_arrays(); + desul::Impl::init_lock_arrays_sycl( + Impl::SYCLInternal::default_instance->m_queue); +#endif } std::ostream& SYCL::impl_sycl_info(std::ostream& os, diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.hpp index f6f5afcf639..518140bb9fd 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.hpp @@ -12,12 +12,7 @@ static_assert(false, #include #ifdef KOKKOS_ENABLE_SYCL -// FIXME_SYCL -#if __has_include() #include -#else -#include -#endif #include #include #include @@ -48,6 +43,13 @@ class SYCL { using scratch_memory_space = ScratchMemorySpace; + SYCL(const SYCL&) = default; + SYCL(SYCL&& other) noexcept : SYCL(static_cast(other)) {} + SYCL& operator=(const SYCL&) = default; + SYCL& operator=(SYCL&& other) noexcept { + return *this = static_cast(other); + } + ~SYCL(); SYCL(); explicit SYCL(const sycl::queue&); @@ -55,26 +57,13 @@ class SYCL { return m_space_instance->impl_get_instance_id(); } - sycl::queue& sycl_queue() const noexcept { - // NOLINTNEXTLINE(bugprone-unchecked-optional-access) - return *m_space_instance->m_queue; - } + sycl::queue& sycl_queue() const noexcept { return m_space_instance->m_queue; } //@} //------------------------------------ //! \name Functions that all Kokkos devices must implement. //@{ -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION static int in_parallel() { -#if defined(__SYCL_DEVICE_ONLY__) - return true; -#else - return false; -#endif - } -#endif - /** \brief Wait until all dispatched functors complete. A noop for OpenMP. */ static void impl_static_fence(const std::string& name); @@ -89,11 +78,7 @@ class SYCL { static void impl_initialize(InitializationSettings const&); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - static int concurrency(); -#else int concurrency() const; -#endif static const char* name(); diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Abort.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Abort.hpp index f9fb281a957..24e8fe8edd7 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Abort.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Abort.hpp @@ -6,12 +6,7 @@ #include #if defined(KOKKOS_ENABLE_SYCL) -// FIXME_SYCL -#if __has_include() #include -#else -#include -#endif namespace Kokkos { namespace Impl { diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp index de06e8ccf27..476e6264827 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp @@ -19,6 +19,9 @@ namespace Kokkos { namespace Impl { template <> class GraphImpl { + private: + using device_handle_t = Kokkos::Impl::DeviceHandle; + public: using node_details_t = GraphNodeBackendSpecificDetails; using root_node_impl_t = @@ -42,9 +45,9 @@ class GraphImpl { ~GraphImpl(); - explicit GraphImpl(Kokkos::SYCL instance); + explicit GraphImpl(const device_handle_t& device_handle); - GraphImpl(Kokkos::SYCL instance, native_graph_t native_graph); + GraphImpl(const device_handle_t& device_handle, native_graph_t native_graph); void add_node(std::shared_ptr const& arg_node_ptr); @@ -68,7 +71,7 @@ class GraphImpl { void submit(const Kokkos::SYCL& exec); - Kokkos::SYCL const& get_execution_space() const noexcept; + auto get_device_handle() const noexcept -> device_handle_t const&; auto create_root_node_ptr(); @@ -84,7 +87,7 @@ class GraphImpl { auto& sycl_graph_exec() { return m_graph_exec; } private: - Kokkos::SYCL m_execution_space; + device_handle_t m_device_handle; native_graph_t m_graph; std::optional> @@ -94,18 +97,18 @@ class GraphImpl { }; inline GraphImpl::~GraphImpl() { - m_execution_space.fence("Kokkos::GraphImpl::~GraphImpl: Graph Destruction"); + m_device_handle.m_exec.fence( + "Kokkos::GraphImpl::~GraphImpl: Graph Destruction"); } -inline GraphImpl::GraphImpl(Kokkos::SYCL instance) - : m_execution_space(std::move(instance)), - m_graph(m_execution_space.sycl_queue().get_context(), - m_execution_space.sycl_queue().get_device()) {} +inline GraphImpl::GraphImpl(const device_handle_t& device_handle) + : m_device_handle(device_handle), + m_graph(m_device_handle.m_exec.sycl_queue().get_context(), + m_device_handle.m_exec.sycl_queue().get_device()) {} -inline GraphImpl::GraphImpl(Kokkos::SYCL instance, +inline GraphImpl::GraphImpl(const device_handle_t& device_handle, native_graph_t native_graph) - : m_execution_space(std::move(instance)), - m_graph(std::move(native_graph)) {} + : m_device_handle(device_handle), m_graph(std::move(native_graph)) {} inline void GraphImpl::add_node( std::shared_ptr const& arg_node_ptr) { @@ -190,14 +193,14 @@ inline void GraphImpl::submit(const Kokkos::SYCL& exec) { exec.sycl_queue().ext_oneapi_graph(*m_graph_exec); } -inline Kokkos::SYCL const& GraphImpl::get_execution_space() - const noexcept { - return m_execution_space; +inline auto GraphImpl::get_device_handle() const noexcept + -> device_handle_t const& { + return m_device_handle; } inline auto GraphImpl::create_root_node_ptr() { KOKKOS_EXPECTS(!m_graph_exec); - auto rv = std::make_shared(get_execution_space(), + auto rv = std::make_shared(m_device_handle, _graph_node_is_root_ctor_tag{}); rv->node_details_t::node = m_graph.add(); return rv; @@ -211,7 +214,7 @@ inline auto GraphImpl::create_aggregate_ptr( // each predecessor ref, so all we need to do here is create the (trivial) // aggregate node. return std::make_shared( - m_execution_space, _graph_node_kernel_ctor_tag{}, aggregate_impl_t{}); + m_device_handle, _graph_node_kernel_ctor_tag{}, aggregate_impl_t{}); } } // namespace Impl } // namespace Kokkos diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Half_Impl_Type.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Half_Impl_Type.hpp index 5034348e9c6..72fb6adc603 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Half_Impl_Type.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Half_Impl_Type.hpp @@ -6,12 +6,7 @@ #include -// FIXME_SYCL -#if __has_include() #include -#else -#include -#endif // Make sure no one else tries to define half_t #ifndef KOKKOS_IMPL_HALF_TYPE_DEFINED diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Half_MathematicalFunctions.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Half_MathematicalFunctions.hpp index cd58d9ada41..5055755dcf0 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Half_MathematicalFunctions.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Half_MathematicalFunctions.hpp @@ -24,6 +24,14 @@ namespace Impl { Experimental::half_t::impl_type(y))); \ } +#define KOKKOS_SYCL_HALF_TERNARY_INT_PTR_FUNCTION(OP) \ + KOKKOS_INLINE_FUNCTION Experimental::half_t impl_##OP( \ + Experimental::half_t x, Experimental::half_t y, int* z) { \ + return static_cast( \ + sycl::OP(Experimental::half_t::impl_type(x), \ + Experimental::half_t::impl_type(y), z)); \ + } + #define KOKKOS_SYCL_HALF_UNARY_PREDICATE(OP) \ KOKKOS_INLINE_FUNCTION bool impl_##OP(Experimental::half_t x) { \ return sycl::OP(Experimental::half_t::impl_type(x)); \ @@ -42,6 +50,7 @@ KOKKOS_SYCL_HALF_BINARY_FUNCTION(remainder) KOKKOS_SYCL_HALF_BINARY_FUNCTION(fmax) KOKKOS_SYCL_HALF_BINARY_FUNCTION(fmin) KOKKOS_SYCL_HALF_BINARY_FUNCTION(fdim) +KOKKOS_SYCL_HALF_TERNARY_INT_PTR_FUNCTION(remquo) // Exponential functions KOKKOS_SYCL_HALF_UNARY_FUNCTION(exp) KOKKOS_SYCL_HALF_UNARY_FUNCTION(exp2) @@ -81,6 +90,7 @@ KOKKOS_SYCL_HALF_UNARY_FUNCTION(floor) KOKKOS_SYCL_HALF_UNARY_FUNCTION(trunc) KOKKOS_SYCL_HALF_UNARY_FUNCTION(round) // KOKKOS_SYCL_HALF_UNARY_FUNCTION(nearbyint) +KOKKOS_SYCL_HALF_UNARY_FUNCTION(rint) KOKKOS_SYCL_HALF_UNARY_FUNCTION(logb) KOKKOS_SYCL_HALF_BINARY_FUNCTION(nextafter) KOKKOS_SYCL_HALF_BINARY_FUNCTION(copysign) @@ -88,9 +98,12 @@ KOKKOS_SYCL_HALF_UNARY_PREDICATE(isfinite) KOKKOS_SYCL_HALF_UNARY_PREDICATE(isinf) KOKKOS_SYCL_HALF_UNARY_PREDICATE(isnan) KOKKOS_SYCL_HALF_UNARY_PREDICATE(signbit) +// Non-standard functions +KOKKOS_SYCL_HALF_UNARY_FUNCTION(rsqrt) #undef KOKKOS_SYCL_HALF_UNARY_FUNCTION #undef KOKKOS_SYCL_HALF_BINARY_FUNCTION +#undef KOKKOS_SYCL_HALF_TERNARY_INT_PTR_FUNCTION #undef KOKKOS_SYCL_HALF_UNARY_PREDICATE #endif @@ -129,6 +142,7 @@ KOKKOS_INLINE_FUNCTION Kokkos::Experimental::bhalf_t impl_test_fallback_bhalf( KOKKOS_SYCL_BHALF_UNARY_FUNCTION(fabs) // fmod // remainder +// remquo KOKKOS_SYCL_BHALF_BINARY_FUNCTION(fmax) KOKKOS_SYCL_BHALF_BINARY_FUNCTION(fmin) // fdim @@ -170,6 +184,7 @@ KOKKOS_SYCL_BHALF_UNARY_FUNCTION(ceil) KOKKOS_SYCL_BHALF_UNARY_FUNCTION(floor) KOKKOS_SYCL_BHALF_UNARY_FUNCTION(trunc) // round +// rint // nearbyint // logb // nextafter @@ -178,6 +193,9 @@ KOKKOS_SYCL_BHALF_UNARY_FUNCTION(trunc) // isinf KOKKOS_SYCL_BHALF_UNARY_PREDICATE(isnan) // signbit +// Non-standard functions +KOKKOS_SYCL_BHALF_UNARY_FUNCTION(rsqrt) +// rcp #undef KOKKOS_SYCL_BHALF_UNARY_FUNCTION #undef KOKKOS_SYCL_BHALF_BINARY_FUNCTION diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp index 37040097af4..16eb33b5e8b 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -15,8 +15,6 @@ import kokkos.core; // kokkos_malloc #include #include -// FIXME_SYCL -// NOLINTBEGIN(bugprone-unchecked-optional-access) namespace Kokkos { namespace Impl { @@ -31,7 +29,7 @@ std::size_t scratch_count(const std::size_t size) { } // namespace -std::vector*> SYCLInternal::all_queues; +std::vector SYCLInternal::all_queues; std::mutex SYCLInternal::mutex; Kokkos::View sycl_global_unique_token_locks( @@ -45,55 +43,41 @@ Kokkos::View sycl_global_unique_token_locks( return locks; } -SYCLInternal::~SYCLInternal() { - if (!was_finalized || m_scratchSpace || m_scratchHost || m_scratchFlags) { - std::cerr << "Kokkos::SYCL ERROR: Failed to call " - "Kokkos::SYCL::finalize()" - << std::endl; - std::cerr.flush(); - } -} - int SYCLInternal::verify_is_initialized(const char* const label) const { - if (!is_initialized()) { + if (!default_instance) { Kokkos::abort((std::string("Kokkos::SYCL::") + label + " : ERROR device not initialized\n") .c_str()); } - return is_initialized(); -} -SYCLInternal& SYCLInternal::singleton() { - static SYCLInternal self; - return self; + return static_cast(default_instance); } -void SYCLInternal::initialize(const sycl::device& d) { - auto exception_handler = [](sycl::exception_list exceptions) { - bool asynchronous_error = false; - for (std::exception_ptr const& e : exceptions) { - try { - std::rethrow_exception(e); - } catch (sycl::exception const& e) { - std::cerr << e.what() << '\n'; - asynchronous_error = true; - } - } - if (asynchronous_error) - Kokkos::Impl::throw_runtime_exception( - "There was an asynchronous SYCL error!\n"); - }; +SYCLInternal::SYCLInternal(const sycl::device& d) + : SYCLInternal(sycl::queue{ + d, + [](sycl::exception_list exceptions) { + bool asynchronous_error = false; + for (std::exception_ptr const& e : exceptions) { + try { + std::rethrow_exception(e); + } catch (sycl::exception const& e) { + std::cerr << e.what() << '\n'; + asynchronous_error = true; + } + } + if (asynchronous_error) + Kokkos::Impl::throw_runtime_exception( + "There was an asynchronous SYCL error!\n"); + } #ifdef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES - initialize( - sycl::queue{d, exception_handler, sycl::property::queue::in_order()}); -#else - initialize(sycl::queue{d, exception_handler}); + + , + sycl::property::queue::in_order() #endif + }) { } -// FIXME_SYCL -void SYCLInternal::initialize(const sycl::queue& q) { - KOKKOS_EXPECTS(!is_initialized()); - +SYCLInternal::SYCLInternal(const sycl::queue& q) : m_queue(q) { #define KOKKOS_IMPL_CHECK_SYCL_BACKEND_SUPPORT(BACKEND, REQUIRED) \ if (BACKEND != REQUIRED) \ Kokkos::abort( \ @@ -111,16 +95,12 @@ void SYCLInternal::initialize(const sycl::queue& q) { sycl::backend::ext_oneapi_hip); #endif - if (was_finalized) - Kokkos::abort("Calling SYCL::initialize after SYCL::finalize is illegal\n"); - - m_queue = q; // guard pushing to all_queues { std::scoped_lock lock(mutex); all_queues.push_back(&m_queue); } - const sycl::device& d = m_queue->get_device(); + const sycl::device& d = m_queue.get_device(); m_maxWorkgroupSize = d.template get_info(); @@ -133,16 +113,8 @@ void SYCLInternal::initialize(const sycl::queue& q) { d.template get_info(); for (auto& usm_mem : m_indirectKernelMem) { - usm_mem.reset(*m_queue, m_instance_id); - } - -#ifdef KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED - // Init the array for used for arbitrarily sized atomics - if (this == &singleton()) { - desul::Impl::init_lock_arrays(); - desul::Impl::init_lock_arrays_sycl(*m_queue); + usm_mem.reset(m_queue, m_instance_id); } -#endif } int SYCLInternal::acquire_team_scratch_space() { @@ -158,12 +130,12 @@ int SYCLInternal::acquire_team_scratch_space() { return current_team_scratch; } -Kokkos::Impl::sycl_device_ptr SYCLInternal::resize_team_scratch_space( +sycl::global_ptr SYCLInternal::resize_team_scratch_space( int scratch_pool_id, std::int64_t bytes, bool force_shrink) { // Multiple ParallelFor/Reduce Teams can call this function at the same time // and invalidate the m_team_scratch_ptr. We use a pool to avoid any race // condition. - auto mem_space = Kokkos::SYCLDeviceUSMSpace(*m_queue); + auto mem_space = Kokkos::SYCLDeviceUSMSpace(m_queue); if (m_team_scratch_current_size[scratch_pool_id] == 0 && bytes > 0) { m_team_scratch_current_size[scratch_pool_id] = bytes; m_team_scratch_ptr[scratch_pool_id] = @@ -190,24 +162,13 @@ void SYCLInternal::register_team_scratch_event(int scratch_pool_id, uint32_t SYCLInternal::impl_get_instance_id() const { return m_instance_id; } -void SYCLInternal::finalize() { - SYCLInternal::fence(*m_queue, - "Kokkos::SYCLInternal::finalize: fence on finalization", +SYCLInternal::~SYCLInternal() { + SYCLInternal::fence(m_queue, + "Kokkos::SYCLInternal::finalize: fence on destruction", m_instance_id); - was_finalized = true; - - // The global_unique_token_locks array is static and should only be - // deallocated once by the defualt instance - if (this == &singleton()) { - Impl::sycl_global_unique_token_locks(true); -#ifdef KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED - desul::Impl::finalize_lock_arrays(); - desul::Impl::finalize_lock_arrays_sycl(*m_queue); -#endif - } - auto device_mem_space = SYCLDeviceUSMSpace(*m_queue); - auto host_mem_space = SYCLHostUSMSpace(*m_queue); + auto device_mem_space = SYCLDeviceUSMSpace(m_queue); + auto host_mem_space = SYCLHostUSMSpace(m_queue); if (nullptr != m_scratchSpace) device_mem_space.deallocate(m_scratchSpace, m_scratchSpaceCount * sizeScratchGrain); @@ -217,20 +178,10 @@ void SYCLInternal::finalize() { if (nullptr != m_scratchFlags) device_mem_space.deallocate(m_scratchFlags, m_scratchFlagsCount * sizeScratchGrain); - m_syclDev = -1; - m_scratchSpaceCount = 0; - m_scratchSpace = nullptr; - m_scratchHostCount = 0; - m_scratchHost = nullptr; - m_scratchFlagsCount = 0; - m_scratchFlags = nullptr; - for (int i = 0; i < m_n_team_scratch; ++i) { if (m_team_scratch_current_size[i] > 0) { device_mem_space.deallocate(m_team_scratch_ptr[i], m_team_scratch_current_size[i]); - m_team_scratch_current_size[i] = 0; - m_team_scratch_ptr[i] = nullptr; } } @@ -240,14 +191,12 @@ void SYCLInternal::finalize() { std::scoped_lock lock(mutex); all_queues.erase(std::find(all_queues.begin(), all_queues.end(), &m_queue)); } - m_queue.reset(); } -Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_space( - const std::size_t size) { +sycl::global_ptr SYCLInternal::scratch_space(const std::size_t size) { if (verify_is_initialized("scratch_space") && m_scratchSpaceCount < scratch_count(size)) { - auto mem_space = Kokkos::SYCLDeviceUSMSpace(*m_queue); + auto mem_space = Kokkos::SYCLDeviceUSMSpace(m_queue); if (nullptr != m_scratchSpace) mem_space.deallocate(m_scratchSpace, @@ -264,11 +213,10 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_space( return m_scratchSpace; } -Kokkos::Impl::sycl_host_ptr SYCLInternal::scratch_host( - const std::size_t size) { +sycl::global_ptr SYCLInternal::scratch_host(const std::size_t size) { if (verify_is_initialized("scratch_unified") && m_scratchHostCount < scratch_count(size)) { - auto mem_space = Kokkos::SYCLHostUSMSpace(*m_queue); + auto mem_space = Kokkos::SYCLHostUSMSpace(m_queue); if (nullptr != m_scratchHost) mem_space.deallocate(m_scratchHost, @@ -285,11 +233,10 @@ Kokkos::Impl::sycl_host_ptr SYCLInternal::scratch_host( return m_scratchHost; } -Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_flags( - const std::size_t size) { +sycl::global_ptr SYCLInternal::scratch_flags(const std::size_t size) { if (verify_is_initialized("scratch_flags") && m_scratchFlagsCount < scratch_count(size)) { - auto mem_space = Kokkos::SYCLDeviceUSMSpace(*m_queue); + auto mem_space = Kokkos::SYCLDeviceUSMSpace(m_queue); if (nullptr != m_scratchFlags) mem_space.deallocate(m_scratchFlags, @@ -305,10 +252,10 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_flags( // We only zero-initialize the allocation when we actually allocate. // It's the responsibility of the features using scratch_flags, // namely parallel_reduce and parallel_scan, to reset the used values to 0. - auto memset_event = m_queue->memset(m_scratchFlags, 0, - m_scratchFlagsCount * sizeScratchGrain); + auto memset_event = m_queue.memset(m_scratchFlags, 0, + m_scratchFlagsCount * sizeScratchGrain); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES - m_queue->ext_oneapi_submit_barrier(std::vector{memset_event}); + m_queue.ext_oneapi_submit_barrier(std::vector{memset_event}); #endif } @@ -352,6 +299,7 @@ size_t SYCLInternal::USMObjectMem::reserve(size_t n) { KOKKOS_ASSERT(m_q); if (m_capacity < n) { + // NOLINTNEXTLINE(bugprone-unchecked-optional-access) AllocationSpace alloc_space(*m_q); if (m_data) alloc_space.deallocate(m_data, m_capacity); @@ -368,8 +316,10 @@ size_t SYCLInternal::USMObjectMem::reserve(size_t n) { template void SYCLInternal::USMObjectMem::reset() { if (m_data) { + KOKKOS_ASSERT(m_q); // This implies a fence since this class is not copyable // and deallocating implies a fence across all registered queues. + // NOLINTNEXTLINE(bugprone-unchecked-optional-access) AllocationSpace alloc_space(*m_q); alloc_space.deallocate(m_data, m_capacity); @@ -381,10 +331,11 @@ void SYCLInternal::USMObjectMem::reset() { int SYCLInternal::m_syclDev; +HostSharedPtr SYCLInternal::default_instance; + template class SYCLInternal::USMObjectMem; template class SYCLInternal::USMObjectMem; template class SYCLInternal::USMObjectMem; } // namespace Impl } // namespace Kokkos - // NOLINTEND(bugprone-unchecked-optional-access) diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp index 23685bcf42d..5546008da9d 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp @@ -5,15 +5,12 @@ #define KOKKOS_SYCL_INSTANCE_HPP_ #include -// FIXME_SYCL -#if __has_include() #include -#else -#include -#endif #include #include +#include + namespace Kokkos { namespace Impl { @@ -21,7 +18,10 @@ class SYCLInternal { public: using size_type = unsigned int; - SYCLInternal() = default; + static HostSharedPtr default_instance; + + SYCLInternal(const sycl::device& d); + SYCLInternal(const sycl::queue& q); ~SYCLInternal(); SYCLInternal(const SYCLInternal&) = delete; @@ -29,12 +29,13 @@ class SYCLInternal { SYCLInternal& operator=(SYCLInternal&&) = delete; SYCLInternal(SYCLInternal&&) = delete; - Kokkos::Impl::sycl_device_ptr scratch_space(const std::size_t size); - Kokkos::Impl::sycl_device_ptr scratch_flags(const std::size_t size); - Kokkos::Impl::sycl_host_ptr scratch_host(const std::size_t size); + sycl::global_ptr scratch_space(const std::size_t size); + sycl::global_ptr scratch_flags(const std::size_t size); + sycl::global_ptr scratch_host(const std::size_t size); int acquire_team_scratch_space(); - Kokkos::Impl::sycl_device_ptr resize_team_scratch_space( - int scratch_pool_id, std::int64_t bytes, bool force_shrink = false); + sycl::global_ptr resize_team_scratch_space(int scratch_pool_id, + std::int64_t bytes, + bool force_shrink = false); void register_team_scratch_event(int scratch_pool_id, sycl::event event); uint32_t impl_get_instance_id() const; @@ -44,32 +45,29 @@ class SYCLInternal { uint32_t m_maxConcurrency = 0; uint64_t m_maxShmemPerBlock = 0; - std::size_t m_scratchSpaceCount = 0; - Kokkos::Impl::sycl_device_ptr m_scratchSpace = nullptr; - std::size_t m_scratchHostCount = 0; - Kokkos::Impl::sycl_host_ptr m_scratchHost = nullptr; - std::size_t m_scratchFlagsCount = 0; - Kokkos::Impl::sycl_device_ptr m_scratchFlags = nullptr; + std::size_t m_scratchSpaceCount = 0; + sycl::global_ptr m_scratchSpace = nullptr; + std::size_t m_scratchHostCount = 0; + sycl::global_ptr m_scratchHost = nullptr; + std::size_t m_scratchFlagsCount = 0; + sycl::global_ptr m_scratchFlags = nullptr; // mutex to access shared memory mutable std::mutex m_mutexScratchSpace; // Team Scratch Level 1 Space - static constexpr int m_n_team_scratch = 10; - mutable int64_t m_team_scratch_current_size[m_n_team_scratch] = {}; - mutable Kokkos::Impl::sycl_device_ptr - m_team_scratch_ptr[m_n_team_scratch] = {}; - mutable int m_current_team_scratch = 0; - mutable sycl::event m_team_scratch_event[m_n_team_scratch] = {}; + static constexpr int m_n_team_scratch = 10; + mutable int64_t m_team_scratch_current_size[m_n_team_scratch] = {}; + mutable sycl::global_ptr m_team_scratch_ptr[m_n_team_scratch] = {}; + mutable int m_current_team_scratch = 0; + mutable sycl::event m_team_scratch_event[m_n_team_scratch] = {}; mutable std::mutex m_team_scratch_mutex; uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance( reinterpret_cast(this)); - std::optional m_queue; + sycl::queue m_queue; - // Using std::vector> reveals a compiler bug when - // compiling for the CUDA backend. Storing pointers instead works around this. - static std::vector*> all_queues; + static std::vector all_queues; // We need a mutex for thread safety when modifying all_queues. static std::mutex mutex; @@ -181,20 +179,8 @@ class SYCLInternal { using IndirectKernelMem = USMObjectMem; IndirectKernelMem& get_indirect_kernel_mem(); - bool was_finalized = false; - - static SYCLInternal& singleton(); - int verify_is_initialized(const char* const label) const; - void initialize(const sycl::device& d); - - void initialize(const sycl::queue& q); - - int is_initialized() const { return m_queue.has_value(); } - - void finalize(); - private: // fence(...) takes any type with a .wait_and_throw() method // (sycl::event and sycl::queue) @@ -248,11 +234,20 @@ class SYCLFunctionWrapper { std::memcpy(static_cast(&m_f), static_cast(&other.m_f), sizeof(m_f)); } + TrivialWrapper(TrivialWrapper&& other) { + std::memcpy(static_cast(&m_f), + static_cast(&other.m_f), sizeof(m_f)); + } TrivialWrapper& operator=(const TrivialWrapper& other) { std::memcpy(static_cast(&m_f), static_cast(&other.m_f), sizeof(m_f)); return *this; } + TrivialWrapper& operator=(TrivialWrapper&& other) { + std::memcpy(static_cast(&m_f), + static_cast(&other.m_f), sizeof(m_f)); + return *this; + } ~TrivialWrapper() {} Functor m_f; diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp index 507b4672d5b..7c3cf260c84 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp @@ -29,11 +29,8 @@ inline TileSizeProperties get_tile_size_properties( TileSizeProperties properties; properties.max_threads = space.impl_internal_space_instance()->m_maxWorkgroupSize; - properties.default_largest_tile_size = 16; - properties.default_tile_size = 2; - properties.max_total_tile_size = properties.max_threads; - - auto device = space.sycl_queue().get_device(); + properties.max_total_tile_size = properties.max_threads; + auto device = space.sycl_queue().get_device(); auto max_work_item_sizes = device.get_info>(); properties.max_threads_dimensions[0] = max_work_item_sizes[0]; @@ -42,6 +39,55 @@ inline TileSizeProperties get_tile_size_properties( return properties; } +template <> +struct TileSizeRecommended { + template + static auto get(Policy const&) { + constexpr auto InnerDirection = Policy::inner_direction; + constexpr int Rank = Policy::rank; + + using tile_type = typename Policy::tile_type; + + if constexpr (InnerDirection == Iterate::Left) { + if constexpr (Rank == 2) { + return tile_type{32, 8}; + } else if constexpr (Rank == 3) { + return tile_type{32, 2, 4}; + } else if constexpr (Rank == 4) { + return tile_type{16, 2, 2, 2}; + } else if constexpr (Rank == 5) { + return tile_type{16, 2, 2, 2, 2}; + } else if constexpr (Rank == 6) { + return tile_type{16, 2, 2, 2, 2, 1}; + } + tile_type tile_sizes{}; + for (int i = 0; i < Rank; ++i) { + tile_sizes[i] = 2; + } + tile_sizes[0] = 16; + return tile_sizes; + } else { + if constexpr (Rank == 2) { + return tile_type{8, 32}; + } else if constexpr (Rank == 3) { + return tile_type{4, 2, 32}; + } else if constexpr (Rank == 4) { + return tile_type{2, 2, 2, 16}; + } else if constexpr (Rank == 5) { + return tile_type{2, 2, 2, 2, 16}; + } else if constexpr (Rank == 6) { + return tile_type{1, 2, 2, 2, 2, 16}; + } + tile_type tile_sizes{}; + for (int i = 0; i < Rank; ++i) { + tile_sizes[i] = 2; + } + tile_sizes[Rank - 1] = 16; + return tile_sizes; + } + } +}; + // Settings for TeamMDRangePolicy template struct ThreadAndVectorNestLevel diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp index 7e390ef71c6..f88752e58e1 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp @@ -25,133 +25,35 @@ class Kokkos::Impl::ParallelFor, using index_type = typename Policy::index_type; using WorkTag = typename Policy::work_tag; using MaxGridSize = Kokkos::Array; + using array_type = typename Policy::point_type; const FunctorType m_functor; - // MDRangePolicy is not trivially copyable. Hence, replicate the data we - // really need in DeviceIterateTile in a trivially copyable struct. - const struct BarePolicy { - using index_type = typename Policy::index_type; - - BarePolicy(const Policy& policy) - : m_lower(policy.m_lower), - m_upper(policy.m_upper), - m_tile(policy.m_tile), - m_tile_end(policy.m_tile_end), - m_num_tiles(policy.m_num_tiles) {} - - const typename Policy::point_type m_lower; - const typename Policy::point_type m_upper; - const typename Policy::tile_type m_tile; - const typename Policy::point_type m_tile_end; - const typename Policy::index_type m_num_tiles; - static constexpr Iterate inner_direction = Policy::inner_direction; - } m_policy; + const Policy m_policy; const MaxGridSize m_max_grid_size; - const Kokkos::SYCL& m_space; - - sycl::nd_range<3> compute_ranges() const { - const auto& m_tile = m_policy.m_tile; - const auto& m_tile_end = m_policy.m_tile_end; - - if constexpr (Policy::rank == 2) { - // id0 to threadIdx.x; id1 to threadIdx.y - sycl::range<3> local_sizes(m_tile[0], m_tile[1], 1); - - sycl::range<3> global_sizes( - std::min(m_tile_end[0], m_max_grid_size[0]) * - m_tile[0], - std::min(m_tile_end[1], m_max_grid_size[1]) * - m_tile[1], - 1); - - return {global_sizes, local_sizes}; - } - if constexpr (Policy::rank == 3) { - // id0 to threadIdx.x; id1 to threadIdx.y; id2 to threadIdx.z - sycl::range<3> local_sizes(m_tile[0], m_tile[1], m_tile[2]); - - sycl::range<3> global_sizes( - std::min(m_tile_end[0], m_max_grid_size[0]) * - m_tile[0], - std::min(m_tile_end[1], m_max_grid_size[1]) * - m_tile[1], - std::min(m_tile_end[2], m_max_grid_size[2]) * - m_tile[2]); - - return {global_sizes, local_sizes}; - } - if constexpr (Policy::rank == 4) { - // id0,id1 encoded within first index; id2 to second index; id3 to third - // index - sycl::range<3> local_sizes(m_tile[0] * m_tile[1], m_tile[2], m_tile[3]); - - sycl::range<3> global_sizes( - std::min(m_tile_end[0] * m_tile_end[1], - m_max_grid_size[0]) * - m_tile[0] * m_tile[1], - std::min(m_tile_end[2], m_max_grid_size[1]) * - m_tile[2], - std::min(m_tile_end[3], m_max_grid_size[2]) * - m_tile[3]); - return {global_sizes, local_sizes}; - } - if constexpr (Policy::rank == 5) { - // id0,id1 encoded within first index; id2,id3 to second index; id4 to - // third index - sycl::range<3> local_sizes(m_tile[0] * m_tile[1], m_tile[2] * m_tile[3], - m_tile[4]); - - sycl::range<3> global_sizes( - std::min(m_tile_end[0] * m_tile_end[1], - m_max_grid_size[0]) * - m_tile[0] * m_tile[1], - std::min(m_tile_end[2] * m_tile_end[3], - m_max_grid_size[1]) * - m_tile[2] * m_tile[3], - std::min(m_tile_end[4], m_max_grid_size[2]) * - m_tile[4]); - - return {global_sizes, local_sizes}; - } - if constexpr (Policy::rank == 6) { - // id0,id1 encoded within first index; id2,id3 to second index; id4,id5 to - // third index - sycl::range<3> local_sizes(m_tile[0] * m_tile[1], m_tile[2] * m_tile[3], - m_tile[4] * m_tile[5]); - - sycl::range<3> global_sizes( - std::min(m_tile_end[0] * m_tile_end[1], - m_max_grid_size[0]) * - m_tile[0] * m_tile[1], - std::min(m_tile_end[2] * m_tile_end[3], - m_max_grid_size[1]) * - m_tile[2] * m_tile[3], - std::min(m_tile_end[4] * m_tile_end[5], - m_max_grid_size[2]) * - m_tile[4] * m_tile[5]); - - return {global_sizes, local_sizes}; - } - static_assert(Policy::rank > 1 && Policy::rank < 7, - "Kokkos::MDRange Error: Exceeded rank bounds with SYCL\n"); - } + array_type m_lower; + array_type m_upper; + array_type m_extent; // tile_size * num_tiles template sycl::event sycl_direct_launch(const FunctorWrapper& functor_wrapper, const sycl::event& memcpy_event) const { // Convenience references - sycl::queue& q = m_space.sycl_queue(); + const Kokkos::SYCL& space = m_policy.space(); + sycl::queue& q = space.sycl_queue(); if (m_policy.m_num_tiles == 0) return {}; - const BarePolicy bare_policy(m_policy); - const auto& max_grid_size = m_max_grid_size; + const auto lower_bound = m_lower; + const auto upper_bound = m_upper; + const auto extent = m_extent; desul::ensure_sycl_lock_arrays_on_device(q); - auto cgh_lambda = [&](sycl::handler& cgh) { - const auto range = compute_ranges(); + const auto range = + Kokkos::Impl::compute_device_launch_params(m_policy, m_max_grid_size); + + auto cgh_lambda = [&, range](sycl::handler& cgh) { const sycl::range<3> global_range = range.get_global_range(); const sycl::range<3> local_range = range.get_local_range(); const sycl::nd_range sycl_swapped_range{ @@ -163,13 +65,16 @@ class Kokkos::Impl::ParallelFor, #else (void)memcpy_event; #endif - cgh.parallel_for(sycl_swapped_range, [functor_wrapper, bare_policy, - max_grid_size]( + cgh.parallel_for(sycl_swapped_range, [lower_bound, upper_bound, extent, + functor_wrapper]( sycl::nd_item<3> item) { // swap back for correct index calculations in DeviceIterateTile const index_type local_x = item.get_local_id(2); const index_type local_y = item.get_local_id(1); const index_type local_z = item.get_local_id(0); + const index_type n_local_x = item.get_local_range(2); + const index_type n_local_y = item.get_local_range(1); + const index_type n_local_z = item.get_local_range(0); const index_type global_x = item.get_group(2); const index_type global_y = item.get_group(1); const index_type global_z = item.get_group(0); @@ -177,11 +82,13 @@ class Kokkos::Impl::ParallelFor, const index_type n_global_y = item.get_group_range(1); const index_type n_global_z = item.get_group_range(0); - Kokkos::Impl::DeviceIterateTile( - bare_policy, functor_wrapper.get_functor(), max_grid_size, + Kokkos::Impl::DeviceIterate( + lower_bound, upper_bound, extent, functor_wrapper.get_functor(), {n_global_x, n_global_y, n_global_z}, - {global_x, global_y, global_z}, {local_x, local_y, local_z}) + {n_local_x, n_local_y, n_local_z}, {global_x, global_y, global_z}, + {local_x, local_y, local_z}) .exec_range(); }); }; @@ -239,8 +146,9 @@ class Kokkos::Impl::ParallelFor, } void execute() const { + auto space_instance = m_policy.space().impl_internal_space_instance(); Kokkos::Impl::SYCLInternal::IndirectKernelMem& indirectKernelMem = - m_space.impl_internal_space_instance()->get_indirect_kernel_mem(); + space_instance->get_indirect_kernel_mem(); auto functor_wrapper = Impl::make_sycl_function_wrapper(m_functor, indirectKernelMem); @@ -252,8 +160,22 @@ class Kokkos::Impl::ParallelFor, ParallelFor(const FunctorType& arg_functor, const Policy& arg_policy) : m_functor(arg_functor), m_policy(arg_policy), - m_max_grid_size(get_max_grid_size(arg_policy)), - m_space(arg_policy.space()) {} + m_max_grid_size(get_max_grid_size(arg_policy)) { + // Initialize begins and ends based on layout + // Swap the fastest indexes to x dimension + for (array_index_type i = 0; i < Policy::rank; ++i) { + if constexpr (Policy::inner_direction == Iterate::Left) { + m_lower[i] = m_policy.m_lower[i]; + m_upper[i] = m_policy.m_upper[i]; + m_extent[i] = m_policy.m_tile[i] * m_policy.m_tile_end[i]; + } else { + m_lower[i] = m_policy.m_lower[Policy::rank - 1 - i]; + m_upper[i] = m_policy.m_upper[Policy::rank - 1 - i]; + m_extent[i] = m_policy.m_tile[Policy::rank - 1 - i] * + m_policy.m_tile_end[Policy::rank - 1 - i]; + } + } + } }; #endif // KOKKOS_SYCL_PARALLEL_FOR_MDRANGE_HPP_ diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp index f92d9c6f934..c316f3c3531 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp @@ -35,9 +35,10 @@ class Kokkos::Impl::ParallelFor, size_t m_scratch_size[2]; template - sycl::event sycl_direct_launch(const sycl_device_ptr global_scratch_ptr, - const FunctorWrapper& functor_wrapper, - const sycl::event& memcpy_event) const { + sycl::event sycl_direct_launch( + const sycl::global_ptr global_scratch_ptr, + const FunctorWrapper& functor_wrapper, + const sycl::event& memcpy_event) const { // Convenience references const Kokkos::SYCL& space = m_policy.space(); sycl::queue& q = space.sycl_queue(); @@ -130,8 +131,8 @@ class Kokkos::Impl::ParallelFor, // Functor's reduce memory, team scan memory, and team shared memory depend // upon team size. int scratch_pool_id = instance.acquire_team_scratch_space(); - const sycl_device_ptr global_scratch_ptr = - static_cast>(instance.resize_team_scratch_space( + const sycl::global_ptr global_scratch_ptr = + static_cast>(instance.resize_team_scratch_space( scratch_pool_id, static_cast(m_scratch_size[1]) * m_league_size)); @@ -170,22 +171,32 @@ class Kokkos::Impl::ParallelFor, const auto& instance = *m_policy.space().impl_internal_space_instance(); if (static_cast(instance.m_maxShmemPerBlock) < - m_shmem_size - m_shmem_begin) { + m_shmem_size + m_shmem_begin) { std::stringstream out; - out << "Kokkos::Impl::ParallelFor insufficient shared memory! " - "Requested " - << m_shmem_size - m_shmem_begin << " bytes but maximum is " - << instance.m_maxShmemPerBlock << '\n'; + out << "Kokkos::parallel_for Requested too much scratch memory on " + "level 0. Requested: " + << m_shmem_size + m_shmem_begin + << ", Maximum: " << instance.m_maxShmemPerBlock; + Kokkos::Impl::throw_runtime_exception(out.str()); + } + + if (m_scratch_size[1] > static_cast(m_policy.scratch_size_max(1))) { + std::stringstream out; + out << "Kokkos::parallel_for Requested too much scratch memory on " + "level 1. Requested: " + << m_scratch_size[1] << ", Maximum: " << m_policy.scratch_size_max(1); Kokkos::Impl::throw_runtime_exception(out.str()); } const auto max_team_size = m_policy.team_size_max(arg_functor, ParallelForTag{}); - if (m_team_size > m_policy.team_size_max(arg_functor, ParallelForTag{})) - Kokkos::Impl::throw_runtime_exception( - "Kokkos::Impl::ParallelFor requested too large team size. The " - "maximal team_size is " + - std::to_string(max_team_size) + '!'); + if (m_team_size > max_team_size) { + std::stringstream error; + error << "Kokkos::parallel_for: Requested too large team size. " + "Requested: " + << m_team_size << ", Maximum: " << max_team_size; + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); + } } }; diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp index 4ab1403899c..8c84d6fb5eb 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp @@ -79,10 +79,10 @@ class Kokkos::Impl::ParallelReduce results_ptr; + sycl::global_ptr results_ptr; auto host_result_ptr = (m_result_ptr && !m_result_ptr_device_accessible) - ? static_cast>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; @@ -99,7 +99,7 @@ class Kokkos::Impl::ParallelReduce>( + results_ptr = static_cast>( instance.scratch_space(sizeof(value_type) * value_count)); auto device_accessible_result_ptr = m_result_ptr_device_accessible @@ -149,13 +149,13 @@ class Kokkos::Impl::ParallelReduce>( + results_ptr = static_cast>( instance.scratch_space(sizeof(value_type) * value_count * n_wgroups)); auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) : static_cast>(host_result_ptr); - auto scratch_flags = static_cast>( + auto scratch_flags = static_cast>( instance.scratch_flags(sizeof(unsigned int))); auto cgh_lambda = [&](sycl::handler& cgh) { diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp index 81ddb4aca73..1aa2df6c4e7 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Range.hpp @@ -53,10 +53,10 @@ class Kokkos::Impl::ParallelReduce< std::size_t size = policy.end() - policy.begin(); const unsigned int value_count = m_functor_reducer.get_reducer().value_count(); - sycl_device_ptr results_ptr = nullptr; + sycl::global_ptr results_ptr = nullptr; auto host_result_ptr = (m_result_ptr && !m_result_ptr_device_accessible) - ? static_cast>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; auto device_accessible_result_ptr = @@ -72,7 +72,7 @@ class Kokkos::Impl::ParallelReduce< // working with the global scratch memory but don't copy back to // m_result_ptr yet. if (size <= 1) { - results_ptr = static_cast>( + results_ptr = static_cast>( instance.scratch_space(sizeof(value_type) * value_count)); auto cgh_lambda = [&](sycl::handler& cgh) { @@ -117,13 +117,13 @@ class Kokkos::Impl::ParallelReduce< // workgroups separately, write the workgroup results back to global // memory and recurse until only one workgroup does the reduction and thus // gets the final value. - auto scratch_flags = static_cast>( + auto scratch_flags = static_cast>( instance.scratch_flags(sizeof(unsigned int))); auto reduction_lambda_factory = [&](sycl::local_accessor local_mem, sycl::local_accessor num_teams_done, - sycl_device_ptr results_ptr, int values_per_thread) { + sycl::global_ptr results_ptr, int values_per_thread) { const auto begin = policy.begin(); auto lambda = [=](sycl::nd_item<1> item) { @@ -285,7 +285,7 @@ class Kokkos::Impl::ParallelReduce< } results_ptr = - static_cast>(instance.scratch_space( + static_cast>(instance.scratch_space( sizeof(value_type) * value_count * n_wgroups)); sycl::local_accessor local_mem( diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp index bc593bd98e0..501ca88d128 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp @@ -49,7 +49,7 @@ class Kokkos::Impl::ParallelReduce sycl::event sycl_direct_launch( - const sycl_device_ptr global_scratch_ptr, + const sycl::global_ptr global_scratch_ptr, const CombinedFunctorReducerWrapper& functor_reducer_wrapper, const sycl::event& memcpy_event) const { // Convenience references @@ -64,7 +64,7 @@ class Kokkos::Impl::ParallelReduce>( + ? static_cast>( instance.scratch_host(sizeof(value_type) * value_count)) : nullptr; @@ -77,7 +77,7 @@ class Kokkos::Impl::ParallelReduce>(instance.scratch_space( + static_cast>(instance.scratch_space( sizeof(value_type) * std::max(value_count, 1u))); auto device_accessible_result_ptr = m_result_ptr_device_accessible @@ -144,7 +144,7 @@ class Kokkos::Impl::ParallelReduce>( + auto scratch_flags = static_cast>( instance.scratch_flags(sizeof(unsigned int))); auto cgh_lambda = [&](sycl::handler& cgh) { // FIXME_SYCL accessors seem to need a size greater than zero at least @@ -162,7 +162,7 @@ class Kokkos::Impl::ParallelReduce local_mem, - sycl_device_ptr results_ptr) { + sycl::global_ptr results_ptr) { auto device_accessible_result_ptr = m_result_ptr_device_accessible ? static_cast>(m_result_ptr) @@ -320,7 +320,7 @@ class Kokkos::Impl::ParallelReduce((size + wgroup_size - 1) / wgroup_size, 1); results_ptr = - static_cast>(instance.scratch_space( + static_cast>(instance.scratch_space( sizeof(value_type) * std::max(value_count, 1u) * init_size)); size_t max_work_groups = @@ -399,8 +399,8 @@ class Kokkos::Impl::ParallelReduce global_scratch_ptr = - static_cast>(instance.resize_team_scratch_space( + const sycl::global_ptr global_scratch_ptr = + static_cast>(instance.resize_team_scratch_space( scratch_pool_id, static_cast(m_scratch_size[1]) * m_league_size)); @@ -456,21 +456,35 @@ class Kokkos::Impl::ParallelReduce(instance.m_maxShmemPerBlock) < - m_shmem_size - m_shmem_begin) { + m_shmem_size + m_shmem_begin) { + std::stringstream out; + out << "Kokkos::parallel_reduce Requested too much scratch memory " + "on level 0. Requested: " + << m_shmem_size + m_shmem_begin << ", Maximum " + << instance.m_maxShmemPerBlock; + Kokkos::Impl::throw_runtime_exception(out.str()); + } + + if (m_scratch_size[1] > static_cast(m_policy.scratch_size_max(1))) { std::stringstream out; - out << "Kokkos::Impl::ParallelFor insufficient shared memory! " - "Requested " - << m_shmem_size - m_shmem_begin << " bytes but maximum is " - << instance.m_maxShmemPerBlock << '\n'; + out << "Kokkos::parallel_reduce Requested too much scratch memory " + "on level 1. Requested: " + << m_scratch_size[1] << ", Maximum " << m_policy.scratch_size_max(1); Kokkos::Impl::throw_runtime_exception(out.str()); } - if (m_team_size > m_policy.team_size_max(m_functor_reducer.get_functor(), - m_functor_reducer.get_reducer(), - ParallelReduceTag{})) - Kokkos::Impl::throw_runtime_exception( - "Kokkos::Impl::ParallelFor requested too large team size."); + const auto max_team_size = m_policy.team_size_max( + m_functor_reducer.get_functor(), m_functor_reducer.get_reducer(), + ParallelReduceTag{}); + if (m_team_size > max_team_size) { + std::stringstream error; + error << "Kokkos::parallel_for: Requested too large team size. " + "Requested: " + << m_team_size << ", Maximum: " << max_team_size; + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); + } } }; diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp index 7324b6aa38b..06fe8b962bb 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp @@ -139,7 +139,7 @@ class ParallelScanSYCLBase { const CombinedFunctorReducer m_functor_reducer; const Policy m_policy; - sycl_host_ptr m_scratch_host = nullptr; + sycl::global_ptr m_scratch_host = nullptr; pointer_type m_result_ptr; const bool m_result_ptr_device_accessible; @@ -155,93 +155,95 @@ class ParallelScanSYCLBase { const auto size = m_policy.end() - m_policy.begin(); - auto scratch_flags = static_cast>( + auto scratch_flags = static_cast>( instance.scratch_flags(sizeof(unsigned int))); const auto begin = m_policy.begin(); // Initialize global memory - auto scan_lambda_factory = [&](sycl::local_accessor local_mem, - sycl::local_accessor - num_teams_done, - sycl_device_ptr global_mem_, - sycl_device_ptr group_results_) { - auto lambda = [=](sycl::nd_item<1> item) { - auto global_mem = global_mem_; - auto group_results = group_results_; - - const CombinedFunctorReducer& - functor_reducer = functor_wrapper.get_functor(); - const FunctorType& functor = functor_reducer.get_functor(); - const typename Analysis::Reducer& reducer = - functor_reducer.get_reducer(); - - const auto n_wgroups = item.get_group_range()[0]; - const int wgroup_size = item.get_local_range()[0]; - - const int local_id = item.get_local_linear_id(); - const index_type global_id = item.get_global_linear_id(); - - // Initialize local memory - value_type local_value; - reducer.init(&local_value); - if (global_id < size) { - if constexpr (std::is_void_v) - functor(global_id + begin, local_value, false); - else - functor(WorkTag(), global_id + begin, local_value, false); - } + auto scan_lambda_factory = + [&](sycl::local_accessor local_mem, + sycl::local_accessor num_teams_done, + sycl::global_ptr global_mem_, + sycl::global_ptr group_results_) { + auto lambda = [=](sycl::nd_item<1> item) { + auto global_mem = global_mem_; + auto group_results = group_results_; - workgroup_scan<>(item, reducer, local_mem, local_value, wgroup_size); + const CombinedFunctorReducer< + FunctorType, typename Analysis::Reducer>& functor_reducer = + functor_wrapper.get_functor(); + const FunctorType& functor = functor_reducer.get_functor(); + const typename Analysis::Reducer& reducer = + functor_reducer.get_reducer(); - // Write results to global memory - if (global_id < size) global_mem[global_id] = local_value; + const auto n_wgroups = item.get_group_range()[0]; + const int wgroup_size = item.get_local_range()[0]; - if (local_id == wgroup_size - 1) { - group_results[item.get_group_linear_id()] = - local_mem[item.get_sub_group().get_group_range()[0] - 1]; + const int local_id = item.get_local_linear_id(); + const index_type global_id = item.get_global_linear_id(); - sycl::atomic_ref - scratch_flags_ref(*scratch_flags); - num_teams_done[0] = ++scratch_flags_ref; - } - sycl::group_barrier(item.get_group()); - if (num_teams_done[0] == n_wgroups) { - if (local_id == 0) *scratch_flags = 0; - value_type total; - reducer.init(&total); - - for (unsigned int offset = 0; offset < n_wgroups; - offset += wgroup_size) { - index_type id = local_id + offset; - if (id < static_cast(n_wgroups)) - local_value = group_results[id]; - else - reducer.init(&local_value); - workgroup_scan<>( - item, reducer, local_mem, local_value, - std::min(n_wgroups - offset, wgroup_size)); - if (id < static_cast(n_wgroups)) { - reducer.join(&local_value, &total); - group_results[id] = local_value; + // Initialize local memory + value_type local_value; + reducer.init(&local_value); + if (global_id < size) { + if constexpr (std::is_void_v) + functor(global_id + begin, local_value, false); + else + functor(WorkTag(), global_id + begin, local_value, false); } - reducer.join( - &total, - &local_mem[item.get_sub_group().get_group_range()[0] - 1]); - if (offset + wgroup_size < n_wgroups) - sycl::group_barrier(item.get_group()); - } - } - }; - return lambda; - }; + + workgroup_scan<>(item, reducer, local_mem, local_value, + wgroup_size); + + // Write results to global memory + if (global_id < size) global_mem[global_id] = local_value; + + if (local_id == wgroup_size - 1) { + group_results[item.get_group_linear_id()] = + local_mem[item.get_sub_group().get_group_range()[0] - 1]; + + sycl::atomic_ref + scratch_flags_ref(*scratch_flags); + num_teams_done[0] = ++scratch_flags_ref; + } + sycl::group_barrier(item.get_group()); + if (num_teams_done[0] == n_wgroups) { + if (local_id == 0) *scratch_flags = 0; + value_type total; + reducer.init(&total); + + for (unsigned int offset = 0; offset < n_wgroups; + offset += wgroup_size) { + index_type id = local_id + offset; + if (id < static_cast(n_wgroups)) + local_value = group_results[id]; + else + reducer.init(&local_value); + workgroup_scan<>( + item, reducer, local_mem, local_value, + std::min(n_wgroups - offset, wgroup_size)); + if (id < static_cast(n_wgroups)) { + reducer.join(&local_value, &total); + group_results[id] = local_value; + } + reducer.join( + &total, + &local_mem[item.get_sub_group().get_group_range()[0] - 1]); + if (offset + wgroup_size < n_wgroups) + sycl::group_barrier(item.get_group()); + } + } + }; + return lambda; + }; size_t wgroup_size; size_t n_wgroups; - sycl_device_ptr global_mem; - sycl_device_ptr group_results; + sycl::global_ptr global_mem; + sycl::global_ptr group_results; desul::ensure_sycl_lock_arrays_on_device(q); @@ -276,9 +278,9 @@ class ParallelScanSYCLBase { // FIXME_SYCL consider only storing one value per block and recreate // initial results in the end before doing the final pass global_mem = - static_cast>(instance.scratch_space( + static_cast>(instance.scratch_space( n_wgroups * (wgroup_size + 1) * sizeof(value_type))); - m_scratch_host = static_cast>( + m_scratch_host = static_cast>( instance.scratch_host(sizeof(value_type))); group_results = global_mem + n_wgroups * wgroup_size; diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp index 16b0b62a757..34069f614b2 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp @@ -20,8 +20,7 @@ namespace Kokkos { namespace Impl { void DeepCopySYCL(void* dst, const void* src, size_t n) { - // NOLINTNEXTLINE(bugprone-unchecked-optional-access) - Impl::SYCLInternal::singleton().m_queue->memcpy(dst, src, n); + Impl::SYCLInternal::default_instance->m_queue.memcpy(dst, src, n); } void DeepCopyAsyncSYCL(const Kokkos::SYCL& instance, void* dst, const void* src, diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp index d10be8135cf..3839aa386aa 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp @@ -357,7 +357,7 @@ class SYCLTeamMember { KOKKOS_INLINE_FUNCTION SYCLTeamMember(sycl::local_ptr shared, const std::size_t shared_begin, const std::size_t shared_size, - sycl_device_ptr scratch_level_1_ptr, + sycl::global_ptr scratch_level_1_ptr, const std::size_t scratch_level_1_size, const sycl::nd_item<2> item, const int arg_league_rank, const int arg_league_size) @@ -611,7 +611,7 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< * less than N) and a scan operation is performed. The last call to closure has * final == true. */ -// This is the same code as in CUDA and largely the same as in OpenMPTarget +// This is the same code as in CUDA. template KOKKOS_INLINE_FUNCTION void parallel_scan( const Impl::TeamThreadRangeBoundariesStruct& diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp index 23cb58c1ff3..03e1147d24e 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp @@ -106,9 +106,22 @@ class Kokkos::Impl::TeamPolicyInternal public: static int scratch_size_max(int level) { - return level == 0 ? 1024 * 32 - : // FIXME_SYCL arbitrarily setting this to 32kB - 20 * 1024 * 1024; // FIXME_SYCL arbitrarily setting this to 20MB + const auto& sycl_instance = *SYCL{}.impl_internal_space_instance(); + // FIXME_SYCL Avoid requesting too many registers on NVIDIA GPUs. +#if defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + const size_t max_possible_team_size = 256; +#else + const size_t max_possible_team_size = sycl_instance.m_maxWorkgroupSize; +#endif + const size_t max_reserved_shared_mem_per_team = + (max_possible_team_size + 2) * sizeof(double); + // arbitrarily setting level 1 scratch limit to 20MB + constexpr size_t max_l1_scratch_size = + static_cast(20) * 1024 * 1024; + + size_t max_shmem = sycl_instance.m_maxShmemPerBlock; + return (level == 0 ? max_shmem - max_reserved_shared_mem_per_team + : max_l1_scratch_size); } inline void impl_set_vector_length(size_t size) { m_vector_length = size; } inline void impl_set_team_size(size_t size) { m_team_size = size; } @@ -164,13 +177,13 @@ class Kokkos::Impl::TeamPolicyInternal if (m_team_size * m_vector_length > static_cast( m_space.impl_internal_space_instance()->m_maxWorkgroupSize)) { - Impl::throw_runtime_exception( - std::string("Kokkos::TeamPolicy the team size is too large. " - "Team size x vector length is " + - std::to_string(m_team_size * m_vector_length) + - " but must be smaller than ") + - std::to_string( - m_space.impl_internal_space_instance()->m_maxWorkgroupSize)); + std::stringstream error; + error << "Kokkos::TeamPolicy: Requested too large team size. " + "Requested: " + << m_team_size << ", Maximum: " + << m_space.impl_internal_space_instance()->m_maxWorkgroupSize / + m_vector_length; + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); } } diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp index 29effd6dbd3..18466a76a55 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp @@ -32,18 +32,6 @@ class UniqueToken { explicit UniqueToken(execution_space const& = execution_space()) : m_locks(Kokkos::Impl::sycl_global_unique_token_locks()) {} - KOKKOS_DEFAULTED_FUNCTION - UniqueToken(const UniqueToken&) = default; - - KOKKOS_DEFAULTED_FUNCTION - UniqueToken(UniqueToken&&) = default; - - KOKKOS_DEFAULTED_FUNCTION - UniqueToken& operator=(const UniqueToken&) = default; - - KOKKOS_DEFAULTED_FUNCTION - UniqueToken& operator=(UniqueToken&&) = default; - /// \brief upper bound for acquired values, i.e. 0 <= value < size() KOKKOS_INLINE_FUNCTION size_type size() const noexcept { return m_locks.extent(0); } diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp index 46253c18ee5..49f2dfa6e04 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_WorkgroupReduction.hpp @@ -62,7 +62,7 @@ inline constexpr bool use_shuffle_based_algorithm = false; template std::enable_if_t> workgroup_reduction( sycl::nd_item& item, sycl::local_accessor local_mem, - sycl_device_ptr results_ptr, + sycl::global_ptr results_ptr, sycl::global_ptr device_accessible_result_ptr, const unsigned int value_count_, const ReducerType& final_reducer, bool final, unsigned int max_size) { @@ -133,7 +133,7 @@ std::enable_if_t> workgroup_reduction( template std::enable_if_t> workgroup_reduction( sycl::nd_item& item, sycl::local_accessor local_mem, - ValueType local_value, sycl_device_ptr results_ptr, + ValueType local_value, sycl::global_ptr results_ptr, sycl::global_ptr device_accessible_result_ptr, const ReducerType& final_reducer, bool final, unsigned int max_size) { const auto local_id = item.get_local_linear_id(); diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial.cpp b/lib/kokkos/core/src/Serial/Kokkos_Serial.cpp index f4884aa680d..69171d89c36 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial.cpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial.cpp @@ -13,10 +13,11 @@ import kokkos.core; #endif #include -#include +#include #include #include #include +#include #include #include @@ -30,15 +31,11 @@ namespace Impl { std::vector SerialInternal::all_instances; std::mutex SerialInternal::all_instances_mutex; -bool SerialInternal::is_initialized() { return m_is_initialized; } - -void SerialInternal::initialize() { - if (is_initialized()) return; +HostSharedPtr SerialInternal::default_instance; +SerialInternal::SerialInternal() { Impl::SharedAllocationRecord::tracking_enable(); - m_is_initialized = true; - // guard pushing to all_instances { std::scoped_lock lock(all_instances_mutex); @@ -46,7 +43,27 @@ void SerialInternal::initialize() { } } -void SerialInternal::finalize() { +void SerialInternal::fence(const std::string& name) { +#ifdef KOKKOS_ENABLE_ATOMICS_BYPASS + auto fence = []() {}; +#else + auto fence = [this]() { std::lock_guard lock(m_instance_mutex); }; +#endif + if (Kokkos::Tools::profileLibraryLoaded()) { + Kokkos::Tools::Experimental::Impl::profile_fence_event( + name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1}, + fence); // TODO: correct device ID + } else { + fence(); + } +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS + Kokkos::memory_fence(); +#endif +} + +SerialInternal::~SerialInternal() { + fence("Kokkos::SerialInternal: fence on destruction"); + if (m_thread_team_data.scratch_buffer()) { m_thread_team_data.disband_team(); m_thread_team_data.disband_pool(); @@ -59,8 +76,6 @@ void SerialInternal::finalize() { m_thread_team_data.scratch_assign(nullptr, 0, 0, 0, 0, 0); } - m_is_initialized = false; - // guard erasing from all_instances { std::scoped_lock lock(all_instances_mutex); @@ -73,11 +88,6 @@ void SerialInternal::finalize() { } } -SerialInternal& SerialInternal::singleton() { - static SerialInternal self; - return self; -} - // Resize thread team data scratch memory void SerialInternal::resize_thread_team_data(size_t pool_reduce_bytes, size_t team_reduce_bytes, @@ -145,17 +155,19 @@ void SerialInternal::resize_thread_team_data(size_t pool_reduce_bytes, } } // namespace Impl +Serial::~Serial() { + Impl::check_execution_space_destructor_precondition(name()); +} + Serial::Serial() - : m_space_instance(&Impl::SerialInternal::singleton(), - [](Impl::SerialInternal*) {}) {} + : m_space_instance( + (Impl::check_execution_space_constructor_precondition(name()), + Impl::SerialInternal::default_instance)) {} Serial::Serial(NewInstance) - : m_space_instance(new Impl::SerialInternal, [](Impl::SerialInternal* ptr) { - ptr->finalize(); - delete ptr; - }) { - m_space_instance->initialize(); -} + : m_space_instance( + (Impl::check_execution_space_constructor_precondition(name()), + new Impl::SerialInternal)) {} void Serial::print_configuration(std::ostream& os, bool /*verbose*/) const { os << "Host Serial Execution Space:\n"; @@ -169,10 +181,13 @@ void Serial::print_configuration(std::ostream& os, bool /*verbose*/) const { } void Serial::impl_initialize(InitializationSettings const&) { - Impl::SerialInternal::singleton().initialize(); + Impl::SerialInternal::default_instance = + Impl::HostSharedPtr(new Impl::SerialInternal); } -void Serial::impl_finalize() { Impl::SerialInternal::singleton().finalize(); } +void Serial::impl_finalize() { + Impl::SerialInternal::default_instance = nullptr; +} const char* Serial::name() { return "Serial"; } diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial.hpp b/lib/kokkos/core/src/Serial/Kokkos_Serial.hpp index c936480804e..91fc1f664e1 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial.hpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial.hpp @@ -36,21 +36,20 @@ namespace Kokkos { namespace Impl { class SerialInternal { public: - SerialInternal() = default; + SerialInternal(); + ~SerialInternal(); - bool is_initialized(); - - void initialize(); - - void finalize(); - - static SerialInternal& singleton(); + SerialInternal(SerialInternal const&) = delete; + SerialInternal& operator=(SerialInternal const&) = delete; std::mutex m_instance_mutex; + static HostSharedPtr default_instance; static std::vector all_instances; static std::mutex all_instances_mutex; + void fence(const std::string& name); + // Resize thread team data scratch memory void resize_thread_team_data(size_t pool_reduce_bytes, size_t team_reduce_bytes, @@ -58,7 +57,6 @@ class SerialInternal { size_t thread_local_bytes); HostThreadTeamData m_thread_team_data; - bool m_is_initialized = false; }; } // namespace Impl @@ -100,29 +98,18 @@ class Serial { //@} + KOKKOS_DEFAULTED_FUNCTION Serial(const Serial&) = default; + KOKKOS_FUNCTION Serial(Serial&& other) noexcept + : Serial(static_cast(other)) {} + KOKKOS_DEFAULTED_FUNCTION Serial& operator=(const Serial&) = default; + KOKKOS_FUNCTION Serial& operator=(Serial&& other) noexcept { + return *this = static_cast(other); + } + ~Serial(); Serial(); explicit Serial(NewInstance); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - template - KOKKOS_DEPRECATED_WITH_COMMENT( - "Serial execution space should be constructed explicitly.") - Serial(NewInstance) - : Serial(NewInstance{}) {} -#endif - - /// \brief True if and only if this method is being called in a - /// thread-parallel function. - /// - /// For the Serial device, this method always returns false, - /// because parallel_for or parallel_reduce with the Serial device - /// always execute sequentially. - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - KOKKOS_DEPRECATED inline static int in_parallel() { return false; } -#endif - /// \brief Wait until all dispatched functors complete. /// /// The parallel_for or parallel_reduce dispatch of a functor may @@ -158,32 +145,11 @@ class Serial { void fence(const std::string& name = "Kokkos::Serial::fence: Unnamed Instance Fence") const { -#ifdef KOKKOS_ENABLE_ATOMICS_BYPASS - auto fence = []() {}; -#else - auto fence = [this]() { - auto* internal_instance = this->impl_internal_space_instance(); - std::lock_guard lock(internal_instance->m_instance_mutex); - }; -#endif - if (Kokkos::Tools::profileLibraryLoaded()) { - Kokkos::Tools::Experimental::Impl::profile_fence_event( - name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{1}, - fence); // TODO: correct device ID - } else { - fence(); - } -#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS - Kokkos::memory_fence(); -#endif + this->impl_internal_space_instance()->fence(name); } /** \brief Return the maximum amount of concurrency. */ -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - static int concurrency() { return 1; } -#else int concurrency() const { return 1; } -#endif //! Print configuration information to the given output stream. void print_configuration(std::ostream& os, bool verbose = false) const; diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp index 2bdb61649c2..d5f5946f293 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp @@ -109,8 +109,13 @@ class TeamPolicyInternal m_league_size(league_size_request), m_chunk_size(32), m_space(space) { - if (team_size_request > 1) - Kokkos::abort("Kokkos::abort: Requested Team Size is too large!"); + if (team_size_request > 1) { + std::stringstream error; + error << "Kokkos::TeamPolicy: Requested too large team size. " + "Requested: " + << team_size_request << ", Maximum: 1"; + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); + } } TeamPolicyInternal(const execution_space& space, int league_size_request, @@ -264,7 +269,30 @@ class ParallelFor, m_policy(arg_policy), m_league(m_policy.league_size()), m_shared(m_policy.scratch_size(0) + m_policy.scratch_size(1) + - FunctorTeamShmemSize::value(m_functor, 1)) {} + FunctorTeamShmemSize::value( + m_functor, m_policy.team_size())) { + if ((m_policy.scratch_size(0) + FunctorTeamShmemSize::value( + m_functor, m_policy.team_size())) > + static_cast(TeamPolicy::scratch_size_max(0))) { + std::stringstream error; + error << "Kokkos::parallel_for: Requested too much scratch " + "memory on level 0. Requested: " + << m_policy.scratch_size(0) + + FunctorTeamShmemSize::value( + m_functor, m_policy.team_size()) + << ", Maximum: " << TeamPolicy::scratch_size_max(0); + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); + } + if (m_policy.scratch_size(1) > + static_cast(TeamPolicy::scratch_size_max(1))) { + std::stringstream error; + error << "Kokkos::parallel_for: Requested too much scratch " + "memory on level 1. Requested: " + << m_policy.scratch_size(1) + << ", Maximum: " << TeamPolicy::scratch_size_max(1); + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); + } + } }; /*--------------------------------------------------------------------------*/ @@ -357,7 +385,7 @@ class ParallelReduce::value( - m_functor_reducer.get_functor(), 1)) { + m_functor_reducer.get_functor(), m_policy.team_size())) { static_assert(Kokkos::is_view::value, "Reduction result on Kokkos::Serial must be a Kokkos::View"); @@ -366,6 +394,28 @@ class ParallelReduce::accessible, "Kokkos::Serial reduce result must be a View accessible from " "HostSpace"); + if ((arg_policy.scratch_size(0) + + FunctorTeamShmemSize::value( + m_functor_reducer.get_functor(), arg_policy.team_size())) > + static_cast(TeamPolicy::scratch_size_max(0))) { + std::stringstream error; + error << "Kokkos::parallel_reduce: Requested too much scratch " + "memory on level 0. Requested: " + << arg_policy.scratch_size(0) + + FunctorTeamShmemSize::value( + m_functor_reducer.get_functor(), arg_policy.team_size()) + << ", Maximum: " << TeamPolicy::scratch_size_max(0); + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); + } + if (arg_policy.scratch_size(1) > + static_cast(TeamPolicy::scratch_size_max(1))) { + std::stringstream error; + error << "Kokkos::parallel_reduce: Requested too much scratch " + "memory on level 1. Requested: " + << arg_policy.scratch_size(1) + << ", Maximum: " << TeamPolicy::scratch_size_max(1); + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); + } } }; diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads.hpp index 7311c7a8aae..e9ff3b828f5 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads.hpp @@ -20,8 +20,9 @@ static_assert(false, #include #include #include -#include +#include #include +#include /*--------------------------------------------------------------------------*/ @@ -49,12 +50,6 @@ class Threads { //! \name Static functions that all Kokkos devices must implement. //@{ - /// \brief True if and only if this method is being called in a - /// thread-parallel function. -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - KOKKOS_DEPRECATED static int in_parallel(); -#endif - /// \brief Print configuration information to the given output stream. void print_configuration(std::ostream& os, bool verbose = false) const; @@ -70,11 +65,7 @@ class Threads { "Kokkos::Threads::fence: Unnamed Instance Fence") const; /** \brief Return the maximum amount of concurrency. */ -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - static int concurrency(); -#else int concurrency() const; -#endif /// \brief Free any resources being consumed by the device. /// @@ -110,6 +101,18 @@ class Threads { uint32_t impl_instance_id() const noexcept { return 1; } + KOKKOS_DEFAULTED_FUNCTION Threads(const Threads&) = default; + KOKKOS_FUNCTION Threads(Threads&& other) noexcept + : Threads(static_cast(other)) {} + KOKKOS_DEFAULTED_FUNCTION Threads& operator=(const Threads&) = default; + KOKKOS_FUNCTION Threads& operator=(Threads&& other) noexcept { + return *this = static_cast(other); + } + + ~Threads() { Impl::check_execution_space_destructor_precondition(name()); } + + Threads() { Impl::check_execution_space_constructor_precondition(name()); } + static const char* name(); //@} //---------------------------------------- diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.cpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.cpp index 64f36b658d1..703792db7c9 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.cpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.cpp @@ -167,7 +167,8 @@ ThreadsInternal::~ThreadsInternal() { const unsigned entry = m_pool_size - (m_pool_rank + 1); if (m_scratch) { - Kokkos::kokkos_free(m_scratch); + Kokkos::HostSpace{}.deallocate("Kokkos::thread_scratch", m_scratch, + m_scratch_thread_end); m_scratch = nullptr; } @@ -235,15 +236,6 @@ void ThreadsInternal::verify_is_process(const std::string &name, } } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -KOKKOS_DEPRECATED int ThreadsInternal::in_parallel() { - // A thread function is in execution and - // the function argument is not the special threads process argument and - // the master process is a worker or is not the master process. - return s_current_function && (&s_threads_process != s_current_function_arg) && - (s_threads_process.m_pool_base || !is_process()); -} -#endif void ThreadsInternal::fence() { fence("Kokkos::ThreadsInternal::fence: Unnamed Instance Fence"); } @@ -306,7 +298,8 @@ void ThreadsInternal::execute_resize_scratch_in_serial() { auto deallocate_scratch_memory = [](ThreadsInternal &exec) { if (exec.m_scratch) { - Kokkos::kokkos_free(exec.m_scratch); + Kokkos::HostSpace{}.deallocate("Kokkos::thread_scratch", exec.m_scratch, + exec.m_scratch_thread_end); exec.m_scratch = nullptr; } }; @@ -358,8 +351,8 @@ void ThreadsInternal::first_touch_allocate_thread_private_scratch( if (s_threads_process.m_scratch_thread_end) { // Allocate tracked memory: { - exec.m_scratch = Kokkos::kokkos_malloc( - "Kokkos::thread_scratch", s_threads_process.m_scratch_thread_end); + exec.m_scratch = Kokkos::HostSpace{}.allocate("Kokkos::thread_scratch", + exec.m_scratch_thread_end); } unsigned *ptr = reinterpret_cast(exec.m_scratch); @@ -617,7 +610,7 @@ void ThreadsInternal::initialize(int thread_count_arg) { void ThreadsInternal::finalize() { verify_is_process("ThreadsInternal::finalize", false); - fence(); + fence("Kokkos::ThreadsInternal::finalize: fence on destruction"); resize_scratch(0, 0); @@ -666,11 +659,7 @@ void ThreadsInternal::finalize() { namespace Kokkos { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -int Threads::concurrency() { return impl_thread_pool_size(0); } -#else int Threads::concurrency() const { return impl_thread_pool_size(0); } -#endif void Threads::fence(const std::string &name) const { Impl::ThreadsInternal::fence(name); diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.hpp index a7193257084..81f5b8d74e7 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.hpp @@ -66,9 +66,6 @@ class ThreadsInternal { static void first_touch_allocate_thread_private_scratch(ThreadsInternal &, const void *); - ThreadsInternal(const ThreadsInternal &); - ThreadsInternal &operator=(const ThreadsInternal &); - static void execute_resize_scratch_in_serial(); public: @@ -90,6 +87,8 @@ class ThreadsInternal { static void driver(void); + ThreadsInternal(const ThreadsInternal &) = delete; + ThreadsInternal &operator=(const ThreadsInternal &) = delete; ~ThreadsInternal(); ThreadsInternal(); @@ -389,9 +388,6 @@ class ThreadsInternal { */ static void start(void (*)(ThreadsInternal &, const void *), const void *); -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - static int in_parallel(); -#endif static void fence(); static void fence(const std::string &); static void internal_fence(); @@ -533,12 +529,6 @@ class ThreadsInternal { namespace Kokkos { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -KOKKOS_DEPRECATED inline int Threads::in_parallel() { - return Impl::ThreadsInternal::in_parallel(); -} -#endif - inline void Threads::impl_initialize(InitializationSettings const &settings) { Impl::ThreadsInternal::initialize( settings.has_num_threads() ? settings.get_num_threads() : -1); diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp index d527b9f2248..d6331e7d355 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp @@ -100,7 +100,30 @@ class ParallelFor, m_policy(fix_policy(arg_policy)), m_shared(m_policy.scratch_size(0) + m_policy.scratch_size(1) + FunctorTeamShmemSize::value( - m_functor, m_policy.team_size())) {} + m_functor, m_policy.team_size())) { + if ((m_policy.scratch_size(0, m_policy.team_size()) + + FunctorTeamShmemSize::value(m_functor, + m_policy.team_size())) > + static_cast(m_policy.scratch_size_max(0))) { + std::stringstream error; + error << "Kokkos::parallel_for: Requested too much scratch " + "memory on level 0. Requested: " + << m_policy.scratch_size(0, m_policy.team_size()) + + FunctorTeamShmemSize::value( + m_functor, m_policy.team_size()) + << ", Maximum: " << m_policy.scratch_size_max(0); + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); + } + if (m_policy.scratch_size(1, m_policy.team_size()) > + static_cast(m_policy.scratch_size_max(1))) { + std::stringstream error; + error << "Kokkos::parallel_for: Requested too much scratch " + "memory on level 1. Requested: " + << m_policy.scratch_size(1, m_policy.team_size()) + << ", Maximum: " << m_policy.scratch_size_max(1); + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); + } + } }; } // namespace Impl diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp index dde7fd68540..109eb4100af 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp @@ -119,6 +119,29 @@ class ParallelReduce::accessible, "Kokkos::Threads reduce result must be a View accessible from " "HostSpace"); + + if ((arg_policy.scratch_size(0, m_policy.team_size()) + + FunctorTeamShmemSize::value( + m_functor_reducer.get_functor(), arg_policy.team_size())) > + static_cast(m_policy.scratch_size_max(0))) { + std::stringstream error; + error << "Kokkos::parallel_reduce: Requested too much scratch " + "memory on level 0. Requested: " + << arg_policy.scratch_size(0, m_policy.team_size()) + + FunctorTeamShmemSize::value( + m_functor_reducer.get_functor(), arg_policy.team_size()) + << ", Maximum: " << m_policy.scratch_size_max(0); + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); + } + if (arg_policy.scratch_size(1, m_policy.team_size()) > + static_cast(m_policy.scratch_size_max(1))) { + std::stringstream error; + error << "Kokkos::parallel_reduce: Requested too much scratch " + "memory on level 1. Requested: " + << arg_policy.scratch_size(1, m_policy.team_size()) + << ", Maximum: " << m_policy.scratch_size_max(1); + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); + } } }; diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp index 51d56158acd..ab1f4825305 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp @@ -7,6 +7,7 @@ #include #include +#include #include #include @@ -63,36 +64,43 @@ class ThreadsExecTeamMember { } public: + // clang-format off + // Fan-in and wait until the matching fan-out is called. // The root thread which does not wait will return true. // All other threads will return false during the fan-out. KOKKOS_INLINE_FUNCTION bool team_fan_in() const { - int n, j; + KOKKOS_IF_ON_HOST(( + int n, j; - // Wait for fan-in threads - for (n = 1; - (!(m_team_rank_rev & n)) && ((j = m_team_rank_rev + n) < m_team_size); - n <<= 1) { - spinwait_while_equal(m_team_base[j]->state(), ThreadState::Active); - } + // Wait for fan-in threads + for (n = 1; + (!(m_team_rank_rev & n)) && ((j = m_team_rank_rev + n) < m_team_size); + n <<= 1) { + spinwait_while_equal(m_team_base[j]->state(), ThreadState::Active); + } - // If not root then wait for release - if (m_team_rank_rev) { - m_instance->state() = ThreadState::Rendezvous; - spinwait_while_equal(m_instance->state(), ThreadState::Rendezvous); - } + // If not root then wait for release + if (m_team_rank_rev) { + m_instance->state() = ThreadState::Rendezvous; + spinwait_while_equal(m_instance->state(), ThreadState::Rendezvous); + } + )) return !m_team_rank_rev; } KOKKOS_INLINE_FUNCTION void team_fan_out() const { - int n, j; - for (n = 1; - (!(m_team_rank_rev & n)) && ((j = m_team_rank_rev + n) < m_team_size); - n <<= 1) { - m_team_base[j]->state() = ThreadState::Active; - } + KOKKOS_IF_ON_HOST(( + int n, j; + for (n = 1; + (!(m_team_rank_rev & n)) && ((j = m_team_rank_rev + n) < m_team_size); + n <<= 1) { + m_team_base[j]->state() = ThreadState::Active; + } + )) } + // clang-format on public: KOKKOS_INLINE_FUNCTION static int team_reduce_size() { @@ -527,6 +535,8 @@ class TeamPolicyInternal bool m_tune_team_size; bool m_tune_vector_length; + Threads m_space; + inline void init(const int league_size_request, const int team_size_request) { const int pool_size = traits::execution_space::impl_thread_pool_size(0); const int max_host_team_size = Impl::HostThreadTeamData::max_team_members; @@ -536,8 +546,13 @@ class TeamPolicyInternal m_league_size = league_size_request; - if (team_size_request > team_max) - Kokkos::abort("Kokkos::abort: Requested Team Size is too large!"); + if (team_size_request > team_max) { + std::stringstream error; + error << "Kokkos::TeamPolicy: Requested too large team size. " + "Requested: " + << team_size_request << ", Maximum: " << team_max; + Kokkos::Impl::throw_runtime_exception(error.str().c_str()); + } m_team_size = team_size_request < team_max ? team_size_request : team_max; @@ -563,10 +578,7 @@ class TeamPolicyInternal using traits = PolicyTraits; - const typename traits::execution_space& space() const { - static typename traits::execution_space m_space; - return m_space; - } + const typename traits::execution_space& space() const { return m_space; } template friend class TeamPolicyInternal; @@ -585,6 +597,7 @@ class TeamPolicyInternal m_chunk_size = p.m_chunk_size; m_tune_team_size = p.m_tune_team_size; m_tune_vector_length = p.m_tune_vector_length; + m_space = p.m_space; } //---------------------------------------- diff --git a/lib/kokkos/core/src/View/Kokkos_BasicView.hpp b/lib/kokkos/core/src/View/Kokkos_BasicView.hpp index a67ce315d35..65c659364e8 100644 --- a/lib/kokkos/core/src/View/Kokkos_BasicView.hpp +++ b/lib/kokkos/core/src/View/Kokkos_BasicView.hpp @@ -597,7 +597,8 @@ class BasicView { // Explicit cast is needed because submdspan_mapping may return a different // layout type. using sub_accessor_t = typename OtherAccessorPolicy::offset_policy; - m_ptr = src_view.m_acc.offset(src_view.m_ptr, sub_mapping_result.offset); + m_ptr = static_cast( + src_view.m_acc.offset(src_view.m_ptr, sub_mapping_result.offset)); m_map = mapping_type(sub_mapping_result.mapping); m_acc = sub_accessor_t(src_view.m_acc); diff --git a/lib/kokkos/core/src/View/Kokkos_ViewAlloc.hpp b/lib/kokkos/core/src/View/Kokkos_ViewAlloc.hpp index 3814888ee1c..e21188fabf2 100644 --- a/lib/kokkos/core/src/View/Kokkos_ViewAlloc.hpp +++ b/lib/kokkos/core/src/View/Kokkos_ViewAlloc.hpp @@ -11,6 +11,7 @@ static_assert(false, #define KOKKOS_VIEW_ALLOC_HPP #include +#include #include #include #include @@ -59,9 +60,7 @@ struct ViewValueFunctor { #endif } - ViewValueFunctor() = default; - ViewValueFunctor(const ViewValueFunctor&) = default; - ViewValueFunctor& operator=(const ViewValueFunctor&) = default; + ViewValueFunctor() = default; ViewValueFunctor(ExecSpace const& arg_space, ValueType* const arg_ptr, size_t const arg_n, std::string arg_name) @@ -100,9 +99,9 @@ struct ViewValueFunctor { } #ifdef KOKKOS_ENABLE_CUDA - if (std::is_same::value) { - Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n, - true); + if constexpr (std::is_same::value) { + Kokkos::Impl::cuda_prefetch_pointer(space.cuda_stream(), ptr, + sizeof(ValueType) * n, true); } #endif const Kokkos::Impl::ParallelFor closure( @@ -169,7 +168,7 @@ struct ViewValueFunctor { // when the function is queried with cudaFuncGetAttributes void functor_instantiate_workaround() { #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) || \ - defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_OPENMPTARGET) + defined(KOKKOS_ENABLE_SYCL) if (false) { parallel_for_implementation(); } diff --git a/lib/kokkos/core/src/View/Kokkos_ViewCtor.hpp b/lib/kokkos/core/src/View/Kokkos_ViewCtor.hpp index 23e7fbaf33c..bfd8fc10d2e 100644 --- a/lib/kokkos/core/src/View/Kokkos_ViewCtor.hpp +++ b/lib/kokkos/core/src/View/Kokkos_ViewCtor.hpp @@ -4,6 +4,8 @@ #ifndef KOKKOS_EXPERIMENTAL_IMPL_VIEW_CTOR_PROP_HPP #define KOKKOS_EXPERIMENTAL_IMPL_VIEW_CTOR_PROP_HPP +#include "impl/Kokkos_Traits.hpp" + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -57,6 +59,9 @@ struct is_view_label : public std::true_type {}; template constexpr bool is_view_label_v = is_view_label::value; +template +concept ViewLabel = is_view_label_v; + //---------------------------------------------------------------------------- template diff --git a/lib/kokkos/core/src/View/Kokkos_ViewDataAnalysis.hpp b/lib/kokkos/core/src/View/Kokkos_ViewDataAnalysis.hpp index bcafe936902..d17f76986c0 100644 --- a/lib/kokkos/core/src/View/Kokkos_ViewDataAnalysis.hpp +++ b/lib/kokkos/core/src/View/Kokkos_ViewDataAnalysis.hpp @@ -47,9 +47,7 @@ struct rank_dynamic { static constexpr size_t ArgN##R = (V != KOKKOS_INVALID_INDEX ? V : 1); \ static constexpr size_t N##R = (V != KOKKOS_INVALID_INDEX ? V : 1); \ KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t) {} \ - ViewDimension##R() = default; \ - ViewDimension##R(const ViewDimension##R&) = default; \ - ViewDimension##R& operator=(const ViewDimension##R&) = default; \ + ViewDimension##R() = default; \ }; \ template \ constexpr size_t ViewDimension##R::ArgN##R; \ @@ -59,9 +57,7 @@ struct rank_dynamic { struct ViewDimension##R<0u, RD> { \ static constexpr size_t ArgN##R = 0; \ std::conditional_t<(RD < 3), size_t, unsigned> N##R; \ - ViewDimension##R() = default; \ - ViewDimension##R(const ViewDimension##R&) = default; \ - ViewDimension##R& operator=(const ViewDimension##R&) = default; \ + ViewDimension##R() = default; \ KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t V) : N##R(V) {} \ }; \ template \ @@ -136,9 +132,7 @@ struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION ViewDimension static constexpr unsigned rank = sizeof...(Vals); static constexpr unsigned rank_dynamic = Impl::rank_dynamic::value; - ViewDimension() = default; - ViewDimension(const ViewDimension&) = default; - ViewDimension& operator=(const ViewDimension&) = default; + ViewDimension() = default; KOKKOS_INLINE_FUNCTION constexpr ViewDimension(size_t n0, size_t n1, size_t n2, size_t n3, size_t n4, diff --git a/lib/kokkos/core/src/View/Kokkos_ViewLegacy.hpp b/lib/kokkos/core/src/View/Kokkos_ViewLegacy.hpp index 97c8bddb646..ac5ab32af19 100644 --- a/lib/kokkos/core/src/View/Kokkos_ViewLegacy.hpp +++ b/lib/kokkos/core/src/View/Kokkos_ViewLegacy.hpp @@ -338,10 +338,6 @@ class View : public ViewTraits { static constexpr Impl::integral_constant rank_dynamic = {}; -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - enum {Rank KOKKOS_DEPRECATED_WITH_COMMENT("Use rank instead.") = - map_type::Rank}; -#endif template KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t, @@ -908,8 +904,11 @@ class View : public ViewTraits { // FIXME_NVCC: nvcc 12.2 and 12.3 view these as ambiguous even though they have // exclusive requirements clauses. 12.6 Also has some issues though it manifests -// differently -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_COMPILER_NVHPC) +// differently. Clang with CUDA also had segfaults in CI +// Define the workaround here since this condition will be re-used. +// We undef KOKKOS_IMPL_VIEW_HOOKS_NVCC_WORKAROUND later. +#if defined(KOKKOS_COMPILER_NVCC) || defined(KOKKOS_COMPILER_NVHPC) || \ + (defined(KOKKOS_COMPILER_CLANG) && defined(KOKKOS_ENABLE_CUDA)) #define KOKKOS_IMPL_VIEW_HOOKS_NVCC_WORKAROUND 1 #endif #ifdef KOKKOS_IMPL_VIEW_HOOKS_NVCC_WORKAROUND @@ -1535,22 +1534,6 @@ KOKKOS_INLINE_FUNCTION auto subview(const View& src, Args... args) { Args...>::type(src, args...); } -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -template -KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION auto subview(const View& src, - Args... args) { - static_assert(View::rank == sizeof...(Args), - "subview requires one argument for each source View rank"); - static_assert(Kokkos::is_memory_traits::value); - - return typename Kokkos::Impl::ViewMapping< - void /* deduce subview type from source view traits */ - , - typename Impl::RemoveAlignedMemoryTrait::type, - Args...>::type(src, args...); -} -#endif - template using Subview = decltype(subview(std::declval(), std::declval()...)); diff --git a/lib/kokkos/core/src/View/Kokkos_ViewMapping.hpp b/lib/kokkos/core/src/View/Kokkos_ViewMapping.hpp index 26f029af130..9052148f01b 100644 --- a/lib/kokkos/core/src/View/Kokkos_ViewMapping.hpp +++ b/lib/kokkos/core/src/View/Kokkos_ViewMapping.hpp @@ -674,21 +674,7 @@ struct ViewOffset< //---------------------------------------- - // MSVC (16.5.5) + CUDA (10.2) did not generate the defaulted functions - // correct and errors out during compilation. Same for the other places where - // I changed this. -#ifdef KOKKOS_IMPL_WINDOWS_CUDA - KOKKOS_FUNCTION ViewOffset() : m_dim(dimension_type()) {} - KOKKOS_FUNCTION ViewOffset(const ViewOffset& src) { m_dim = src.m_dim; } - KOKKOS_FUNCTION ViewOffset& operator=(const ViewOffset& src) { - m_dim = src.m_dim; - return *this; - } -#else - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; - ViewOffset& operator=(const ViewOffset&) = default; -#endif + ViewOffset() = default; template KOKKOS_INLINE_FUNCTION constexpr ViewOffset( @@ -1025,26 +1011,7 @@ struct ViewOffset< }; public: - // MSVC (16.5.5) + CUDA (10.2) did not generate the defaulted functions - // correct and errors out during compilation. Same for the other places where - // I changed this. -#ifdef KOKKOS_IMPL_WINDOWS_CUDA - KOKKOS_FUNCTION ViewOffset() : m_dim(dimension_type()), m_stride(0) {} - KOKKOS_FUNCTION ViewOffset(const ViewOffset& src) { - m_dim = src.m_dim; - m_stride = src.m_stride; - } - KOKKOS_FUNCTION ViewOffset& operator=(const ViewOffset& src) { - m_dim = src.m_dim; - m_stride = src.m_stride; - return *this; - } -#else - - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; - ViewOffset& operator=(const ViewOffset&) = default; -#endif + ViewOffset() = default; /* Enable padding for trivial scalar types with non-zero trivial scalar size */ @@ -1367,24 +1334,7 @@ struct ViewOffset< s[dimension_type::rank] = stride_fill(s); } - //---------------------------------------- - // MSVC (16.5.5) + CUDA (10.2) did not generate the defaulted functions - // correct and errors out during compilation. Same for the other places where - // I changed this. - -#ifdef KOKKOS_IMPL_WINDOWS_CUDA - KOKKOS_FUNCTION ViewOffset() : m_dim(dimension_type()) {} - KOKKOS_FUNCTION ViewOffset(const ViewOffset& src) { m_dim = src.m_dim; } - KOKKOS_FUNCTION ViewOffset& operator=(const ViewOffset& src) { - m_dim = src.m_dim; - return *this; - } -#else - - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; - ViewOffset& operator=(const ViewOffset&) = default; -#endif + ViewOffset() = default; template KOKKOS_INLINE_FUNCTION constexpr ViewOffset( @@ -1711,27 +1661,7 @@ struct ViewOffset< }; public: - // MSVC (16.5.5) + CUDA (10.2) did not generate the defaulted functions - // correct and errors out during compilation. Same for the other places where - // I changed this. - -#ifdef KOKKOS_IMPL_WINDOWS_CUDA - KOKKOS_FUNCTION ViewOffset() : m_dim(dimension_type()), m_stride(0) {} - KOKKOS_FUNCTION ViewOffset(const ViewOffset& src) { - m_dim = src.m_dim; - m_stride = src.m_stride; - } - KOKKOS_FUNCTION ViewOffset& operator=(const ViewOffset& src) { - m_dim = src.m_dim; - m_stride = src.m_stride; - return *this; - } -#else - - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; - ViewOffset& operator=(const ViewOffset&) = default; -#endif + ViewOffset() = default; /* Enable padding for trivial scalar types with non-zero trivial scalar size. */ @@ -1853,9 +1783,7 @@ struct ViewStride<0> { static constexpr size_t S0 = 0, S1 = 0, S2 = 0, S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; - ViewStride& operator=(const ViewStride&) = default; + ViewStride() = default; KOKKOS_INLINE_FUNCTION constexpr ViewStride(size_t, size_t, size_t, size_t, size_t, size_t, size_t, @@ -1868,9 +1796,7 @@ struct ViewStride<1> { static constexpr size_t S1 = 0, S2 = 0, S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; - ViewStride& operator=(const ViewStride&) = default; + ViewStride() = default; KOKKOS_INLINE_FUNCTION constexpr ViewStride(size_t aS0, size_t, size_t, size_t, size_t, size_t, @@ -1883,9 +1809,7 @@ struct ViewStride<2> { size_t S0, S1; static constexpr size_t S2 = 0, S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; - ViewStride& operator=(const ViewStride&) = default; + ViewStride() = default; KOKKOS_INLINE_FUNCTION constexpr ViewStride(size_t aS0, size_t aS1, size_t, size_t, size_t, size_t, @@ -1898,9 +1822,7 @@ struct ViewStride<3> { size_t S0, S1, S2; static constexpr size_t S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; - ViewStride& operator=(const ViewStride&) = default; + ViewStride() = default; KOKKOS_INLINE_FUNCTION constexpr ViewStride(size_t aS0, size_t aS1, size_t aS2, size_t, size_t, @@ -1913,9 +1835,7 @@ struct ViewStride<4> { size_t S0, S1, S2, S3; static constexpr size_t S4 = 0, S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; - ViewStride& operator=(const ViewStride&) = default; + ViewStride() = default; KOKKOS_INLINE_FUNCTION constexpr ViewStride(size_t aS0, size_t aS1, size_t aS2, size_t aS3, size_t, @@ -1928,9 +1848,7 @@ struct ViewStride<5> { size_t S0, S1, S2, S3, S4; static constexpr size_t S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; - ViewStride& operator=(const ViewStride&) = default; + ViewStride() = default; KOKKOS_INLINE_FUNCTION constexpr ViewStride(size_t aS0, size_t aS1, size_t aS2, size_t aS3, @@ -1943,9 +1861,7 @@ struct ViewStride<6> { size_t S0, S1, S2, S3, S4, S5; static constexpr size_t S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; - ViewStride& operator=(const ViewStride&) = default; + ViewStride() = default; KOKKOS_INLINE_FUNCTION constexpr ViewStride(size_t aS0, size_t aS1, size_t aS2, size_t aS3, @@ -1958,9 +1874,7 @@ struct ViewStride<7> { size_t S0, S1, S2, S3, S4, S5, S6; static constexpr size_t S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; - ViewStride& operator=(const ViewStride&) = default; + ViewStride() = default; KOKKOS_INLINE_FUNCTION constexpr ViewStride(size_t aS0, size_t aS1, size_t aS2, size_t aS3, @@ -1972,9 +1886,7 @@ template <> struct ViewStride<8> { size_t S0, S1, S2, S3, S4, S5, S6, S7; - ViewStride() = default; - ViewStride(const ViewStride&) = default; - ViewStride& operator=(const ViewStride&) = default; + ViewStride() = default; KOKKOS_INLINE_FUNCTION constexpr ViewStride(size_t aS0, size_t aS1, size_t aS2, size_t aS3, @@ -2231,29 +2143,7 @@ struct ViewOffset { s[dimension_type::rank] = stride_fill(s); } - //---------------------------------------- - // MSVC (16.5.5) + CUDA (10.2) did not generate the defaulted functions - // correct and errors out during compilation. Same for the other places where - // I changed this. - -#ifdef KOKKOS_IMPL_WINDOWS_CUDA - KOKKOS_FUNCTION ViewOffset() - : m_dim(dimension_type()), m_stride(stride_type()) {} - KOKKOS_FUNCTION ViewOffset(const ViewOffset& src) { - m_dim = src.m_dim; - m_stride = src.m_stride; - } - KOKKOS_FUNCTION ViewOffset& operator=(const ViewOffset& src) { - m_dim = src.m_dim; - m_stride = src.m_stride; - return *this; - } -#else - - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; - ViewOffset& operator=(const ViewOffset&) = default; -#endif + ViewOffset() = default; KOKKOS_INLINE_FUNCTION constexpr ViewOffset(std::integral_constant const&, @@ -2732,16 +2622,8 @@ class ViewMapping< //---------------------------------------- - KOKKOS_DEFAULTED_FUNCTION ~ViewMapping() = default; KOKKOS_INLINE_FUNCTION ViewMapping() : m_impl_handle(), m_impl_offset() {} - KOKKOS_DEFAULTED_FUNCTION ViewMapping(const ViewMapping&) = default; - KOKKOS_DEFAULTED_FUNCTION ViewMapping& operator=(const ViewMapping&) = - default; - - KOKKOS_DEFAULTED_FUNCTION ViewMapping(ViewMapping&&) = default; - KOKKOS_DEFAULTED_FUNCTION ViewMapping& operator=(ViewMapping&&) = default; - //---------------------------------------- /**\brief Span, in bytes, of the required memory */ diff --git a/lib/kokkos/core/src/View/Kokkos_ViewTracker.hpp b/lib/kokkos/core/src/View/Kokkos_ViewTracker.hpp index 2ae4f655eb2..0de7b99469a 100644 --- a/lib/kokkos/core/src/View/Kokkos_ViewTracker.hpp +++ b/lib/kokkos/core/src/View/Kokkos_ViewTracker.hpp @@ -36,6 +36,10 @@ struct ViewTracker { ViewTracker(const ViewTracker& vt) noexcept : m_tracker(vt.m_tracker, !view_traits::memory_traits::is_unmanaged) {} + KOKKOS_INLINE_FUNCTION + ViewTracker(ViewTracker&& vt) noexcept + : m_tracker(vt.m_tracker, !view_traits::memory_traits::is_unmanaged) {} + KOKKOS_INLINE_FUNCTION explicit ViewTracker(const ParentView& vt) noexcept : m_tracker() { assign(vt); @@ -48,6 +52,8 @@ struct ViewTracker { assign(vt); } + ~ViewTracker() = default; + template KOKKOS_INLINE_FUNCTION void assign(const View& vt) { if (this == reinterpret_cast(&vt.m_track)) return; @@ -74,6 +80,11 @@ struct ViewTracker { return *this; } + // NOLINTNEXTLINE(bugprone-exception-escape) + KOKKOS_INLINE_FUNCTION ViewTracker& operator=(ViewTracker&& rhs) { + return *this = static_cast(rhs); + } + KOKKOS_INLINE_FUNCTION explicit ViewTracker(const track_type& tt) noexcept : m_tracker(tt, !view_traits::memory_traits::is_unmanaged) {} diff --git a/lib/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Header.hpp b/lib/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Header.hpp index 86033447e88..37265edaf52 100644 --- a/lib/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Header.hpp +++ b/lib/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Header.hpp @@ -17,11 +17,6 @@ static_assert(false, #include -#ifdef KOKKOS_ENABLE_OPENMPTARGET -#include -#include -#endif - namespace Kokkos { namespace detail { template @@ -47,21 +42,6 @@ struct index_pair_like, IndexType> { } // namespace detail } // namespace Kokkos -// FIXME_OPENMPTARGET We need to inject our own error handler as the default -// mdspan one cannot be called from device code -#ifdef KOKKOS_ENABLE_OPENMPTARGET -namespace Kokkos::detail { -KOKKOS_INLINE_FUNCTION void openmptarget_precondition_handler(const char *cond, - const char *file, - unsigned line) { - ::printf("%s:%u: precondition failure: `%s`\n", file, line, cond); - assert(0); -} -} // namespace Kokkos::detail -#define MDSPAN_IMPL_PRECONDITION_VIOLATION_HANDLER(cond, file, line) \ - Kokkos::detail::openmptarget_precondition_handler(cond, file, line) -#endif - #include #endif // KOKKOS_EXPERIMENTAL_MDSPAN_HPP diff --git a/lib/kokkos/core/src/decl/Kokkos_Declare_HIP.hpp b/lib/kokkos/core/src/decl/Kokkos_Declare_HIP.hpp index 1f763c4996d..a0b223f2795 100644 --- a/lib/kokkos/core/src/decl/Kokkos_Declare_HIP.hpp +++ b/lib/kokkos/core/src/decl/Kokkos_Declare_HIP.hpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include diff --git a/lib/kokkos/core/src/decl/Kokkos_Declare_OPENMPTARGET.hpp b/lib/kokkos/core/src/decl/Kokkos_Declare_OPENMPTARGET.hpp deleted file mode 100644 index 61f6aafe783..00000000000 --- a/lib/kokkos/core/src/decl/Kokkos_Declare_OPENMPTARGET.hpp +++ /dev/null @@ -1,21 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project - -#ifndef KOKKOS_DECLARE_OPENMPTARGET_HPP -#define KOKKOS_DECLARE_OPENMPTARGET_HPP - -#if defined(KOKKOS_ENABLE_OPENMPTARGET) -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#endif - -#endif diff --git a/lib/kokkos/core/src/fwd/Kokkos_Fwd_OPENMPTARGET.hpp b/lib/kokkos/core/src/fwd/Kokkos_Fwd_OPENMPTARGET.hpp deleted file mode 100644 index 3c611205544..00000000000 --- a/lib/kokkos/core/src/fwd/Kokkos_Fwd_OPENMPTARGET.hpp +++ /dev/null @@ -1,15 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project - -#ifndef KOKKOS_OPENMPTARGET_FWD_HPP_ -#define KOKKOS_OPENMPTARGET_FWD_HPP_ - -#if defined(KOKKOS_ENABLE_OPENMPTARGET) -namespace Kokkos { -namespace Experimental { -class OpenMPTarget; ///< OpenMPTarget execution space. -class OpenMPTargetSpace; -} // namespace Experimental -} // namespace Kokkos -#endif -#endif diff --git a/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp b/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp index fa7595a82e5..8c941157caf 100644 --- a/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp +++ b/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp @@ -21,82 +21,6 @@ namespace Kokkos { namespace Impl { -// Temporary, for testing new loop macros -#define KOKKOS_ENABLE_NEW_LOOP_MACROS 1 - -#define KOKKOS_IMPL_LOOP_1L(type, tile) \ - KOKKOS_ENABLE_IVDEP_MDRANGE \ - for (type i0 = 0; i0 < static_cast(tile[0]); ++i0) - -#define KOKKOS_IMPL_LOOP_2L(type, tile) \ - for (type i1 = 0; i1 < static_cast(tile[1]); ++i1) \ - KOKKOS_IMPL_LOOP_1L(type, tile) - -#define KOKKOS_IMPL_LOOP_3L(type, tile) \ - for (type i2 = 0; i2 < static_cast(tile[2]); ++i2) \ - KOKKOS_IMPL_LOOP_2L(type, tile) - -#define KOKKOS_IMPL_LOOP_4L(type, tile) \ - for (type i3 = 0; i3 < static_cast(tile[3]); ++i3) \ - KOKKOS_IMPL_LOOP_3L(type, tile) - -#define KOKKOS_IMPL_LOOP_5L(type, tile) \ - for (type i4 = 0; i4 < static_cast(tile[4]); ++i4) \ - KOKKOS_IMPL_LOOP_4L(type, tile) - -#define KOKKOS_IMPL_LOOP_6L(type, tile) \ - for (type i5 = 0; i5 < static_cast(tile[5]); ++i5) \ - KOKKOS_IMPL_LOOP_5L(type, tile) - -#define KOKKOS_IMPL_LOOP_7L(type, tile) \ - for (type i6 = 0; i6 < static_cast(tile[6]); ++i6) \ - KOKKOS_IMPL_LOOP_6L(type, tile) - -#define KOKKOS_IMPL_LOOP_8L(type, tile) \ - for (type i7 = 0; i7 < static_cast(tile[7]); ++i7) \ - KOKKOS_IMPL_LOOP_7L(type, tile) - -#define KOKKOS_IMPL_LOOP_1R(type, tile) \ - KOKKOS_ENABLE_IVDEP_MDRANGE \ - for (type i0 = 0; i0 < static_cast(tile[0]); ++i0) - -#define KOKKOS_IMPL_LOOP_2R(type, tile) \ - KOKKOS_IMPL_LOOP_1R(type, tile) \ - for (type i1 = 0; i1 < static_cast(tile[1]); ++i1) - -#define KOKKOS_IMPL_LOOP_3R(type, tile) \ - KOKKOS_IMPL_LOOP_2R(type, tile) \ - for (type i2 = 0; i2 < static_cast(tile[2]); ++i2) - -#define KOKKOS_IMPL_LOOP_4R(type, tile) \ - KOKKOS_IMPL_LOOP_3R(type, tile) \ - for (type i3 = 0; i3 < static_cast(tile[3]); ++i3) - -#define KOKKOS_IMPL_LOOP_5R(type, tile) \ - KOKKOS_IMPL_LOOP_4R(type, tile) \ - for (type i4 = 0; i4 < static_cast(tile[4]); ++i4) - -#define KOKKOS_IMPL_LOOP_6R(type, tile) \ - KOKKOS_IMPL_LOOP_5R(type, tile) \ - for (type i5 = 0; i5 < static_cast(tile[5]); ++i5) - -#define KOKKOS_IMPL_LOOP_7R(type, tile) \ - KOKKOS_IMPL_LOOP_6R(type, tile) \ - for (type i6 = 0; i6 < static_cast(tile[6]); ++i6) - -#define KOKKOS_IMPL_LOOP_8R(type, tile) \ - KOKKOS_IMPL_LOOP_7R(type, tile) \ - for (type i7 = 0; i7 < static_cast(tile[7]); ++i7) - -#define KOKKOS_IMPL_LOOP_ARGS_1 i0 + m_offset[0] -#define KOKKOS_IMPL_LOOP_ARGS_2 KOKKOS_IMPL_LOOP_ARGS_1, i1 + m_offset[1] -#define KOKKOS_IMPL_LOOP_ARGS_3 KOKKOS_IMPL_LOOP_ARGS_2, i2 + m_offset[2] -#define KOKKOS_IMPL_LOOP_ARGS_4 KOKKOS_IMPL_LOOP_ARGS_3, i3 + m_offset[3] -#define KOKKOS_IMPL_LOOP_ARGS_5 KOKKOS_IMPL_LOOP_ARGS_4, i4 + m_offset[4] -#define KOKKOS_IMPL_LOOP_ARGS_6 KOKKOS_IMPL_LOOP_ARGS_5, i5 + m_offset[5] -#define KOKKOS_IMPL_LOOP_ARGS_7 KOKKOS_IMPL_LOOP_ARGS_6, i6 + m_offset[6] -#define KOKKOS_IMPL_LOOP_ARGS_8 KOKKOS_IMPL_LOOP_ARGS_7, i7 + m_offset[7] - // New Loop Macros... // parallel_for, non-tagged #define KOKKOS_IMPL_APPLY(func, ...) func(__VA_ARGS__); @@ -1639,7 +1563,6 @@ struct HostIterateTile inline void operator()(IType tile_idx) const { point_type m_offset; @@ -1668,280 +1591,74 @@ struct HostIterateTile - inline void operator()(IType tile_idx) const { - operator_impl(tile_idx, RankTag()); + template + std::enable_if_t<(sizeof...(Args) == RP::rank && std::is_void_v), void> + apply(Args&&... args) const { + m_func(args...); } - // added due to compiler error when using sfinae to choose operator based on - // rank w/ cuda+serial - - template - inline void operator_impl(IType tile_idx, const RankTag<2>) const { - point_type m_offset; - point_type m_tiledims; - - if (RP::outer_direction == Iterate::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } - - // Check if offset+tiledim in bounds - if not, replace tile dims with the - // partial tile dims - const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); - - if (RP::inner_direction == Iterate::Left) { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); } - } - } // end Iterate::Left - else { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_2R(index_type, m_tiledims) { apply(LOOP_ARGS_2); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_2R(index_type, m_tiledims) { apply(LOOP_ARGS_2); } - } - } // end Iterate::Right - - } // end op() rank == 2 - - template - inline void operator_impl(IType tile_idx, const RankTag<3>) const { - point_type m_offset; - point_type m_tiledims; - - if (RP::outer_direction == Iterate::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } - - // Check if offset+tiledim in bounds - if not, replace tile dims with the - // partial tile dims - const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); - - if (RP::inner_direction == Iterate::Left) { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); } - } - } // end Iterate::Left - else { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_3R(index_type, m_tiledims) { apply(LOOP_ARGS_3); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_3R(index_type, m_tiledims) { apply(LOOP_ARGS_3); } - } - } // end Iterate::Right - - } // end op() rank == 3 - template - inline void operator_impl(IType tile_idx, const RankTag<4>) const { - point_type m_offset; - point_type m_tiledims; - - if (RP::outer_direction == Iterate::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } - - // Check if offset+tiledim in bounds - if not, replace tile dims with the - // partial tile dims - const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); - - if (RP::inner_direction == Iterate::Left) { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); } - } - } // end Iterate::Left - else { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_4R(index_type, m_tiledims) { apply(LOOP_ARGS_4); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_4R(index_type, m_tiledims) { apply(LOOP_ARGS_4); } - } - } // end Iterate::Right - - } // end op() rank == 4 - - template - inline void operator_impl(IType tile_idx, const RankTag<5>) const { - point_type m_offset; - point_type m_tiledims; - - if (RP::outer_direction == Iterate::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } - - // Check if offset+tiledim in bounds - if not, replace tile dims with the - // partial tile dims - const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); + template + std::enable_if_t<(sizeof...(Args) == RP::rank && !std::is_void_v), void> + apply(Args&&... args) const { + m_func(m_tag, args...); + } - if (RP::inner_direction == Iterate::Left) { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); } - } - } // end Iterate::Left - else { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_5R(index_type, m_tiledims) { apply(LOOP_ARGS_5); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_5R(index_type, m_tiledims) { apply(LOOP_ARGS_5); } - } - } // end Iterate::Right + RP const m_rp; + Functor const m_func; + std::conditional_t, int, Tag> m_tag{}; +}; - } // end op() rank == 5 +// For ParallelReduce +// ValueType - scalar: For reductions +template +struct HostIterateTile && + !std::is_array_v>> { + using index_type = typename RP::index_type; + using point_type = typename RP::point_type; - template - inline void operator_impl(IType tile_idx, const RankTag<6>) const { - point_type m_offset; - point_type m_tiledims; + using value_type = ValueType; - if (RP::outer_direction == Iterate::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } + inline HostIterateTile(RP const& rp, Functor const& func) + : m_rp(rp) // Cuda 7.0 does not like braces... + , + m_func(func) { + // Errors due to braces rather than parenthesis for init (with cuda 7.0) + // /home/ndellin/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp:1216:98: + // error: too many braces around initializer for ‘int’ [-fpermissive] + // /home/ndellin/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp:1216:98: + // error: aggregate value used where an integer was expected + } - // Check if offset+tiledim in bounds - if not, replace tile dims with the - // partial tile dims - const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); + inline bool check_iteration_bounds(point_type& partial_tile, + point_type& offset) const { + bool is_full_tile = true; - if (RP::inner_direction == Iterate::Left) { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); } - } - } // end Iterate::Left - else { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_6R(index_type, m_tiledims) { apply(LOOP_ARGS_6); } + for (int i = 0; i < RP::rank; ++i) { + if ((offset[i] + m_rp.m_tile[i]) <= m_rp.m_upper[i]) { + partial_tile[i] = m_rp.m_tile[i]; } else { - // #pragma simd - KOKKOS_IMPL_LOOP_6R(index_type, m_tiledims) { apply(LOOP_ARGS_6); } - } - } // end Iterate::Right - - } // end op() rank == 6 - - template - inline void operator_impl(IType tile_idx, const RankTag<7>) const { - point_type m_offset; - point_type m_tiledims; - - if (RP::outer_direction == Iterate::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; + is_full_tile = false; + partial_tile[i] = + (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1 + : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 + ? (m_rp.m_upper[i] - offset[i]) + : (m_rp.m_upper[i] - + m_rp.m_lower[i]); // when single tile encloses range } } - // Check if offset+tiledim in bounds - if not, replace tile dims with the - // partial tile dims - const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); - - if (RP::inner_direction == Iterate::Left) { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); } - } - } // end Iterate::Left - else { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_7R(index_type, m_tiledims) { apply(LOOP_ARGS_7); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_7R(index_type, m_tiledims) { apply(LOOP_ARGS_7); } - } - } // end Iterate::Right + return is_full_tile; + } // end check bounds - } // end op() rank == 7 + template + struct RankTag { + using type = RankTag; + enum { value = (int)Rank }; + }; template - inline void operator_impl(IType tile_idx, const RankTag<8>) const { + inline void operator()(IType tile_idx, value_type& val) const { point_type m_offset; point_type m_tiledims; @@ -1963,38 +1680,9 @@ struct HostIterateTile - std::enable_if_t<(sizeof...(Args) == RP::rank && std::is_void_v), void> - apply(Args&&... args) const { - m_func(args...); - } - - template - std::enable_if_t<(sizeof...(Args) == RP::rank && !std::is_void_v), void> - apply(Args&&... args) const { - m_func(m_tag, args...); + Tile_Loop_Type::apply(val, m_func.get_functor(), full_tile, m_offset, + m_rp.m_tile, m_tiledims); } RP const m_rp; @@ -2003,26 +1691,24 @@ struct HostIterateTile struct HostIterateTile && - !std::is_array_v>> { + std::is_array_v>> { using index_type = typename RP::index_type; using point_type = typename RP::point_type; - using value_type = ValueType; + using value_type = + std::remove_extent_t; // strip away the + // 'array-ness' [], only + // underlying type remains inline HostIterateTile(RP const& rp, Functor const& func) : m_rp(rp) // Cuda 7.0 does not like braces... , - m_func(func) { - // Errors due to braces rather than parenthesis for init (with cuda 7.0) - // /home/ndellin/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp:1216:98: - // error: too many braces around initializer for ‘int’ [-fpermissive] - // /home/ndellin/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp:1216:98: - // error: aggregate value used where an integer was expected - } + m_func(func) {} inline bool check_iteration_bounds(point_type& partial_tile, point_type& offset) const { @@ -2051,421 +1737,8 @@ struct HostIterateTile - inline void operator()(IType tile_idx, value_type& val) const { - point_type m_offset; - point_type m_tiledims; - - if (RP::outer_direction == Iterate::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } - - // Check if offset+tiledim in bounds - if not, replace tile dims with the - // partial tile dims - const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); - - Tile_Loop_Type::apply(val, m_func.get_functor(), full_tile, m_offset, - m_rp.m_tile, m_tiledims); - } - -#else - template - inline void operator()(IType tile_idx) const { - operator_impl(tile_idx, RankTag()); - } - // added due to compiler error when using sfinae to choose operator based on - // rank - - template - inline void operator_impl(IType tile_idx, const RankTag<2>) const { - point_type m_offset; - point_type m_tiledims; - - if (RP::outer_direction == Iterate::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } - - // Check if offset+tiledim in bounds - if not, replace tile dims with the - // partial tile dims - const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); - - if (RP::inner_direction == Iterate::Left) { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); } - } - } // end Iterate::Left - else { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_2R(index_type, m_tiledims) { apply(LOOP_ARGS_2); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_2R(index_type, m_tiledims) { apply(LOOP_ARGS_2); } - } - } // end Iterate::Right - - } // end op() rank == 2 - - template - inline void operator_impl(IType tile_idx, const RankTag<3>) const { - point_type m_offset; - point_type m_tiledims; - - if (RP::outer_direction == Iterate::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } - - // Check if offset+tiledim in bounds - if not, replace tile dims with the - // partial tile dims - const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); - - if (RP::inner_direction == Iterate::Left) { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); } - } - } // end Iterate::Left - else { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_3R(index_type, m_tiledims) { apply(LOOP_ARGS_3); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_3R(index_type, m_tiledims) { apply(LOOP_ARGS_3); } - } - } // end Iterate::Right - - } // end op() rank == 3 - - template - inline void operator_impl(IType tile_idx, const RankTag<4>) const { - point_type m_offset; - point_type m_tiledims; - - if (RP::outer_direction == Iterate::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } - - // Check if offset+tiledim in bounds - if not, replace tile dims with the - // partial tile dims - const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); - - if (RP::inner_direction == Iterate::Left) { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); } - } - } // end Iterate::Left - else { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_4R(index_type, m_tiledims) { apply(LOOP_ARGS_4); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_4R(index_type, m_tiledims) { apply(LOOP_ARGS_4); } - } - } // end Iterate::Right - - } // end op() rank == 4 - - template - inline void operator_impl(IType tile_idx, const RankTag<5>) const { - point_type m_offset; - point_type m_tiledims; - - if (RP::outer_direction == Iterate::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } - - // Check if offset+tiledim in bounds - if not, replace tile dims with the - // partial tile dims - const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); - - if (RP::inner_direction == Iterate::Left) { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); } - } - } // end Iterate::Left - else { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_5R(index_type, m_tiledims) { apply(LOOP_ARGS_5); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_5R(index_type, m_tiledims) { apply(LOOP_ARGS_5); } - } - } // end Iterate::Right - - } // end op() rank == 5 - - template - inline void operator_impl(IType tile_idx, const RankTag<6>) const { - point_type m_offset; - point_type m_tiledims; - - if (RP::outer_direction == Iterate::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } - - // Check if offset+tiledim in bounds - if not, replace tile dims with the - // partial tile dims - const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); - - if (RP::inner_direction == Iterate::Left) { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); } - } - } // end Iterate::Left - else { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_6R(index_type, m_tiledims) { apply(LOOP_ARGS_6); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_6R(index_type, m_tiledims) { apply(LOOP_ARGS_6); } - } - } // end Iterate::Right - - } // end op() rank == 6 - - template - inline void operator_impl(IType tile_idx, const RankTag<7>) const { - point_type m_offset; - point_type m_tiledims; - - if (RP::outer_direction == Iterate::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } - - // Check if offset+tiledim in bounds - if not, replace tile dims with the - // partial tile dims - const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); - - if (RP::inner_direction == Iterate::Left) { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); } - } - } // end Iterate::Left - else { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_7R(index_type, m_tiledims) { apply(LOOP_ARGS_7); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_7R(index_type, m_tiledims) { apply(LOOP_ARGS_7); } - } - } // end Iterate::Right - - } // end op() rank == 7 - - template - inline void operator_impl(IType tile_idx, const RankTag<8>) const { - point_type m_offset; - point_type m_tiledims; - - if (RP::outer_direction == Iterate::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } - - // Check if offset+tiledim in bounds - if not, replace tile dims with the - // partial tile dims - const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); - - if (RP::inner_direction == Iterate::Left) { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_8L(index_type, m_tiledims) { apply(LOOP_ARGS_8); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_8L(index_type, m_tiledims) { apply(LOOP_ARGS_8); } - } - } // end Iterate::Left - else { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_8R(index_type, m_tiledims) { apply(LOOP_ARGS_8); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_8R(index_type, m_tiledims) { apply(LOOP_ARGS_8); } - } - } // end Iterate::Right - - } // end op() rank == 8 - - template - std::enable_if_t<(sizeof...(Args) == RP::rank && std::is_void::value), - void> - apply(Args&&... args) const { - m_func(args..., m_v); - } - - template - std::enable_if_t<(sizeof...(Args) == RP::rank && !std::is_void::value), - void> - apply(Args&&... args) const { - m_func(m_tag, args..., m_v); - } -#endif - - RP const m_rp; - Functor const m_func; - std::conditional_t, int, Tag> m_tag{}; -}; - -// For ParallelReduce -// Extra specialization for array reductions -// ValueType[]: For array reductions -template -struct HostIterateTile && - std::is_array_v>> { - using index_type = typename RP::index_type; - using point_type = typename RP::point_type; - - using value_type = - std::remove_extent_t; // strip away the - // 'array-ness' [], only - // underlying type remains - - inline HostIterateTile(RP const& rp, Functor const& func) - : m_rp(rp) // Cuda 7.0 does not like braces... - , - m_func(func) {} - - inline bool check_iteration_bounds(point_type& partial_tile, - point_type& offset) const { - bool is_full_tile = true; - - for (int i = 0; i < RP::rank; ++i) { - if ((offset[i] + m_rp.m_tile[i]) <= m_rp.m_upper[i]) { - partial_tile[i] = m_rp.m_tile[i]; - } else { - is_full_tile = false; - partial_tile[i] = - (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1 - : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 - ? (m_rp.m_upper[i] - offset[i]) - : (m_rp.m_upper[i] - - m_rp.m_lower[i]); // when single tile encloses range - } - } - - return is_full_tile; - } // end check bounds - - template - struct RankTag { - using type = RankTag; - enum { value = (int)Rank }; - }; - -#if KOKKOS_ENABLE_NEW_LOOP_MACROS - template - inline void operator()(IType tile_idx, value_type* val) const { + inline void operator()(IType tile_idx, value_type* val) const { point_type m_offset; point_type m_tiledims; @@ -2492,336 +1765,6 @@ struct HostIterateTile - inline void operator()(IType tile_idx) const { - operator_impl(tile_idx, RankTag()); - } - // added due to compiler error when using sfinae to choose operator based on - // rank - - template - inline void operator_impl(IType tile_idx, const RankTag<2>) const { - point_type m_offset; - point_type m_tiledims; - - if (RP::outer_direction == Iterate::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } - - // Check if offset+tiledim in bounds - if not, replace tile dims with the - // partial tile dims - const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); - - if (RP::inner_direction == Iterate::Left) { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_2L(index_type, m_tiledims) { apply(LOOP_ARGS_2); } - } - } // end Iterate::Left - else { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_2R(index_type, m_tiledims) { apply(LOOP_ARGS_2); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_2R(index_type, m_tiledims) { apply(LOOP_ARGS_2); } - } - } // end Iterate::Right - - } // end op() rank == 2 - - template - inline void operator_impl(IType tile_idx, const RankTag<3>) const { - point_type m_offset; - point_type m_tiledims; - - if (RP::outer_direction == Iterate::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } - - // Check if offset+tiledim in bounds - if not, replace tile dims with the - // partial tile dims - const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); - - if (RP::inner_direction == Iterate::Left) { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_3L(index_type, m_tiledims) { apply(LOOP_ARGS_3); } - } - } // end Iterate::Left - else { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_3R(index_type, m_tiledims) { apply(LOOP_ARGS_3); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_3R(index_type, m_tiledims) { apply(LOOP_ARGS_3); } - } - } // end Iterate::Right - - } // end op() rank == 3 - - template - inline void operator_impl(IType tile_idx, const RankTag<4>) const { - point_type m_offset; - point_type m_tiledims; - - if (RP::outer_direction == Iterate::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } - - // Check if offset+tiledim in bounds - if not, replace tile dims with the - // partial tile dims - const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); - - if (RP::inner_direction == Iterate::Left) { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_4L(index_type, m_tiledims) { apply(LOOP_ARGS_4); } - } - } // end Iterate::Left - else { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_4R(index_type, m_tiledims) { apply(LOOP_ARGS_4); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_4R(index_type, m_tiledims) { apply(LOOP_ARGS_4); } - } - } // end Iterate::Right - - } // end op() rank == 4 - - template - inline void operator_impl(IType tile_idx, const RankTag<5>) const { - point_type m_offset; - point_type m_tiledims; - - if (RP::outer_direction == Iterate::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } - - // Check if offset+tiledim in bounds - if not, replace tile dims with the - // partial tile dims - const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); - - if (RP::inner_direction == Iterate::Left) { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_5L(index_type, m_tiledims) { apply(LOOP_ARGS_5); } - } - } // end Iterate::Left - else { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_5R(index_type, m_tiledims) { apply(LOOP_ARGS_5); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_5R(index_type, m_tiledims) { apply(LOOP_ARGS_5); } - } - } // end Iterate::Right - - } // end op() rank == 5 - - template - inline void operator_impl(IType tile_idx, const RankTag<6>) const { - point_type m_offset; - point_type m_tiledims; - - if (RP::outer_direction == Iterate::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } - - // Check if offset+tiledim in bounds - if not, replace tile dims with the - // partial tile dims - const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); - - if (RP::inner_direction == Iterate::Left) { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_6L(index_type, m_tiledims) { apply(LOOP_ARGS_6); } - } - } // end Iterate::Left - else { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_6R(index_type, m_tiledims) { apply(LOOP_ARGS_6); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_6R(index_type, m_tiledims) { apply(LOOP_ARGS_6); } - } - } // end Iterate::Right - - } // end op() rank == 6 - - template - inline void operator_impl(IType tile_idx, const RankTag<7>) const { - point_type m_offset; - point_type m_tiledims; - - if (RP::outer_direction == Iterate::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } - - // Check if offset+tiledim in bounds - if not, replace tile dims with the - // partial tile dims - const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); - - if (RP::inner_direction == Iterate::Left) { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_7L(index_type, m_tiledims) { apply(LOOP_ARGS_7); } - } - } // end Iterate::Left - else { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_7R(index_type, m_tiledims) { apply(LOOP_ARGS_7); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_7R(index_type, m_tiledims) { apply(LOOP_ARGS_7); } - } - } // end Iterate::Right - - } // end op() rank == 7 - - template - inline void operator_impl(IType tile_idx, const RankTag<8>) const { - point_type m_offset; - point_type m_tiledims; - - if (RP::outer_direction == Iterate::Left) { - for (int i = 0; i < RP::rank; ++i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } else { - for (int i = RP::rank - 1; i >= 0; --i) { - m_offset[i] = - (tile_idx % m_rp.m_tile_end[i]) * m_rp.m_tile[i] + m_rp.m_lower[i]; - tile_idx /= m_rp.m_tile_end[i]; - } - } - - // Check if offset+tiledim in bounds - if not, replace tile dims with the - // partial tile dims - const bool full_tile = check_iteration_bounds(m_tiledims, m_offset); - - if (RP::inner_direction == Iterate::Left) { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_8L(index_type, m_tiledims) { apply(LOOP_ARGS_8); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_8L(index_type, m_tiledims) { apply(LOOP_ARGS_8); } - } - } // end Iterate::Left - else { - if (full_tile) { - // #pragma simd - KOKKOS_IMPL_LOOP_8R(index_type, m_tiledims) { apply(LOOP_ARGS_8); } - } else { - // #pragma simd - KOKKOS_IMPL_LOOP_8R(index_type, m_tiledims) { apply(LOOP_ARGS_8); } - } - } // end Iterate::Right - - } // end op() rank == 8 - template - std::enable_if_t<(sizeof...(Args) == RP::rank && std::is_void::value), - void> - apply(Args&&... args) const { - m_func(args..., m_v); - } - - template - std::enable_if_t<(sizeof...(Args) == RP::rank && !std::is_void::value), - void> - apply(Args&&... args) const { - m_func(m_tag, args..., m_v); - } -#endif - RP const m_rp; Functor const m_func; std::conditional_t, int, Tag> m_tag{}; @@ -2829,31 +1772,6 @@ struct HostIterateTile -struct DeviceIterateTile; - -// Rank 2 -template -struct DeviceIterateTile<2, PolicyType, Functor, MaxGridSize, Tag> { - using index_type = typename PolicyType::index_type; +// ------------------------------------------------------------------------- // +// Compute GPU launch parameters (grid/block dimensions) for MDRangePolicy +// +// Ranks 2-3: Direct mapping - each policy dimension maps to one GPU dimension. +// Ranks 4-6: Dimension packing - pairs of policy dimensions are packed +// into single GPU dimensions to fit the 3D hardware limit. +// +// Returns: CUDA/HIP: std::pair +// SYCL: sycl::nd_range<3>{global, local} +// +template +auto compute_device_launch_params( + const Kokkos::MDRangePolicy& policy, + const MaxGridSize& max_grid_size) { + using Policy = Kokkos::MDRangePolicy; + using array_index_type = typename Policy::array_index_type; #ifdef KOKKOS_ENABLE_SYCL - KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile( - const PolicyType& policy_, const Functor& f_, - const MaxGridSize& max_grid_size_, - const EmulateCUDADim3 gridDim_, - const EmulateCUDADim3 blockIdx_, - const EmulateCUDADim3 threadIdx_) - : m_policy(policy_), - m_func(f_), - m_max_grid_size(max_grid_size_), - gridDim(gridDim_), - blockIdx(blockIdx_), - threadIdx(threadIdx_) {} + EmulateCUDADim3 block{1, 1, 1}; #else - KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile( - const PolicyType& policy_, const Functor& f_, - const MaxGridSize& max_grid_size_) - : m_policy(policy_), m_func(f_), m_max_grid_size(max_grid_size_) {} + dim3 block{1, 1, 1}; #endif - KOKKOS_IMPL_DEVICE_FUNCTION - void exec_range() const { - // LL - if (PolicyType::inner_direction == Iterate::Left) { - // iterate over y blocks - for (index_type tile_id1 = static_cast(blockIdx.y); - tile_id1 < m_policy.m_tile_end[1]; tile_id1 += gridDim.y) { - // compute index for dimension 1 - const index_type offset_1 = - tile_id1 * m_policy.m_tile[1] + - static_cast(threadIdx.y) + - static_cast(m_policy.m_lower[1]); - // check index for dimension 1 is within range - if (offset_1 < m_policy.m_upper[1] && - static_cast(threadIdx.y) < m_policy.m_tile[1]) { - // iterate over x blocks - for (index_type tile_id0 = static_cast(blockIdx.x); - tile_id0 < m_policy.m_tile_end[0]; tile_id0 += gridDim.x) { - // compute index for dimension 0 - const index_type offset_0 = - tile_id0 * m_policy.m_tile[0] + - static_cast(threadIdx.x) + - static_cast(m_policy.m_lower[0]); - // check index for dimension 0 is within range - if (offset_0 < m_policy.m_upper[0] && - static_cast(threadIdx.x) < m_policy.m_tile[0]) { - // call kernel with computed indices - Impl::_tag_invoke(m_func, offset_0, offset_1); - } - } - } - } + array_index_type grid_0 = 1; + array_index_type grid_1 = 1; + array_index_type grid_2 = 1; + + if constexpr (Policy::inner_direction == Iterate::Left) { + if constexpr (Policy::rank == 2) { + block.x = policy.m_tile[0]; + block.y = policy.m_tile[1]; + grid_0 = policy.m_tile_end[0]; + grid_1 = policy.m_tile_end[1]; + } else if constexpr (Policy::rank == 3) { + block.x = policy.m_tile[0]; + block.y = policy.m_tile[1]; + block.z = policy.m_tile[2]; + grid_0 = policy.m_tile_end[0]; + grid_1 = policy.m_tile_end[1]; + grid_2 = policy.m_tile_end[2]; + } else if constexpr (Policy::rank == 4) { + block.x = policy.m_tile[0] * policy.m_tile[1]; + block.y = policy.m_tile[2]; + block.z = policy.m_tile[3]; + grid_0 = policy.m_tile_end[0] * policy.m_tile_end[1]; + grid_1 = policy.m_tile_end[2]; + grid_2 = policy.m_tile_end[3]; + } else if constexpr (Policy::rank == 5) { + block.x = policy.m_tile[0] * policy.m_tile[1]; + block.y = policy.m_tile[2] * policy.m_tile[3]; + block.z = policy.m_tile[4]; + grid_0 = policy.m_tile_end[0] * policy.m_tile_end[1]; + grid_1 = policy.m_tile_end[2] * policy.m_tile_end[3]; + grid_2 = policy.m_tile_end[4]; + } else if constexpr (Policy::rank == 6) { + block.x = policy.m_tile[0] * policy.m_tile[1]; + block.y = policy.m_tile[2] * policy.m_tile[3]; + block.z = policy.m_tile[4] * policy.m_tile[5]; + grid_0 = policy.m_tile_end[0] * policy.m_tile_end[1]; + grid_1 = policy.m_tile_end[2] * policy.m_tile_end[3]; + grid_2 = policy.m_tile_end[4] * policy.m_tile_end[5]; } - // LR - else { - // iterate over x blocks - for (index_type tile_id0 = static_cast(blockIdx.x); - tile_id0 < m_policy.m_tile_end[0]; tile_id0 += gridDim.x) { - // compute index for dimension 0 - const index_type offset_0 = - tile_id0 * m_policy.m_tile[0] + - static_cast(threadIdx.x) + - static_cast(m_policy.m_lower[0]); - // check index for dimension 0 is within range - if (offset_0 < m_policy.m_upper[0] && - static_cast(threadIdx.x) < m_policy.m_tile[0]) { - // iterate over y blocks - for (index_type tile_id1 = static_cast(blockIdx.y); - tile_id1 < m_policy.m_tile_end[1]; tile_id1 += gridDim.y) { - // compute index for dimension 1 - const index_type offset_1 = - tile_id1 * m_policy.m_tile[1] + - static_cast(threadIdx.y) + - static_cast(m_policy.m_lower[1]); - // check index for dimension 1 is within range - if (offset_1 < m_policy.m_upper[1] && - static_cast(threadIdx.y) < m_policy.m_tile[1]) { - // call kernel with computed indices - Impl::_tag_invoke(m_func, offset_0, offset_1); - } - } - } - } + } else { // InnerDirection == Right + if constexpr (Policy::rank == 2) { + block.x = policy.m_tile[1]; + block.y = policy.m_tile[0]; + grid_0 = policy.m_tile_end[1]; + grid_1 = policy.m_tile_end[0]; + } else if constexpr (Policy::rank == 3) { + block.x = policy.m_tile[2]; + block.y = policy.m_tile[1]; + block.z = policy.m_tile[0]; + grid_0 = policy.m_tile_end[2]; + grid_1 = policy.m_tile_end[1]; + grid_2 = policy.m_tile_end[0]; + } else if constexpr (Policy::rank == 4) { + block.x = policy.m_tile[3] * policy.m_tile[2]; + block.y = policy.m_tile[1]; + block.z = policy.m_tile[0]; + grid_0 = policy.m_tile_end[3] * policy.m_tile_end[2]; + grid_1 = policy.m_tile_end[1]; + grid_2 = policy.m_tile_end[0]; + } else if constexpr (Policy::rank == 5) { + block.x = policy.m_tile[4] * policy.m_tile[3]; + block.y = policy.m_tile[2] * policy.m_tile[1]; + block.z = policy.m_tile[0]; + grid_0 = policy.m_tile_end[4] * policy.m_tile_end[3]; + grid_1 = policy.m_tile_end[2] * policy.m_tile_end[1]; + grid_2 = policy.m_tile_end[0]; + } else if constexpr (Policy::rank == 6) { + block.x = policy.m_tile[5] * policy.m_tile[4]; + block.y = policy.m_tile[3] * policy.m_tile[2]; + block.z = policy.m_tile[1] * policy.m_tile[0]; + grid_0 = policy.m_tile_end[5] * policy.m_tile_end[4]; + grid_1 = policy.m_tile_end[3] * policy.m_tile_end[2]; + grid_2 = policy.m_tile_end[1] * policy.m_tile_end[0]; } - } // end exec_range + } - private: - const PolicyType& m_policy; - const Functor& m_func; - const MaxGridSize& m_max_grid_size; #ifdef KOKKOS_ENABLE_SYCL - const EmulateCUDADim3 gridDim; - const EmulateCUDADim3 blockIdx; - const EmulateCUDADim3 threadIdx; -#endif -}; - -// Rank 3 -template -struct DeviceIterateTile<3, PolicyType, Functor, MaxGridSize, Tag> { - using index_type = typename PolicyType::index_type; - -#ifdef KOKKOS_ENABLE_SYCL - KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile( - const PolicyType& policy_, const Functor& f_, - const MaxGridSize& max_grid_size_, - const EmulateCUDADim3 gridDim_, - const EmulateCUDADim3 blockIdx_, - const EmulateCUDADim3 threadIdx_) - : m_policy(policy_), - m_func(f_), - m_max_grid_size(max_grid_size_), - gridDim(gridDim_), - blockIdx(blockIdx_), - threadIdx(threadIdx_) {} + // SYCL uses nd_range with global = grid * local sizes + sycl::range<3> local_sizes(block.x, block.y, block.z); + sycl::range<3> global_sizes( + std::min(grid_0, max_grid_size[0]) * local_sizes[0], + std::min(grid_1, max_grid_size[1]) * local_sizes[1], + std::min(grid_2, max_grid_size[2]) * local_sizes[2]); + return sycl::nd_range<3>(global_sizes, local_sizes); #else - KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile( - const PolicyType& policy_, const Functor& f_, - const MaxGridSize& max_grid_size_) - : m_policy(policy_), m_func(f_), m_max_grid_size(max_grid_size_) {} + dim3 grid(std::min(grid_0, max_grid_size[0]), + std::min(grid_1, max_grid_size[1]), + std::min(grid_2, max_grid_size[2])); + return std::pair(grid, block); #endif +} - KOKKOS_IMPL_DEVICE_FUNCTION - void exec_range() const { - // LL - if (PolicyType::inner_direction == Iterate::Left) { - // iterate over z blocks - for (index_type tile_id2 = static_cast(blockIdx.z); - tile_id2 < m_policy.m_tile_end[2]; tile_id2 += gridDim.z) { - // compute index for dimension 2 - const index_type offset_2 = - tile_id2 * m_policy.m_tile[2] + - static_cast(threadIdx.z) + - static_cast(m_policy.m_lower[2]); - // check index for dimension 2 is within range - if (offset_2 < m_policy.m_upper[2] && - static_cast(threadIdx.z) < m_policy.m_tile[2]) { - // iterate over y blocks - for (index_type tile_id1 = static_cast(blockIdx.y); - tile_id1 < m_policy.m_tile_end[1]; tile_id1 += gridDim.y) { - // compute index for dimension 1 - const index_type offset_1 = - tile_id1 * m_policy.m_tile[1] + - static_cast(threadIdx.y) + - static_cast(m_policy.m_lower[1]); - // check index for dimension 1 is within range - if (offset_1 < m_policy.m_upper[1] && - static_cast(threadIdx.y) < m_policy.m_tile[1]) { - // iterate over x blocks - for (index_type tile_id0 = static_cast(blockIdx.x); - tile_id0 < m_policy.m_tile_end[0]; tile_id0 += gridDim.x) { - // compute index for dimension 0 - const index_type offset_0 = - tile_id0 * m_policy.m_tile[0] + - static_cast(threadIdx.x) + - static_cast(m_policy.m_lower[0]); - // check index for dimension 0 is within range - if (offset_0 < m_policy.m_upper[0] && - static_cast(threadIdx.x) < m_policy.m_tile[0]) { - // call kernel with computed indices - Impl::_tag_invoke(m_func, offset_0, offset_1, offset_2); - } - } - } - } - } - } - } - // LR - else { - // iterate over x blocks - for (index_type tile_id0 = static_cast(blockIdx.x); - tile_id0 < m_policy.m_tile_end[0]; tile_id0 += gridDim.x) { - // compute index for dimension 0 - const index_type offset_0 = - tile_id0 * m_policy.m_tile[0] + - static_cast(threadIdx.x) + - static_cast(m_policy.m_lower[0]); - // check index for dimension 0 is within range - if (offset_0 < m_policy.m_upper[0] && - static_cast(threadIdx.x) < m_policy.m_tile[0]) { - // iterate over y blocks - for (index_type tile_id1 = static_cast(blockIdx.y); - tile_id1 < m_policy.m_tile_end[1]; tile_id1 += gridDim.y) { - // compute index for dimension 1 - const index_type offset_1 = - tile_id1 * m_policy.m_tile[1] + - static_cast(threadIdx.y) + - static_cast(m_policy.m_lower[1]); - // check index for dimension 1 is within range - if (offset_1 < m_policy.m_upper[1] && - static_cast(threadIdx.y) < m_policy.m_tile[1]) { - // iterate over z blocks - for (index_type tile_id2 = static_cast(blockIdx.z); - tile_id2 < m_policy.m_tile_end[2]; tile_id2 += gridDim.z) { - // compute index for dimension 2 - const index_type offset_2 = - tile_id2 * m_policy.m_tile[2] + - static_cast(threadIdx.z) + - static_cast(m_policy.m_lower[2]); - // check index for dimension 2 is within range - if (offset_2 < m_policy.m_upper[2] && - static_cast(threadIdx.z) < m_policy.m_tile[2]) { - // call kernel with computed indices - Impl::_tag_invoke(m_func, offset_0, offset_1, offset_2); - } - } - } - } - } - } - } - } // end exec_range +// ------------------------------------------------------------------------- // +// ParallelFor iteration pattern - maps GPU threads to N-D iteration space +// +// For ranks 2-3: Direct mapping of hardware threads to iteration space +// dimensions. +// For ranks 4-6: Multiple logical indices are packed into single +// hardware dimensions. +// +// 1. Start iterating at the hardware thread identifier. +// 2. Extend the iteration space range with stride loops using grid dimensions. +// 3. Bounds check against m_upper to filter out-of-bounds iterations. +// +template +struct DeviceIterate; + +template +struct DeviceIterate { + using array_type = Kokkos::Array; private: - const PolicyType& m_policy; - const Functor& m_func; - const MaxGridSize& m_max_grid_size; + const array_type m_lower; + const array_type m_upper; + const array_type m_extent; // tile_size * num_tiles + const Functor& m_functor; + #ifdef KOKKOS_ENABLE_SYCL const EmulateCUDADim3 gridDim; + const EmulateCUDADim3 blockDim; const EmulateCUDADim3 blockIdx; const EmulateCUDADim3 threadIdx; #endif -}; - -// Rank 4 -template -struct DeviceIterateTile<4, PolicyType, Functor, MaxGridSize, Tag> { - using index_type = typename PolicyType::index_type; + public: #ifdef KOKKOS_ENABLE_SYCL - KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile( - const PolicyType& policy_, const Functor& f_, - const MaxGridSize& max_grid_size_, + KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterate( + const array_type& lower, const array_type& upper, + const array_type& extent, const Functor& functor, const EmulateCUDADim3 gridDim_, + const EmulateCUDADim3 blockDim_, const EmulateCUDADim3 blockIdx_, const EmulateCUDADim3 threadIdx_) - : m_policy(policy_), - m_func(f_), - m_max_grid_size(max_grid_size_), + : m_lower(lower), + m_upper(upper), + m_extent(extent), + m_functor(functor), gridDim(gridDim_), + blockDim(blockDim_), blockIdx(blockIdx_), threadIdx(threadIdx_) {} #else - KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile( - const PolicyType& policy_, const Functor& f_, - const MaxGridSize& max_grid_size_) - : m_policy(policy_), m_func(f_), m_max_grid_size(max_grid_size_) {} + + KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterate(const array_type& lower, + const array_type& upper, + const array_type& extent, + const Functor& functor) + : m_lower(lower), m_upper(upper), m_extent(extent), m_functor(functor) {} #endif KOKKOS_IMPL_DEVICE_FUNCTION - void exec_range() const { - // LL - if (PolicyType::inner_direction == Iterate::Left) { - // number of tiles for dimension 0 - const index_type temp0 = m_policy.m_tile_end[0]; - // number of tiles for dimension 1 - const index_type temp1 = m_policy.m_tile_end[1]; - - // number of virtual blocks for dimension 0 - const index_type numbl0 = - Kokkos::min(temp0, static_cast(m_max_grid_size[0])); - // number of virtual blocks for dimension 1 - const index_type numbl1 = - (temp0 * temp1 > static_cast(m_max_grid_size[0]) - ? static_cast(m_max_grid_size[0]) / numbl0 - : Kokkos::min(temp1, - static_cast(m_max_grid_size[0]))); - - // first virtual block index for dimension 0 - const index_type tile_id0 = static_cast(blockIdx.x) % numbl0; - // first virtual block index for dimension 1 - const index_type tile_id1 = static_cast(blockIdx.x) / numbl0; + void exec_range() const { iterate(std::integral_constant()); } - // virtual thread index for dimension 0 - const index_type thr_id0 = - static_cast(threadIdx.x) % m_policy.m_tile[0]; - // virtual thread index for dimension 1 - const index_type thr_id1 = - static_cast(threadIdx.x) / m_policy.m_tile[0]; - - // iterate over z blocks - for (index_type tile_id3 = static_cast(blockIdx.z); - tile_id3 < m_policy.m_tile_end[3]; tile_id3 += gridDim.z) { - // compute index for dimension 3 - const index_type offset_3 = - tile_id3 * m_policy.m_tile[3] + - static_cast(threadIdx.z) + - static_cast(m_policy.m_lower[3]); - // check index for dimension 3 is within range - if (offset_3 < m_policy.m_upper[3] && - static_cast(threadIdx.z) < m_policy.m_tile[3]) { - // iterate over y blocks - for (index_type tile_id2 = static_cast(blockIdx.y); - tile_id2 < m_policy.m_tile_end[2]; tile_id2 += gridDim.y) { - // compute index for dimension 2 - const index_type offset_2 = - tile_id2 * m_policy.m_tile[2] + - static_cast(threadIdx.y) + - static_cast(m_policy.m_lower[2]); - // check index for dimension 2 is within range - if (offset_2 < m_policy.m_upper[2] && - static_cast(threadIdx.y) < m_policy.m_tile[2]) { - // iterate over virtual blocks for dimension 1 - for (index_type j = tile_id1; j < m_policy.m_tile_end[1]; - j += numbl1) { - // compute index for dimension 1 - const index_type offset_1 = - j * m_policy.m_tile[1] + thr_id1 + - static_cast(m_policy.m_lower[1]); - // check index for dimension 1 is within range - if (offset_1 < m_policy.m_upper[1] && - thr_id1 < m_policy.m_tile[1]) { - // iterate over virtual blocks for dimension 0 - for (index_type i = tile_id0; i < m_policy.m_tile_end[0]; - i += numbl0) { - // compute index for dimension 0 - const index_type offset_0 = - i * m_policy.m_tile[0] + thr_id0 + - static_cast(m_policy.m_lower[0]); - // check index for dimension 0 is within range - if (offset_0 < m_policy.m_upper[0] && - thr_id0 < m_policy.m_tile[0]) { - // call kernel with computed indices - Impl::_tag_invoke(m_func, offset_0, offset_1, - offset_2, offset_3); - } - } - } - } - } - } - } + private: + // Runtime expression to determine if Dim is part of a packed pair + // Packing occurs on consecutive dimension pairs for rank > 3 + template + KOKKOS_IMPL_DEVICE_FUNCTION static consteval bool is_packed_index() { + return ((Dim == 0 || Dim == 1) && Rank > 3) || + ((Dim == 2 || Dim == 3) && Rank > 4) || + ((Dim == 4 || Dim == 5) && Rank > 5); + } + + // Packed: returns flat hardware thread ID (unpacking happens in iterate()) + // Unpacked: returns global index (lower + blockIdx * blockDim + threadIdx) + template + KOKKOS_IMPL_DEVICE_FUNCTION KOKKOS_IMPL_FORCEINLINE constexpr index_type + my_begin() const noexcept { + static_assert(R < 6); + if constexpr (is_packed_index()) { + if constexpr (R == 0 || R == 1) { + return blockIdx.x * blockDim.x + threadIdx.x; + } else if constexpr (R == 2 || R == 3) { + return blockIdx.y * blockDim.y + threadIdx.y; + } else if constexpr (R == 4 || R == 5) { + return blockIdx.z * blockDim.z + threadIdx.z; } - } - // LR - else { - // number of tiles for dimension 0 - const index_type temp0 = m_policy.m_tile_end[0]; - // number of tiles for dimension 1 - const index_type temp1 = m_policy.m_tile_end[1]; - - // number of virtual blocks for dimension 1 - const index_type numbl1 = - Kokkos::min(temp1, static_cast(m_max_grid_size[0])); - // number of virtual blocks for dimension 0 - const index_type numbl0 = - (temp0 * temp1 > static_cast(m_max_grid_size[0]) - ? static_cast(m_max_grid_size[0]) / numbl1 - : Kokkos::min(temp0, - static_cast(m_max_grid_size[0]))); - - // first virtual block index for dimension 0 - const index_type tile_id0 = static_cast(blockIdx.x) / numbl1; - // first virtual block index for dimension 1 - const index_type tile_id1 = static_cast(blockIdx.x) % numbl1; - - // virtual thread index for dimension 0 - const index_type thr_id0 = - static_cast(threadIdx.x) / m_policy.m_tile[1]; - // virtual thread index for dimension 1 - const index_type thr_id1 = - static_cast(threadIdx.x) % m_policy.m_tile[1]; - - // iterate over virtual blocks for dimension 0 - for (index_type i = tile_id0; i < m_policy.m_tile_end[0]; i += numbl0) { - // compute index for dimension 0 - const index_type offset_0 = - i * m_policy.m_tile[0] + thr_id0 + - static_cast(m_policy.m_lower[0]); - // check index for dimension 0 is within range - if (offset_0 < m_policy.m_upper[0] && thr_id0 < m_policy.m_tile[0]) { - // iterate over virtual blocks for dimension 1 - for (index_type j = tile_id1; j < m_policy.m_tile_end[1]; - j += numbl1) { - // compute index for dimension 1 - const index_type offset_1 = - j * m_policy.m_tile[1] + thr_id1 + - static_cast(m_policy.m_lower[1]); - // check index for dimension 1 is within range - if (offset_1 < m_policy.m_upper[1] && - thr_id1 < m_policy.m_tile[1]) { - // iterate over y blocks - for (index_type tile_id2 = static_cast(blockIdx.y); - tile_id2 < m_policy.m_tile_end[2]; tile_id2 += gridDim.y) { - // compute index for dimension 2 - const index_type offset_2 = - tile_id2 * m_policy.m_tile[2] + - static_cast(threadIdx.y) + - static_cast(m_policy.m_lower[2]); - // check index for dimension 2 is within range - if (offset_2 < m_policy.m_upper[2] && - static_cast(threadIdx.y) < m_policy.m_tile[2]) { - // iterate over z blocks - for (index_type tile_id3 = - static_cast(blockIdx.z); - tile_id3 < m_policy.m_tile_end[3]; - tile_id3 += gridDim.z) { - // compute index for dimension 3 - const index_type offset_3 = - tile_id3 * m_policy.m_tile[3] + - static_cast(threadIdx.z) + - static_cast(m_policy.m_lower[3]); - // check index for dimension 3 is within range - if (offset_3 < m_policy.m_upper[3] && - static_cast(threadIdx.z) < - m_policy.m_tile[3]) { - // call kernel with computed indices - Impl::_tag_invoke(m_func, offset_0, offset_1, - offset_2, offset_3); - } - } - } - } - } - } + } else { + // No packed index + if constexpr (Rank < 4) { + if constexpr (R == 0) { + return m_lower[R] + blockIdx.x * blockDim.x + threadIdx.x; + } else if constexpr (R == 1) { + return m_lower[R] + blockIdx.y * blockDim.y + threadIdx.y; + } else if constexpr (R == 2) { + return m_lower[R] + blockIdx.z * blockDim.z + threadIdx.z; } - } - } - } // end exec_range - - private: - const PolicyType& m_policy; - const Functor& m_func; - const MaxGridSize& m_max_grid_size; -#ifdef KOKKOS_ENABLE_SYCL - const EmulateCUDADim3 gridDim; - const EmulateCUDADim3 blockIdx; - const EmulateCUDADim3 threadIdx; -#endif -}; - -// Rank 5 -template -struct DeviceIterateTile<5, PolicyType, Functor, MaxGridSize, Tag> { - using index_type = typename PolicyType::index_type; - -#ifdef KOKKOS_ENABLE_SYCL - KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile( - const PolicyType& policy_, const Functor& f_, - const MaxGridSize& max_grid_size_, - const EmulateCUDADim3 gridDim_, - const EmulateCUDADim3 blockIdx_, - const EmulateCUDADim3 threadIdx_) - : m_policy(policy_), - m_func(f_), - m_max_grid_size(max_grid_size_), - gridDim(gridDim_), - blockIdx(blockIdx_), - threadIdx(threadIdx_) {} -#else - KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile( - const PolicyType& policy_, const Functor& f_, - const MaxGridSize& max_grid_size_) - : m_policy(policy_), m_func(f_), m_max_grid_size(max_grid_size_) {} -#endif - - KOKKOS_IMPL_DEVICE_FUNCTION - void exec_range() const { - // LL - if (PolicyType::inner_direction == Iterate::Left) { - // number of tiles for dimension 0 - index_type temp0 = m_policy.m_tile_end[0]; - // number of tiles for dimension 1 - index_type temp1 = m_policy.m_tile_end[1]; - - // number of virtual blocks for dimension 0 - const index_type numbl0 = - Kokkos::min(temp0, static_cast(m_max_grid_size[0])); - // number of virtual blocks for dimension 1 - const index_type numbl1 = - (temp0 * temp1 > static_cast(m_max_grid_size[0]) - ? static_cast(m_max_grid_size[0]) / numbl0 - : Kokkos::min(temp1, - static_cast(m_max_grid_size[0]))); - - // first virtual block index for dimension 0 - const index_type tile_id0 = static_cast(blockIdx.x) % numbl0; - // first virtual block index for dimension 1 - const index_type tile_id1 = static_cast(blockIdx.x) / numbl0; - - // virtual thread index for dimension 0 - const index_type thr_id0 = - static_cast(threadIdx.x) % m_policy.m_tile[0]; - // virtual thread index for dimension 1 - const index_type thr_id1 = - static_cast(threadIdx.x) / m_policy.m_tile[0]; - - // number of tiles for dimension 2 - temp0 = m_policy.m_tile_end[2]; - // number of tiles for dimension 3 - temp1 = m_policy.m_tile_end[3]; - - // number of virtual blocks for dimension 2 - const index_type numbl2 = - Kokkos::min(temp0, static_cast(m_max_grid_size[1])); - // number of virtual blocks for dimension 3 - const index_type numbl3 = - (temp0 * temp1 > static_cast(m_max_grid_size[1]) - ? static_cast(m_max_grid_size[1]) / numbl2 - : Kokkos::min(temp1, - static_cast(m_max_grid_size[1]))); - - // first virtual block index for dimension 2 - const index_type tile_id2 = static_cast(blockIdx.y) % numbl2; - // first virtual block index for dimension 3 - const index_type tile_id3 = static_cast(blockIdx.y) / numbl2; - - // virtual thread index for dimension 2 - const index_type thr_id2 = - static_cast(threadIdx.y) % m_policy.m_tile[2]; - // virtual thread index for dimension 3 - const index_type thr_id3 = - static_cast(threadIdx.y) / m_policy.m_tile[2]; - - // iterate over z blocks - for (index_type tile_id4 = static_cast(blockIdx.z); - tile_id4 < m_policy.m_tile_end[4]; tile_id4 += gridDim.z) { - // compute index for dimension 4 - const index_type offset_4 = - tile_id4 * m_policy.m_tile[4] + - static_cast(threadIdx.z) + - static_cast(m_policy.m_lower[4]); - // check index for dimension 4 is within range - if (offset_4 < m_policy.m_upper[4] && - static_cast(threadIdx.z) < m_policy.m_tile[4]) { - // iterate over virtual blocks for dimension 3 - for (index_type l = tile_id3; l < m_policy.m_tile_end[3]; - l += numbl3) { - // compute index for dimension 3 - const index_type offset_3 = - l * m_policy.m_tile[3] + thr_id3 + - static_cast(m_policy.m_lower[3]); - // check index for dimension 3 is within range - if (offset_3 < m_policy.m_upper[3] && - thr_id3 < m_policy.m_tile[3]) { - // iterate over virtual blocks for dimension 2 - for (index_type k = tile_id2; k < m_policy.m_tile_end[2]; - k += numbl2) { - // compute index for dimension 2 - const index_type offset_2 = - k * m_policy.m_tile[2] + thr_id2 + - static_cast(m_policy.m_lower[2]); - // check index for dimension 2 is within range - if (offset_2 < m_policy.m_upper[2] && - thr_id2 < m_policy.m_tile[2]) { - // iterate over virtual blocks for dimension 1 - for (index_type j = tile_id1; j < m_policy.m_tile_end[1]; - j += numbl1) { - // compute index for dimension 1 - const index_type offset_1 = - j * m_policy.m_tile[1] + thr_id1 + - static_cast(m_policy.m_lower[1]); - // check index for dimension 1 is within range - if (offset_1 < m_policy.m_upper[1] && - thr_id1 < m_policy.m_tile[1]) { - // iterate over virtual blocks for dimension 0 - for (index_type i = tile_id0; i < m_policy.m_tile_end[0]; - i += numbl0) { - // compute index for dimension 0 - const index_type offset_0 = - i * m_policy.m_tile[0] + thr_id0 + - static_cast(m_policy.m_lower[0]); - // check index for dimension 0 is within range - if (offset_0 < m_policy.m_upper[0] && - thr_id0 < m_policy.m_tile[0]) { - // call kernel with computed indices - Impl::_tag_invoke(m_func, offset_0, offset_1, - offset_2, offset_3, offset_4); - } - } - } - } - } - } - } - } + } else { + // Mix of packed and unpacked for Rank 4 and 5 + if constexpr (R == 2) { + return m_lower[R] + blockIdx.y * blockDim.y + threadIdx.y; + } else if constexpr (R == 3 || R == 4) { + return m_lower[R] + blockIdx.z * blockDim.z + threadIdx.z; } } } - // LR - else { - // number of tiles for dimension 0 - index_type temp0 = m_policy.m_tile_end[0]; - // number of tiles for dimension 1 - index_type temp1 = m_policy.m_tile_end[1]; - - // number of virtual blocks for dimension 1 - const index_type numbl1 = - Kokkos::min(temp1, static_cast(m_max_grid_size[0])); - // number of virtual blocks for dimension 0 - const index_type numbl0 = - (temp0 * temp1 > static_cast(m_max_grid_size[0]) - ? static_cast(m_max_grid_size[0]) / numbl1 - : Kokkos::min(temp0, - static_cast(m_max_grid_size[0]))); - - // first virtual block index for dimension 0 - const index_type tile_id0 = static_cast(blockIdx.x) / numbl1; - // first virtual block index for dimension 1 - const index_type tile_id1 = static_cast(blockIdx.x) % numbl1; - - // virtual thread index for dimension 0 - const index_type thr_id0 = - static_cast(threadIdx.x) / m_policy.m_tile[1]; - // virtual thread index for dimension 1 - const index_type thr_id1 = - static_cast(threadIdx.x) % m_policy.m_tile[1]; - - // number of tiles for dimension 2 - temp0 = m_policy.m_tile_end[2]; - // number of tiles for dimension 3 - temp1 = m_policy.m_tile_end[3]; - - // number of virtual blocks for dimension 3 - const index_type numbl3 = - Kokkos::min(temp1, static_cast(m_max_grid_size[1])); - // number of virtual blocks for dimension 2 - const index_type numbl2 = - (temp0 * temp1 > static_cast(m_max_grid_size[1]) - ? static_cast(m_max_grid_size[1]) / numbl3 - : Kokkos::min(temp0, - static_cast(m_max_grid_size[1]))); - - // first virtual block index for dimension 2 - const index_type tile_id2 = static_cast(blockIdx.y) / numbl3; - // first virtual block index for dimension 3 - const index_type tile_id3 = static_cast(blockIdx.y) % numbl3; - - // virtual thread index for dimension 2 - const index_type thr_id2 = - static_cast(threadIdx.y) / m_policy.m_tile[3]; - // virtual thread index for dimension 3 - const index_type thr_id3 = - static_cast(threadIdx.y) % m_policy.m_tile[3]; - - // iterate over virtual blocks for dimension 0 - for (index_type i = tile_id0; i < m_policy.m_tile_end[0]; i += numbl0) { - // compute index for dimension 0 - const index_type offset_0 = - i * m_policy.m_tile[0] + thr_id0 + - static_cast(m_policy.m_lower[0]); - // check index for dimension 0 is within range - if (offset_0 < m_policy.m_upper[0] && thr_id0 < m_policy.m_tile[0]) { - // iterate over virtual blocks for dimension 1 - for (index_type j = tile_id1; j < m_policy.m_tile_end[1]; - j += numbl1) { - // compute index for dimension 1 - const index_type offset_1 = - j * m_policy.m_tile[1] + thr_id1 + - static_cast(m_policy.m_lower[1]); - // check index for dimension 1 is within range - if (offset_1 < m_policy.m_upper[1] && - thr_id1 < m_policy.m_tile[1]) { - // iterate over virtual blocks for dimension 2 - for (index_type k = tile_id2; k < m_policy.m_tile_end[2]; - k += numbl2) { - // compute index for dimension 2 - const index_type offset_2 = - k * m_policy.m_tile[2] + thr_id2 + - static_cast(m_policy.m_lower[2]); - // check index for dimension 2 is within range - if (offset_2 < m_policy.m_upper[2] && - thr_id2 < m_policy.m_tile[2]) { - // iterate over virtual blocks for dimension 3 - for (index_type l = tile_id3; l < m_policy.m_tile_end[3]; - l += numbl3) { - // compute index for dimension 3 - const index_type offset_3 = - l * m_policy.m_tile[3] + thr_id3 + - static_cast(m_policy.m_lower[3]); - // check index for dimension 3 is within range - if (offset_3 < m_policy.m_upper[3] && - thr_id3 < m_policy.m_tile[3]) { - // iterate over z blocks - for (index_type tile_id4 = - static_cast(blockIdx.z); - tile_id4 < m_policy.m_tile_end[4]; - tile_id4 += gridDim.z) { - // compute index for dimension 3 - const index_type offset_4 = - tile_id4 * m_policy.m_tile[4] + - static_cast(threadIdx.z) + - static_cast(m_policy.m_lower[4]); - // check index for dimension 3 is within range - if (offset_4 < m_policy.m_upper[4] && - static_cast(threadIdx.z) < - m_policy.m_tile[4]) { - // call kernel with computed indices - Impl::_tag_invoke(m_func, offset_0, offset_1, - offset_2, offset_3, offset_4); - } - } - } - } - } - } - } - } - } + return m_lower[R]; + } + + // Packed: end at the product of two consecutive extents + // Unpacked: directly use m_upper + template + KOKKOS_IMPL_DEVICE_FUNCTION KOKKOS_IMPL_FORCEINLINE constexpr index_type + my_end() const noexcept { + static_assert(R < 6); + if constexpr (is_packed_index()) { + if constexpr (R % 2 == 0) { + return m_extent[R] * m_extent[R + 1]; + } else { + return m_extent[R] * m_extent[R - 1]; } + } else { + return m_upper[R]; } - } // end exec_range - - private: - const PolicyType& m_policy; - const Functor& m_func; - const MaxGridSize& m_max_grid_size; -#ifdef KOKKOS_ENABLE_SYCL - const EmulateCUDADim3 gridDim; - const EmulateCUDADim3 blockIdx; - const EmulateCUDADim3 threadIdx; -#endif -}; - -// Rank 6 -template -struct DeviceIterateTile<6, PolicyType, Functor, MaxGridSize, Tag> { - using index_type = typename PolicyType::index_type; - -#ifdef KOKKOS_ENABLE_SYCL - KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile( - const PolicyType& policy_, const Functor& f_, - const MaxGridSize& max_grid_size_, - const EmulateCUDADim3 gridDim_, - const EmulateCUDADim3 blockIdx_, - const EmulateCUDADim3 threadIdx_) - : m_policy(policy_), - m_func(f_), - m_max_grid_size(max_grid_size_), - gridDim(gridDim_), - blockIdx(blockIdx_), - threadIdx(threadIdx_) {} -#else - KOKKOS_IMPL_DEVICE_FUNCTION DeviceIterateTile( - const PolicyType& policy_, const Functor& f_, - const MaxGridSize& max_grid_size_) - : m_policy(policy_), m_func(f_), m_max_grid_size(max_grid_size_) {} -#endif - - KOKKOS_IMPL_DEVICE_FUNCTION - void exec_range() const { - // LL - if (PolicyType::inner_direction == Iterate::Left) { - // number of tiles for dimension 0 - index_type temp0 = m_policy.m_tile_end[0]; - // number of tiles for dimension 1 - index_type temp1 = m_policy.m_tile_end[1]; - - // number of virtual blocks for dimension 0 - const index_type numbl0 = - Kokkos::min(temp0, static_cast(m_max_grid_size[0])); - // number of virtual blocks for dimension 1 - const index_type numbl1 = - (temp0 * temp1 > static_cast(m_max_grid_size[0]) - ? static_cast(m_max_grid_size[0]) / numbl0 - : Kokkos::min(temp1, - static_cast(m_max_grid_size[0]))); - - // first virtual block index for dimension 0 - const index_type tile_id0 = static_cast(blockIdx.x) % numbl0; - // first virtual block index for dimension 1 - const index_type tile_id1 = static_cast(blockIdx.x) / numbl0; - - // virtual thread index for dimension 0 - const index_type thr_id0 = - static_cast(threadIdx.x) % m_policy.m_tile[0]; - // virtual thread index for dimension 1 - const index_type thr_id1 = - static_cast(threadIdx.x) / m_policy.m_tile[0]; - - // number of tiles for dimension 2 - temp0 = m_policy.m_tile_end[2]; - // number of tiles for dimension 3 - temp1 = m_policy.m_tile_end[3]; - - // number of virtual blocks for dimension 2 - const index_type numbl2 = - Kokkos::min(temp0, static_cast(m_max_grid_size[1])); - // number of virtual blocks for dimension 3 - const index_type numbl3 = - (temp0 * temp1 > static_cast(m_max_grid_size[1]) - ? static_cast(m_max_grid_size[1]) / numbl2 - : Kokkos::min(temp1, - static_cast(m_max_grid_size[1]))); - - // first virtual block index for dimension 2 - const index_type tile_id2 = static_cast(blockIdx.y) % numbl2; - // first virtual block index for dimension 3 - const index_type tile_id3 = static_cast(blockIdx.y) / numbl2; - - // virtual thread index for dimension 2 - const index_type thr_id2 = - static_cast(threadIdx.y) % m_policy.m_tile[2]; - // virtual thread index for dimension 3 - const index_type thr_id3 = - static_cast(threadIdx.y) / m_policy.m_tile[2]; - - // number of tiles for dimension 4 - temp0 = m_policy.m_tile_end[4]; - // number of tiles for dimension 5 - temp1 = m_policy.m_tile_end[5]; - - // number of virtual blocks for dimension 4 - const index_type numbl4 = - Kokkos::min(temp0, static_cast(m_max_grid_size[2])); - // number of virtual blocks for dimension 5 - const index_type numbl5 = - (temp0 * temp1 > static_cast(m_max_grid_size[2]) - ? static_cast(m_max_grid_size[2]) / numbl4 - : Kokkos::min(temp1, - static_cast(m_max_grid_size[2]))); - - // first virtual block index for dimension 4 - const index_type tile_id4 = static_cast(blockIdx.z) % numbl4; - // first virtual block index for dimension 5 - const index_type tile_id5 = static_cast(blockIdx.z) / numbl4; - - // virtual thread index for dimension 4 - const index_type thr_id4 = - static_cast(threadIdx.z) % m_policy.m_tile[4]; - // virtual thread index for dimension 5 - const index_type thr_id5 = - static_cast(threadIdx.z) / m_policy.m_tile[4]; - - // iterate over virtual blocks for dimension 5 - for (index_type n = tile_id5; n < m_policy.m_tile_end[5]; n += numbl5) { - // compute index for dimension 5 - const index_type offset_5 = - n * m_policy.m_tile[5] + thr_id5 + - static_cast(m_policy.m_lower[5]); - // check index for dimension 5 is within range - if (offset_5 < m_policy.m_upper[5] && thr_id5 < m_policy.m_tile[5]) { - // iterate over virtual blocks for dimension 4 - for (index_type m = tile_id4; m < m_policy.m_tile_end[4]; - m += numbl4) { - // compute index for dimension 4 - const index_type offset_4 = - m * m_policy.m_tile[4] + thr_id4 + - static_cast(m_policy.m_lower[4]); - // check index for dimension 4 is within range - if (offset_4 < m_policy.m_upper[4] && - thr_id4 < m_policy.m_tile[4]) { - // iterate over virtual blocks for dimension 3 - for (index_type l = tile_id3; l < m_policy.m_tile_end[3]; - l += numbl3) { - // compute index for dimension 3 - const index_type offset_3 = - l * m_policy.m_tile[3] + thr_id3 + - static_cast(m_policy.m_lower[3]); - // check index for dimension 3 is within range - if (offset_3 < m_policy.m_upper[3] && - thr_id3 < m_policy.m_tile[3]) { - // iterate over virtual blocks for dimension 2 - for (index_type k = tile_id2; k < m_policy.m_tile_end[2]; - k += numbl2) { - // compute index for dimension 2 - const index_type offset_2 = - k * m_policy.m_tile[2] + thr_id2 + - static_cast(m_policy.m_lower[2]); - // check index for dimension 2 is within range - if (offset_2 < m_policy.m_upper[2] && - thr_id2 < m_policy.m_tile[2]) { - // iterate over virtual blocks for dimension 1 - for (index_type j = tile_id1; j < m_policy.m_tile_end[1]; - j += numbl1) { - // compute index for dimension 1 - const index_type offset_1 = - j * m_policy.m_tile[1] + thr_id1 + - static_cast(m_policy.m_lower[1]); - // check index for dimension 1 is within range - if (offset_1 < m_policy.m_upper[1] && - thr_id1 < m_policy.m_tile[1]) { - // iterate over virtual blocks for dimension 0 - for (index_type i = tile_id0; - i < m_policy.m_tile_end[0]; i += numbl0) { - // compute index for dimension 0 - const index_type offset_0 = - i * m_policy.m_tile[0] + thr_id0 + - static_cast(m_policy.m_lower[0]); - // check index for dimension 0 is within range - if (offset_0 < m_policy.m_upper[0] && - thr_id0 < m_policy.m_tile[0]) { - // call kernel with computed indices - Impl::_tag_invoke(m_func, offset_0, offset_1, - offset_2, offset_3, - offset_4, offset_5); - } - } - } - } - } - } - } - } - } - } + } + + // Stride by the total number of threads in the GPU dimension + template + KOKKOS_IMPL_DEVICE_FUNCTION KOKKOS_IMPL_FORCEINLINE constexpr index_type + my_stride() const noexcept { + static_assert(R < 6); + if constexpr (is_packed_index()) { + if constexpr (R == 0 || R == 1) { + return static_cast(blockDim.x) * + static_cast(gridDim.x); + } else if constexpr (R == 2 || R == 3) { + return static_cast(blockDim.y) * + static_cast(gridDim.y); + } else if constexpr (R == 4 || R == 5) { + return static_cast(blockDim.z) * + static_cast(gridDim.z); + } + } else { + // No packed index for all ranks + if constexpr (Rank < 4) { + if constexpr (R == 0) { + return static_cast(blockDim.x) * + static_cast(gridDim.x); + } else if constexpr (R == 1) { + return static_cast(blockDim.y) * + static_cast(gridDim.y); + } else if constexpr (R == 2) { + return static_cast(blockDim.z) * + static_cast(gridDim.z); + } + } else { + // Mix of packed and unpacked for Rank 4 and 5 + if constexpr (R == 2) { + return static_cast(blockDim.y) * + static_cast(gridDim.y); + } else if constexpr (R == 3 || R == 4) { + return static_cast(blockDim.z) * + static_cast(gridDim.z); } } } - // LR - else { - // number of tiles for dimension 0 - index_type temp0 = m_policy.m_tile_end[0]; - // number of tiles for dimension 1 - index_type temp1 = m_policy.m_tile_end[1]; - - // number of virtual blocks for dimension 1 - const index_type numbl1 = - Kokkos::min(temp1, static_cast(m_max_grid_size[0])); - // number of virtual blocks for dimension 0 - const index_type numbl0 = - (temp0 * temp1 > static_cast(m_max_grid_size[0]) - ? static_cast(m_max_grid_size[0]) / numbl1 - : Kokkos::min(temp0, - static_cast(m_max_grid_size[0]))); - - // first virtual block index for dimension 0 - const index_type tile_id0 = static_cast(blockIdx.x) / numbl1; - // first virtual block index for dimension 1 - const index_type tile_id1 = static_cast(blockIdx.x) % numbl1; - - // virtual thread index for dimension 0 - const index_type thr_id0 = - static_cast(threadIdx.x) / m_policy.m_tile[1]; - // virtual thread index for dimension 1 - const index_type thr_id1 = - static_cast(threadIdx.x) % m_policy.m_tile[1]; - - // number of tiles for dimension 2 - temp0 = m_policy.m_tile_end[2]; - // number of tiles for dimension 3 - temp1 = m_policy.m_tile_end[3]; - - // number of virtual blocks for dimension 3 - const index_type numbl3 = - Kokkos::min(temp1, static_cast(m_max_grid_size[1])); - // number of virtual blocks for dimension 2 - const index_type numbl2 = - (temp0 * temp1 > static_cast(m_max_grid_size[1]) - ? static_cast(m_max_grid_size[1]) / numbl3 - : Kokkos::min(temp0, - static_cast(m_max_grid_size[1]))); - - // first virtual block index for dimension 2 - const index_type tile_id2 = static_cast(blockIdx.y) / numbl3; - // first virtual block index for dimension 3 - const index_type tile_id3 = static_cast(blockIdx.y) % numbl3; - - // virtual thread index for dimension 2 - const index_type thr_id2 = - static_cast(threadIdx.y) / m_policy.m_tile[3]; - // virtual thread index for dimension 3 - const index_type thr_id3 = - static_cast(threadIdx.y) % m_policy.m_tile[3]; - - // number of tiles for dimension 4 - temp0 = m_policy.m_tile_end[4]; - // number of tiles for dimension 5 - temp1 = m_policy.m_tile_end[5]; - - // number of virtual blocks for dimension 5 - const index_type numbl5 = - Kokkos::min(temp1, static_cast(m_max_grid_size[2])); - // number of virtual blocks for dimension 3 - const index_type numbl4 = - (temp0 * temp1 > static_cast(m_max_grid_size[2]) - ? static_cast(m_max_grid_size[2]) / numbl5 - : Kokkos::min(temp0, - static_cast(m_max_grid_size[2]))); - - // first virtual block index for dimension 4 - const index_type tile_id4 = static_cast(blockIdx.z) / numbl5; - // first virtual block index for dimension 5 - const index_type tile_id5 = static_cast(blockIdx.z) % numbl5; - - // virtual thread index for dimension 4 - const index_type thr_id4 = - static_cast(threadIdx.z) / m_policy.m_tile[5]; - // virtual thread index for dimension 5 - const index_type thr_id5 = - static_cast(threadIdx.z) % m_policy.m_tile[5]; - - // iterate over virtual blocks for dimension 0 - for (index_type i = tile_id0; i < m_policy.m_tile_end[0]; i += numbl0) { - // compute index for dimension 0 - const index_type offset_0 = - i * m_policy.m_tile[0] + thr_id0 + - static_cast(m_policy.m_lower[0]); - // check index for dimension 0 is within range - if (offset_0 < m_policy.m_upper[0] && thr_id0 < m_policy.m_tile[0]) { - // iterate over virtual blocks for dimension 1 - for (index_type j = tile_id1; j < m_policy.m_tile_end[1]; - j += numbl1) { - // compute index for dimension 1 - const index_type offset_1 = - j * m_policy.m_tile[1] + thr_id1 + - static_cast(m_policy.m_lower[1]); - // check index for dimension 1 is within range - if (offset_1 < m_policy.m_upper[1] && - thr_id1 < m_policy.m_tile[1]) { - // iterate over virtual blocks for dimension 2 - for (index_type k = tile_id2; k < m_policy.m_tile_end[2]; - k += numbl2) { - // compute index for dimension 2 - const index_type offset_2 = - k * m_policy.m_tile[2] + thr_id2 + - static_cast(m_policy.m_lower[2]); - // check index for dimension 2 is within range - if (offset_2 < m_policy.m_upper[2] && - thr_id2 < m_policy.m_tile[2]) { - // iterate over virtual blocks for dimension 3 - for (index_type l = tile_id3; l < m_policy.m_tile_end[3]; - l += numbl3) { - // compute index for dimension 3 - const index_type offset_3 = - l * m_policy.m_tile[3] + thr_id3 + - static_cast(m_policy.m_lower[3]); - // check index for dimension 3 is within range - if (offset_3 < m_policy.m_upper[3] && - thr_id3 < m_policy.m_tile[3]) { - // iterate over virtual blocks for dimension 4 - for (index_type m = tile_id4; m < m_policy.m_tile_end[4]; - m += numbl4) { - // compute index for dimension 4 - const index_type offset_4 = - m * m_policy.m_tile[4] + thr_id4 + - static_cast(m_policy.m_lower[4]); - // check index for dimension 4 is within range - if (offset_4 < m_policy.m_upper[4] && - thr_id4 < m_policy.m_tile[4]) { - // iterate over virtual blocks for dimension 5 - for (index_type n = tile_id5; - n < m_policy.m_tile_end[5]; n += numbl5) { - // compute index for dimension 5 - const index_type offset_5 = - n * m_policy.m_tile[5] + thr_id5 + - static_cast(m_policy.m_lower[5]); - // check index for dimension 5 is within range - if (offset_5 < m_policy.m_upper[5] && - thr_id5 < m_policy.m_tile[5]) { - // call kernel with computed indices - Impl::_tag_invoke(m_func, offset_0, offset_1, - offset_2, offset_3, - offset_4, offset_5); - } - } - } - } - } - } - } - } - } + return index_type{1}; + } + + // ----------------------------------------------------------------------- // + // Nested loops with recursive template instantiation + // + // Accumulates indices in parameter pack Idxs... + // The fastest changing index is always i0 (innermost loop). + // + // Functor call order depends on the Layout: + // Layout::Left: + // functor(i0, i1, i2, ..., iR) + // Layout::Right: + // functor(iR, ..., i2, i1, i0) + // + // For Layout::Right, bounds were previously swapped during ParallelFor + // construction, so i0 correctly iterates over the range of iR while + // remaining the "fastest-changing" index. + // + template + KOKKOS_IMPL_DEVICE_FUNCTION inline void iterate( + std::integral_constant, Idxs... idxs) const { + constexpr unsigned rankIdx = R - 1; + const index_type start = my_begin(); + const index_type end = my_end(); + const index_type stride = my_stride(); + + for (index_type idx = start; idx < end; idx += stride) { + if constexpr (is_packed_index()) { + static_assert(R >= 2); + // Unpack two consecutive indices + constexpr unsigned idx1 = (rankIdx % 2 == 0) ? rankIdx : (rankIdx - 1); + constexpr unsigned idx2 = (rankIdx % 2 == 0) ? (rankIdx + 1) : rankIdx; + + const index_type id_1 = idx % m_extent[idx1] + m_lower[idx1]; + const index_type id_2 = idx / m_extent[idx1] + m_lower[idx2]; + + if (id_1 < m_upper[idx1] && id_2 < m_upper[idx2]) { + if constexpr (Layout == Iterate::Left) { + iterate(std::integral_constant(), id_1, id_2, + idxs...); + } else { + iterate(std::integral_constant(), idxs..., id_2, + id_1); } } + } else { + if constexpr (Layout == Iterate::Left) { + iterate(std::integral_constant(), idx, idxs...); + } else { + iterate(std::integral_constant(), idxs..., idx); + } } } - } // end exec_range + } - private: - const PolicyType& m_policy; - const Functor& m_func; - const MaxGridSize& m_max_grid_size; -#ifdef KOKKOS_ENABLE_SYCL - const EmulateCUDADim3 gridDim; - const EmulateCUDADim3 blockIdx; - const EmulateCUDADim3 threadIdx; -#endif + template + KOKKOS_IMPL_DEVICE_FUNCTION inline void iterate( + std::integral_constant, Idxs... idxs) const { + Impl::_tag_invoke(m_functor, idxs...); + } }; // ---------------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/impl/Kokkos_CPUDiscovery.cpp b/lib/kokkos/core/src/impl/Kokkos_CPUDiscovery.cpp index ab317a6dc36..47ac0c7ae95 100644 --- a/lib/kokkos/core/src/impl/Kokkos_CPUDiscovery.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_CPUDiscovery.cpp @@ -17,6 +17,7 @@ int Kokkos::Impl::mpi_ranks_per_node() { "MPI_LOCALNRANKS", // MPICH // SLURM??? "PMI_LOCAL_SIZE", // PMI + "PALS_LOCAL_SIZE", // PALS }) { char const* str = std::getenv(env_var); if (str) { @@ -33,6 +34,7 @@ int Kokkos::Impl::mpi_local_rank_on_node() { "MPI_LOCALRANKID", // MPICH "SLURM_LOCALID", // SLURM "PMI_LOCAL_RANK", // PMI + "PALS_LOCAL_RANKID", // PALS }) { char const* str = std::getenv(env_var); if (str) { diff --git a/lib/kokkos/core/src/impl/Kokkos_CStyleMemoryManagement.hpp b/lib/kokkos/core/src/impl/Kokkos_CStyleMemoryManagement.hpp index 6d00d764ccf..82095f5eac2 100644 --- a/lib/kokkos/core/src/impl/Kokkos_CStyleMemoryManagement.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_CStyleMemoryManagement.hpp @@ -20,10 +20,6 @@ namespace Kokkos::Impl { inline void check_init_final([[maybe_unused]] char const* func_name) { -// FIXME_THREADS: Checking for calls to kokkos_malloc, kokkos_realloc, -// kokkos_free before initialize or after finalize is currently disabled -// for the Threads backend. Refer issue #7944. -#if !defined(KOKKOS_ENABLE_THREADS) if (is_finalized()) { std::stringstream ss; ss << "Kokkos ERROR: attempting to perform C-style memory management " @@ -37,7 +33,6 @@ inline void check_init_final([[maybe_unused]] char const* func_name) { ss << func_name << "() **before** Kokkos::initialize() was called\n"; Kokkos::abort(ss.str().c_str()); } -#endif } } // namespace Kokkos::Impl diff --git a/lib/kokkos/core/src/Kokkos_CheckUsage.hpp b/lib/kokkos/core/src/impl/Kokkos_CheckUsage.hpp similarity index 75% rename from lib/kokkos/core/src/Kokkos_CheckUsage.hpp rename to lib/kokkos/core/src/impl/Kokkos_CheckUsage.hpp index fb3f450d0fa..1de0bb0306a 100644 --- a/lib/kokkos/core/src/Kokkos_CheckUsage.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_CheckUsage.hpp @@ -4,21 +4,19 @@ #ifndef KOKKOS_CHECK_USAGE_HPP #define KOKKOS_CHECK_USAGE_HPP -#include -#include - #include #include +#include #include +#include +#include + // FIXME: Obtain file and line number information via std::source_location // (since C++20) which requires GCC 11 etc. namespace Kokkos { -[[nodiscard]] bool is_initialized() noexcept; -[[nodiscard]] bool is_finalized() noexcept; - template class RangePolicy; @@ -100,6 +98,37 @@ struct CheckUsage { } }; +// NOLINTBEGIN(bugprone-exception-escape) +inline void check_execution_space_constructor_precondition( + char const* name) noexcept { + if (Kokkos::is_finalized()) { + std::stringstream err; + err << "Kokkos ERROR: " << name + << " execution space is being constructed" + " after finalize() has been called"; + Kokkos::abort(err.str().c_str()); + } + if (!Kokkos::is_initialized()) { + std::stringstream err; + err << "Kokkos ERROR: " << name + << " execution space is being constructed" + " before initialize() has been called"; + Kokkos::abort(err.str().c_str()); + } +} + +inline void check_execution_space_destructor_precondition( + char const* name) noexcept { + if (Kokkos::is_finalized()) { + std::stringstream err; + err << "Kokkos ERROR: " << name + << " execution space is being destructed" + " after finalize() has been called"; + Kokkos::abort(err.str().c_str()); + } +} +// NOLINTEND(bugprone-exception-escape) + } // namespace Impl } // namespace Kokkos diff --git a/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp b/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp index b662eb27273..a0ed9965094 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp @@ -7,9 +7,6 @@ #include #include #include -#ifdef KOKKOS_ENABLE_OPENMPTARGET -#include -#endif // To use OpenCL(TM) built-in intrinsics inside kernels, we have to // forward-declare their prototype, also see @@ -50,10 +47,6 @@ KOKKOS_IMPL_DEVICE_FUNCTION inline uint64_t clock_tic_device() noexcept { return intel_get_cycle_counter(); -#elif defined(KOKKOS_ENABLE_OPENMPTARGET) - - return omp_get_wtime() * 1.e9; - #else return 0; diff --git a/lib/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp b/lib/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp index 4e6dc1b64ad..b2894ac445e 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp @@ -30,16 +30,6 @@ struct CombinedReducerValueItemImpl { public: KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerValueItemImpl() = default; - KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerValueItemImpl( - CombinedReducerValueItemImpl const&) = default; - KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerValueItemImpl( - CombinedReducerValueItemImpl&&) = default; - KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerValueItemImpl& operator=( - CombinedReducerValueItemImpl const&) = default; - KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerValueItemImpl& operator=( - CombinedReducerValueItemImpl&&) = default; - KOKKOS_DEFAULTED_FUNCTION - ~CombinedReducerValueItemImpl() = default; explicit KOKKOS_FUNCTION CombinedReducerValueItemImpl(value_type arg_value) : m_value(std::move(arg_value)) {} @@ -51,28 +41,25 @@ struct CombinedReducerValueItemImpl { //============================================================================== +// Dummy struct used to align CombinedReducerValueImpl to at least alignof(int). +// CombinedReducerValueImpl has to be aligned to at least alignof(int) and its +// sizeof must be a multiple of sizeof(int), as we might access it through an +// int* in the CUDA and HIP reduction kernels. +struct alignas(int) AlignmentHelper {}; + template struct CombinedReducerValueImpl; template struct CombinedReducerValueImpl, - ValueTypes...> - : CombinedReducerValueItemImpl... { + ValueTypes...> : +#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) + AlignmentHelper, +#endif + CombinedReducerValueItemImpl... { public: KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerValueImpl() = default; - KOKKOS_DEFAULTED_FUNCTION - constexpr CombinedReducerValueImpl(CombinedReducerValueImpl const&) = default; - KOKKOS_DEFAULTED_FUNCTION - constexpr CombinedReducerValueImpl(CombinedReducerValueImpl&&) = default; - KOKKOS_DEFAULTED_FUNCTION - constexpr CombinedReducerValueImpl& operator=( - CombinedReducerValueImpl const&) = default; - KOKKOS_DEFAULTED_FUNCTION - constexpr CombinedReducerValueImpl& operator=(CombinedReducerValueImpl&&) = - default; - KOKKOS_DEFAULTED_FUNCTION - ~CombinedReducerValueImpl() = default; KOKKOS_FUNCTION explicit CombinedReducerValueImpl(ValueTypes... arg_values) @@ -165,16 +152,6 @@ struct CombinedReducerImpl, Space, public: KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl() = default; - KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl( - CombinedReducerImpl const&) = default; - KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl( - CombinedReducerImpl&&) = default; - KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl& operator=( - CombinedReducerImpl const&) = default; - KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl& operator=( - CombinedReducerImpl&&) = default; - - KOKKOS_DEFAULTED_FUNCTION ~CombinedReducerImpl() = default; template KOKKOS_FUNCTION constexpr explicit CombinedReducerImpl( @@ -300,20 +277,6 @@ struct CombinedReductionFunctorWrapperImpl< KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReductionFunctorWrapperImpl() noexcept = default; - KOKKOS_DEFAULTED_FUNCTION - constexpr CombinedReductionFunctorWrapperImpl( - CombinedReductionFunctorWrapperImpl const&) = default; - KOKKOS_DEFAULTED_FUNCTION - constexpr CombinedReductionFunctorWrapperImpl( - CombinedReductionFunctorWrapperImpl&&) = default; - KOKKOS_DEFAULTED_FUNCTION - constexpr CombinedReductionFunctorWrapperImpl& operator=( - CombinedReductionFunctorWrapperImpl const&) = default; - KOKKOS_DEFAULTED_FUNCTION - constexpr CombinedReductionFunctorWrapperImpl& operator=( - CombinedReductionFunctorWrapperImpl&&) = default; - KOKKOS_DEFAULTED_FUNCTION - ~CombinedReductionFunctorWrapperImpl() = default; KOKKOS_INLINE_FUNCTION constexpr explicit CombinedReductionFunctorWrapperImpl(Functor arg_functor) diff --git a/lib/kokkos/core/src/impl/Kokkos_Core.cpp b/lib/kokkos/core/src/impl/Kokkos_Core.cpp index 11c0aa68bc4..08b67cad3d1 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Core.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_Core.cpp @@ -127,8 +127,6 @@ int get_device_count() { #elif defined(KOKKOS_ENABLE_OPENACC) return acc_get_num_devices( Kokkos::Experimental::Impl::OpenACC_Traits::dev_type); -#elif defined(KOKKOS_ENABLE_OPENMPTARGET) - return omp_get_num_devices(); #else Kokkos::abort("implementation bug"); return -1; @@ -166,8 +164,6 @@ std::vector const& Kokkos::Impl::get_visible_devices() { int device = HIP().hip_device(); #elif defined(KOKKOS_ENABLE_OPENACC) int device = Experimental::OpenACC().acc_device_number(); -#elif defined(KOKKOS_ENABLE_OPENMPTARGET) - int device = omp_get_default_device(); // FIXME_OPENMPTARGET #elif defined(KOKKOS_ENABLE_SYCL) int device = Impl::SYCLInternal::m_syclDev; #else @@ -687,15 +683,19 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #elif defined(KOKKOS_ARCH_AMPERE86) declare_configuration_metadata("architecture", "GPU architecture", "AMPERE86"); #elif defined(KOKKOS_ARCH_AMPERE87) - declare_configuration_metadata("architecture", "GPU architecture", "AMPERE87"); + declare_configuration_metadata("architecture", "GPU architecture", "AMPERE87"); #elif defined(KOKKOS_ARCH_ADA89) declare_configuration_metadata("architecture", "GPU architecture", "ADA89"); #elif defined(KOKKOS_ARCH_HOPPER90) declare_configuration_metadata("architecture", "GPU architecture", "HOPPER90"); #elif defined(KOKKOS_ARCH_BLACKWELL100) declare_configuration_metadata("architecture", "GPU architecture", "BLACKWELL100"); +#elif defined(KOKKOS_ARCH_BLACKWELL103) + declare_configuration_metadata("architecture", "GPU architecture", "BLACKWELL103"); #elif defined(KOKKOS_ARCH_BLACKWELL120) declare_configuration_metadata("architecture", "GPU architecture", "BLACKWELL120"); +#elif defined(KOKKOS_ARCH_BLACKWELL121) + declare_configuration_metadata("architecture", "GPU architecture", "BLACKWELL121"); #elif defined(KOKKOS_ARCH_AMD_GFX906) declare_configuration_metadata("architecture", "GPU architecture", "AMD_GFX906"); #elif defined(KOKKOS_ARCH_AMD_GFX908) diff --git a/lib/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp b/lib/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp index 4863ddec6fa..2d0f947bc93 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp @@ -25,7 +25,8 @@ namespace Impl { // {{{1 template -struct GraphImpl : private InstanceStorage { +struct GraphImpl + : private InstanceStorage> { public: using root_node_impl_t = GraphNodeImpl { using aggregate_impl_t = GraphNodeAggregateDefaultImpl; private: - using execution_space_instance_storage_base_t = - InstanceStorage; + using device_handle_t = Kokkos::Impl::DeviceHandle; + + using device_handle_storage_base_t = + InstanceStorage>; using node_details_t = GraphNodeBackendSpecificDetails; std::set> m_sinks; @@ -53,14 +56,14 @@ struct GraphImpl : private InstanceStorage { GraphImpl& operator=(GraphImpl&&) = delete; ~GraphImpl() = default; - explicit GraphImpl(ExecutionSpace arg_space) - : execution_space_instance_storage_base_t(std::move(arg_space)) {} + explicit GraphImpl(const device_handle_t& device_handle) + : device_handle_storage_base_t(device_handle) {} // end Constructors, destructor, and assignment }}}2 //---------------------------------------------------------------------------- - ExecutionSpace const& get_execution_space() const { - return this->execution_space_instance_storage_base_t::instance(); + device_handle_t const& get_device_handle() const { + return this->device_handle_storage_base_t::instance(); } //---------------------------------------------------------------------------- @@ -110,13 +113,13 @@ struct GraphImpl : private InstanceStorage { GraphNodeImpl; return GraphAccess::make_node_shared_ptr( - this->get_execution_space(), _graph_node_kernel_ctor_tag{}, + this->get_device_handle(), _graph_node_kernel_ctor_tag{}, aggregate_impl_t{}); } auto create_root_node_ptr() { auto rv = Kokkos::Impl::GraphAccess::make_node_shared_ptr( - get_execution_space(), _graph_node_is_root_ctor_tag{}); + get_device_handle(), _graph_node_is_root_ctor_tag{}); m_sinks.insert(rv); return rv; } diff --git a/lib/kokkos/core/src/impl/Kokkos_DeviceHandle.hpp b/lib/kokkos/core/src/impl/Kokkos_DeviceHandle.hpp new file mode 100644 index 00000000000..4315d4c4ff0 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_DeviceHandle.hpp @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project + +#ifndef KOKKOS_IMPL_KOKKOS_DEVICEHANDLE_HPP +#define KOKKOS_IMPL_KOKKOS_DEVICEHANDLE_HPP + +#include "View/Kokkos_ViewCtor.hpp" + +namespace Kokkos::Impl { +template +struct DeviceHandle { + // A device is implicitly contained in an execution space instance. + static DeviceHandle from(Exec exec) { + return DeviceHandle{.m_exec = std::move(exec)}; + } + + auto operator<=>(const DeviceHandle&) const = default; + + // For now, let's store the execution space instance. + // It is the best portable way to ensure that the device handle has enough + // information. It is an implementation detail, let's keep this reference + // counted member extractible easily (no need to make it private yet). + // In the future, it could be treated as the default execution queue for the + // device. + Exec m_exec; +}; + +template +struct is_device_handle : public std::false_type {}; + +template +struct is_device_handle> : public std::true_type {}; + +template +constexpr bool is_device_handle_v = is_device_handle::value; +} // namespace Kokkos::Impl + +namespace Kokkos::Experimental { +// The user shall treat the return type as an opaque type. +template + requires Kokkos::ExecutionSpace> +auto get_device_handle(Exec&& exec) { + return Kokkos::Impl::DeviceHandle>::from( + std::forward(exec)); +} +} // namespace Kokkos::Experimental + +#endif // KOKKOS_IMPL_KOKKOS_DEVICEHANDLE_HPP diff --git a/lib/kokkos/core/src/impl/Kokkos_EBO.hpp b/lib/kokkos/core/src/impl/Kokkos_EBO.hpp index 5ed2aa6d263..b7883fd4156 100644 --- a/lib/kokkos/core/src/impl/Kokkos_EBO.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_EBO.hpp @@ -53,21 +53,6 @@ struct EBOBaseImpl { long> = 0> inline constexpr explicit EBOBaseImpl(Args&&...) noexcept {} - KOKKOS_DEFAULTED_FUNCTION - constexpr EBOBaseImpl(EBOBaseImpl const&) = default; - - KOKKOS_DEFAULTED_FUNCTION - constexpr EBOBaseImpl(EBOBaseImpl&&) = default; - - KOKKOS_DEFAULTED_FUNCTION - constexpr EBOBaseImpl& operator=(EBOBaseImpl const&) = default; - - KOKKOS_DEFAULTED_FUNCTION - constexpr EBOBaseImpl& operator=(EBOBaseImpl&&) = default; - - KOKKOS_DEFAULTED_FUNCTION - ~EBOBaseImpl() = default; - KOKKOS_INLINE_FUNCTION constexpr T& _ebo_data_member() & { return *reinterpret_cast(this); } @@ -116,23 +101,6 @@ struct EBOBaseImpl { noexcept(T(std::forward(args)...))) : m_ebo_object(std::forward(args)...) {} - // TODO @tasking @minor DSH noexcept in the right places? - - KOKKOS_DEFAULTED_FUNCTION - constexpr EBOBaseImpl(EBOBaseImpl const&) = default; - - KOKKOS_DEFAULTED_FUNCTION - constexpr EBOBaseImpl(EBOBaseImpl&&) noexcept = default; - - KOKKOS_DEFAULTED_FUNCTION - constexpr EBOBaseImpl& operator=(EBOBaseImpl const&) = default; - - KOKKOS_DEFAULTED_FUNCTION - constexpr EBOBaseImpl& operator=(EBOBaseImpl&&) = default; - - KOKKOS_DEFAULTED_FUNCTION - ~EBOBaseImpl() = default; - KOKKOS_INLINE_FUNCTION T& _ebo_data_member() & { return m_ebo_object; } diff --git a/lib/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp b/lib/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp index c4bded91ddc..0f7c1a4f4f5 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp @@ -29,7 +29,9 @@ Impl::PerThreadValue PerThread(const size_t& arg) { return Impl::PerThreadValue(arg); } -void team_policy_check_valid_storage_level_argument(int level) { +} // namespace Kokkos + +void Kokkos::Impl::team_policy_check_valid_storage_level_argument(int level) { if (!(level == 0 || level == 1)) { std::stringstream ss; ss << "TeamPolicy::set_scratch_size(/*level*/ " << level @@ -37,5 +39,3 @@ void team_policy_check_valid_storage_level_argument(int level) { abort(ss.str().c_str()); } } - -} // namespace Kokkos diff --git a/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp b/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp index e34e4a25e9a..cc65a9a97f7 100644 --- a/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp @@ -5,6 +5,7 @@ #define KOKKOS_FUNCTORANALYSIS_HPP #include +#include #include #include diff --git a/lib/kokkos/core/src/impl/Kokkos_GraphNodeCtorProps.hpp b/lib/kokkos/core/src/impl/Kokkos_GraphNodeCtorProps.hpp new file mode 100644 index 00000000000..75039528eb9 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_GraphNodeCtorProps.hpp @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project + +#ifndef KOKKOS_IMPL_KOKKOS_GRAPHNODECTORPROPS_HPP +#define KOKKOS_IMPL_KOKKOS_GRAPHNODECTORPROPS_HPP + +#include "impl/Kokkos_DeviceHandle.hpp" +#include "View/Kokkos_ViewCtor.hpp" + +namespace Kokkos::Impl { + +template +concept ValidNodeProperty = is_view_label_v || is_device_handle_v; + +// Transform a type into a valid property. +template +struct NodeCtorPropTransform { + using type = T; +}; + +template +struct NodeCtorPropTransform { + using type = std::string; +}; + +// A single property. +template +struct NodeCtorProp { + using value_type = typename NodeCtorPropTransform::type; + + explicit NodeCtorProp(value_type value) : m_value(std::move(value)) {} + + value_type m_value; +}; + +// Aggregated properties. +template +struct NodeCtorProps : public NodeCtorProp... { + using properties_value_type_list_t = + Kokkos::Impl::type_list::value_type...>; + + static_assert(Kokkos::Impl::type_list_size_v>> <= 1, + "Only one label allowed."); + static_assert(Kokkos::Impl::type_list_size_v>> <= + 1, + "Only one device handle allowed."); + + using uniform_type = + NodeCtorProps::value_type...>; + + template + static constexpr bool has = + Kokkos::Impl::type_list_contains_v; + + NodeCtorProps() = default; + + // NOLINTBEGIN(modernize-type-traits) + template + requires(std::constructible_from, Args &&> && ...) + // NOLINTEND(modernize-type-traits) + explicit NodeCtorProps(Args&&... args) + : NodeCtorProp{std::forward(args)}... {} +}; + +template +struct is_node_props : public std::false_type {}; + +template +struct is_node_props> : public std::true_type {}; + +template +constexpr bool is_node_props_v = is_node_props::value; + +template +concept NodeProperties = is_node_props_v; + +template + requires Props::template +has [[nodiscard]] constexpr decltype(auto) get_property( + const Props& props) { + return static_cast&>(props).m_value; +} + +template + requires Props::template +has [[nodiscard]] constexpr decltype(auto) extract_property( + Props& props) { + return std::move(static_cast&>(props).m_value); +} + +struct WithProperty { + template + requires(sizeof...(Props) > 0) + [[nodiscard]] static constexpr decltype(auto) set( + NodeCtorProps props, Property&& property) { + using NewNodeCtorProps = + typename NodeCtorProps>::uniform_type; + return NewNodeCtorProps{extract_property(props)..., + std::forward(property)}; + } + + template + [[nodiscard]] static constexpr decltype(auto) set(NodeCtorProps<>, + Property&& prop) { + return typename NodeCtorProps>::uniform_type{ + std::forward(prop)}; + } +}; + +template +[[nodiscard]] constexpr decltype(auto) with_properties_if_unset( + Props node_props) noexcept { + return node_props; +} + +template +[[nodiscard]] constexpr decltype(auto) with_properties_if_unset( + Props node_props, [[maybe_unused]] Property&& property, + Properties&&... properties) { + if constexpr (!Props::template has>::value_type>) { + return with_properties_if_unset( + WithProperty::set(std::move(node_props), + std::forward(property)), + std::forward(properties)...); + } else { + return with_properties_if_unset(std::move(node_props), + std::forward(properties)...); + } +} + +} // namespace Kokkos::Impl + +namespace Kokkos::Experimental { +template +[[nodiscard]] constexpr auto node_props(Args&&... args) { + using return_t = typename Kokkos::Impl::NodeCtorProps< + std::remove_cvref_t...>::uniform_type; + return return_t{std::forward(args)...}; +} + +} // namespace Kokkos::Experimental + +#endif // KOKKOS_IMPL_KOKKOS_GRAPHNODECTORPROPS_HPP diff --git a/lib/kokkos/core/src/impl/Kokkos_GraphNodeImpl.hpp b/lib/kokkos/core/src/impl/Kokkos_GraphNodeImpl.hpp index e8e6b11de8c..a7dada4a6a1 100644 --- a/lib/kokkos/core/src/impl/Kokkos_GraphNodeImpl.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_GraphNodeImpl.hpp @@ -28,7 +28,7 @@ template struct GraphNodeImpl : GraphNodeBackendSpecificDetails, - InstanceStorage { + InstanceStorage> { public: using node_ref_t = Kokkos::Experimental::GraphNodeRef; - using execution_space_storage_base_t = InstanceStorage; - - public: - virtual ~GraphNodeImpl() = default; + using device_handle_t = Kokkos::Impl::DeviceHandle; + using device_handle_storage_base_t = InstanceStorage; protected: //---------------------------------------------------------------------------- // {{{2 - explicit GraphNodeImpl(ExecutionSpace const& ex) noexcept - : implementation_base_t(), execution_space_storage_base_t(ex) {} + explicit GraphNodeImpl(device_handle_t const& device_handle) noexcept + : implementation_base_t(), device_handle_storage_base_t(device_handle) {} // end protected ctors and destructors }}}2 //---------------------------------------------------------------------------- @@ -57,10 +55,10 @@ struct GraphNodeImpl {{{2 template - GraphNodeImpl(ExecutionSpace const& ex, _graph_node_is_root_ctor_tag, - Args&&... args) noexcept + GraphNodeImpl(device_handle_t const& device_handle, + _graph_node_is_root_ctor_tag, Args&&... args) noexcept : implementation_base_t(_graph_node_is_root_ctor_tag{}, (Args&&)args...), - execution_space_storage_base_t(ex) {} + device_handle_storage_base_t(device_handle) {} // end public(-ish) constructors }}}2 //---------------------------------------------------------------------------- @@ -73,12 +71,13 @@ struct GraphNodeImpl end no other constructors }}}2 //---------------------------------------------------------------------------- - ExecutionSpace const& execution_space_instance() const { - return this->execution_space_storage_base_t::instance(); + device_handle_t get_device_handle() const { + return this->device_handle_storage_base_t::instance(); } }; @@ -100,6 +99,9 @@ struct GraphNodeImpl; + protected: + using typename base_t::device_handle_t; + public: //---------------------------------------------------------------------------- // {{{2 @@ -130,13 +132,15 @@ struct GraphNodeImpl || std::is_same_v || std::is_same_v>> - GraphNodeImpl(ExecutionSpace const& ex, Tag, KernelDeduced&& arg_kernel) - : base_t(ex), m_kernel{(KernelDeduced&&)arg_kernel} {} + GraphNodeImpl(device_handle_t const& device_handle, Tag, + KernelDeduced&& arg_kernel) + : base_t(device_handle), m_kernel{(KernelDeduced&&)arg_kernel} {} template - GraphNodeImpl(ExecutionSpace const& ex, _graph_node_is_root_ctor_tag, - Args&&... args) - : base_t(ex, _graph_node_is_root_ctor_tag{}, (Args&&)args...) {} + GraphNodeImpl(device_handle_t const& device_handle, + _graph_node_is_root_ctor_tag, Args&&... args) + : base_t(device_handle, _graph_node_is_root_ctor_tag{}, (Args&&)args...) { + } //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // {{{3 @@ -191,6 +195,8 @@ struct GraphNodeImpl GraphNodeImpl; + using typename base_t::device_handle_t; + public: //---------------------------------------------------------------------------- // {{{2 @@ -228,24 +234,25 @@ struct GraphNodeImpl std::is_same_v || std::is_same_v || std::is_same_v>> - GraphNodeImpl(ExecutionSpace const& ex, Tag, KernelDeduced&& arg_kernel, - _graph_node_predecessor_ctor_tag, + GraphNodeImpl(device_handle_t const& device_handle, Tag, + KernelDeduced&& arg_kernel, _graph_node_predecessor_ctor_tag, PredecessorPtrDeduced&& arg_predecessor) - : base_t(ex, Tag{}, (KernelDeduced&&)arg_kernel), + : base_t(device_handle, Tag{}, (KernelDeduced&&)arg_kernel), // The backend gets the ability to store (weak, non-owning) references // to the kernel in it's final resting place here if it wants. The // predecessor is already a pointer, so it doesn't matter that it isn't // already at its final address - backend_details_base_t(ex, this->base_t::get_kernel(), arg_predecessor, - *this), + backend_details_base_t(device_handle.m_exec, this->base_t::get_kernel(), + arg_predecessor, *this), m_predecessor_ref((PredecessorPtrDeduced&&)arg_predecessor) {} // Root-tagged constructor template - GraphNodeImpl(ExecutionSpace const& ex, _graph_node_is_root_ctor_tag, - Args&&... args) - : base_t(ex, _graph_node_is_root_ctor_tag{}, (Args&&)args...), - backend_details_base_t(ex, _graph_node_is_root_ctor_tag{}, *this), + GraphNodeImpl(device_handle_t const& device_handle, + _graph_node_is_root_ctor_tag, Args&&... args) + : base_t(device_handle, _graph_node_is_root_ctor_tag{}, (Args&&)args...), + backend_details_base_t(device_handle.m_exec, + _graph_node_is_root_ctor_tag{}, *this), m_predecessor_ref() {} // end Ctors, destructors, and assignment }}}2 diff --git a/lib/kokkos/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp b/lib/kokkos/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp index 87c4137af03..2921957c56c 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Half_FloatingPointWrapper.hpp @@ -27,9 +27,15 @@ struct is_bfloat16 : std::false_type {}; // KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH: A macro to select which // floating_pointer_wrapper operator paths should be used. For CUDA, let the -// compiler conditionally select when device ops are used For SYCL, we have a -// full half type on both host and device -#if defined(__CUDA_ARCH__) || defined(KOKKOS_ENABLE_SYCL) +// compiler conditionally select when device ops are used. For SYCL, we have a +// full half type on both host and device. For HIP, we have a full half type on +// host and device only for ROCm 6.4 and later. +#if defined(__CUDA_ARCH__) || \ + (defined(KOKKOS_ENABLE_HIP) && \ + ((HIP_VERSION_MAJOR > 6 || \ + (HIP_VERSION_MAJOR == 6 && HIP_VERSION_MINOR >= 4)) || \ + defined(__HIP_DEVICE_COMPILE__))) || \ + defined(KOKKOS_ENABLE_SYCL) #define KOKKOS_HALF_IS_FULL_TYPE_ON_ARCH #endif @@ -278,27 +284,6 @@ class alignas(FloatType) floating_point_wrapper { KOKKOS_FUNCTION floating_point_wrapper() : val(0.0F) {} -// Copy constructors -// Getting "C2580: multiple versions of a defaulted special -// member function are not allowed" with VS 16.11.3 and CUDA 11.4.2 -#if defined(_WIN32) && defined(KOKKOS_ENABLE_CUDA) - KOKKOS_FUNCTION - floating_point_wrapper(const floating_point_wrapper& rhs) : val(rhs.val) {} - - KOKKOS_FUNCTION - floating_point_wrapper& operator=(const floating_point_wrapper& rhs) { - val = rhs.val; - return *this; - } -#else - KOKKOS_DEFAULTED_FUNCTION - floating_point_wrapper(const floating_point_wrapper&) noexcept = default; - - KOKKOS_DEFAULTED_FUNCTION - floating_point_wrapper& operator=(const floating_point_wrapper&) noexcept = - default; -#endif - KOKKOS_FUNCTION floating_point_wrapper(bit_comparison_type rhs) { val = Kokkos::bit_cast(rhs); diff --git a/lib/kokkos/core/src/impl/Kokkos_Half_MathematicalFunctions.hpp b/lib/kokkos/core/src/impl/Kokkos_Half_MathematicalFunctions.hpp index c90097b48e0..73d28dd1bbf 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Half_MathematicalFunctions.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Half_MathematicalFunctions.hpp @@ -17,6 +17,10 @@ #include #endif +#ifdef KOKKOS_ENABLE_HIP +#include +#endif + #ifdef KOKKOS_ENABLE_SYCL #include #endif @@ -25,25 +29,25 @@ namespace Kokkos { // BEGIN macro definitions #if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT - #define KOKKOS_IMPL_MATH_H_FUNC_WRAPPER(MACRO, FUNC) \ - MACRO(FUNC, Kokkos::Experimental::half_t) + #define KOKKOS_IMPL_MATH_H_FUNC_WRAPPER(MACRO, FUNC, /*MAYBE_RET*/...) \ + MACRO(FUNC, Kokkos::Experimental::half_t __VA_OPT__(,) __VA_ARGS__) #else - #define KOKKOS_IMPL_MATH_H_FUNC_WRAPPER(MACRO, FUNC) + #define KOKKOS_IMPL_MATH_H_FUNC_WRAPPER(MACRO, FUNC, ...) #endif #if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT - #define KOKKOS_IMPL_MATH_B_FUNC_WRAPPER(MACRO, FUNC) \ - MACRO(FUNC, Kokkos::Experimental::bhalf_t) + #define KOKKOS_IMPL_MATH_B_FUNC_WRAPPER(MACRO, FUNC, /*MAYBE_RET*/...) \ + MACRO(FUNC, Kokkos::Experimental::bhalf_t __VA_OPT__(,) __VA_ARGS__) #else - #define KOKKOS_IMPL_MATH_B_FUNC_WRAPPER(MACRO, FUNC) + #define KOKKOS_IMPL_MATH_B_FUNC_WRAPPER(MACRO, FUNC, ...) #endif -#define KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(MACRO, FUNC) \ - KOKKOS_IMPL_MATH_H_FUNC_WRAPPER(MACRO, FUNC) \ - KOKKOS_IMPL_MATH_B_FUNC_WRAPPER(MACRO, FUNC) +#define KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(MACRO, FUNC, /*MAYBE_RETURN_TYPE*/...) \ + KOKKOS_IMPL_MATH_H_FUNC_WRAPPER(MACRO, FUNC __VA_OPT__(,) __VA_ARGS__) \ + KOKKOS_IMPL_MATH_B_FUNC_WRAPPER(MACRO, FUNC __VA_OPT__(,) __VA_ARGS__) #define KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE(FUNC, HALF_TYPE) \ - namespace Impl { \ + namespace Impl { \ template \ KOKKOS_INLINE_FUNCTION HALF_TYPE impl_##FUNC(HALF_TYPE x) { \ return static_cast(Kokkos::FUNC(static_cast(x))); \ @@ -53,6 +57,17 @@ namespace Kokkos { return Kokkos::Impl::impl_##FUNC(x); \ } +#define KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE_RETURN_INT(FUNC, HALF_TYPE, INT_TYPE) \ + namespace Impl { \ + template \ + KOKKOS_INLINE_FUNCTION INT_TYPE impl_##FUNC(HALF_TYPE x) { \ + return Kokkos::FUNC(static_cast(x)); \ + } \ + } /* namespace Impl */ \ + KOKKOS_INLINE_FUNCTION INT_TYPE FUNC(HALF_TYPE x) { \ + return Kokkos::Impl::impl_##FUNC(x); \ + } + #define KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF_MIXED(FUNC, HALF_TYPE, MIXED_TYPE) \ namespace Impl { \ template \ @@ -106,6 +121,59 @@ namespace Kokkos { KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF_MIXED(FUNC, HALF_TYPE, long long) \ KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF_MIXED(FUNC, HALF_TYPE, unsigned long long) +#define KOKKOS_IMPL_MATH_TERNARY_INT_PTR_FUNCTION_HALF_MIXED(FUNC, HALF_TYPE, MIXED_TYPE) \ + namespace Impl { \ + template \ + KOKKOS_INLINE_FUNCTION double impl_##FUNC(HALF_TYPE x, MIXED_TYPE y, int* z) { \ + return Kokkos::FUNC(static_cast(x), static_cast(y), z); \ + } \ + template \ + KOKKOS_INLINE_FUNCTION double impl_##FUNC(MIXED_TYPE x, HALF_TYPE y, int* z) { \ + return Kokkos::FUNC(static_cast(x), static_cast(y), z); \ + } \ + } /* namespace Impl */ \ + KOKKOS_INLINE_FUNCTION double FUNC(HALF_TYPE x, MIXED_TYPE y, int* z) { \ + return Kokkos::Impl::impl_##FUNC(x, y, z); \ + } \ + KOKKOS_INLINE_FUNCTION double FUNC(MIXED_TYPE x, HALF_TYPE y, int* z) { \ + return Kokkos::Impl::impl_##FUNC(x, y, z); \ + } + +#define KOKKOS_IMPL_MATH_TERNARY_INT_PTR_FUNCTION_HALF(FUNC, HALF_TYPE) \ + namespace Impl { \ + template \ + KOKKOS_INLINE_FUNCTION HALF_TYPE impl_##FUNC(HALF_TYPE x, HALF_TYPE y, int* z) { \ + return static_cast( \ + Kokkos::FUNC(static_cast(x), static_cast(y), z)); \ + } \ + template \ + KOKKOS_INLINE_FUNCTION float impl_##FUNC(float x, HALF_TYPE y, int* z) { \ + return Kokkos::FUNC(static_cast(x), static_cast(y), z); \ + } \ + template \ + KOKKOS_INLINE_FUNCTION float impl_##FUNC(HALF_TYPE x, float y, int* z) { \ + return Kokkos::FUNC(static_cast(x), static_cast(y), z); \ + } \ + } /* namespace Impl */ \ + KOKKOS_INLINE_FUNCTION HALF_TYPE FUNC(HALF_TYPE x, HALF_TYPE y, int* z) { \ + return Kokkos::Impl::impl_##FUNC(x, y, z); \ + } \ + KOKKOS_INLINE_FUNCTION float FUNC(float x, HALF_TYPE y, int* z) { \ + return Kokkos::Impl::impl_##FUNC(x, y, z); \ + } \ + KOKKOS_INLINE_FUNCTION float FUNC(HALF_TYPE x, float y, int* z) { \ + return Kokkos::Impl::impl_##FUNC(x, y, z); \ + } \ + KOKKOS_IMPL_MATH_TERNARY_INT_PTR_FUNCTION_HALF_MIXED(FUNC, HALF_TYPE, double) \ + KOKKOS_IMPL_MATH_TERNARY_INT_PTR_FUNCTION_HALF_MIXED(FUNC, HALF_TYPE, short) \ + KOKKOS_IMPL_MATH_TERNARY_INT_PTR_FUNCTION_HALF_MIXED(FUNC, HALF_TYPE, unsigned short) \ + KOKKOS_IMPL_MATH_TERNARY_INT_PTR_FUNCTION_HALF_MIXED(FUNC, HALF_TYPE, int) \ + KOKKOS_IMPL_MATH_TERNARY_INT_PTR_FUNCTION_HALF_MIXED(FUNC, HALF_TYPE, unsigned int) \ + KOKKOS_IMPL_MATH_TERNARY_INT_PTR_FUNCTION_HALF_MIXED(FUNC, HALF_TYPE, long) \ + KOKKOS_IMPL_MATH_TERNARY_INT_PTR_FUNCTION_HALF_MIXED(FUNC, HALF_TYPE, unsigned long) \ + KOKKOS_IMPL_MATH_TERNARY_INT_PTR_FUNCTION_HALF_MIXED(FUNC, HALF_TYPE, long long) \ + KOKKOS_IMPL_MATH_TERNARY_INT_PTR_FUNCTION_HALF_MIXED(FUNC, HALF_TYPE, unsigned long long) + #define KOKKOS_IMPL_MATH_UNARY_PREDICATE_HALF(FUNC, HALF_TYPE) \ namespace Impl { \ @@ -123,7 +191,7 @@ KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, ab KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, fabs) KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF, fmod) KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF, remainder) -// remquo +KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_TERNARY_INT_PTR_FUNCTION_HALF, remquo) // fma KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF, fmax) KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF, fmin) @@ -167,15 +235,18 @@ KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, ce KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, floor) KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, trunc) KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, round) -// lround -// llround -// FIXME_SYCL not available as of current SYCL 2020 specification (revision 4) -#ifndef KOKKOS_ENABLE_SYCL // FIXME_SYCL +// FIXME_SYCL not available as of current SYCL 2020 specification (revision 11) +#ifndef KOKKOS_ENABLE_SYCL +KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE_RETURN_INT, lround, long) +KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE_RETURN_INT, llround, long long) KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, nearbyint) #endif -// rint -// lrint -// llrint +KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, rint) +#ifndef KOKKOS_ENABLE_SYCL +// FIXME_SYCL not available as of current SYCL 2020 specification (revision 11) +KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE_RETURN_INT, lrint, long ) +KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE_RETURN_INT, llrint, long long) +#endif // Floating point manipulation functions // frexp // ldexp @@ -183,6 +254,7 @@ KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, ne // scalbn // scalbln // ilog +KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE_RETURN_INT, ilogb, int) KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, logb) // FIXME nextafter for fp16 is unavailable for MSVC CUDA builds @@ -391,8 +463,69 @@ KOKKOS_INLINE_FUNCTION Kokkos::Experimental::bhalf_t nextafter(Kokkos::Experimen #endif #endif // !(defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_COMPILER_MSVC)) -// isnormal -KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_PREDICATE_HALF, signbit) +#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT +KOKKOS_INLINE_FUNCTION bool isnormal(Kokkos::Experimental::half_t x) { +#if defined(KOKKOS_ENABLE_HIP) + // FIXME_HIP + // Workaround for NaN with HIP + if (x != x) { return false; } +#endif + auto abs = Kokkos::abs(x); + return (abs >= Kokkos::Experimental::norm_min_v)&&( + abs <= Kokkos::Experimental::finite_max_v); +} +#endif + +#if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT +KOKKOS_INLINE_FUNCTION bool isnormal(Kokkos::Experimental::bhalf_t x) { +#if defined(KOKKOS_ENABLE_HIP) + // FIXME_HIP + // Workaround for NaN with HIP + if (x != x) { return false; } +#endif + auto abs = Kokkos::abs(x); + return (abs >= Kokkos::Experimental::norm_min_v)&&( + abs <= Kokkos::Experimental::finite_max_v); +} +#endif + +#define KOKKOS_IMPL_HALF_MATH_FPCLASSIFY(TYPE) \ + KOKKOS_INLINE_FUNCTION int fpclassify(TYPE x) { \ + if (x != x) { \ + return FP_NAN; \ + } else if (x == 0) { \ + return FP_ZERO; \ + } else if (Kokkos::abs(x) < Kokkos::Experimental::norm_min_v) { \ + return FP_SUBNORMAL; \ + } else if (Kokkos::abs(x) == Kokkos::Experimental::infinity_v) { \ + return FP_INFINITE; \ + } else { \ + return FP_NORMAL; \ + } \ + } + +#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT +KOKKOS_IMPL_HALF_MATH_FPCLASSIFY(Kokkos::Experimental::half_t) +#endif + +#if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT +KOKKOS_IMPL_HALF_MATH_FPCLASSIFY(Kokkos::Experimental::bhalf_t) +#endif + +#undef KOKKOS_IMPL_HALF_MATH_FPCLASSIFY + +#if defined(KOKKOS_HALF_T_IS_FLOAT) && !KOKKOS_HALF_T_IS_FLOAT +KOKKOS_INLINE_FUNCTION bool signbit(Kokkos::Experimental::half_t x) { + constexpr std::uint16_t sign_mask = 1u<<15; + return (Kokkos::bit_cast(x) & sign_mask) != 0; +} +#endif +#if defined(KOKKOS_BHALF_T_IS_FLOAT) && !KOKKOS_BHALF_T_IS_FLOAT +KOKKOS_INLINE_FUNCTION bool signbit(Kokkos::Experimental::bhalf_t x) { + constexpr std::uint16_t sign_mask = 1u<<15; + return (Kokkos::bit_cast(x) & sign_mask) != 0; +} +#endif // isgreater // isgreaterequal // isless @@ -400,6 +533,10 @@ KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_PREDICATE_HALF, signbi // islessgreater // isunordered +// Non-standard functions +KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, rsqrt) +KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE, rcp) + // Implementation test function: check if fallback for half and bhalf type are used namespace Impl { template @@ -433,7 +570,9 @@ KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER(KOKKOS_IMPL_MATH_COMPLEX_IMAG_HALF, imag) #undef KOKKOS_IMPL_MATH_COMPLEX_IMAG_HALF #undef KOKKOS_IMPL_MATH_UNARY_PREDICATE_HALF #undef KOKKOS_IMPL_MATH_BINARY_FUNCTION_HALF +#undef KOKKOS_IMPL_MATH_TERNARY_INT_PTR_FUNCTION_HALF #undef KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE +#undef KOKKOS_IMPL_MATH_UNARY_FUNCTION_HALF_TYPE_RETURN_INT #undef KOKKOS_IMPL_MATH_HALF_FUNC_WRAPPER #undef KOKKOS_IMPL_MATH_B_FUNC_WRAPPER #undef KOKKOS_IMPL_MATH_H_FUNC_WRAPPER diff --git a/lib/kokkos/core/src/impl/Kokkos_Half_NumericTraits.hpp b/lib/kokkos/core/src/impl/Kokkos_Half_NumericTraits.hpp index 74be48ac9f1..fce775d3c6e 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Half_NumericTraits.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Half_NumericTraits.hpp @@ -142,6 +142,20 @@ struct Kokkos::Experimental::Impl::norm_min_helper< static constexpr Kokkos::Experimental::half_t::bit_comparison_type value{0b0'00001'0000000000}; // 0.00006103515625 }; +/// \brief: Smallest positive non-zero value +/// +/// Smallest positive non-zero value +/// [s e e e e e f f f f f f f f f f] +/// [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1] +/// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +/// +/// and in base10: 2**-24 = 5.96046e-08 +template <> +struct Kokkos::Experimental::Impl::denorm_min_helper< + Kokkos::Experimental::half_t> { + static constexpr Kokkos::Experimental::half_t::bit_comparison_type value{0b0'00000'0000000001}; +}; + /// \brief: Quiet not a half precision number /// /// IEEE 754 defines this as all exponent bits and the first fraction bit high. @@ -204,7 +218,7 @@ struct Kokkos::Experimental::Impl::radix_helper { /// \brief: This is the smallest possible exponent value /// -/// Stdc defines this as the smallest possible exponent value for type binary16. +/// Stdc defines this as the smallest possible exponent value for type binary16. /// More precisely, it is the minimum negative integer such that the value min_exponent_helper /// raised to this power minus 1 can be represented as a normalized floating point number of type float. /// @@ -212,10 +226,10 @@ struct Kokkos::Experimental::Impl::radix_helper { /// [s e e e e e f f f f f f f f f f] /// [0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0] /// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 -/// +/// /// and in base10: 1 * 2**(2**0 - 15) * (1 + 0) /// = 2**-14 -/// +/// /// with a bias of one from (C11 5.2.4.2.2), gives -13; template <> struct Kokkos::Experimental::Impl::min_exponent_helper< @@ -229,11 +243,11 @@ struct Kokkos::Experimental::Impl::min_exponent_helper< /// [s e e e e e f f f f f f f f f f] /// [0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0] /// bit index: 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 -/// +/// /// and in base10: 1 * 2**(2**4 + 2**3 + 2**2 + 2**1 - 15) * (1 + 0) /// = 2**(30 - 15) /// = 2**15 -/// +/// /// with a bias of one from (C11 5.2.4.2.2), gives 16; template <> struct Kokkos::Experimental::Impl::max_exponent_helper< @@ -286,6 +300,12 @@ struct Kokkos::Experimental::Impl::norm_min_helper< Kokkos::Experimental::bhalf_t> { static constexpr Kokkos::Experimental::bhalf_t::bit_comparison_type value{0b0'00000001'0000000}; // 1.175494351e-38 }; +/// Smallest positive non-zero bhalf +template <> +struct Kokkos::Experimental::Impl::denorm_min_helper< + Kokkos::Experimental::bhalf_t> { + static constexpr Kokkos::Experimental::bhalf_t::bit_comparison_type value{0b0'00000000'0000001}; // 2^(-133)=9.18355e-41 +}; // Quiet not a bhalf number template <> struct Kokkos::Experimental::Impl::quiet_NaN_helper< diff --git a/lib/kokkos/core/src/impl/Kokkos_HostBarrier.hpp b/lib/kokkos/core/src/impl/Kokkos_HostBarrier.hpp index 32b519124ea..16090010cfc 100644 --- a/lib/kokkos/core/src/impl/Kokkos_HostBarrier.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_HostBarrier.hpp @@ -152,6 +152,7 @@ class HostBarrier { HostBarrier(const HostBarrier&) = delete; HostBarrier& operator=(const HostBarrier&) = delete; + ~HostBarrier() = default; private: KOKKOS_INLINE_FUNCTION diff --git a/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp index d0fdca7f9b5..cb1ed19576b 100644 --- a/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -27,18 +28,11 @@ namespace Kokkos { -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -KOKKOS_DEPRECATED HostSpace::HostSpace(const HostSpace::AllocationMechanism &) - : HostSpace() {} -#endif - void *HostSpace::allocate(const size_t arg_alloc_size) const { return allocate("[unlabeled]", arg_alloc_size); } void *HostSpace::allocate(const char *arg_label, const size_t arg_alloc_size, - const size_t - - arg_logical_size) const { + const size_t arg_logical_size) const { return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); } void *HostSpace::impl_allocate( diff --git a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp index 2c677f4e6b8..b27d654a1f0 100644 --- a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp @@ -434,13 +434,6 @@ class HostThreadTeamMember { m_league_rank(arg_league_rank), m_league_size(arg_league_size) {} - ~HostThreadTeamMember() = default; - HostThreadTeamMember() = delete; - HostThreadTeamMember(HostThreadTeamMember&&) = default; - HostThreadTeamMember(HostThreadTeamMember const&) = default; - HostThreadTeamMember& operator=(HostThreadTeamMember&&) = default; - HostThreadTeamMember& operator=(HostThreadTeamMember const&) = default; - //---------------------------------------- KOKKOS_INLINE_FUNCTION diff --git a/lib/kokkos/core/src/impl/Kokkos_InitializationSettings.hpp b/lib/kokkos/core/src/impl/Kokkos_InitializationSettings.hpp index 49c9f336adf..1c69417c9ec 100644 --- a/lib/kokkos/core/src/impl/Kokkos_InitializationSettings.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_InitializationSettings.hpp @@ -27,34 +27,10 @@ class InitializationSettings { } \ static_assert(true, "no-op to require trailing semicolon") -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -#define KOKKOS_IMPL_DECLARE_DEPRECATED(TYPE, NAME) \ - private: \ - std::optional m_##NAME; \ - \ - public: \ - KOKKOS_DEPRECATED InitializationSettings& set_##NAME(TYPE NAME) { \ - m_##NAME = NAME; \ - return *this; \ - } \ - KOKKOS_DEPRECATED bool has_##NAME() const noexcept { \ - return static_cast(m_##NAME); \ - } \ - KOKKOS_DEPRECATED TYPE get_##NAME() const noexcept { \ - return *m_##NAME; /* NOLINT(bugprone-unchecked-optional-access) */ \ - } \ - static_assert(true, "no-op to require trailing semicolon") -#else -#define KOKKOS_IMPL_DECLARE_DEPRECATED(TYPE, NAME) \ - static_assert(true, "no-op to require trailing semicolon") -#endif - public: KOKKOS_IMPL_DECLARE(int, num_threads); KOKKOS_IMPL_DECLARE(int, device_id); KOKKOS_IMPL_DECLARE(std::string, map_device_id_by); - KOKKOS_IMPL_DECLARE_DEPRECATED(int, num_devices); - KOKKOS_IMPL_DECLARE_DEPRECATED(int, skip_device); KOKKOS_IMPL_DECLARE(bool, disable_warnings); KOKKOS_IMPL_DECLARE(bool, print_configuration); KOKKOS_IMPL_DECLARE(bool, tune_internals); @@ -62,8 +38,6 @@ class InitializationSettings { KOKKOS_IMPL_DECLARE(std::string, tools_libs); KOKKOS_IMPL_DECLARE(std::string, tools_args); -#undef KOKKOS_IMPL_INIT_ARGS_DATA_MEMBER_TYPE -#undef KOKKOS_IMPL_INIT_ARGS_DATA_MEMBER #undef KOKKOS_IMPL_DECLARE }; diff --git a/lib/kokkos/core/src/impl/Kokkos_NvidiaGpuArchitectures.hpp b/lib/kokkos/core/src/impl/Kokkos_NvidiaGpuArchitectures.hpp index 00d9cbb5752..950a813b4f9 100644 --- a/lib/kokkos/core/src/impl/Kokkos_NvidiaGpuArchitectures.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_NvidiaGpuArchitectures.hpp @@ -32,11 +32,15 @@ #define KOKKOS_IMPL_ARCH_NVIDIA_GPU 90 #elif defined(KOKKOS_ARCH_BLACKWELL100) #define KOKKOS_IMPL_ARCH_NVIDIA_GPU 100 +#elif defined(KOKKOS_ARCH_BLACKWELL103) +#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 103 #elif defined(KOKKOS_ARCH_BLACKWELL120) #define KOKKOS_IMPL_ARCH_NVIDIA_GPU 120 +#elif defined(KOKKOS_ARCH_BLACKWELL121) +#define KOKKOS_IMPL_ARCH_NVIDIA_GPU 121 #elif defined(KOKKOS_ENABLE_CUDA) // do not raise an error on other backends that may run on NVIDIA GPUs such as -// OpenACC, OpenMPTarget, or SYCL +// OpenACC, or SYCL #error NVIDIA GPU arch not recognized #endif diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h b/lib/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h index 6a21961d8f0..6e54d153062 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h +++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_C_Interface.h @@ -1,7 +1,7 @@ /* -# SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -*/ + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * SPDX-FileCopyrightText: Copyright Contributors to the Kokkos project + */ #ifndef KOKKOS_PROFILING_C_INTERFACE_HPP #define KOKKOS_PROFILING_C_INTERFACE_HPP diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp index 88c5ca4efe1..87462ab7cc4 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp @@ -29,7 +29,7 @@ enum struct DeviceType { OpenMP, Cuda, HIP, - OpenMPTarget, + OpenMPTarget, // removed HPX, Threads, SYCL, @@ -54,7 +54,7 @@ inline DeviceType devicetype_from_uint32t(const uint32_t in) { case 1: return DeviceType::OpenMP; case 2: return DeviceType::Cuda; case 3: return DeviceType::HIP; - case 4: return DeviceType::OpenMPTarget; + case 4: return DeviceType::OpenMPTarget; // removed case 5: return DeviceType::HPX; case 6: return DeviceType::Threads; case 7: return DeviceType::SYCL; diff --git a/lib/kokkos/core/src/impl/Kokkos_QuadPrecisionMath.hpp b/lib/kokkos/core/src/impl/Kokkos_QuadPrecisionMath.hpp index 614d49e40b1..d64093bccaf 100644 --- a/lib/kokkos/core/src/impl/Kokkos_QuadPrecisionMath.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_QuadPrecisionMath.hpp @@ -28,7 +28,7 @@ inline __float128 abs(__float128 x) { return ::fabsq(x); } inline __float128 fabs(__float128 x) { return ::fabsq(x); } inline __float128 fmod(__float128 x, __float128 y) { return ::fmodq(x, y); } inline __float128 remainder(__float128 x, __float128 y) { return ::remainderq(x, y); } -// remquo +inline __float128 remquo(__float128 x, __float128 y, int* quo) { return ::remquoq(x,y,quo); } inline __float128 fma(__float128 x, __float128 y, __float128 z) { return ::fmaq(x, y, z); } inline __float128 fmax(__float128 x, __float128 y) { return ::fmaxq(x, y); } inline __float128 fmin(__float128 x, __float128 y) { return ::fminq(x, y); } @@ -72,19 +72,19 @@ inline __float128 ceil(__float128 x) { return ::ceilq(x); } inline __float128 floor(__float128 x) { return ::floorq(x); } inline __float128 trunc(__float128 x) { return ::truncq(x); } inline __float128 round(__float128 x) { return ::roundq(x); } -// lround -// llround +inline long lround(__float128 x) { return ::lroundq(x); } +inline long long llround(__float128 x) { return ::llroundq(x); } inline __float128 nearbyint(__float128 x) { return ::nearbyintq(x); } -// rint -// lrint -// llrint +inline __float128 rint(__float128 x) { return ::rintq(x); } +inline long lrint(__float128 x) { return ::lrintq(x); } +inline long long llrint(__float128 x) { return ::llrintq(x); } // Floating point manipulation functions -// frexp -// ldexp -// modf -// scalbn -// scalbln -// ilog +inline __float128 frexp(__float128 num, int* exp) { return ::frexpq(num, exp); } +inline __float128 ldexp(__float128 num, int exp) { return ::ldexpq(num, exp); } +inline __float128 modf(__float128 num, __float128* iptr) { return ::modfq(num, iptr); } +inline __float128 scalbn(__float128 num, int exp) { return ::scalbnq(num, exp); } +inline __float128 scalbln(__float128 num, long exp) { return ::scalblnq(num, exp); } +inline int ilogb(__float128 x) { return ::ilogbq(x); } inline __float128 logb(__float128 x) { return ::logbq(x); } inline __float128 nextafter(__float128 x, __float128 y) { return ::nextafterq(x, y); } // nexttoward diff --git a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp index 391a2e5373c..fdfe2a405b7 100644 --- a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp @@ -92,6 +92,7 @@ class SharedAllocationRecord { SharedAllocationRecord(const SharedAllocationRecord&) = delete; SharedAllocationRecord& operator=(SharedAllocationRecord&&) = delete; SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; + virtual ~SharedAllocationRecord() = default; /**\brief Construct and insert into 'arg_root' tracking set. * use_count is zero. @@ -130,8 +131,6 @@ class SharedAllocationRecord { */ static void tracking_enable() { t_tracking_enabled = 1; } - virtual ~SharedAllocationRecord() = default; - SharedAllocationRecord() : m_alloc_ptr(nullptr), m_alloc_size(0), @@ -221,7 +220,15 @@ class SharedAllocationRecordCommon : public SharedAllocationRecord { static void deallocate(record_base_t* arg_rec); public: + SharedAllocationRecordCommon(const SharedAllocationRecordCommon&) = delete; + SharedAllocationRecordCommon(SharedAllocationRecordCommon&&) = delete; + SharedAllocationRecordCommon& operator=(const SharedAllocationRecordCommon&) = + delete; + SharedAllocationRecordCommon& operator=(SharedAllocationRecordCommon&&) = + delete; + ~SharedAllocationRecordCommon(); + template SharedAllocationRecordCommon( ExecutionSpace const& exec, MemorySpace const& space, @@ -305,7 +312,17 @@ class HostInaccessibleSharedAllocationRecordCommon static void deallocate(record_base_t* arg_rec); public: + HostInaccessibleSharedAllocationRecordCommon( + const HostInaccessibleSharedAllocationRecordCommon&) = delete; + HostInaccessibleSharedAllocationRecordCommon( + HostInaccessibleSharedAllocationRecordCommon&&) = delete; + HostInaccessibleSharedAllocationRecordCommon& operator=( + const HostInaccessibleSharedAllocationRecordCommon&) = delete; + HostInaccessibleSharedAllocationRecordCommon& operator=( + HostInaccessibleSharedAllocationRecordCommon&&) = delete; + ~HostInaccessibleSharedAllocationRecordCommon(); + template HostInaccessibleSharedAllocationRecordCommon( ExecutionSpace const& exec, MemorySpace const& space, @@ -468,11 +485,12 @@ class SharedAllocationRecord &Kokkos::Impl::deallocate), m_destroy() {} + public: SharedAllocationRecord() = delete; SharedAllocationRecord(const SharedAllocationRecord&) = delete; SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; + ~SharedAllocationRecord() = default; - public: DestroyFunctor m_destroy; // Allocate with a zero use count. Incrementing the use count from zero to @@ -611,7 +629,7 @@ union SharedAllocationTracker { // Move: KOKKOS_FORCEINLINE_FUNCTION - SharedAllocationTracker(SharedAllocationTracker&& rhs) + SharedAllocationTracker(SharedAllocationTracker&& rhs) noexcept : m_record_bits(rhs.m_record_bits) { rhs.m_record_bits = DO_NOT_DEREF_FLAG; } diff --git a/lib/kokkos/core/src/impl/Kokkos_StringManipulation.hpp b/lib/kokkos/core/src/impl/Kokkos_StringManipulation.hpp index 03fd72b11d5..c59ce0cbd5c 100644 --- a/lib/kokkos/core/src/impl/Kokkos_StringManipulation.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_StringManipulation.hpp @@ -148,6 +148,7 @@ struct to_chars_result { template KOKKOS_FUNCTION constexpr to_chars_result to_chars_i(char *first, char *last, Integral value) { + // NOLINTBEGIN(bugprone-invalid-enum-default-initialization) using Unsigned = std::conditional_t; Unsigned unsigned_val = value; // NOLINT(bugprone-signed-char-misuse) @@ -166,6 +167,7 @@ KOKKOS_FUNCTION constexpr to_chars_result to_chars_i(char *first, char *last, } to_chars_impl(first, len, unsigned_val); return {first + len, {}}; + // NOLINTEND(bugprone-invalid-enum-default-initialization) } // diff --git a/lib/kokkos/core/src/impl/Kokkos_TeamMDPolicy.hpp b/lib/kokkos/core/src/impl/Kokkos_TeamMDPolicy.hpp index 1c41ec56c63..8586f385f79 100644 --- a/lib/kokkos/core/src/impl/Kokkos_TeamMDPolicy.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_TeamMDPolicy.hpp @@ -103,7 +103,7 @@ KOKKOS_INLINE_FUNCTION auto nested_policy( TeamMDRangeMode, - TeamHandle const team, int count) { + TeamHandle const& team, int count) { return TeamThreadRange(team, count); } @@ -112,7 +112,7 @@ KOKKOS_INLINE_FUNCTION auto nested_policy( TeamMDRangeMode, - TeamHandle const team, int count) { + TeamHandle const& team, int count) { return ThreadVectorRange(team, count); } @@ -121,7 +121,7 @@ KOKKOS_INLINE_FUNCTION auto nested_policy( TeamMDRangeMode, - TeamHandle const team, int count) { + TeamHandle const& team, int count) { return TeamVectorRange(team, count); } diff --git a/lib/kokkos/core/src/impl/Kokkos_Tools_Generic.hpp b/lib/kokkos/core/src/impl/Kokkos_Tools_Generic.hpp index fe261bb4688..be6676fa652 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Tools_Generic.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Tools_Generic.hpp @@ -80,9 +80,21 @@ struct SimpleTeamSizeCalculator { int get_max_team_size(const Policy& policy, const FunctorReducer& functor_reducer, const Kokkos::ParallelReduceTag tag) { - auto max = policy.team_size_max(functor_reducer.get_functor(), - functor_reducer.get_reducer(), tag); - return max; + if constexpr (false +#ifdef KOKKOS_ENABLE_CUDA + || + std::is_same_v +#endif +#ifdef KOKKOS_ENABLE_HIP + || + std::is_same_v +#endif + ) + return policy.team_size_max_internal(functor_reducer.get_functor(), + functor_reducer.get_reducer(), tag); + else + return policy.team_size_max(functor_reducer.get_functor(), + functor_reducer.get_reducer(), tag); } template int get_recommended_team_size(const Policy& policy, const Functor& functor, @@ -119,15 +131,41 @@ struct ComplexReducerSizeCalculator { template int get_max_team_size(const Policy& policy, const FunctorReducer& functor_reducer, const Tag tag) { - return policy.team_size_max(functor_reducer.get_functor(), - functor_reducer.get_reducer(), tag); + if constexpr (false +#ifdef KOKKOS_ENABLE_CUDA + || + std::is_same_v +#endif +#ifdef KOKKOS_ENABLE_HIP + || + std::is_same_v +#endif + ) + return policy.team_size_max_internal(functor_reducer.get_functor(), + functor_reducer.get_reducer(), tag); + else + return policy.team_size_max(functor_reducer.get_functor(), + functor_reducer.get_reducer(), tag); } template int get_recommended_team_size(const Policy& policy, const FunctorReducer& functor_reducer, const Tag tag) { - return policy.team_size_recommended(functor_reducer.get_functor(), - functor_reducer.get_reducer(), tag); + if constexpr (false +#ifdef KOKKOS_ENABLE_CUDA + || + std::is_same_v +#endif +#ifdef KOKKOS_ENABLE_HIP + || + std::is_same_v +#endif + ) + return policy.team_size_recommended_internal( + functor_reducer.get_functor(), functor_reducer.get_reducer(), tag); + else + return policy.team_size_recommended(functor_reducer.get_functor(), + functor_reducer.get_reducer(), tag); } template int get_mdrange_max_tile_size_product(const Policy& policy, diff --git a/lib/kokkos/core/src/impl/Kokkos_Utilities.hpp b/lib/kokkos/core/src/impl/Kokkos_Utilities.hpp index f1a355f6bc9..b0a45ae6e6a 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Utilities.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Utilities.hpp @@ -160,8 +160,9 @@ struct concat_type_list> { // combine consecutive type_lists template -struct concat_type_list, type_list, Tail...> - : concat_type_list, Tail...> {}; +struct concat_type_list, type_list, Tail...> { + using type = concat_type_list_t, Tail...>; +}; // end concat_type_list }}}2 //------------------------------------------------------------------------------ @@ -175,12 +176,18 @@ template