diff --git a/docs/snippets/example/02_execution.cpp b/docs/snippets/example/02_execution.cpp new file mode 100644 index 000000000..1dd5e15fa --- /dev/null +++ b/docs/snippets/example/02_execution.cpp @@ -0,0 +1,50 @@ +/* Copyright 2026 René Widera + * SPDX-License-Identifier: MPL-2.0 + */ + +#include + +#include + +using namespace alpaka; + +TEST_CASE("tutorial enumerate backends and executors", "[docs]") +{ + // BEGIN-TUTORIAL-enumerateDeviceSpec + auto deviceSpec = onHost::DeviceSpec{api::host, deviceKind::cpu}; + auto selector = onHost::makeDeviceSelector(deviceSpec); + + auto numDevices = selector.getDeviceCount(); + REQUIRE(numDevices >= 1u); + + auto properties = selector.getDeviceProperties(0u); + auto device = selector.makeDevice(0u); + // END-TUTORIAL-enumerateDeviceSpec + + CHECK(properties.warpSize >= 1u); + CHECK(!device.getName().empty()); + + size_t numVisitedBackends = 0u; + // BEGIN-TUTORIAL-enumerateBackends + onHost::executeForEachIfHasDevice( + [&](auto const& backend) + { + ++numVisitedBackends; + + auto backendDeviceSpec = backend[object::deviceSpec]; + auto backendExec = backend[object::exec]; + auto backendSelector = onHost::makeDeviceSelector(backendDeviceSpec); + auto backendDevice = backendSelector.makeDevice(0u); + auto backendQueue = backendDevice.makeQueue(); + + backendQueue.enqueueHostFn([]() noexcept {}); + onHost::wait(backendQueue); + + alpaka::unused(backendExec); + return EXIT_SUCCESS; + }, + onHost::allBackends(onHost::enabledApis, exec::enabledExecutors)); + // END-TUTORIAL-enumerateBackends + + CHECK(numVisitedBackends >= 1u); +} diff --git a/docs/snippets/example/05_device.cpp b/docs/snippets/example/05_device.cpp index 30f000abf..ba4eaf643 100644 --- a/docs/snippets/example/05_device.cpp +++ b/docs/snippets/example/05_device.cpp @@ -26,7 +26,7 @@ TEST_CASE("show host devices", "[docs]") // END-TUTORIAL-devCount // BEGIN-TUTORIAL-devHandleCount - // Always check the number of available compute devices! Alpaka always creates a valid DeviceSelector even for + // Always check the number of available compute devices! alpaka always creates a valid DeviceSelector even for // unsupported combinations of an api and deviceKind. if(numComputeDevs > 0) { diff --git a/docs/snippets/example/06_queue.cpp b/docs/snippets/example/06_queue.cpp index 4a9324bcb..df3aa44de 100644 --- a/docs/snippets/example/06_queue.cpp +++ b/docs/snippets/example/06_queue.cpp @@ -2,17 +2,23 @@ * SPDX-License-Identifier: MPL-2.0 */ +#include "docsTest.hpp" + #include +#include #include #include using namespace alpaka; -TEST_CASE("non blocking queue", "[docs]") +TEMPLATE_LIST_TEST_CASE("non blocking queue", "[docs]", docs::test::TestBackends) { - auto device = onHost::makeHostDevice(); + auto selector = onHost::makeDeviceSelector(TestType::makeDict()[object::deviceSpec]); + if(!selector.isAvailable()) + return; + auto device = selector.makeDevice(0); // BEGIN-TUTORIAL-nonBlockingQueue // Creating a non-blocking queue @@ -24,16 +30,19 @@ TEST_CASE("non blocking queue", "[docs]") // END-TUTORIAL-nonBlockingQueue } -TEST_CASE("blocking queue", "[docs]") +TEMPLATE_LIST_TEST_CASE("blocking queue", "[docs]", docs::test::TestBackends) { - auto device = onHost::makeHostDevice(); + auto selector = onHost::makeDeviceSelector(TestType::makeDict()[object::deviceSpec]); + if(!selector.isAvailable()) + return; + auto device = selector.makeDevice(0); // BEGIN-TUTORIAL-blockingQueue // Creating a blocking queue onHost::Queue queue = device.makeQueue(queueKind::blocking); uint32_t value = 42u; queue.enqueueHostFn([&value]() { value = 23u; }); - // no wait required, enqueue will wait untile the task is finished + // no wait required, enqueue will wait until the task is finished CHECK(value == 23u); // END-TUTORIAL-blockingQueue } diff --git a/docs/snippets/example/08_events.cpp b/docs/snippets/example/08_events.cpp new file mode 100644 index 000000000..03a4f1c48 --- /dev/null +++ b/docs/snippets/example/08_events.cpp @@ -0,0 +1,38 @@ +/* Copyright 2026 René Widera + * SPDX-License-Identifier: MPL-2.0 + */ + +#include "docsTest.hpp" + +#include + +#include +#include + +using namespace alpaka; + +TEMPLATE_LIST_TEST_CASE("tutorial events and synchronization", "[docs]", docs::test::TestBackends) +{ + auto selector = onHost::makeDeviceSelector(TestType::makeDict()[object::deviceSpec]); + if(!selector.isAvailable()) + return; + auto device = selector.makeDevice(0); + auto queue0 = device.makeQueue(); + auto queue1 = device.makeQueue(); + auto event = device.makeEvent(); + int value = 0; + + // BEGIN-TUTORIAL-eventCreation + queue0.enqueueHostFn([&value]() { value = 41; }); + queue0.enqueue(event); + // END-TUTORIAL-eventCreation + + // BEGIN-TUTORIAL-eventWait + queue1.waitFor(event); + queue1.enqueueHostFn([&value]() { value += 1; }); + onHost::wait(queue1); + // END-TUTORIAL-eventWait + + CHECK(event.isComplete()); + CHECK(value == 42); +} diff --git a/docs/snippets/example/10_memory.cpp b/docs/snippets/example/10_memory.cpp index 4ceb9d685..285ca72c6 100644 --- a/docs/snippets/example/10_memory.cpp +++ b/docs/snippets/example/10_memory.cpp @@ -2,8 +2,11 @@ * SPDX-License-Identifier: MPL-2.0 */ +#include "docsTest.hpp" + #include +#include #include #include @@ -80,20 +83,12 @@ TEST_CASE("memory allocations like", "[docs]") alpaka::unused(devDoubleBuffer); } -TEST_CASE("memory", "[docs]") +TEMPLATE_LIST_TEST_CASE("memory", "[docs]", docs::test::TestBackends) { - // Nvidia GPU: onHost::DeviceSpec{api::cuda, deviceKind::nvidiaGpu}; - // Amd GPU: onHost::DeviceSpec{api::hip, deviceKind::amdGpu}; - // Intel GPU: onHost::DeviceSpec{api::oneApi, deviceKind::intelGpu}; - // this call selects the host Cpu - auto computeDevSpec = onHost::DeviceSpec{api::host, deviceKind::cpu}; + auto computeDevSpec = TestType::makeDict()[object::deviceSpec]; auto computeDevSelector = alpaka::onHost::makeDeviceSelector(computeDevSpec); - auto numComputeDevs = computeDevSelector.getDeviceCount(); - - if(numComputeDevs == 0) - { - std::cout << "No device for " << onHost::getName(computeDevSpec) << " found." << std::endl; - } + if(!computeDevSelector.isAvailable()) + return; // using the typed interface and not concept + auto onHost::Device computeDev = computeDevSelector.makeDevice(0); @@ -133,20 +128,12 @@ TEST_CASE("memory", "[docs]") CHECK(v == 42); } -TEST_CASE("memory using std::vector", "[docs]") +TEMPLATE_LIST_TEST_CASE("memory using std::vector", "[docs]", docs::test::TestBackends) { - // Nvidia GPU: onHost::DeviceSpec{api::cuda, deviceKind::nvidiaGpu}; - // Amd GPU: onHost::DeviceSpec{api::hip, deviceKind::amdGpu}; - // Intel GPU: onHost::DeviceSpec{api::oneApi, deviceKind::intelGpu}; - // this call selects the host Cpu - auto computeDevSpec = onHost::DeviceSpec{api::host, deviceKind::cpu}; + auto computeDevSpec = TestType::makeDict()[object::deviceSpec]; auto computeDevSelector = alpaka::onHost::makeDeviceSelector(computeDevSpec); - auto numComputeDevs = computeDevSelector.getDeviceCount(); - - if(numComputeDevs == 0) - { - std::cout << "No device for " << onHost::getName(computeDevSpec) << " found." << std::endl; - } + if(!computeDevSelector.isAvailable()) + return; onHost::Device computeDev = computeDevSelector.makeDevice(0); onHost::Queue asyncComputeQueue = computeDev.makeQueue(); diff --git a/docs/snippets/example/11_views.cpp b/docs/snippets/example/11_views.cpp new file mode 100644 index 000000000..7d2b9601f --- /dev/null +++ b/docs/snippets/example/11_views.cpp @@ -0,0 +1,49 @@ +/* Copyright 2026 René Widera + * SPDX-License-Identifier: MPL-2.0 + */ + +#include "docsTest.hpp" + +#include + +#include +#include + +#include + +using namespace alpaka; + +TEMPLATE_LIST_TEST_CASE("tutorial views and subviews", "[docs]", docs::test::TestBackends) +{ + auto selector = onHost::makeDeviceSelector(TestType::makeDict()[object::deviceSpec]); + if(!selector.isAvailable()) + return; + auto device = selector.makeDevice(0); + auto queue = device.makeQueue(); + + std::vector hostData{0, 1, 2, 3, 4, 5, 6, 7}; + + // BEGIN-TUTORIAL-viewCreation + auto hostView = makeView(hostData); + auto middleView = hostView.getSubView(size_t{2}, size_t{4}); + // END-TUTORIAL-viewCreation + + CHECK(hostView.getExtents().x() == 8u); + CHECK(middleView.getExtents().x() == 4u); + CHECK(middleView[Vec{size_t{0}}] == 2); + CHECK(middleView[Vec{size_t{3}}] == 5); + + // BEGIN-TUTORIAL-viewCopy + auto deviceBuffer = onHost::allocLike(device, hostView); + onHost::memcpy(queue, deviceBuffer, hostView); + + auto hostSlice = onHost::allocHost(4u); + onHost::memcpy(queue, hostSlice, deviceBuffer.getSubView(Vec{size_t{2}}, Vec{size_t{4}})); + onHost::wait(queue); + // END-TUTORIAL-viewCopy + + CHECK(hostSlice[Vec{0u}] == 2); + CHECK(hostSlice[Vec{1u}] == 3); + CHECK(hostSlice[Vec{2u}] == 4); + CHECK(hostSlice[Vec{3u}] == 5); +} diff --git a/docs/snippets/example/12_kernelIntro.cpp b/docs/snippets/example/12_kernelIntro.cpp new file mode 100644 index 000000000..0cf5eee30 --- /dev/null +++ b/docs/snippets/example/12_kernelIntro.cpp @@ -0,0 +1,75 @@ +/* Copyright 2026 René Widera + * SPDX-License-Identifier: MPL-2.0 + */ + +#include "docsTest.hpp" + +#include + +#include +#include + +#include +#include + +using namespace alpaka; + +// BEGIN-TUTORIAL-kernelStructure +struct VectorAddKernel +{ + ALPAKA_FN_ACC void operator()( + onAcc::concepts::Acc auto const& acc, + concepts::IMdSpan auto out, + concepts::IDataSource auto const& lhs, + concepts::IDataSource auto const& rhs) const + { + ALPAKA_ASSERT_ACC(out.getExtents() == lhs.getExtents()); + ALPAKA_ASSERT_ACC(out.getExtents() == rhs.getExtents()); + + for(auto [i] : onAcc::makeIdxMap(acc, onAcc::worker::threadsInGrid, IdxRange{out.getExtents()})) + { + out[i] = lhs[i] + rhs[i]; + } + } +}; + +// END-TUTORIAL-kernelStructure + +TEMPLATE_LIST_TEST_CASE("tutorial kernel intro vector add", "[docs]", docs::test::TestBackends) +{ + auto selector = onHost::makeDeviceSelector(TestType::makeDict()[object::deviceSpec]); + if(!selector.isAvailable()) + return; + auto device = selector.makeDevice(0); + auto queue = device.makeQueue(); + + std::vector lhs(257u); + std::vector rhs(257u); + std::iota(lhs.begin(), lhs.end(), 0); + std::iota(rhs.begin(), rhs.end(), 1000); + std::vector result(lhs.size(), -1); + + auto lhsBuffer = onHost::alloc(device, static_cast(lhs.size())); + auto rhsBuffer = onHost::allocLike(device, lhsBuffer); + auto resultBuffer = onHost::allocLike(device, lhsBuffer); + + onHost::memcpy(queue, lhsBuffer, lhs); + onHost::memcpy(queue, rhsBuffer, rhs); + onHost::memset(queue, resultBuffer, 0x00); + + // BEGIN-TUTORIAL-kernelLaunch + // BEGIN-TUTORIAL-kernelFrameSpec + auto frameSpec = onHost::getFrameSpec(device, Vec{static_cast(result.size())}); + // END-TUTORIAL-kernelFrameSpec + + queue.enqueue(frameSpec, KernelBundle{VectorAddKernel{}, resultBuffer, lhsBuffer, rhsBuffer}); + + onHost::memcpy(queue, result, resultBuffer); + onHost::wait(queue); + // END-TUTORIAL-kernelLaunch + + for(size_t i = 0; i < result.size(); ++i) + { + CHECK(result[i] == lhs[i] + rhs[i]); + } +} diff --git a/docs/snippets/example/13_hierarchy.cpp b/docs/snippets/example/13_hierarchy.cpp new file mode 100644 index 000000000..5f97472fa --- /dev/null +++ b/docs/snippets/example/13_hierarchy.cpp @@ -0,0 +1,140 @@ +/* Copyright 2026 René Widera + * SPDX-License-Identifier: MPL-2.0 + */ + +#include "docsTest.hpp" + +#include + +#include +#include + +using namespace alpaka; + +// BEGIN-TUTORIAL-hierarchyKernel +struct ImageTileHierarchyKernel +{ + ALPAKA_FN_ACC void operator()( + onAcc::concepts::Acc auto const& acc, + concepts::IDataSource auto const& input, + concepts::IMdSpan auto mask, + concepts::IMdSpan auto rowCounts, + int threshold) const + { + auto const imageExtent = input.getExtents(); + auto const tileExtent = acc[frame::extent]; + + for(auto blockStart : + onAcc::makeIdxMap(acc, onAcc::worker::blocksInGrid, IdxRange{Vec{0u, 0u}, imageExtent, tileExtent})) + { + for(auto localIdx : onAcc::makeIdxMap(acc, onAcc::worker::threadsInBlock, IdxRange{tileExtent})) + { + auto globalIdx = blockStart + localIdx; + if(globalIdx[0u] < imageExtent[0u] && globalIdx[1u] < imageExtent[1u]) + { + mask[globalIdx] = input[globalIdx] >= threshold ? 1u : 0u; + } + } + + for(auto warpRow : + onAcc::makeIdxMap(acc, onAcc::worker::linearWarpsInBlock, onAcc::range::linearWarpsInBlock)) + { + auto rowStart = blockStart + Vec{warpRow.x(), 0u}; + if(rowStart[0u] >= imageExtent[0u] || warpRow.x() >= tileExtent[0u]) + { + continue; + } + + for(auto lane : + onAcc::makeIdxMap(acc, onAcc::worker::linearThreadsInWarp, onAcc::range::linearThreadsInWarp)) + { + auto globalIdx = rowStart + Vec{0u, lane.x()}; + if(lane.x() < tileExtent[1u] && globalIdx[1u] < imageExtent[1u] && input[globalIdx] >= threshold) + { + onAcc::atomicAdd(acc, &rowCounts[Vec{rowStart[0u]}], 1u); + } + } + } + } + } +}; + +// END-TUTORIAL-hierarchyKernel + +TEMPLATE_LIST_TEST_CASE("tutorial hierarchy blocks threads warps", "[docs]", docs::test::TestBackends) +{ + auto selector = onHost::makeDeviceSelector(TestType::makeDict()[object::deviceSpec]); + if(!selector.isAvailable()) + return; + auto device = selector.makeDevice(0); + auto queue = device.makeQueue(queueKind::blocking); + + auto const warpSize = device.getDeviceProperties().warpSize; + auto const imageExtent = Vec{4u, 2u * warpSize}; + auto const tileExtent = Vec{1u, warpSize}; + + auto hostInput = onHost::allocHost(imageExtent); + auto hostMask = onHost::allocHost(imageExtent); + auto hostRowCounts = onHost::allocHost(Vec{imageExtent[0u]}); + + for(auto idx : IdxRange{imageExtent}) + { + if(idx[0u] == 0u) + { + hostInput[idx] = 10; + } + else if(idx[0u] == 1u) + { + hostInput[idx] = idx[1u] < warpSize ? 0 : 10; + } + else if(idx[0u] == 2u) + { + hostInput[idx] = (idx[1u] % 2u == 0u) ? 10 : 0; + } + else + { + hostInput[idx] = 0; + } + } + + auto inputBuffer = onHost::allocLike(device, hostInput); + auto maskBuffer = onHost::allocLike(device, hostMask); + auto rowCountsBuffer = onHost::allocLike(device, hostRowCounts); + + onHost::memcpy(queue, inputBuffer, hostInput); + onHost::fill(queue, rowCountsBuffer, 0u); + + // BEGIN-TUTORIAL-hierarchyLaunch + auto frameSpec = onHost::FrameSpec{divExZero(imageExtent, tileExtent), tileExtent}; + queue.enqueue(frameSpec, KernelBundle{ImageTileHierarchyKernel{}, inputBuffer, maskBuffer, rowCountsBuffer, 5}); + // END-TUTORIAL-hierarchyLaunch + + onHost::memcpy(queue, hostMask, maskBuffer); + onHost::memcpy(queue, hostRowCounts, rowCountsBuffer); + onHost::wait(queue); + + for(auto idx : IdxRange{imageExtent}) + { + if(idx[0u] == 0u) + { + CHECK(hostMask[idx] == 1u); + } + else if(idx[0u] == 1u) + { + CHECK(hostMask[idx] == (idx[1u] < warpSize ? 0u : 1u)); + } + else if(idx[0u] == 2u) + { + CHECK(hostMask[idx] == (idx[1u] % 2u == 0u ? 1u : 0u)); + } + else + { + CHECK(hostMask[idx] == 0u); + } + } + + CHECK(hostRowCounts[0u] == 2u * warpSize); + CHECK(hostRowCounts[1u] == warpSize); + CHECK(hostRowCounts[2u] == warpSize); + CHECK(hostRowCounts[3u] == 0u); +} diff --git a/docs/snippets/example/14_algorithms.cpp b/docs/snippets/example/14_algorithms.cpp new file mode 100644 index 000000000..569d569f9 --- /dev/null +++ b/docs/snippets/example/14_algorithms.cpp @@ -0,0 +1,158 @@ +/* Copyright 2026 René Widera + * SPDX-License-Identifier: MPL-2.0 + */ + +#include "docsTest.hpp" + +#include + +#include +#include + +#include +#include + +using namespace alpaka; + +// BEGIN-TUTORIAL-transformFunctor +struct SquareValue +{ + ALPAKA_FN_ACC auto operator()(int const& value) const -> int + { + return value * value; + } +}; + +// END-TUTORIAL-transformFunctor + +// BEGIN-TUTORIAL-transformReduceFunctor +struct MultiplyValues +{ + ALPAKA_FN_ACC auto operator()(int const& a, int const& b) const -> int + { + return a * b; + } +}; + +// END-TUTORIAL-transformReduceFunctor + +// BEGIN-TUTORIAL-generatorFunctor +struct AddLinearIdx +{ + ALPAKA_FN_ACC auto operator()(int const& value, size_t const& linearIdx) const -> int + { + return value + static_cast(linearIdx); + } +}; + +// END-TUTORIAL-generatorFunctor + +TEMPLATE_LIST_TEST_CASE("tutorial onHost algorithms", "[docs]", docs::test::TestBackends) +{ + auto cfg = TestType::makeDict(); + auto selector = onHost::makeDeviceSelector(cfg[object::deviceSpec]); + if(!selector.isAvailable()) + return; + auto device = selector.makeDevice(0); + auto queue = device.makeQueue(queueKind::blocking); + auto exec = cfg[object::exec]; + + std::array hostInput{1, 2, 3, 4, 5, 6, 7, 8}; + std::array hostIota{}; + std::array hostTransform{}; + std::array hostScan{}; + std::array hostGenerator{}; + + auto iotaBuffer = onHost::allocLike(device, hostInput); + auto inputBuffer = onHost::allocLike(device, hostInput); + auto transformBuffer = onHost::allocLike(device, hostInput); + auto scanBuffer = onHost::allocLike(device, hostInput); + auto generatorBuffer = onHost::allocLike(device, hostInput); + auto reduceBuffer = onHost::alloc(device, Vec{1u}); + auto transformReduceBuffer = onHost::alloc(device, Vec{1u}); + auto reduceHost = onHost::allocHostLike(reduceBuffer); + auto transformReduceHost = onHost::allocHostLike(transformReduceBuffer); + + onHost::memcpy(queue, inputBuffer, hostInput); + + // BEGIN-TUTORIAL-iota + onHost::iota(queue, exec, 10, iotaBuffer); + // END-TUTORIAL-iota + + // BEGIN-TUTORIAL-transformCall + onHost::transform(queue, exec, transformBuffer, ScalarFunc{SquareValue{}}, inputBuffer); + // END-TUTORIAL-transformCall + + // BEGIN-TUTORIAL-reduce + onHost::reduce(queue, exec, 0, reduceBuffer, std::plus{}, inputBuffer); + // END-TUTORIAL-reduce + + // BEGIN-TUTORIAL-scan + auto tmpBuffer = onHost::alloc(device, onHost::getScanBufferSize(inputBuffer.getExtents())); + onHost::inclusiveScan(queue, exec, tmpBuffer, scanBuffer, inputBuffer); + // END-TUTORIAL-scan + + // BEGIN-TUTORIAL-transformReduceCall + onHost::transformReduce( + queue, + exec, + 0, + transformReduceBuffer, + std::plus{}, + ScalarFunc{MultiplyValues{}}, + inputBuffer, + inputBuffer); + // END-TUTORIAL-transformReduceCall + + // BEGIN-TUTORIAL-generatorCall + auto generator = LinearizedIdxGenerator{inputBuffer.getExtents()}; + onHost::transform(queue, exec, generatorBuffer, ScalarFunc{AddLinearIdx{}}, inputBuffer, generator); + // END-TUTORIAL-generatorCall + + onHost::memcpy(queue, hostIota, iotaBuffer); + onHost::memcpy(queue, hostTransform, transformBuffer); + onHost::memcpy(queue, reduceHost, reduceBuffer); + onHost::memcpy(queue, hostScan, scanBuffer); + onHost::memcpy(queue, hostGenerator, generatorBuffer); + onHost::memcpy(queue, transformReduceHost, transformReduceBuffer); + onHost::wait(queue); + + CHECK(hostIota[0] == 10); + CHECK(hostIota[1] == 11); + CHECK(hostIota[2] == 12); + CHECK(hostIota[3] == 13); + CHECK(hostIota[4] == 14); + CHECK(hostIota[5] == 15); + CHECK(hostIota[6] == 16); + CHECK(hostIota[7] == 17); + + CHECK(hostTransform[0] == 1); + CHECK(hostTransform[1] == 4); + CHECK(hostTransform[2] == 9); + CHECK(hostTransform[3] == 16); + CHECK(hostTransform[4] == 25); + CHECK(hostTransform[5] == 36); + CHECK(hostTransform[6] == 49); + CHECK(hostTransform[7] == 64); + + CHECK(reduceHost[0] == 36); + CHECK(hostScan[0] == 1); + CHECK(hostScan[1] == 3); + CHECK(hostScan[2] == 6); + CHECK(hostScan[3] == 10); + CHECK(hostScan[4] == 15); + CHECK(hostScan[5] == 21); + CHECK(hostScan[6] == 28); + CHECK(hostScan[7] == 36); + + CHECK(hostGenerator[0] == 1); + CHECK(hostGenerator[1] == 3); + CHECK(hostGenerator[2] == 5); + CHECK(hostGenerator[3] == 7); + CHECK(hostGenerator[4] == 9); + CHECK(hostGenerator[5] == 11); + CHECK(hostGenerator[6] == 13); + CHECK(hostGenerator[7] == 15); + + CHECK(transformReduceHost[0] == 204); +} diff --git a/docs/snippets/example/15_kernel.cpp b/docs/snippets/example/15_kernel.cpp index c526f12f5..d0e63c86e 100644 --- a/docs/snippets/example/15_kernel.cpp +++ b/docs/snippets/example/15_kernel.cpp @@ -2,8 +2,11 @@ * SPDX-License-Identifier: MPL-2.0 */ +#include "docsTest.hpp" + #include +#include #include #include @@ -15,7 +18,7 @@ using namespace alpaka; struct AddOne { - ALPAKA_FN_ACC void operator()(auto const& acc, concepts::IMdSpan auto out) const + ALPAKA_FN_ACC void operator()(onAcc::concepts::Acc auto const& acc, concepts::IMdSpan auto out) const { for(auto i : onAcc::makeIdxMap(acc, onAcc::worker::threadsInGrid, IdxRange{out.getExtents()})) { @@ -24,20 +27,12 @@ struct AddOne } }; -TEST_CASE("first kernel", "[docs]") +TEMPLATE_LIST_TEST_CASE("first kernel", "[docs]", docs::test::TestBackends) { - // Nvidia GPU: onHost::DeviceSpec{api::cuda, deviceKind::nvidiaGpu}; - // Amd GPU: onHost::DeviceSpec{api::hip, deviceKind::amdGpu}; - // Intel GPU: onHost::DeviceSpec{api::oneApi, deviceKind::intelGpu}; - // this call selects the host Cpu - auto computeDevSpec = onHost::DeviceSpec{api::host, deviceKind::cpu}; + auto computeDevSpec = TestType::makeDict()[object::deviceSpec]; auto computeDevSelector = alpaka::onHost::makeDeviceSelector(computeDevSpec); - auto numComputeDevs = computeDevSelector.getDeviceCount(); - - if(numComputeDevs == 0) - { - std::cout << "No device for " << onHost::getName(computeDevSpec) << " found." << std::endl; - } + if(!computeDevSelector.isAvailable()) + return; onHost::Device computeDev = computeDevSelector.makeDevice(0); onHost::Queue computeQueue = computeDev.makeQueue(); @@ -70,7 +65,7 @@ TEST_CASE("first kernel", "[docs]") struct MDVectorAdd { ALPAKA_FN_ACC void operator()( - auto const& acc, + onAcc::concepts::Acc auto const& acc, concepts::IMdSpan auto out, concepts::IDataSource auto const& in0, concepts::IDataSource auto const& in1) const @@ -84,20 +79,13 @@ struct MDVectorAdd } }; -TEST_CASE("MD vector add kernel", "[docs]") +TEMPLATE_LIST_TEST_CASE("MD vector add kernel", "[docs]", docs::test::TestBackends) { - // Nvidia GPU: onHost::DeviceSpec{api::cuda, deviceKind::nvidiaGpu}; - // Amd GPU: onHost::DeviceSpec{api::hip, deviceKind::amdGpu}; - // Intel GPU: onHost::DeviceSpec{api::oneApi, deviceKind::intelGpu}; - // this call selects the host Cpu - auto computeDevSpec = onHost::DeviceSpec{api::host, deviceKind::cpu}; + auto cfg = TestType::makeDict(); + auto computeDevSpec = cfg[object::deviceSpec]; auto computeDevSelector = alpaka::onHost::makeDeviceSelector(computeDevSpec); - auto numComputeDevs = computeDevSelector.getDeviceCount(); - - if(numComputeDevs == 0) - { - std::cout << "No device for " << onHost::getName(computeDevSpec) << " found." << std::endl; - } + if(!computeDevSelector.isAvailable()) + return; onHost::Device computeDev = computeDevSelector.makeDevice(0); onHost::Queue computeQueue = computeDev.makeQueue(); @@ -128,9 +116,8 @@ TEST_CASE("MD vector add kernel", "[docs]") onHost::wait(computeQueue); auto const beginT = std::chrono::high_resolution_clock::now(); - // we enforce serial execution because this executor is always available deviceKind::cpu and api::host computeQueue.enqueue( - exec::cpuSerial, + cfg[object::exec], frameSpec, KernelBundle{MDVectorAdd{}, computeBufferOut, computeBufferIn0, computeBufferIn1}); onHost::wait(computeQueue); diff --git a/docs/snippets/example/16_sharedMemory.cpp b/docs/snippets/example/16_sharedMemory.cpp new file mode 100644 index 000000000..64978fa12 --- /dev/null +++ b/docs/snippets/example/16_sharedMemory.cpp @@ -0,0 +1,292 @@ +/* Copyright 2026 René Widera + * SPDX-License-Identifier: MPL-2.0 + */ + +#include "docsTest.hpp" + +#include + +#include +#include + +#include +#include + +using namespace alpaka; + +// BEGIN-TUTORIAL-sharedScalarKernel +struct BlockSumKernel +{ + ALPAKA_FN_ACC void operator()( + onAcc::concepts::Acc auto const& acc, + concepts::IMdSpan auto out, + concepts::IDataSource auto const& in) const + { + auto& blockSum = onAcc::declareSharedVar(acc); + + for(auto idx : onAcc::makeIdxMap(acc, onAcc::worker::threadsInBlock, onAcc::range::frameExtent)) + { + if(idx.x() == 0u) + { + blockSum = 0; + } + } + + onAcc::syncBlockThreads(acc); + + for(auto idx : onAcc::makeIdxMap(acc, onAcc::worker::threadsInBlock, onAcc::range::frameExtent)) + { + onAcc::atomicAdd(acc, &blockSum, in[idx], onAcc::scope::block); + } + + onAcc::syncBlockThreads(acc); + + for(auto idx : onAcc::makeIdxMap(acc, onAcc::worker::threadsInBlock, onAcc::range::frameExtent)) + { + if(idx.x() == 0u) + { + out[0u] = blockSum; + } + } + } +}; + +// END-TUTORIAL-sharedScalarKernel + +// BEGIN-TUTORIAL-sharedKernel +struct ReverseFrameKernel +{ + ALPAKA_FN_ACC void operator()( + onAcc::concepts::Acc auto const& acc, + concepts::IMdSpan auto out, + concepts::IDataSource auto const& in) const + { + auto tile = onAcc::declareSharedMdArray(acc, acc[frame::extent]); + + for(auto idx : onAcc::makeIdxMap(acc, onAcc::worker::threadsInBlock, onAcc::range::frameExtent)) + { + tile[idx] = in[idx]; + } + + onAcc::syncBlockThreads(acc); + + for(auto idx : onAcc::makeIdxMap(acc, onAcc::worker::threadsInBlock, onAcc::range::frameExtent)) + { + auto reverseIdx = Vec{acc[frame::extent].x() - 1u - idx.x()}; + out[idx] = tile[reverseIdx]; + } + } +}; + +// END-TUTORIAL-sharedKernel + +// BEGIN-TUTORIAL-dynSharedMemberKernel +struct DynamicReverseKernel +{ + uint32_t dynSharedMemBytes; + + ALPAKA_FN_ACC void operator()( + onAcc::concepts::Acc auto const& acc, + concepts::IMdSpan auto out, + concepts::IDataSource auto const& in) const + { + auto* tile = onAcc::getDynSharedMem(acc); + + for(auto idx : onAcc::makeIdxMap(acc, onAcc::worker::threadsInBlock, onAcc::range::frameExtent)) + { + tile[idx.x()] = in[idx]; + } + + onAcc::syncBlockThreads(acc); + + for(auto idx : onAcc::makeIdxMap(acc, onAcc::worker::threadsInBlock, onAcc::range::frameExtent)) + { + auto reverseIdx = acc[frame::extent].x() - 1u - idx.x(); + out[idx] = tile[reverseIdx]; + } + } +}; + +// END-TUTORIAL-dynSharedMemberKernel + +// BEGIN-TUTORIAL-dynSharedTraitKernel +struct DynamicScaleKernel +{ + ALPAKA_FN_ACC void operator()( + onAcc::concepts::Acc auto const& acc, + concepts::IMdSpan auto out, + concepts::IDataSource auto const& in, + int factor) const + { + auto* cache = onAcc::getDynSharedMem(acc); + + for(auto idx : onAcc::makeIdxMap(acc, onAcc::worker::threadsInBlock, onAcc::range::frameExtent)) + { + cache[idx.x()] = in[idx] * factor; + } + + onAcc::syncBlockThreads(acc); + + for(auto idx : onAcc::makeIdxMap(acc, onAcc::worker::threadsInBlock, onAcc::range::frameExtent)) + { + out[idx] = cache[idx.x()]; + } + } +}; + +// END-TUTORIAL-dynSharedTraitKernel + +namespace alpaka::onHost::trait +{ + // BEGIN-TUTORIAL-dynSharedTraitSpec + template + struct BlockDynSharedMemBytes + { + BlockDynSharedMemBytes(DynamicScaleKernel const&, T_Spec const& spec) : m_spec(spec) + { + } + + uint32_t operator()(auto const executor, auto const& out, auto const& in, int factor) const + { + alpaka::unused(executor, out, in, factor); + auto const totalCachedElements = in.getExtents().product(); + auto const numBlocks = m_spec.getNumBlocks().product(); + assert(totalCachedElements % numBlocks == 0u); + auto const cachedFrameExtent = totalCachedElements / numBlocks; + return static_cast(cachedFrameExtent * sizeof(int)); + } + + private: + T_Spec m_spec; + }; + + // END-TUTORIAL-dynSharedTraitSpec +} // namespace alpaka::onHost::trait + +TEMPLATE_LIST_TEST_CASE("tutorial shared memory tile", "[docs]", docs::test::TestBackends) +{ + auto selector = onHost::makeDeviceSelector(TestType::makeDict()[object::deviceSpec]); + if(!selector.isAvailable()) + return; + auto device = selector.makeDevice(0); + auto queue = device.makeQueue(queueKind::blocking); + + std::array hostInput{0, 1, 2, 3, 4, 5, 6, 7}; + std::array hostOutput{}; + + auto inputBuffer = onHost::allocLike(device, hostInput); + auto outputBuffer = onHost::allocLike(device, hostInput); + + onHost::memcpy(queue, inputBuffer, hostInput); + + // BEGIN-TUTORIAL-sharedLaunch + auto frameSpec = onHost::FrameSpec{1u, CVec{}}; + queue.enqueue(frameSpec, KernelBundle{ReverseFrameKernel{}, outputBuffer, inputBuffer}); + // END-TUTORIAL-sharedLaunch + + onHost::memcpy(queue, hostOutput, outputBuffer); + onHost::wait(queue); + + CHECK(hostOutput[0] == 7); + CHECK(hostOutput[1] == 6); + CHECK(hostOutput[2] == 5); + CHECK(hostOutput[3] == 4); + CHECK(hostOutput[4] == 3); + CHECK(hostOutput[5] == 2); + CHECK(hostOutput[6] == 1); + CHECK(hostOutput[7] == 0); +} + +TEMPLATE_LIST_TEST_CASE("tutorial shared memory scalar value", "[docs]", docs::test::TestBackends) +{ + auto selector = onHost::makeDeviceSelector(TestType::makeDict()[object::deviceSpec]); + if(!selector.isAvailable()) + return; + auto device = selector.makeDevice(0); + auto queue = device.makeQueue(queueKind::blocking); + + std::array hostInput{1, 2, 3, 4, 5, 6, 7, 8}; + std::array hostOutput{}; + + auto inputBuffer = onHost::allocLike(device, hostInput); + auto outputBuffer = onHost::allocLike(device, hostOutput); + + onHost::memcpy(queue, inputBuffer, hostInput); + + auto frameSpec = onHost::FrameSpec{1u, CVec{}}; + queue.enqueue(frameSpec, KernelBundle{BlockSumKernel{}, outputBuffer, inputBuffer}); + + onHost::memcpy(queue, hostOutput, outputBuffer); + onHost::wait(queue); + + CHECK(hostOutput[0] == 36); +} + +TEMPLATE_LIST_TEST_CASE("tutorial dynamic shared memory via member", "[docs]", docs::test::TestBackends) +{ + auto selector = onHost::makeDeviceSelector(TestType::makeDict()[object::deviceSpec]); + if(!selector.isAvailable()) + return; + auto device = selector.makeDevice(0); + auto queue = device.makeQueue(queueKind::blocking); + + std::array hostInput{0, 1, 2, 3, 4, 5, 6, 7}; + std::array hostOutput{}; + + auto inputBuffer = onHost::allocLike(device, hostInput); + auto outputBuffer = onHost::allocLike(device, hostInput); + + onHost::memcpy(queue, inputBuffer, hostInput); + + auto frameSpec = onHost::FrameSpec{1u, CVec{}}; + queue.enqueue( + frameSpec, + KernelBundle{ + DynamicReverseKernel{static_cast(hostInput.size() * sizeof(int))}, + outputBuffer, + inputBuffer}); + + onHost::memcpy(queue, hostOutput, outputBuffer); + onHost::wait(queue); + + CHECK(hostOutput[0] == 7); + CHECK(hostOutput[1] == 6); + CHECK(hostOutput[2] == 5); + CHECK(hostOutput[3] == 4); + CHECK(hostOutput[4] == 3); + CHECK(hostOutput[5] == 2); + CHECK(hostOutput[6] == 1); + CHECK(hostOutput[7] == 0); +} + +TEMPLATE_LIST_TEST_CASE("tutorial dynamic shared memory via trait", "[docs]", docs::test::TestBackends) +{ + auto selector = onHost::makeDeviceSelector(TestType::makeDict()[object::deviceSpec]); + if(!selector.isAvailable()) + return; + auto device = selector.makeDevice(0); + auto queue = device.makeQueue(queueKind::blocking); + + std::array hostInput{0, 1, 2, 3, 4, 5, 6, 7}; + std::array hostOutput{}; + + auto inputBuffer = onHost::allocLike(device, hostInput); + auto outputBuffer = onHost::allocLike(device, hostInput); + + onHost::memcpy(queue, inputBuffer, hostInput); + + auto frameSpec = onHost::FrameSpec{1u, CVec{}}; + queue.enqueue(frameSpec, KernelBundle{DynamicScaleKernel{}, outputBuffer, inputBuffer, 3}); + + onHost::memcpy(queue, hostOutput, outputBuffer); + onHost::wait(queue); + + CHECK(hostOutput[0] == 0); + CHECK(hostOutput[1] == 3); + CHECK(hostOutput[2] == 6); + CHECK(hostOutput[3] == 9); + CHECK(hostOutput[4] == 12); + CHECK(hostOutput[5] == 15); + CHECK(hostOutput[6] == 18); + CHECK(hostOutput[7] == 21); +} diff --git a/docs/snippets/example/18_multidimKernel.cpp b/docs/snippets/example/18_multidimKernel.cpp new file mode 100644 index 000000000..892e7e693 --- /dev/null +++ b/docs/snippets/example/18_multidimKernel.cpp @@ -0,0 +1,81 @@ +/* Copyright 2026 René Widera + * SPDX-License-Identifier: MPL-2.0 + */ + +#include "docsTest.hpp" + +#include + +#include +#include + +using namespace alpaka; + +// BEGIN-TUTORIAL-multidimKernelStructure +struct FivePointAverageKernel +{ + ALPAKA_FN_ACC void operator()( + onAcc::concepts::Acc auto const& acc, + concepts::IMdSpan auto out, + concepts::IDataSource auto const& in) const + { + auto extents = out.getExtents(); + ALPAKA_ASSERT_ACC(extents == in.getExtents()); + constexpr auto xDir = CVec{}; + constexpr auto yDir = CVec{}; + + for(auto idx : onAcc::makeIdxMap(acc, onAcc::worker::threadsInGrid, IdxRange{extents})) + { + if(idx.y() == 0u || idx.x() == 0u || idx.y() + 1u == extents.y() || idx.x() + 1u == extents.x()) + { + out[idx] = in[idx]; + continue; + } + + out[idx] = (in[idx] + in[idx - yDir] + in[idx + yDir] + in[idx - xDir] + in[idx + xDir]) / 5; + } + } +}; + +// END-TUTORIAL-multidimKernelStructure + +TEMPLATE_LIST_TEST_CASE("tutorial multidimensional stencil kernel", "[docs]", docs::test::TestBackends) +{ + auto selector = onHost::makeDeviceSelector(TestType::makeDict()[object::deviceSpec]); + if(!selector.isAvailable()) + return; + auto device = selector.makeDevice(0); + auto queue = device.makeQueue(); + + auto const problemExtents = Vec{5u, 5u}; + auto hostInput = onHost::allocHost(problemExtents); + auto hostOutput = onHost::allocHostLike(hostInput); + + for(auto& value : hostInput) + value = 0; + hostInput[Vec{2u, 2u}] = 100; + + auto inBuffer = onHost::allocLike(device, hostInput); + auto outBuffer = onHost::allocLike(device, hostInput); + + onHost::memcpy(queue, inBuffer, hostInput); + onHost::memset(queue, outBuffer, 0x00); + + // BEGIN-TUTORIAL-multidimFrameSpec + auto frameSpec = onHost::getFrameSpec(device, problemExtents); + // END-TUTORIAL-multidimFrameSpec + + // BEGIN-TUTORIAL-multidimKernelLaunch + queue.enqueue(frameSpec, KernelBundle{FivePointAverageKernel{}, outBuffer, inBuffer}); + + onHost::memcpy(queue, hostOutput, outBuffer); + onHost::wait(queue); + // END-TUTORIAL-multidimKernelLaunch + + for(auto const idx : IdxRange{problemExtents}) + { + auto const isCross = idx == Vec{2u, 2u} || idx == Vec{1u, 2u} || idx == Vec{2u, 1u} || idx == Vec{2u, 3u} + || idx == Vec{3u, 2u}; + CHECK(hostOutput[idx] == (isCross ? 20 : 0)); + } +} diff --git a/docs/snippets/example/20_simdKernel.cpp b/docs/snippets/example/20_simdKernel.cpp index 89cf7d2c9..e14231bf0 100644 --- a/docs/snippets/example/20_simdKernel.cpp +++ b/docs/snippets/example/20_simdKernel.cpp @@ -2,8 +2,11 @@ * SPDX-License-Identifier: MPL-2.0 */ +#include "docsTest.hpp" + #include +#include #include #include @@ -16,7 +19,7 @@ using namespace alpaka; struct MDVectorSimdAdd { ALPAKA_FN_ACC void operator()( - auto const& acc, + onAcc::concepts::Acc auto const& acc, concepts::IMdSpan auto out, concepts::IDataSource auto const& in0, concepts::IDataSource auto const& in1) const @@ -35,20 +38,13 @@ struct MDVectorSimdAdd } }; -TEST_CASE("MD vector simd add kernel", "[docs]") +TEMPLATE_LIST_TEST_CASE("MD vector simd add kernel", "[docs]", docs::test::TestBackends) { - // Nvidia GPU: onHost::DeviceSpec{api::cuda, deviceKind::nvidiaGpu}; - // Amd GPU: onHost::DeviceSpec{api::hip, deviceKind::amdGpu}; - // Intel GPU: onHost::DeviceSpec{api::oneApi, deviceKind::intelGpu}; - // this call selects the host Cpu - auto computeDevSpec = onHost::DeviceSpec{api::host, deviceKind::cpu}; + auto cfg = TestType::makeDict(); + auto computeDevSpec = cfg[object::deviceSpec]; auto computeDevSelector = alpaka::onHost::makeDeviceSelector(computeDevSpec); - auto numComputeDevs = computeDevSelector.getDeviceCount(); - - if(numComputeDevs == 0) - { - std::cout << "No device for " << onHost::getName(computeDevSpec) << " found." << std::endl; - } + if(!computeDevSelector.isAvailable()) + return; onHost::Device computeDev = computeDevSelector.makeDevice(0); onHost::Queue computeQueue = computeDev.makeQueue(); @@ -87,9 +83,8 @@ TEST_CASE("MD vector simd add kernel", "[docs]") std::cout << frameSpec << std::endl; onHost::wait(computeQueue); auto const beginT = std::chrono::high_resolution_clock::now(); - // we enforce serial execution because this executor is always available deviceKind::cpu and api::host computeQueue.enqueue( - exec::cpuSerial, + cfg[object::exec], frameSpec, KernelBundle{MDVectorSimdAdd{}, computeBufferOut, computeBufferIn0, computeBufferIn1}); onHost::wait(computeQueue); diff --git a/docs/snippets/example/22_atomics.cpp b/docs/snippets/example/22_atomics.cpp new file mode 100644 index 000000000..2742731c9 --- /dev/null +++ b/docs/snippets/example/22_atomics.cpp @@ -0,0 +1,63 @@ +/* Copyright 2026 René Widera + * SPDX-License-Identifier: MPL-2.0 + */ + +#include "docsTest.hpp" + +#include + +#include +#include + +#include + +using namespace alpaka; + +// BEGIN-TUTORIAL-atomicKernel +struct HistogramKernel +{ + ALPAKA_FN_ACC void operator()( + onAcc::concepts::Acc auto const& acc, + concepts::IDataSource auto const& input, + concepts::IMdSpan auto bins) const + { + for(auto [i] : onAcc::makeIdxMap(acc, onAcc::worker::threadsInGrid, IdxRange{input.getExtents()})) + { + auto const bin = input[i]; + onAcc::atomicAdd(acc, &bins[Vec{bin}], 1u); + } + } +}; + +// END-TUTORIAL-atomicKernel + +TEMPLATE_LIST_TEST_CASE("tutorial atomics histogram", "[docs]", docs::test::TestBackends) +{ + auto selector = onHost::makeDeviceSelector(TestType::makeDict()[object::deviceSpec]); + if(!selector.isAvailable()) + return; + auto device = selector.makeDevice(0); + auto queue = device.makeQueue(); + + std::array hostInput{0u, 1u, 0u, 2u, 3u, 0u, 1u, 2u, 2u, 3u, 3u, 3u}; + std::array hostBins{}; + + auto inputBuffer = onHost::allocLike(device, hostInput); + auto binsBuffer = onHost::alloc(device, Vec{4u}); + + onHost::memcpy(queue, inputBuffer, hostInput); + onHost::memset(queue, binsBuffer, 0x00); + + // BEGIN-TUTORIAL-atomicLaunch + auto frameSpec = onHost::FrameSpec{divExZero(static_cast(hostInput.size()), 64u), 64u}; + queue.enqueue(frameSpec, KernelBundle{HistogramKernel{}, inputBuffer, binsBuffer}); + // END-TUTORIAL-atomicLaunch + + onHost::memcpy(queue, hostBins, binsBuffer); + onHost::wait(queue); + + CHECK(hostBins[0] == 3u); + CHECK(hostBins[1] == 2u); + CHECK(hostBins[2] == 3u); + CHECK(hostBins[3] == 4u); +} diff --git a/docs/snippets/example/24_math.cpp b/docs/snippets/example/24_math.cpp new file mode 100644 index 000000000..916572de1 --- /dev/null +++ b/docs/snippets/example/24_math.cpp @@ -0,0 +1,92 @@ +/* Copyright 2026 René Widera + * SPDX-License-Identifier: MPL-2.0 + */ + +#include "docsTest.hpp" + +#include + +#include +#include +#include + +#include +#include + +using namespace alpaka; + +// BEGIN-TUTORIAL-mathKernel +struct TrigIdentityKernel +{ + ALPAKA_FN_ACC void operator()( + onAcc::concepts::Acc auto const& acc, + concepts::IMdSpan auto out, + concepts::IDataSource auto const& angles) const + { + for(auto [i] : onAcc::makeIdxMap(acc, onAcc::worker::threadsInGrid, IdxRange{angles.getExtents()})) + { + float sine{}; + float cosine{}; + math::sincos(angles[i], sine, cosine); + out[i] = math::fma(sine, sine, cosine * cosine); + } + } +}; + +// END-TUTORIAL-mathKernel + +// BEGIN-TUTORIAL-rsqrtKernel +struct DistanceKernel +{ + ALPAKA_FN_ACC void operator()( + onAcc::concepts::Acc auto const& acc, + concepts::IMdSpan auto out, + concepts::IDataSource auto const& x) const + { + for(auto [i] : onAcc::makeIdxMap(acc, onAcc::worker::threadsInGrid, IdxRange{x.getExtents()})) + { + auto const squaredLength = math::fma(x[i], x[i], 1.0f); + out[i] = math::rsqrt(squaredLength); + } + } +}; + +// END-TUTORIAL-rsqrtKernel + +TEMPLATE_LIST_TEST_CASE("tutorial math functions on device", "[docs]", docs::test::TestBackends) +{ + auto selector = onHost::makeDeviceSelector(TestType::makeDict()[object::deviceSpec]); + if(!selector.isAvailable()) + return; + auto device = selector.makeDevice(0); + auto queue = device.makeQueue(); + + std::array hostAngles{0.0f, 0.5f, 1.0f, 1.5f}; + std::array hostTrig{}; + std::array hostInvLen{}; + + auto angleBuffer = onHost::allocLike(device, hostAngles); + auto trigBuffer = onHost::allocLike(device, hostAngles); + auto invLenBuffer = onHost::allocLike(device, hostAngles); + + onHost::memcpy(queue, angleBuffer, hostAngles); + + auto frameSpec = onHost::FrameSpec{1u, 64u}; + queue.enqueue(frameSpec, KernelBundle{TrigIdentityKernel{}, trigBuffer, angleBuffer}); + queue.enqueue(frameSpec, KernelBundle{DistanceKernel{}, invLenBuffer, angleBuffer}); + + onHost::memcpy(queue, hostTrig, trigBuffer); + onHost::memcpy(queue, hostInvLen, invLenBuffer); + onHost::wait(queue); + + for(auto value : hostTrig) + { + CHECK(value == Catch::Approx(1.0f).margin(5e-6f)); + } + + for(size_t i = 0; i < hostAngles.size(); ++i) + { + auto expected = 1.0f / std::sqrt(hostAngles[i] * hostAngles[i] + 1.0f); + CHECK(hostInvLen[i] == Catch::Approx(expected).margin(5e-6f)); + } +} diff --git a/docs/snippets/example/26_warp.cpp b/docs/snippets/example/26_warp.cpp new file mode 100644 index 000000000..5e422e055 --- /dev/null +++ b/docs/snippets/example/26_warp.cpp @@ -0,0 +1,85 @@ +/* Copyright 2026 René Widera + * SPDX-License-Identifier: MPL-2.0 + */ + +#include "docsTest.hpp" + +#include + +#include +#include + +#include + +using namespace alpaka; + +// BEGIN-TUTORIAL-warpKernel +struct WarpSumKernel +{ + ALPAKA_FN_ACC void operator()( + onAcc::concepts::Acc auto const& acc, + concepts::IDataSource auto const& in, + concepts::IMdSpan auto out) const + { + auto const warpSize = onAcc::warp::getSize(acc); + auto const idxInWarp = onAcc::warp::getLaneIdx(acc); + auto const workSize = pCast(in.getExtents()); + for(auto [blockBase] : + onAcc::makeIdxMap(acc, onAcc::worker::linearWarpsInGrid, IdxRange{0u, workSize, warpSize})) + { + auto value = in[Vec{blockBase + idxInWarp}]; + for(uint32_t offset = warpSize / 2u; offset > 0u; offset /= 2u) + value += onAcc::warp::shflDown(acc, value, offset); + + if(onAcc::warp::getLaneIdx(acc) == 0u) + { + out[blockBase / warpSize] = value; + } + } + } +}; + +// END-TUTORIAL-warpKernel + +TEMPLATE_LIST_TEST_CASE("tutorial warp shuffle reduction", "[docs]", docs::test::TestBackends) +{ + auto selector = onHost::makeDeviceSelector(TestType::makeDict()[object::deviceSpec]); + if(!selector.isAvailable()) + return; + auto device = selector.makeDevice(0); + auto queue = device.makeQueue(queueKind::blocking); + auto const warpSize = device.getDeviceProperties().warpSize; + + auto const blocks = 2u; + + std::vector hostInput(blocks * warpSize); + std::vector hostOutput(blocks, 0u); + std::vector expectedOutput(blocks, 0u); + + for(uint32_t blockIdx = 0; blockIdx < blocks; ++blockIdx) + { + for(uint32_t laneIdx = 0; laneIdx < warpSize; ++laneIdx) + { + auto const value = blockIdx * warpSize + laneIdx + 1u; + hostInput[blockIdx * warpSize + laneIdx] = value; + expectedOutput[blockIdx] += value; + } + } + + auto inputBuffer = onHost::allocLike(device, hostInput); + auto outputBuffer = onHost::allocLike(device, hostOutput); + + onHost::memcpy(queue, inputBuffer, hostInput); + onHost::memset(queue, outputBuffer, 0x00); + + // BEGIN-TUTORIAL-warpLaunch + auto frameSpec = onHost::FrameSpec{Vec{blocks}, Vec{warpSize}}; + queue.enqueue(frameSpec, KernelBundle{WarpSumKernel{}, inputBuffer, outputBuffer}); + // END-TUTORIAL-warpLaunch + + onHost::memcpy(queue, hostOutput, outputBuffer); + onHost::wait(queue); + + for(uint32_t blockIdx = 0; blockIdx < blocks; ++blockIdx) + CHECK(hostOutput[blockIdx] == expectedOutput[blockIdx]); +} diff --git a/docs/snippets/example/28_chunkedFrames.cpp b/docs/snippets/example/28_chunkedFrames.cpp new file mode 100644 index 000000000..0672e73aa --- /dev/null +++ b/docs/snippets/example/28_chunkedFrames.cpp @@ -0,0 +1,100 @@ +/* Copyright 2026 René Widera + * SPDX-License-Identifier: MPL-2.0 + */ + +#include "docsTest.hpp" + +#include + +#include +#include + +#include + +using namespace alpaka; + +// BEGIN-TUTORIAL-chunkedKernel +struct ChunkedVectorAddKernel +{ + ALPAKA_FN_ACC void operator()( + onAcc::concepts::Acc auto const& acc, + concepts::IMdSpan auto out, + concepts::IDataSource auto const& in0, + concepts::IDataSource auto const& in1) const + { + auto frameExtent = acc[frame::extent]; + auto linearNumFrames = Vec{acc[frame::count].product()}; + auto linearFrameExtent = Vec{frameExtent.product()}; + + for(auto linearFrameIdx : onAcc::makeIdxMap(acc, onAcc::worker::linearBlocksInGrid, IdxRange{linearNumFrames})) + { + auto tile = onAcc::declareSharedMdArray(acc, frameExtent); + + for(auto linearFrameElem : + onAcc::makeIdxMap(acc, onAcc::worker::linearThreadsInBlock, IdxRange{linearFrameExtent})) + { + auto globalIdx = linearFrameIdx * frameExtent + linearFrameElem; + tile[linearFrameElem] = in0[globalIdx]; + } + + onAcc::syncBlockThreads(acc); + + for(auto linearFrameElem : onAcc::makeIdxMap( + acc, + onAcc::worker::linearThreadsInBlock, + IdxRange{linearFrameExtent}, + onAcc::traverse::tiled)) + { + auto globalIdx = linearFrameIdx * frameExtent + linearFrameElem; + out[globalIdx] = tile[linearFrameElem] + in1[globalIdx]; + } + + onAcc::syncBlockThreads(acc); + } + } +}; + +// END-TUTORIAL-chunkedKernel + +TEMPLATE_LIST_TEST_CASE("tutorial chunked frames kernel", "[docs]", docs::test::TestBackends) +{ + auto selector = onHost::makeDeviceSelector(TestType::makeDict()[object::deviceSpec]); + if(!selector.isAvailable()) + return; + auto device = selector.makeDevice(0); + auto queue = device.makeQueue(queueKind::blocking); + + std::array hostIn0{0, 1, 2, 3, 4, 5, 6, 7}; + std::array hostIn1{10, 10, 10, 10, 10, 10, 10, 10}; + std::array hostOut{}; + + auto in0Buffer = onHost::allocLike(device, hostIn0); + auto in1Buffer = onHost::allocLike(device, hostIn1); + auto outBuffer = onHost::allocLike(device, hostIn0); + + onHost::memcpy(queue, in0Buffer, hostIn0); + onHost::memcpy(queue, in1Buffer, hostIn1); + + // BEGIN-TUTORIAL-chunkedLaunch + constexpr auto frameExtent = CVec{}; + auto const totalElems = static_cast(hostOut.size()); + auto const frameElementCount = frameExtent.product(); + REQUIRE(totalElems % frameElementCount == 0u); + auto numFrames = Vec{totalElems / frameElementCount}; + auto frameSpec = onHost::FrameSpec{numFrames, frameExtent}; + + queue.enqueue(frameSpec, KernelBundle{ChunkedVectorAddKernel{}, outBuffer, in0Buffer, in1Buffer}); + // END-TUTORIAL-chunkedLaunch + + onHost::memcpy(queue, hostOut, outBuffer); + onHost::wait(queue); + + CHECK(hostOut[0] == 10); + CHECK(hostOut[1] == 11); + CHECK(hostOut[2] == 12); + CHECK(hostOut[3] == 13); + CHECK(hostOut[4] == 14); + CHECK(hostOut[5] == 15); + CHECK(hostOut[6] == 16); + CHECK(hostOut[7] == 17); +} diff --git a/docs/snippets/example/30_random.cpp b/docs/snippets/example/30_random.cpp new file mode 100644 index 000000000..4963c05b0 --- /dev/null +++ b/docs/snippets/example/30_random.cpp @@ -0,0 +1,183 @@ +/* Copyright 2026 René Widera + * SPDX-License-Identifier: MPL-2.0 + */ + +#include "docsTest.hpp" + +#include + +#include +#include + +#include +#include +#include + +using namespace alpaka; + +// BEGIN-TUTORIAL-randomKernel +struct UniformRandomKernel +{ + ALPAKA_FN_ACC void operator()(onAcc::concepts::Acc auto const& acc, concepts::IMdSpan auto out, uint32_t seed) + const + { + for(auto [idx] : onAcc::makeIdxMap(acc, onAcc::worker::threadsInGrid, IdxRange{out.getExtents()})) + { + rand::engine::Philox4x32x10 engine(seed + idx); + auto distribution = rand::distribution::UniformReal{0.0f, 1.0f, rand::interval::co}; + out[idx] = distribution(engine); + } + } +}; + +// END-TUTORIAL-randomKernel + +// BEGIN-TUTORIAL-randomIntervalsKernel +struct IntervalExamplesKernel +{ + ALPAKA_FN_ACC void operator()( + onAcc::concepts::Acc auto const& acc, + concepts::IMdSpan auto coValues, + concepts::IMdSpan auto ocValues, + concepts::IMdSpan auto ccValues, + concepts::IMdSpan auto ooValues, + uint32_t seed) const + { + for(auto [idx] : onAcc::makeIdxMap(acc, onAcc::worker::threadsInGrid, IdxRange{coValues.getExtents()})) + { + rand::engine::Philox4x32x10 engine(seed + idx); + coValues[idx] = rand::distribution::UniformReal{0.0f, 1.0f, rand::interval::co}(engine); + ocValues[idx] = rand::distribution::UniformReal{0.0f, 1.0f, rand::interval::oc}(engine); + ccValues[idx] = rand::distribution::UniformReal{0.0f, 1.0f, rand::interval::cc}(engine); + ooValues[idx] = rand::distribution::UniformReal{0.0f, 1.0f, rand::interval::oo}(engine); + } + } +}; + +// END-TUTORIAL-randomIntervalsKernel + +// BEGIN-TUTORIAL-randomNormalKernel +struct NormalNoiseKernel +{ + ALPAKA_FN_ACC void operator()( + onAcc::concepts::Acc auto const& acc, + concepts::IMdSpan auto out, + uint32_t seed, + float mean, + float stdDev) const + { + for(auto [idx] : onAcc::makeIdxMap(acc, onAcc::worker::threadsInGrid, IdxRange{out.getExtents()})) + { + rand::engine::Philox4x32x10 engine(seed + idx); + rand::distribution::NormalReal normal(mean, stdDev); + out[idx] = normal(engine); + } + } +}; + +// END-TUTORIAL-randomNormalKernel + +TEMPLATE_LIST_TEST_CASE("tutorial random numbers", "[docs]", docs::test::TestBackends) +{ + auto selector = onHost::makeDeviceSelector(TestType::makeDict()[object::deviceSpec]); + if(!selector.isAvailable()) + return; + auto device = selector.makeDevice(0); + auto queue = device.makeQueue(queueKind::blocking); + + std::array hostValues{}; + auto randomBuffer = onHost::allocLike(device, hostValues); + + // BEGIN-TUTORIAL-randomLaunch + auto frameSpec = onHost::getFrameSpec(device, randomBuffer.getExtents()); + queue.enqueue(frameSpec, KernelBundle{UniformRandomKernel{}, randomBuffer, 1234u}); + // END-TUTORIAL-randomLaunch + + onHost::memcpy(queue, hostValues, randomBuffer); + onHost::wait(queue); + + float sum = 0.0f; + for(auto value : hostValues) + { + CHECK(value >= 0.0f); + CHECK(value < 1.0f); + sum += value; + } + + CHECK(sum > 0.0f); + CHECK(sum < 8.0f); +} + +TEMPLATE_LIST_TEST_CASE("tutorial random intervals", "[docs]", docs::test::TestBackends) +{ + auto selector = onHost::makeDeviceSelector(TestType::makeDict()[object::deviceSpec]); + if(!selector.isAvailable()) + return; + auto device = selector.makeDevice(0); + auto queue = device.makeQueue(queueKind::blocking); + + std::array hostCo{}; + std::array hostOc{}; + std::array hostCc{}; + std::array hostOo{}; + + auto coBuffer = onHost::allocLike(device, hostCo); + auto ocBuffer = onHost::allocLike(device, hostOc); + auto ccBuffer = onHost::allocLike(device, hostCc); + auto ooBuffer = onHost::allocLike(device, hostOo); + + auto frameSpec = onHost::getFrameSpec(device, coBuffer.getExtents()); + queue.enqueue(frameSpec, KernelBundle{IntervalExamplesKernel{}, coBuffer, ocBuffer, ccBuffer, ooBuffer, 999u}); + + onHost::memcpy(queue, hostCo, coBuffer); + onHost::memcpy(queue, hostOc, ocBuffer); + onHost::memcpy(queue, hostCc, ccBuffer); + onHost::memcpy(queue, hostOo, ooBuffer); + onHost::wait(queue); + + for(size_t i = 0; i < hostCo.size(); ++i) + { + CHECK(hostCo[i] >= 0.0f); + CHECK(hostCo[i] < 1.0f); + CHECK(hostOc[i] > 0.0f); + CHECK(hostOc[i] <= 1.0f); + CHECK(hostCc[i] >= 0.0f); + CHECK(hostCc[i] <= 1.0f); + CHECK(hostOo[i] > 0.0f); + CHECK(hostOo[i] < 1.0f); + } +} + +TEMPLATE_LIST_TEST_CASE("tutorial random normal distribution", "[docs]", docs::test::TestBackends) +{ + auto selector = onHost::makeDeviceSelector(TestType::makeDict()[object::deviceSpec]); + if(!selector.isAvailable()) + return; + auto device = selector.makeDevice(0); + auto queue = device.makeQueue(queueKind::blocking); + + std::array hostValues{}; + auto randomBuffer = onHost::allocLike(device, hostValues); + + // BEGIN-TUTORIAL-randomNormalLaunch + auto frameSpec = onHost::getFrameSpec(device, randomBuffer.getExtents()); + queue.enqueue(frameSpec, KernelBundle{NormalNoiseKernel{}, randomBuffer, 2025u, 5.0f, 2.0f}); + // END-TUTORIAL-randomNormalLaunch + + onHost::memcpy(queue, hostValues, randomBuffer); + onHost::wait(queue); + + float mean = std::accumulate(hostValues.begin(), hostValues.end(), 0.0f) / static_cast(hostValues.size()); + CHECK(mean > 4.0f); + CHECK(mean < 6.0f); + + bool foundBelowMean = false; + bool foundAboveMean = false; + for(auto value : hostValues) + { + foundBelowMean = foundBelowMean || value < 5.0f; + foundAboveMean = foundAboveMean || value > 5.0f; + } + CHECK(foundBelowMean); + CHECK(foundAboveMean); +} diff --git a/docs/snippets/example/31_monteCarloPi.cpp b/docs/snippets/example/31_monteCarloPi.cpp new file mode 100644 index 000000000..464cce85f --- /dev/null +++ b/docs/snippets/example/31_monteCarloPi.cpp @@ -0,0 +1,66 @@ +/* Copyright 2026 René Widera + * SPDX-License-Identifier: MPL-2.0 + */ + +#include "docsTest.hpp" + +#include + +#include +#include +#include + +#include + +using namespace alpaka; + +// BEGIN-TUTORIAL-piKernel +struct MonteCarloPiKernel +{ + ALPAKA_FN_ACC void operator()(onAcc::concepts::Acc auto const& acc, concepts::IMdSpan auto hits, uint32_t seed) + const + { + for(auto [idx] : onAcc::makeIdxMap(acc, onAcc::worker::threadsInGrid, IdxRange{hits.getExtents()})) + { + rand::engine::Philox4x32x10 engine(seed + idx); + auto uniform = rand::distribution::UniformReal{0.0f, 1.0f, rand::interval::co}; + auto x = uniform(engine); + auto y = uniform(engine); + hits[idx] = (x * x + y * y <= 1.0f) ? 1u : 0u; + } + } +}; + +// END-TUTORIAL-piKernel + +TEMPLATE_LIST_TEST_CASE("tutorial monte carlo pi", "[docs]", docs::test::TestBackends) +{ + auto cfg = TestType::makeDict(); + auto selector = onHost::makeDeviceSelector(cfg[object::deviceSpec]); + if(!selector.isAvailable()) + return; + auto device = selector.makeDevice(0); + auto queue = device.makeQueue(queueKind::blocking); + auto exec = cfg[object::exec]; + + constexpr uint32_t numSamples = 16384u; + auto hitBuffer = onHost::alloc(device, Vec{numSamples}); + auto hitCountBuffer = onHost::alloc(device, Vec{1u}); + auto hostHitCount = onHost::allocHostLike(hitCountBuffer); + + auto frameSpec = onHost::getFrameSpec(device, hitBuffer.getExtents()); + + // BEGIN-TUTORIAL-piLaunch + queue.enqueue(frameSpec, KernelBundle{MonteCarloPiKernel{}, hitBuffer, 2026u}); + onHost::reduce(queue, exec, 0u, hitCountBuffer, std::plus{}, hitBuffer); + // END-TUTORIAL-piLaunch + + onHost::memcpy(queue, hostHitCount, hitCountBuffer); + onHost::wait(queue); + + // BEGIN-TUTORIAL-piEstimate + auto estimatedPi = 4.0f * static_cast(hostHitCount[0]) / static_cast(numSamples); + // END-TUTORIAL-piEstimate + + CHECK(estimatedPi == Catch::Approx(3.14159f).margin(0.15f)); +} diff --git a/docs/snippets/example/32_intrinsics.cpp b/docs/snippets/example/32_intrinsics.cpp new file mode 100644 index 000000000..8753928a4 --- /dev/null +++ b/docs/snippets/example/32_intrinsics.cpp @@ -0,0 +1,79 @@ +/* Copyright 2026 René Widera + * SPDX-License-Identifier: MPL-2.0 + */ + +#include "docsTest.hpp" + +#include + +#include +#include + +#include +#include +#include + +using namespace alpaka; + +// BEGIN-TUTORIAL-intrinsicKernel +struct BitIntrinsicKernel +{ + ALPAKA_FN_ACC void operator()( + onAcc::concepts::Acc auto const& acc, + concepts::IMdSpan auto popCounts, + concepts::IMdSpan auto firstSetBits, + concepts::IMdSpan auto leadingZeros, + concepts::IDataSource auto const& input) const + { + for(auto [idx] : onAcc::makeIdxMap(acc, onAcc::worker::threadsInGrid, IdxRange{input.getExtents()})) + { + auto value = input[idx]; + popCounts[idx] = popcount(value); + firstSetBits[idx] = ffs(value); + leadingZeros[idx] = clz(value); + } + } +}; + +// END-TUTORIAL-intrinsicKernel + +TEMPLATE_LIST_TEST_CASE("tutorial intrinsics", "[docs]", docs::test::TestBackends) +{ + auto selector = onHost::makeDeviceSelector(TestType::makeDict()[object::deviceSpec]); + if(!selector.isAvailable()) + return; + auto device = selector.makeDevice(0); + auto queue = device.makeQueue(queueKind::blocking); + + std::array hostInput{0u, 1u, 0b1011'0000u, 0xFFFF'0000u}; + std::array hostPopCount{}; + std::array hostFfs{}; + std::array hostClz{}; + + auto inputBuffer = onHost::allocLike(device, hostInput); + auto popCountBuffer = onHost::allocLike(device, hostPopCount); + auto ffsBuffer = onHost::allocLike(device, hostFfs); + auto clzBuffer = onHost::allocLike(device, hostClz); + + onHost::memcpy(queue, inputBuffer, hostInput); + + // BEGIN-TUTORIAL-intrinsicLaunch + auto frameSpec = onHost::getFrameSpec(device, inputBuffer.getExtents()); + queue.enqueue(frameSpec, KernelBundle{BitIntrinsicKernel{}, popCountBuffer, ffsBuffer, clzBuffer, inputBuffer}); + // END-TUTORIAL-intrinsicLaunch + + onHost::memcpy(queue, hostPopCount, popCountBuffer); + onHost::memcpy(queue, hostFfs, ffsBuffer); + onHost::memcpy(queue, hostClz, clzBuffer); + onHost::wait(queue); + + for(size_t i = 0; i < hostInput.size(); ++i) + { + auto value = hostInput[i]; + CHECK(hostPopCount[i] == std::popcount(value)); + CHECK(hostFfs[i] == (value == 0u ? 0 : static_cast(std::countr_zero(value) + 1u))); + CHECK( + hostClz[i] + == (value == 0u ? std::numeric_limits::digits : static_cast(std::countl_zero(value)))); + } +} diff --git a/docs/snippets/example/34_memFence.cpp b/docs/snippets/example/34_memFence.cpp new file mode 100644 index 000000000..a67244ce4 --- /dev/null +++ b/docs/snippets/example/34_memFence.cpp @@ -0,0 +1,143 @@ +/* Copyright 2026 René Widera + * SPDX-License-Identifier: MPL-2.0 + */ + +#include "docsTest.hpp" + +#include + +#include +#include + +using namespace alpaka; + +// BEGIN-TUTORIAL-memFenceBlockKernel +struct BlockFenceKernel +{ + uint32_t dynSharedMemBytes = 2u * sizeof(int); + + ALPAKA_FN_ACC void operator()(onAcc::concepts::Acc auto const& acc, concepts::IMdSpan auto successFlag) const + { + auto* shared = onAcc::getDynSharedMem(acc); + + for(auto [tid] : onAcc::makeIdxMap(acc, onAcc::worker::threadsInGrid, onAcc::range::threadsInGrid)) + { + if(tid == 0u) + { + shared[0] = 1; + shared[1] = 2; + } + onAcc::syncBlockThreads(acc); + + if(tid == 0u) + { + shared[0] = 10; + onAcc::memFence(acc, onAcc::scope::block, onAcc::order::release); + shared[1] = 20; + } + + onAcc::syncBlockThreads(acc); + + auto observedB = shared[1]; + onAcc::memFence(acc, onAcc::scope::block, onAcc::order::acquire); + auto observedA = shared[0]; + + if(observedA == 1 && observedB == 20) + { + onAcc::atomicExch(acc, &successFlag[0u], 0u); + } + } + } +}; + +// END-TUTORIAL-memFenceBlockKernel + +// BEGIN-TUTORIAL-memFenceDeviceKernel +struct ProducerConsumerFenceKernel +{ + ALPAKA_FN_ACC void operator()( + onAcc::concepts::Acc auto const& acc, + concepts::IMdSpan auto payload, + concepts::IMdSpan auto readyFlag, + concepts::IMdSpan auto mismatchCounter) const + { + auto [tid] = acc.getIdxWithin(onAcc::origin::grid, onAcc::unit::threads); + + if(!(tid == 0u || tid == 2u)) + { + return; + } + + if(tid == 0u) + { + payload[0u] = 42u; + onAcc::memFence(acc, onAcc::scope::device, onAcc::order::release); + onAcc::atomicExch(acc, &readyFlag[0u], 1u); + } + else + { + while(onAcc::atomicCas(acc, &readyFlag[0u], 0u, 0u) == 0u) + { + } + + onAcc::memFence(acc, onAcc::scope::device, onAcc::order::acquire); + if(payload[0u] != 42u) + { + onAcc::atomicAdd(acc, &mismatchCounter[0u], 1u); + } + } + } +}; + +// END-TUTORIAL-memFenceDeviceKernel + +TEMPLATE_LIST_TEST_CASE("tutorial memFence block scope", "[docs]", docs::test::TestBackends) +{ + auto selector = onHost::makeDeviceSelector(TestType::makeDict()[object::deviceSpec]); + if(!selector.isAvailable()) + return; + auto device = selector.makeDevice(0); + auto queue = device.makeQueue(queueKind::blocking); + + auto successFlag = onHost::allocUnified(device, Vec{1u}); + successFlag[0u] = 1u; + + // BEGIN-TUTORIAL-memFenceBlockLaunch + queue.enqueue(onHost::FrameSpec{1u, 2u}, KernelBundle{BlockFenceKernel{}, successFlag}); + // END-TUTORIAL-memFenceBlockLaunch + + onHost::wait(queue); + CHECK(successFlag[0u] == 1u); +} + +TEMPLATE_LIST_TEST_CASE("tutorial memFence device scope", "[docs]", docs::test::TestBackends) +{ + auto selector = onHost::makeDeviceSelector(TestType::makeDict()[object::deviceSpec]); + if(!selector.isAvailable()) + return; + auto device = selector.makeDevice(0); + auto queue = device.makeQueue(queueKind::blocking); + + auto payload = onHost::alloc(device, Vec{1u}); + auto readyFlag = onHost::alloc(device, Vec{1u}); + auto mismatchCounter = onHost::alloc(device, Vec{1u}); + + auto readyInit = onHost::allocHostLike(readyFlag); + auto mismatchInit = onHost::allocHostLike(mismatchCounter); + readyInit[0u] = 0u; + mismatchInit[0u] = 0u; + + onHost::memcpy(queue, readyFlag, readyInit); + onHost::memcpy(queue, mismatchCounter, mismatchInit); + + // BEGIN-TUTORIAL-memFenceDeviceLaunch + queue.enqueue( + onHost::ThreadSpec{3u, 1u}, + KernelBundle{ProducerConsumerFenceKernel{}, payload, readyFlag, mismatchCounter}); + // END-TUTORIAL-memFenceDeviceLaunch + + onHost::memcpy(queue, mismatchInit, mismatchCounter); + onHost::wait(queue); + + CHECK(mismatchInit[0u] == 0u); +} diff --git a/docs/snippets/example/36_portingKernel.cpp b/docs/snippets/example/36_portingKernel.cpp new file mode 100644 index 000000000..4b167487f --- /dev/null +++ b/docs/snippets/example/36_portingKernel.cpp @@ -0,0 +1,70 @@ +/* Copyright 2026 René Widera + * SPDX-License-Identifier: MPL-2.0 + */ + +#include "docsTest.hpp" + +#include + +#include +#include + +#include + +using namespace alpaka; + +// BEGIN-TUTORIAL-portingKernel +struct SaxpyKernel +{ + ALPAKA_FN_ACC void operator()( + onAcc::concepts::Acc auto const& acc, + concepts::IMdSpan auto out, + concepts::IDataSource auto const& x, + concepts::IDataSource auto const& y, + float a) const + { + for(auto [i] : onAcc::makeIdxMap(acc, onAcc::worker::threadsInGrid, IdxRange{out.getExtents()})) + { + out[i] = a * x[i] + y[i]; + } + } +}; + +// END-TUTORIAL-portingKernel + +TEMPLATE_LIST_TEST_CASE("tutorial porting saxpy kernel", "[docs]", docs::test::TestBackends) +{ + auto selector = onHost::makeDeviceSelector(TestType::makeDict()[object::deviceSpec]); + if(!selector.isAvailable()) + return; + auto device = selector.makeDevice(0); + auto queue = device.makeQueue(queueKind::blocking); + + std::array hostX{1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f}; + std::array hostY{10.f, 10.f, 10.f, 10.f, 10.f, 10.f, 10.f, 10.f}; + std::array hostOut{}; + + auto xBuffer = onHost::allocLike(device, hostX); + auto yBuffer = onHost::allocLike(device, hostY); + auto outBuffer = onHost::allocLike(device, hostOut); + + onHost::memcpy(queue, xBuffer, hostX); + onHost::memcpy(queue, yBuffer, hostY); + + // BEGIN-TUTORIAL-portingLaunch + auto frameSpec = onHost::getFrameSpec(device, outBuffer.getExtents()); + queue.enqueue(frameSpec, KernelBundle{SaxpyKernel{}, outBuffer, xBuffer, yBuffer, 2.0f}); + // END-TUTORIAL-portingLaunch + + onHost::memcpy(queue, hostOut, outBuffer); + onHost::wait(queue); + + CHECK(hostOut[0] == 12.f); + CHECK(hostOut[1] == 14.f); + CHECK(hostOut[2] == 16.f); + CHECK(hostOut[3] == 18.f); + CHECK(hostOut[4] == 20.f); + CHECK(hostOut[5] == 22.f); + CHECK(hostOut[6] == 24.f); + CHECK(hostOut[7] == 26.f); +} diff --git a/docs/snippets/example/38_vendorInterop.cpp b/docs/snippets/example/38_vendorInterop.cpp new file mode 100644 index 000000000..4da97a514 --- /dev/null +++ b/docs/snippets/example/38_vendorInterop.cpp @@ -0,0 +1,114 @@ +/* Copyright 2026 René Widera + * SPDX-License-Identifier: MPL-2.0 + */ + +#include "docsTest.hpp" + +#include + +#include +#include + +#include +#include + +using namespace alpaka; + +namespace vendorTutorial +{ + // BEGIN-TUTORIAL-vendorFunctor + struct AffineTransformOp + { + float scale; + float shift; + + ALPAKA_FN_ACC auto operator()(float const& value) const -> float + { + return scale * value + shift; + } + }; + + // END-TUTORIAL-vendorFunctor + + // BEGIN-TUTORIAL-vendorSymbol + ALPAKA_FN_SYMBOL(AffineTransform, alpaka::fn::Fallback::toAlpaka); + + // END-TUTORIAL-vendorSymbol + + // BEGIN-TUTORIAL-vendorFallback + template< + alpaka::concepts::DeviceKind T_DeviceKind, + typename T_Queue, + alpaka::concepts::IMdSpan T_Output, + alpaka::concepts::IMdSpan T_Input> + constexpr void fnDispatch( + AffineTransform::Spec, + T_Queue&& queue, + T_Output&& output, + float scale, + float shift, + T_Input&& input) + { + alpaka::onHost::transform( + ALPAKA_FORWARD(queue), + ALPAKA_FORWARD(output), + ScalarFunc{AffineTransformOp{scale, shift}}, + ALPAKA_FORWARD(input)); + } + + // END-TUTORIAL-vendorFallback + + // BEGIN-TUTORIAL-vendorHost + template + constexpr void fnDispatch( + AffineTransform::Spec, + T_Queue&& queue, + T_Output&& output, + float scale, + float shift, + T_Input&& input) + { + auto outPtr = output.data(); + queue.enqueueHostFn( + [=]() + { + std::transform( + input.data(), + input.data() + input.getExtents().x(), + outPtr, + [=](float value) { return scale * value + shift; }); + }); + } + + // END-TUTORIAL-vendorHost +} // namespace vendorTutorial + +TEMPLATE_LIST_TEST_CASE("tutorial vendor interop dispatch", "[docs]", docs::test::TestBackends) +{ + auto selector = onHost::makeDeviceSelector(TestType::makeDict()[object::deviceSpec]); + if(!selector.isAvailable()) + return; + auto device = selector.makeDevice(0); + auto queue = device.makeQueue(queueKind::blocking); + + std::array hostInput{1.f, 2.f, 3.f, 4.f, 5.f}; + std::array hostOutput{}; + + auto inputBuffer = onHost::allocLike(device, hostInput); + auto outputBuffer = onHost::allocLike(device, hostOutput); + + onHost::memcpy(queue, inputBuffer, hostInput); + + // BEGIN-TUTORIAL-vendorCall + vendorTutorial::AffineTransform::call(queue, outputBuffer, 2.0f, 0.5f, inputBuffer); + // END-TUTORIAL-vendorCall + + onHost::memcpy(queue, hostOutput, outputBuffer); + onHost::wait(queue); + + CHECK(hostOutput[0] == 2.5f); + CHECK(hostOutput[1] == 4.5f); + CHECK(hostOutput[2] == 6.5f); + CHECK(hostOutput[3] == 8.5f); + CHECK(hostOutput[4] == 10.5f); +} diff --git a/docs/snippets/example/40_imagePipeline.cpp b/docs/snippets/example/40_imagePipeline.cpp new file mode 100644 index 000000000..398fab03d --- /dev/null +++ b/docs/snippets/example/40_imagePipeline.cpp @@ -0,0 +1,102 @@ +/* Copyright 2026 René Widera + * SPDX-License-Identifier: MPL-2.0 + */ + +#include "docsTest.hpp" + +#include + +#include +#include + +#include +#include + +using namespace alpaka; + +// BEGIN-TUTORIAL-imageThresholdKernel +struct ThresholdKernel +{ + ALPAKA_FN_ACC void operator()( + onAcc::concepts::Acc auto const& acc, + concepts::IMdSpan auto out, + concepts::IDataSource auto const& in, + uint8_t threshold) const + { + for(auto idx : onAcc::makeIdxMap(acc, onAcc::worker::threadsInGrid, IdxRange{out.getExtents()})) + { + out[idx] = in[idx] >= threshold ? uint8_t{255} : uint8_t{0}; + } + } +}; + +// END-TUTORIAL-imageThresholdKernel + +// BEGIN-TUTORIAL-imageHistogramKernel +struct BinaryHistogramKernel +{ + ALPAKA_FN_ACC void operator()( + onAcc::concepts::Acc auto const& acc, + concepts::IMdSpan auto bins, + concepts::IDataSource auto const& binaryImage) const + { + for(auto idx : onAcc::makeIdxMap(acc, onAcc::worker::threadsInGrid, IdxRange{binaryImage.getExtents()})) + { + auto bin = binaryImage[idx] == 0u ? uint32_t{0} : uint32_t{1}; + onAcc::atomicAdd(acc, &bins[Vec{bin}], uint32_t{1}, onAcc::scope::device); + } + } +}; + +// END-TUTORIAL-imageHistogramKernel + +TEMPLATE_LIST_TEST_CASE("tutorial image pipeline", "[docs]", docs::test::TestBackends) +{ + auto selector = onHost::makeDeviceSelector(TestType::makeDict()[object::deviceSpec]); + if(!selector.isAvailable()) + return; + auto device = selector.makeDevice(0); + auto queue = device.makeQueue(queueKind::blocking); + + auto const imageExtents = Vec{4u, 4u}; + auto hostImage = onHost::allocHost(imageExtents); + auto hostBinary = onHost::allocHostLike(hostImage); + auto hostBins = onHost::allocHost(Vec{2u}); + + std::array + inputValues{12u, 33u, 180u, 210u, 15u, 50u, 170u, 240u, 20u, 95u, 130u, 250u, 5u, 60u, 145u, 220u}; + + for(size_t i = 0; i < inputValues.size(); ++i) + { + hostImage[mapToND(imageExtents, static_cast(i))] = inputValues[i]; + } + + auto imageBuffer = onHost::allocLike(device, hostImage); + auto binaryBuffer = onHost::allocLike(device, hostImage); + auto binsBuffer = onHost::alloc(device, Vec{2u}); + + onHost::memcpy(queue, imageBuffer, hostImage); + onHost::memset(queue, binaryBuffer, 0x00); + onHost::memset(queue, binsBuffer, 0x00); + + // BEGIN-TUTORIAL-imagePipelineLaunch + auto frameSpec = onHost::getFrameSpec(device, imageExtents); + queue.enqueue(frameSpec, KernelBundle{ThresholdKernel{}, binaryBuffer, imageBuffer, uint8_t{128}}); + queue.enqueue(frameSpec, KernelBundle{BinaryHistogramKernel{}, binsBuffer, binaryBuffer}); + // END-TUTORIAL-imagePipelineLaunch + + onHost::memcpy(queue, hostBinary, binaryBuffer); + onHost::memcpy(queue, hostBins, binsBuffer); + onHost::wait(queue); + + // BEGIN-TUTORIAL-imagePipelineResult + auto brightPixels = hostBins[1]; + auto darkPixels = hostBins[0]; + // END-TUTORIAL-imagePipelineResult + + CHECK(darkPixels == 8u); + CHECK(brightPixels == 8u); + CHECK(hostBinary[Vec{0u, 0u}] == 0u); + CHECK(hostBinary[Vec{0u, 2u}] == 255u); + CHECK(hostBinary[Vec{3u, 2u}] == 255u); +} diff --git a/docs/snippets/example/CMakeLists.txt b/docs/snippets/example/CMakeLists.txt index 5b178d034..6f6cac521 100644 --- a/docs/snippets/example/CMakeLists.txt +++ b/docs/snippets/example/CMakeLists.txt @@ -46,7 +46,6 @@ function(alpaka_add_docs_target target_prefix folder) add_executable(${_test_target_name} ${source_file}) if(EXISTS "${test_folder}/include") - message(STATUS " add include folder: ${test_folder}/include") target_include_directories(${_test_target_name} PRIVATE ${test_folder}/include) endif() diff --git a/docs/snippets/example/include/docsTest.hpp b/docs/snippets/example/include/docsTest.hpp new file mode 100644 index 000000000..43ab2b303 --- /dev/null +++ b/docs/snippets/example/include/docsTest.hpp @@ -0,0 +1,9 @@ +#pragma once + +#include + +namespace docs::test +{ + using TestBackends = std::decay_t< + decltype(alpaka::onHost::allBackends(alpaka::onHost::enabledApis, alpaka::exec::enabledExecutors))>; +} // namespace docs::test diff --git a/docs/source/advanced/cmake.rst b/docs/source/advanced/cmake.rst index 1f35b5fb6..8c8fe3e81 100644 --- a/docs/source/advanced/cmake.rst +++ b/docs/source/advanced/cmake.rst @@ -1,7 +1,7 @@ CMake ===== -Alpaka configures a large part of its functionality at compile time. Therefore, a lot of compiler and link flags are needed, which are set by CMake arguments. First, we show a simple way to build alpaka for different back-ends using `CMake Presets `_. The second part of the documentation shows the general and back-end specific alpaka CMake flags. +alpaka configures a large part of its functionality at compile time. Therefore, a lot of compiler and link flags are needed, which are set by CMake arguments. First, we show a simple way to build alpaka for different back-ends using `CMake Presets `_. The second part of the documentation shows the general and back-end specific alpaka CMake flags. .. hint:: @@ -98,13 +98,11 @@ Arguments The executor ``exec::cpuSerial`` is always available and does not require any special CMake flags beside linking the taget ``alpaka::alpaka`` or ``alpaka::host``. - ``alpaka_CXX_STANDARD`` .. code-block:: markdown Set the C++ standard version. - ``alpaka_TESTS`` .. code-block:: markdown @@ -300,7 +298,6 @@ Numa Awareness **attention** If `CMake` is not **NOT** defining the preprocessor define `ALPAKA_HOST_MEM_PINNING_CAN_FAIL` will allow that pinning can fail without an exception. - Intel oneAPI Threading Building Blocks ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -345,7 +342,6 @@ Executors Enable the oneAPI SYCL executor `exec::oneApi`. - Available during the Installation ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -457,10 +453,10 @@ alpaka is providing CMake targets based on the optional activated dependencies ` Dependencies of ``alpaka_DEP_OMP`` and ``alpaka_DEP_TBB`` are linked into the target ``alpaka::host`` because they influence only host executors and do not provide additional alpaka API support. When enabling `alpaka_DEP_TBB`, make sure Intel oneTBB version 2021.10 or newer (including 2022.x releases) is available. -After linking Alpaka targets to your application, you should call ``alpaka_finalize(targetName)`` for each target that uses Alpaka. +After linking alpaka targets to your application, you should call ``alpaka_finalize(targetName)`` for each target that uses alpaka. ``alpaka_finalize()`` is a CMake function that ensures all necessary compile definitions and compiler options are set for the given target. Depending on the enabled dependencies (``alpaka_DEP_*``), this call may copy your source files to a temporary directory and compile them with the appropriate compiler. -Linking non-Alpaka targets after calling ``alpaka_finalize()`` is allowed. +Linking non-alpaka targets after calling ``alpaka_finalize()`` is allowed. You should not include header files using relative paths in your source files. These relative paths may become invalid after ``alpaka_finalize()`` is called, because the source files can be copied to a different location. diff --git a/docs/source/advanced/datastorage.rst b/docs/source/advanced/datastorage.rst index c272e98a8..57cd36311 100644 --- a/docs/source/advanced/datastorage.rst +++ b/docs/source/advanced/datastorage.rst @@ -46,7 +46,7 @@ Memory Layout of multidimensional Data Storage ---------------------------------------------- There are several functions and parameters for improving the memory layout of multidimensional ``Data Storage`` to enhance application performance. -Alpaka supports ``Pitches``, which optimize memory loads, and ``Alignment``, which is required for vector operations such as AVX on CPUs. +alpaka supports ``Pitches``, which optimize memory loads, and ``Alignment``, which is required for vector operations such as AVX on CPUs. All alpaka functions automatically handle ``Pitches`` and ``Alignment`` during memory access. However, it is sometimes necessary to process raw memory, for example, when a memory pointer is passed from alpaka to non-alpaka code. The following section explains how alpaka implements ``Pitches`` and ``Alignment``. @@ -83,7 +83,6 @@ Therefore, the size of a row is ``5 elements * 4 Byte/element + 2 Byte = 22 Byte Matrix with [3, 5] elements, each element has a size of 4 bytes and 2 bytes of padding per row. - .. literalinclude:: ../../snippets/dataStorage/datastorage_pitch.cpp :language: cpp :start-after: BEGIN-DATASTORAGE-pitch2D-example @@ -117,3 +116,45 @@ The following example shows 3D memory and the corresponding values for the ``Ext Alignment ````````` + +Complete Source Files +--------------------- + +.. raw:: html + +
+ datastorage_interface.cpp + +.. literalinclude:: ../../snippets/dataStorage/datastorage_interface.cpp + :language: cpp + :linenos: + +.. raw:: html + +
+ +.. raw:: html + +
+ datastorage_writeable_datasource.cpp + +.. literalinclude:: ../../snippets/dataStorage/datastorage_writeable_datasource.cpp + :language: cpp + :linenos: + +.. raw:: html + +
+ +.. raw:: html + +
+ datastorage_pitch.cpp + +.. literalinclude:: ../../snippets/dataStorage/datastorage_pitch.cpp + :language: cpp + :linenos: + +.. raw:: html + +
diff --git a/docs/source/basic/cheatsheet.rst b/docs/source/basic/cheatsheet.rst index c19ee26c7..88506fefc 100644 --- a/docs/source/basic/cheatsheet.rst +++ b/docs/source/basic/cheatsheet.rst @@ -377,3 +377,19 @@ Math functions :dedent: Similar for other math functions. + +Complete Source File +-------------------- + +.. raw:: html + +
+ cheatsheet.cpp + +.. literalinclude:: ../../snippets/cheatsheet/cheatsheet.cpp + :language: cpp + :linenos: + +.. raw:: html + +
diff --git a/docs/source/basic/example.rst b/docs/source/basic/example.rst index 84686da7c..256413a9d 100644 --- a/docs/source/basic/example.rst +++ b/docs/source/basic/example.rst @@ -64,3 +64,19 @@ In the CMake configuration phase of the project, you must activate the accelerat .. A complete list of CMake flags for the accelerator can be found :doc:`here `. If the configuration was successful and CMake found the CUDA SDK, the C++ api `cuda` and the executor `gpuCuda` is available. + +Complete Source File +-------------------- + +.. raw:: html + +
+ 30_elementWiseMultiplication.cpp + +.. literalinclude:: ../../snippets/example/30_elementWiseMultiplication.cpp + :language: cpp + :linenos: + +.. raw:: html + +
diff --git a/docs/source/basic/install.rst b/docs/source/basic/install.rst index f88a199dd..8524b9b19 100644 --- a/docs/source/basic/install.rst +++ b/docs/source/basic/install.rst @@ -137,6 +137,7 @@ The following ``CMakeLists.txt`` demonstrates how to use ``FetchContent`` with * .. literalinclude:: ../../snippets/fetchContent/CMakeLists.txt :language: C++ :caption: CMakeLists.txt + Example Source Code ^^^^^^^^^^^^^^^^^^^ @@ -187,7 +188,6 @@ You can select different device specifications at CMake configuration time using The CUDA, HIP, or Intel backends are only working if the CUDA SDK, HIP SDK, or OneAPI SDK are available respectively - .. _tests-and-examples: Tests and Examples diff --git a/docs/source/basic/library.rst b/docs/source/basic/library.rst index ba0756607..b3f0e0eee 100644 --- a/docs/source/basic/library.rst +++ b/docs/source/basic/library.rst @@ -38,7 +38,6 @@ The interaction of the main user facing concepts can be seen in the following fi .. image:: /images/structure_assoc.png :alt: user / alpaka code interaction - For each type of ``Device`` there is a ``DeviceSelector`` for enumerating the available ``Device``s. A ``Device`` is the requirement for creating ``Queues`` and ``Events`` as it is for allocating ``SharedBuffers`` on the respective ``Device``. ``SharedBuffers`` can be copied, their memory be byte-wise set or filled with element-wise. @@ -50,7 +49,6 @@ An ``Executor`` can be enqueued into a ``Queue`` and will execute the ``Kernel`` The ``Kernel`` in turn has access to the ``Accelerator`` it is running on. The ``Accelerator`` provides the ``Kernel`` with its current index in the block or grid, their extents or other data as well as it allows to allocate shared memory, execute atomic operations and many more. - Interface Usage --------------- @@ -71,7 +69,6 @@ Therefore the accelerator has to be passed in as a templated constant reference //... } - Kernel Definition ````````````````` diff --git a/docs/source/basic/terms.rst b/docs/source/basic/terms.rst index f7e143b90..adcc8501b 100644 --- a/docs/source/basic/terms.rst +++ b/docs/source/basic/terms.rst @@ -3,7 +3,6 @@ Terms & Structure .. sectionauthor:: Simeon Ehrig, René Widera - Host and Accelerator -------------------- @@ -74,7 +73,7 @@ Each Data Storage object either points to physical memory and uses it to read an The physical memory used is usually the RAM of a CPU, the VRAM of a GPU, or the unified memory (RAM) of an APU. The properties of a Data Storage object are described by the interface concept that it fulfills. -Alpaka offers 4 interface concepts that complement each other. +alpaka offers 4 interface concepts that complement each other. A data storage object must fulfill at least the ``alpaka::concepts::impl::IDataSource``. The ordering is ``IDataSource -> IMdSpan -> IView -> IBuffer``. @@ -121,7 +120,6 @@ The memory is row-oriented. The fastest index is the outer right one. Memory layout of a Data Storage object with the extents [3, 5]. Access to memory at position [1, 3]. For simplicity, pitches and alignment are not shown in the figure. - IMdSpan ``````` @@ -148,3 +146,19 @@ An ``IBuffer`` Data Storage object is pointing to memory and manages its lifetim When all ``IBuffer`` Data Storage objects that are pointing to the same memory are deleted, the memory is freed. Go to the `IBuffer Interface definition `_ + +Complete Source File +-------------------- + +.. raw:: html + +
+ terms_extents.cpp + +.. literalinclude:: ../../snippets/dataStorage/terms_extents.cpp + :language: cpp + :linenos: + +.. raw:: html + +
diff --git a/docs/source/conf.py b/docs/source/conf.py index 3eee45838..260ff5d9d 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -125,7 +125,11 @@ def setup(app): html_css_files = ["custom.css"] html_logo = "../logo/alpaka.svg" -html_theme_options = {"logo_only": True} +html_theme_options = { + "logo_only": True, + "collapse_navigation": False, + "navigation_depth": 2, +} # -- Options for HTMLHelp output --------------------------------------------- diff --git a/docs/source/contribution/sphinx.rst b/docs/source/contribution/sphinx.rst index b0c63114e..e161eb22a 100644 --- a/docs/source/contribution/sphinx.rst +++ b/docs/source/contribution/sphinx.rst @@ -70,7 +70,6 @@ Build documentation: # chromium and other browser also works firefox ./build/html/index.html - Useful Links ------------ diff --git a/docs/source/contribution/tools.rst b/docs/source/contribution/tools.rst index 128535d22..bbab5ce93 100644 --- a/docs/source/contribution/tools.rst +++ b/docs/source/contribution/tools.rst @@ -68,7 +68,6 @@ Formatter versions :language: yaml :caption: .pre-commit-config.yaml - Code Changes with Tools ----------------------- diff --git a/docs/source/dev/logging.rst b/docs/source/dev/logging.rst index 05335b888..3e3b05de1 100644 --- a/docs/source/dev/logging.rst +++ b/docs/source/dev/logging.rst @@ -63,7 +63,6 @@ CMake Logging Options // long signature [Memory] SharedBuffer{ dim=1, api= Host, extents={123456}, pitches={4} , alignment=16 } auto alpaka::onHost::internal::Alloc::Op, T_Extents>::operator()(alpaka::onHost::cpu::Device&, const T_Extents&) const [with T_Type = unsigned int; T_Platform = alpaka::onHost::cpu::Platform; T_Extents = alpaka::Vec >] alpaka/api/host/Device.hpp:198 - * ``alpaka_LOG_INDENT=`` - where ``X`` can be ``ON`` or ``OFF`` which will format the output from ``alpaka_LOG_FUNCTIONS`` and ``alpaka_LOG_INFO`` depending on the call stack depth. .. code:: c++ @@ -82,7 +81,6 @@ The following CMake options requires ``alpaka_LOG=static`` * ``alpaka_LOG_STATIC_Queue`` - activate logging for static queue information * ``alpaka_LOG_STATIC_Kernel`` - activate logging for static kernel information - C++ Code logging ---------------- diff --git a/docs/source/index.rst b/docs/source/index.rst index 24cb209f3..eed212496 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -41,6 +41,7 @@ Individual chapters are based on the information of the chapters before. .. toctree:: :caption: Basic :maxdepth: 1 + :titlesonly: basic/intro.rst basic/install.rst @@ -52,17 +53,19 @@ Individual chapters are based on the information of the chapters before. .. toctree:: :caption: Tutorial - :maxdepth: 1 + :maxdepth: 2 + :titlesonly: tutorial/intro.rst - tutorial/vector.rst - tutorial/device.rst - tutorial/queue.rst - tutorial/memory.rst + tutorial/foundations.rst + tutorial/kernels.rst + tutorial/numerics.rst + tutorial/migration.rst .. toctree:: :caption: Advanced :maxdepth: 1 + :titlesonly: advanced/cmake.rst advanced/datastorage.rst @@ -72,6 +75,7 @@ Individual chapters are based on the information of the chapters before. .. toctree:: :caption: Developer :maxdepth: 1 + :titlesonly: dev/logging.rst dev/online_tools.rst @@ -79,6 +83,7 @@ Individual chapters are based on the information of the chapters before. .. toctree:: :caption: Contribution :maxdepth: 1 + :titlesonly: contribution/sphinx.rst contribution/tools.rst @@ -86,6 +91,7 @@ Individual chapters are based on the information of the chapters before. .. toctree:: :maxdepth: 2 :caption: API Reference + :titlesonly: doxygen doxygen_dev diff --git a/docs/source/tutorial/algorithms.rst b/docs/source/tutorial/algorithms.rst new file mode 100644 index 000000000..79c12c3fe --- /dev/null +++ b/docs/source/tutorial/algorithms.rst @@ -0,0 +1,197 @@ +onHost Algorithms +================= + +*alpaka* provides host-side algorithms that execute on the selected backend through an *alpaka* queue and executor. +It is tempting to think of them as STL algorithms with a different namespace, but that is not quite right. +They operate on *alpaka* buffers and views, and some overloads need explicit temporary storage. + +For a human learner, it helps to see these algorithms as a small data-processing toolbox. +Imagine a simple workflow: + +- create or initialize some data, +- transform it element by element, +- summarize it with a reduction, +- build cumulative offsets with a scan, +- or combine "compute something" and "sum it" with ``transformReduce``. + +That is why this chapter is ordered the way it is. +Each algorithm adds one familiar step to the same general story. + +iota +---- + +``onHost::iota`` is the simplest algorithm in the group. +It fills one or more output buffers with a linear sequence and is useful for initialization, debugging, and synthetic test data. + + .. literalinclude:: ../../snippets/example/14_algorithms.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-iota + :end-before: END-TUTORIAL-iota + :dedent: + +For multidimensional buffers, the linear value increases fastest in the last dimension. +That is the same ordering used by ``LinearizedIdxGenerator`` and by the validation code in the unit tests. +In practice, this is the algorithm you use when you want deterministic toy data before moving to something more realistic. + +Transform +--------- + +``onHost::transform`` is the host-side algorithm equivalent of an element-wise kernel. +It applies a functor to one or more inputs and writes the result into an output buffer. + + .. literalinclude:: ../../snippets/example/14_algorithms.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-transformCall + :end-before: END-TUTORIAL-transformCall + :dedent: + +For simple scalar functors, wrapping the callable in ``ScalarFunc`` keeps the intent clear and matches the tested alpaka pattern. +That wrapper is especially useful when you want scalar semantics even though the algorithm may vectorize loads and stores internally. +Because CUDA/HIP-friendly tutorial code should not rely on local lambdas here, the example uses a tiny named functor instead. +``onHost::transform`` still traverses the full input range itself, so the functor only describes the per-element operation. + + .. literalinclude:: ../../snippets/example/14_algorithms.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-transformFunctor + :end-before: END-TUTORIAL-transformFunctor + :dedent: + +The example squares every element, but the same pattern is what you would use for brightness scaling in an image row, converting Celsius to Kelvin, or applying a threshold to a signal. + +Reduction +--------- + +Reduction writes its result into an explicit output buffer. +That is different from ``std::reduce`` and also different from some CUDA helper libraries that hide more of the storage details. + + .. literalinclude:: ../../snippets/example/14_algorithms.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-reduce + :end-before: END-TUTORIAL-reduce + :dedent: + +There are three details worth noticing: + +- you provide the neutral element explicitly, +- the output is a one-element buffer, +- and the input is an *alpaka* buffer or generator, not a pair of iterators. + +For a beginner, the easiest way to think about reduction is "take many values and collapse them into one summary value". +That summary could be: + +- the sum of all samples, +- the maximum pixel value, +- the total mass in a simulation, +- or the count of values that passed a test. + +Scan +---- + +Scan follows the same overall style and can use an explicit temporary buffer. +Unlike the other algorithms in this chapter, the current scan implementation is restricted to one-dimensional data. +That fits common prefix-sum use cases such as offsets, compaction maps, and cumulative counters, where the logical input is already a linear sequence. + + .. literalinclude:: ../../snippets/example/14_algorithms.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-scan + :end-before: END-TUTORIAL-scan + :dedent: + +This tutorial uses the explicit-buffer form because it makes the data flow easier to see: + +- allocate input and output buffers, +- allocate temporary storage with ``getScanBufferSize``, +- call the scan, +- then copy the result back. + +If your original problem is two- or three-dimensional, the usual approach is to decide which logical line you want to scan, linearize that line into a one-dimensional view, and then apply scan there. +The common mental picture is "running totals": +prefix sums for offsets, cumulative counts for compaction, or row-wise offsets before writing variable-length output. + +Transform-Reduce +---------------- + +``transformReduce`` combines a map step with a reduction step. +That is the natural tool for dot products, weighted sums, norms, and many “compute a value per element and then accumulate it” patterns. + + .. literalinclude:: ../../snippets/example/14_algorithms.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-transformReduceCall + :end-before: END-TUTORIAL-transformReduceCall + :dedent: + +The first functor is the reduction operator and the second one is the element-wise transform. +As in ``reduce``, you provide the neutral element explicitly and store the result in a one-element output buffer. +This is the natural dot-product pattern: +take one product per element pair, then accumulate those products into one final value. +The backend-compatible callable itself is still small enough to show directly: + + .. literalinclude:: ../../snippets/example/14_algorithms.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-transformReduceFunctor + :end-before: END-TUTORIAL-transformReduceFunctor + :dedent: + +Generators Instead of Input Buffers +----------------------------------- + +Several alpaka algorithms also accept generators as inputs. +That is useful when one input is synthetic, such as a linear index, and you do not want to materialize another buffer just to hold it. + + .. literalinclude:: ../../snippets/example/14_algorithms.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-generatorCall + :end-before: END-TUTORIAL-generatorCall + :dedent: + +``LinearizedIdxGenerator`` is the simplest generator to learn first. +It behaves like a virtual buffer whose value at each position is the corresponding linear index. +The algorithm tests use the same pattern for ``reduce``, ``transform``, and ``transformReduce``. +That is useful when the extra input is really a formula rather than stored data. +For example, you may want "value plus index" or "weight derived from the linear position" without allocating another buffer just to hold those numbers. +The helper functor is again small enough to show directly: + + .. literalinclude:: ../../snippets/example/14_algorithms.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-generatorFunctor + :end-before: END-TUTORIAL-generatorFunctor + :dedent: + +How This Differs From STL and CUB-Style Expectations +---------------------------------------------------- + +- STL algorithms operate on host iterators, while *alpaka* algorithms operate on *alpaka* memory objects. +- ``iota`` fills buffers, not iterator ranges. +- ``transform`` and ``transformReduce`` can consume generators as well as stored buffers. +- The result placement is explicit. +- Temporary storage for scan is explicit if you choose that overload. +- The queue and executor are explicit, so the execution backend is part of the call. + +That extra ceremony is useful because it keeps memory placement and execution placement visible. +Once you understand that model, the calls stop feeling verbose and start feeling predictable. + +Try Next +-------- + +If you want to turn this chapter into practice, these are good small follow-up exercises: + +- change the transform example from squaring to ``2 * value + 1`` +- change the reduction from sum to maximum +- use scan to build prefix offsets for a boolean "keep/discard" array +- change the transform-reduce example into a Euclidean norm by summing ``value * value`` + +Complete Source File +-------------------- + +.. raw:: html + +
+ 14_algorithms.cpp + +.. literalinclude:: ../../snippets/example/14_algorithms.cpp + :language: cpp + :linenos: + +.. raw:: html + +
diff --git a/docs/source/tutorial/atomics.rst b/docs/source/tutorial/atomics.rst new file mode 100644 index 000000000..256cb7e62 --- /dev/null +++ b/docs/source/tutorial/atomics.rst @@ -0,0 +1,119 @@ +Atomics +======= + +Atomics are the tool you reach for when several workers may update the same memory location. +Typical examples are histograms, counters, sparse assembly, work lists, and some reductions. +In the broader tutorial story, atomics connect naturally to two recurring cases: + +- image-style histograms, where many pixels may land in the same bin, +- and Monte Carlo-style sampling, where many random trials may contribute to the same counter or bucket. + +When to Use Atomics +------------------- + +Use an atomic operation only when two or more workers can hit the same location at the same time. +If every output element is written exactly once, atomics are usually unnecessary and slower than a plain store. + +A Small Histogram Example +------------------------- + +Histograms are a classic teaching example because many input elements may contribute to the same bin. +That means a direct ``bins[bin] += 1`` would create a data race. + + .. literalinclude:: ../../snippets/example/22_atomics.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-atomicKernel + :end-before: END-TUTORIAL-atomicKernel + :dedent: + +The important detail is that the loop still uses ``makeIdxMap``. +The iteration stays data-centric; only the conflicting update needs special treatment. + +Launching the Atomic Kernel +--------------------------- + + .. literalinclude:: ../../snippets/example/22_atomics.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-atomicLaunch + :end-before: END-TUTORIAL-atomicLaunch + :dedent: + +The same idea shows up in many real kernels: + +- accumulate a global count, +- build a histogram, +- count errors or events, +- update a minimum or maximum, +- or implement a simple unordered reduction. + +If you connect this back to the random-number chapter, you get a useful combined mental model: +random workers produce samples, a binning rule turns each sample into a bucket, and atomics keep the shared bucket counts correct. + +Practical Advice +---------------- + +- Keep the atomic section as small as possible. +- Prefer per-element direct writes over atomics whenever the algorithm allows it. +- If atomics become a bottleneck, the next optimization step is usually hierarchical accumulation, such as one partial result per frame or per warp. +- Be especially careful when atomics interact with other shared state; correctness comes before performance. + +Common Mistakes +--------------- + +- using atomics for data that is actually written exactly once +- choosing device scope when block-local scope would be enough +- mixing atomics and non-atomic accesses to the same location without a clear protocol +- trying to use atomics as a substitute for a real synchronization or publication pattern + +Atomic Scope +------------ + +The atomic helpers in *alpaka* have an optional scope parameter. +In practice, the shape looks like this: + +- ``onAcc::atomicAdd(acc, ptr, value)`` +- ``onAcc::atomicAdd(acc, ptr, value, onAcc::scope::block)`` +- ``onAcc::atomicAdd(acc, ptr, value, onAcc::scope::device)`` + +If you do not pass a scope, the default is ``onAcc::scope::device``. + +This is useful because not every algorithm needs the same visibility: + +- ``scope::block`` means the atomic operation only needs to be coherent for threads in the same block. +- ``scope::device`` means the atomic operation must work across the whole device. + +Many algorithms only need block-local coordination. +If you are accumulating into shared memory or another block-private structure, block scope expresses the real requirement more precisely. +If all blocks may update the same global counter, histogram bin, or reduction output, you need device scope. + +As a rule of thumb: + +- use ``scope::block`` for block-local cooperation, +- use ``scope::device`` for updates visible across blocks, +- and only widen the scope when the algorithm really needs it. + +*alpaka* provides operations such as ``atomicAdd``, ``atomicMin``, ``atomicMax``, ``atomicExch``, and ``atomicCas``. +For a first encounter, ``atomicAdd`` with the default device scope is the easiest one to reason about. + +Where To Go Next +---------------- + +- read :doc:`memFence` when atomics interact with publication or ordering protocols +- read :doc:`tuning` when atomics become the main performance bottleneck +- read :doc:`miniProject` for a small image histogram pipeline using atomics in context + +Complete Source File +-------------------- + +.. raw:: html + +
+ 22_atomics.cpp + +.. literalinclude:: ../../snippets/example/22_atomics.cpp + :language: cpp + :linenos: + +.. raw:: html + +
diff --git a/docs/source/tutorial/backendDifferences.rst b/docs/source/tutorial/backendDifferences.rst new file mode 100644 index 000000000..011b87a1e --- /dev/null +++ b/docs/source/tutorial/backendDifferences.rst @@ -0,0 +1,49 @@ +Backend Differences That Matter +=============================== + +alpaka tries to keep the kernel source portable, but not every backend feels identical. +Users coming from CUDA, HIP/ROCm, or SYCL usually need a short list of what actually changes in practice. + +Warp and Subgroup Size +---------------------- + +Do not assume a warp size of ``32``. +Query it from the backend when warp-local code matters. +The dedicated warp chapter shows the typical pattern with ``onAcc::warp::getSize(acc)``. + +On host backends, the warp size can become ``1``. +That is not a bug. +It means subgroup-specific code still compiles, but the subgroup behavior is naturally trivial. + +Execution Shape +--------------- + +The same ``FrameSpec`` is valid across backends, but it may not be equally good everywhere. +A shape that is natural for CUDA or HIP may still run on CPU backends, just with different performance characteristics. + +The portable beginner rule is: + +- choose a shape that matches the data layout first +- tune only after measuring + +Synchronization Semantics +------------------------- + +These concepts remain important across all backends: + +- ``syncBlockThreads`` is a block-level rendezvous +- ``memFence`` is only a memory-ordering primitive +- atomics solve conflicting updates, not global synchronization + +The semantics stay the same, but the performance cost can differ by backend. + +Default Advice for Migration Users +---------------------------------- + +- keep the first implementation backend-neutral +- avoid backend-specific assumptions such as fixed warp width +- prefer ``makeIdxMap`` over manual index formulas +- treat subgroup and shared-memory code as optimization tools, not as the default starting point + +If you need backend-specific functionality that alpaka does not wrap directly, the next step is usually a small interop layer around the vendor API, not a complete rewrite of the kernel structure. +The dedicated :doc:`vendorInterop` chapter shows the pattern. diff --git a/docs/source/tutorial/chunked.rst b/docs/source/tutorial/chunked.rst new file mode 100644 index 000000000..329fb6f26 --- /dev/null +++ b/docs/source/tutorial/chunked.rst @@ -0,0 +1,70 @@ +Chunked and Tiled Kernels +========================= + +After writing a simple element-wise kernel, the next natural alpaka pattern is a chunked kernel. +Instead of thinking in raw thread IDs, you start thinking in frames, tiles, and local chunks of the problem. + +This is the style used in alpaka's chunked-data tutorial example and in larger examples such as tiled stencil codes. + +Why Chunked Kernels Matter +-------------------------- + +Chunked kernels are useful when: + +- one block should process more than one element per thread, +- data should be loaded once into shared memory and reused, +- or you want to express tiled traversal without dropping into manual CUDA-like indexing. + +The Kernel Structure +-------------------- + + .. literalinclude:: ../../snippets/example/28_chunkedFrames.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-chunkedKernel + :end-before: END-TUTORIAL-chunkedKernel + :dedent: + +There are a few moving parts in this pattern: + +- ``acc[frame::extent]`` is the current frame shape. +- ``acc[frame::count]`` tells you how many frames exist. +- ``linearBlocksInGrid`` lets blocks iterate over frames. +- ``linearThreadsInBlock`` lets threads iterate over elements inside one frame. +- ``onAcc::traverse::tiled`` gives a tiled traversal order for the second pass. + +This is the alpaka way to write many kernels where users might otherwise be tempted to compute every thread and block index by hand. + +Launching a Chunked Kernel +-------------------------- + + .. literalinclude:: ../../snippets/example/28_chunkedFrames.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-chunkedLaunch + :end-before: END-TUTORIAL-chunkedLaunch + :dedent: + +The example uses ``CVec`` for the frame extent because compile-time-known frame sizes work especially well with shared-memory tiles. + +Practical Advice +---------------- + +- Start with a frame size that evenly divides the problem. +- Use chunked kernels when there is real data reuse or tiled structure. +- Prefer frame-based traversal over manual thread arithmetic when teaching, prototyping, or writing portable kernels. +- Add explicit synchronization if the same block reuses shared memory across multiple passes or multiple frames. + +Complete Source File +-------------------- + +.. raw:: html + +
+ 28_chunkedFrames.cpp + +.. literalinclude:: ../../snippets/example/28_chunkedFrames.cpp + :language: cpp + :linenos: + +.. raw:: html + +
diff --git a/docs/source/tutorial/device.rst b/docs/source/tutorial/device.rst index 823bfedb4..bbccbb2a6 100644 --- a/docs/source/tutorial/device.rst +++ b/docs/source/tutorial/device.rst @@ -55,3 +55,21 @@ The device with the api ``host`` and the device kind ``cpu`` which represents yo :start-after: BEGIN-TUTORIAL-devHostDev :end-before: END-TUTORIAL-devHostDev :dedent: + +If you want to see how to enumerate all enabled backends and executors instead of choosing one device manually, continue with :doc:`execution`. + +Complete Source File +-------------------- + +.. raw:: html + +
+ 05_device.cpp + +.. literalinclude:: ../../snippets/example/05_device.cpp + :language: cpp + :linenos: + +.. raw:: html + +
diff --git a/docs/source/tutorial/events.rst b/docs/source/tutorial/events.rst new file mode 100644 index 000000000..3013bff9e --- /dev/null +++ b/docs/source/tutorial/events.rst @@ -0,0 +1,62 @@ +Events and Synchronization +========================== + +As soon as you use more than one queue, or mix host tasks with device work, you need a clear model for synchronization. +In *alpaka*, queues describe execution order, and events describe dependencies between queues. + +Basic Rules +----------- + +- Operations inside one queue execute in FIFO order. +- Different queues may run independently. +- ``onHost::wait(queue)`` waits until all work in that queue is complete. +- ``onHost::wait(event)`` waits until the event has been processed. +- ``queue1.waitFor(event)`` inserts a dependency so work in ``queue1`` starts only after the event is reached. + +Creating an Event +----------------- + + .. literalinclude:: ../../snippets/example/08_events.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-eventCreation + :end-before: END-TUTORIAL-eventCreation + :dedent: + +This records a point in ``queue0`` after the earlier tasks in that queue. + +Waiting From Another Queue +-------------------------- + + .. literalinclude:: ../../snippets/example/08_events.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-eventWait + :end-before: END-TUTORIAL-eventWait + :dedent: + +This is the standard way to connect two queues without forcing the host to block between them. + +When to Use Which Primitive +--------------------------- + +- Use ``onHost::wait(queue)`` when the host must read or modify results after queued work. +- Use an event plus ``waitFor`` when one queue depends on another queue. +- Use block-level synchronization such as ``onAcc::syncBlockThreads`` only inside kernels, never as a host-side substitute. + +For beginners, the most important habit is to be explicit about synchronization. +Most bugs in parallel programs are not arithmetic mistakes but ordering mistakes. + +Complete Source File +-------------------- + +.. raw:: html + +
+ 08_events.cpp + +.. literalinclude:: ../../snippets/example/08_events.cpp + :language: cpp + :linenos: + +.. raw:: html + +
diff --git a/docs/source/tutorial/execution.rst b/docs/source/tutorial/execution.rst new file mode 100644 index 000000000..43149e82d --- /dev/null +++ b/docs/source/tutorial/execution.rst @@ -0,0 +1,64 @@ +Enumerating Devices and Executors +================================= + +One of the first things new alpaka users notice is that execution configuration is explicit. +You do not just "run on the GPU" or "run on the CPU". You choose a device specification, and you can also iterate over all enabled backend combinations. +That may feel more verbose at first, but it becomes useful very quickly in real work. +For example, if you write a small vector-add test, an image blur, or a heat-equation step, you can run exactly the same example once on the host backend and once on every available GPU backend without rewriting the code around the kernel. + +Device Specifications +--------------------- + +A ``DeviceSpec`` combines an API and a device kind, for example host CPU, CUDA NVIDIA GPU, HIP AMD GPU, or oneAPI Intel GPU. + + .. literalinclude:: ../../snippets/example/02_execution.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-enumerateDeviceSpec + :end-before: END-TUTORIAL-enumerateDeviceSpec + :dedent: + +From that selector you can get: + +- the number of visible devices for that backend, +- device properties such as the reported warp size, +- and a concrete ``onHost::Device`` handle for allocation and queue creation. + +Running Over All Enabled Backends +--------------------------------- + +Many alpaka examples are written so they run once for every enabled backend that is actually available on the current machine. + + .. literalinclude:: ../../snippets/example/02_execution.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-enumerateBackends + :end-before: END-TUTORIAL-enumerateBackends + :dedent: + +This pattern is especially useful when: + +- you want one example or test to exercise every enabled backend, +- you want to compare behavior across executors, +- or you want to keep tutorial code backend-neutral. + +The important part is that a backend entry bundles both the ``deviceSpec`` and the ``exec`` object. +That is how many alpaka examples stay generic without branching into separate CUDA, HIP, SYCL, and host code paths by hand. + +For a human learner, the easiest way to think about this is: +"I have one calculation, and I want to ask alpaka where that calculation can run on this machine." +That is a better starting point than hard-coding CUDA or HIP first and only later trying to recover portability. + +Complete Source File +-------------------- + +.. raw:: html + +
+ 02_execution.cpp + +.. literalinclude:: ../../snippets/example/02_execution.cpp + :language: cpp + :linenos: + +.. raw:: html + +
diff --git a/docs/source/tutorial/foundations.rst b/docs/source/tutorial/foundations.rst new file mode 100644 index 000000000..02ac4fa52 --- /dev/null +++ b/docs/source/tutorial/foundations.rst @@ -0,0 +1,20 @@ +Foundations +=========== + +This section covers the basic building blocks you need before writing real kernels: +vector types, device selection, queues, events, memory allocation and copies, views, and execution setup. + +Read these pages in order if you are new to alpaka. + +.. toctree:: + :maxdepth: 1 + + vector.rst + device.rst + queue.rst + events.rst + memory.rst + memoryOperations.rst + views.rst + execution.rst + mentalModel.rst diff --git a/docs/source/tutorial/hierarchy.rst b/docs/source/tutorial/hierarchy.rst new file mode 100644 index 000000000..e5df4410a --- /dev/null +++ b/docs/source/tutorial/hierarchy.rst @@ -0,0 +1,92 @@ +Blocks, Threads, and Warps +========================== + +After the first kernel, the next important step is to understand the execution hierarchy. +alpaka exposes that hierarchy directly, but it still encourages you to talk about work in terms of data ranges instead of hand-written thread arithmetic. + +The three levels to keep in mind are: + +- blocks in the grid +- threads inside one block +- warps inside one block + +Blocks are a good fit for tiles of work. +Threads are a good fit for elements inside a tile. +Warps are always one-dimensional, so they are often the natural tool for the innermost direction of a row, stripe, or linear chunk inside a block. + +A Small 2D Tile Example +----------------------- + +The following kernel uses a tiny image-style example. +Each block owns one 2D row stripe of the image, each thread classifies one pixel of that stripe, and one warp walks the same stripe to count how many pixels in that row pass a threshold. + + .. literalinclude:: ../../snippets/example/13_hierarchy.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-hierarchyKernel + :end-before: END-TUTORIAL-hierarchyKernel + :dedent: + +The structure is the important part: + +- ``onAcc::worker::blocksInGrid`` chooses tile starts in the full 2D image +- ``onAcc::worker::threadsInBlock`` iterates the pixels inside one tile +- ``onAcc::worker::linearWarpsInBlock`` and ``linearThreadsInWarp`` reuse the same tile, but now in a one-dimensional way + +That last step is the key reason this chapter exists. +Warps are not “small 2D blocks”. +They are one-dimensional subgroups. +In a 2D problem, that usually means you map them to the fastest varying direction, which is often the x direction of a row. + +Launching a Hierarchical Kernel +------------------------------- + + .. literalinclude:: ../../snippets/example/13_hierarchy.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-hierarchyLaunch + :end-before: END-TUTORIAL-hierarchyLaunch + :dedent: + +This launch shape deliberately makes that mapping easy to see: + +- the block shape is ``{1, warpSize}`` +- so each block owns one row stripe of the 2D image +- and the warp naturally maps to the one-dimensional x direction of that stripe + +How to Think About the Hierarchy +-------------------------------- + +For beginner kernels, this mental model usually works well: + +1. pick the block shape from the tile shape you want in the data +2. use threads to cover the elements inside that tile +3. only use warps when there is a naturally one-dimensional inner direction + +That keeps the hierarchy tied to the problem structure instead of tied to CUDA-style index formulas. + +Practical Advice +---------------- + +- Start with ``threadsInGrid`` when the kernel is just “process every element once”. +- Move to ``blocksInGrid`` plus ``threadsInBlock`` when the work is tile-based. +- Treat warps as one-dimensional helpers inside a block, not as a replacement for multidimensional block logic. +- If the algorithm does not need warp-local cooperation, do not force warps into the first implementation. +- When you do use warps in a 2D problem, map them to one row or one linear stripe and keep the outer tile structure block-based. + +The later :doc:`warp` chapter goes deeper into warp-local communication such as shuffle and voting operations. +This chapter is only about understanding where that subgroup level fits into the overall hierarchy. + +Complete Source File +-------------------- + +.. raw:: html + +
+ 13_hierarchy.cpp + +.. literalinclude:: ../../snippets/example/13_hierarchy.cpp + :language: cpp + :linenos: + +.. raw:: html + +
diff --git a/docs/source/tutorial/intrinsics.rst b/docs/source/tutorial/intrinsics.rst new file mode 100644 index 000000000..86a064b90 --- /dev/null +++ b/docs/source/tutorial/intrinsics.rst @@ -0,0 +1,81 @@ +Bit Intrinsics +============== + +Most alpaka kernels do not need bit intrinsics. +When they do show up, it is usually in masks, compact data structures, binary encodings, voting logic, or low-level performance-sensitive code. + +alpaka exposes portable helpers such as ``popcount``, ``ffs``, and ``clz`` so you do not need to call backend-specific CUDA, HIP, or SYCL intrinsics directly. +The easiest way to make them feel practical is to imagine a tiny occupancy map: +each bit marks whether one slot, cell, or feature is active. +Then the questions become very concrete: + +- how many active flags are there, +- where is the first active one, +- and how much empty space is left at the front of the word. + +A Small Bit-Manipulation Kernel +------------------------------- + + .. literalinclude:: ../../snippets/example/32_intrinsics.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-intrinsicKernel + :end-before: END-TUTORIAL-intrinsicKernel + :dedent: + +The three operations in this example are: + +- ``popcount(value)``: number of set bits, +- ``ffs(value)``: position of the first set bit, using ``1`` as the first position and ``0`` for the zero value, +- ``clz(value)``: number of leading zero bits. + +This is exactly the kind of logic that appears in compact masks for active particles, occupied bins, tile occupancy, or small scheduling tables. + +Launching the Kernel +-------------------- + + .. literalinclude:: ../../snippets/example/32_intrinsics.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-intrinsicLaunch + :end-before: END-TUTORIAL-intrinsicLaunch + :dedent: + +When to Use Them +---------------- + +Use bit intrinsics when the algorithm is naturally about bit patterns. +Typical examples are: + +- counting active flags in a bit mask, +- finding the next occupied slot, +- building compact lookup structures, +- and implementing integer-heavy utility kernels. + +For ordinary numerical kernels, these helpers are not a starting point. +They are specialized tools, and they are easiest to understand after the rest of the tutorial already feels natural. + +Try Next +-------- + +If you want one small exercise after this page, treat each integer as a row of eight or sixteen binary flags and answer: + +- how many flags are set, +- whether the row is empty, +- and where the first active flag starts. + +That is a small but realistic stepping stone toward histograms, sparse occupancy maps, and compact bit-mask workflows. + +Complete Source File +-------------------- + +.. raw:: html + +
+ 32_intrinsics.cpp + +.. literalinclude:: ../../snippets/example/32_intrinsics.cpp + :language: cpp + :linenos: + +.. raw:: html + +
diff --git a/docs/source/tutorial/intro.rst b/docs/source/tutorial/intro.rst index 7b0e93f8b..c7c4da6d5 100644 --- a/docs/source/tutorial/intro.rst +++ b/docs/source/tutorial/intro.rst @@ -1,6 +1,32 @@ Motivation ========== -The *alpaka* tutorial will go step-by-step through the most important objects and show, with small examples, how to use them. -During the tutorial we will explain some of the design decisions and link to similar concepts in other languages, e.g. CUDA/HIP or SYCL, to aid understanding. -The tutorial will use ``using namespace alpaka;`` to reduce the amount of code. +The *alpaka* tutorial is meant to be worked through, not skimmed like a reference manual. +Each section introduces one or two new ideas, uses a small example, and then builds on the pages before it. +Where it helps, we point out the rough equivalent in CUDA/HIP, SYCL, or other parallel frameworks, but the examples stay written in alpaka style. + +Two small problem families appear again and again on purpose: + +- image-style workloads such as crops, stencils, blur-like kernels, and histograms, +- and Monte Carlo style workloads such as random sampling, reduction, and pi estimation. + +Those recurring examples make it easier to connect the pages. +You are not just learning isolated interfaces. +You are learning how the same kinds of programs grow from memory and views into kernels, synchronization, atomics, randomness, and tuning. + +To keep the code readable, the tutorial uses ``using namespace alpaka;`` in the examples. + +Recommended Reading Order +------------------------- + +The tutorial is intended to be read in roughly this order: + +1. :doc:`foundations` +2. :doc:`kernels` +3. :doc:`numerics` +4. :doc:`migration` + +If you want one page that clarifies the central concepts before the first kernel, read :doc:`mentalModel` near the end of the foundations section. + +If you are new to parallel programming, treat the early chapters as the core path and the later ones as tools you add when the algorithm actually needs them. +You do not need warp functions, shared-memory tiles, or custom atomics to write your first correct alpaka kernel. diff --git a/docs/source/tutorial/kernel.rst b/docs/source/tutorial/kernel.rst new file mode 100644 index 000000000..25541d7ee --- /dev/null +++ b/docs/source/tutorial/kernel.rst @@ -0,0 +1,166 @@ +Start Your First Kernel +======================= + +After selecting a device, creating a queue, and allocating memory, the next step is to launch work on the device. +In *alpaka*, the simplest useful kernel is usually just a small function object plus a host-side launch with a ``FrameSpec``. +If you know CUDA, a frame is roughly a block-shaped chunk of work. +If you know Kokkos, it plays a similar role to choosing the shape of a policy. + +What matters early is that a ``FrameSpec`` does not describe the whole problem size. +It describes the launch-side parallel shape that alpaka makes available to the kernel at one time: +how many frames exist and how large one frame is. +The actual problem can be much larger. +The kernel then uses ``makeIdxMap`` to walk over the complete data range. + +What a Beginner Kernel Looks Like +--------------------------------- + +Most first kernels in alpaka end up looking almost the same: + +- The kernel is a function object with ``operator()``. +- The first argument is the accelerator handle ``acc``. +- Output buffers use ``IMdSpan`` and input buffers use ``IDataSource``. +- Work distribution is expressed with ``onAcc::makeIdxMap(...)``. +- The kernel body only talks about data indices, not about raw block and thread ids. + + .. literalinclude:: ../../snippets/example/12_kernelIntro.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-kernelStructure + :end-before: END-TUTORIAL-kernelStructure + :dedent: + +This is the most important beginner rule in *alpaka*: write the kernel in terms of the data that needs to be processed. +``makeIdxMap`` distributes that work over the available workers for the chosen executor. +That keeps the code portable across CPUs and GPUs and is usually a much better starting point than manual thread arithmetic. + +Launching the Kernel +-------------------- + +On the host side, the pattern is straightforward: + +1. Allocate buffers on the compute device. +2. Copy input data to the device. +3. Choose a frame specification. +4. Enqueue the kernel. +5. Copy the result back and wait for completion before reading it. + + .. literalinclude:: ../../snippets/example/12_kernelIntro.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-kernelLaunch + :end-before: END-TUTORIAL-kernelLaunch + :dedent: + +The queue can be non-blocking, so ``alpaka::onHost::wait(queue)`` is the point where the host knows the device work is finished. +Without that synchronization, reading the result on the host can race with the running kernel. + +What ``FrameSpec`` Means +------------------------ + +It helps to separate three ideas clearly: + +- the problem size, such as ``257`` vector elements or a ``1024 x 1024`` image, +- the frame extent, which is the shape of one frame, +- and the frame count, which is how many such frames are available in the launch. + +Together, frame count and frame extent form the ``FrameSpec``. +That is the maximum parallel structure exposed to the kernel. +It is not a promise that the total problem size is exactly equal to ``frameCount * frameExtent``. + +This is the important beginner picture: + +- the host chooses a reasonable parallel launch shape, +- the kernel describes the full valid data range with ``IdxRange{...}``, +- ``makeIdxMap`` maps the available workers onto that range, +- and if the problem is larger than the immediate launch shape, the workers simply iterate until the whole range is covered. + +That is why a kernel can process a vector of length ``257`` even if the frame extent is something like ``128`` or ``256``. +The frame specification limits the available parallelism per launch shape. +It does not limit the logical size of the problem. + +Choosing the Correct Frame Specification +---------------------------------------- + +For a first implementation, frame selection should be boring. +The host chooses how much work is grouped into one frame, and the kernel then iterates over the valid data indices assigned to it. + + .. literalinclude:: ../../snippets/example/12_kernelIntro.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-kernelFrameSpec + :end-before: END-TUTORIAL-kernelFrameSpec + :dedent: + +Rules of thumb: + +- Use one-dimensional frames for one-dimensional data such as vectors or linear buffers. +- Use the same dimensionality as the problem when the data is naturally 2D or 3D. +- ``onHost::getFrameSpec(device, extents)`` is the easiest way to get a reasonable first frame specification. +- Start with simple sizes. For 1D kernels, something around ``128`` to ``256`` elements per frame is usually a reasonable first try. +- When you have multiple dimensions, prefer more work in the fastest varying dimension, which is usually ``x``. +- Start with ``FrameSpec`` unless you have a concrete reason to control block and thread layout manually. + +If you have seen CUDA-style beginner code, this is one of the major differences in style. +You do not start by hand-writing a global-index formula and hoping the launch exactly matches the problem. +Instead, you choose a sensible frame shape and let ``makeIdxMap`` carry that parallelism across the full problem range. + +In practice, choose the frame from the data layout first and only tune it later if profiling gives you a reason. + +Once you are comfortable with this basic launch style, the next important alpaka step is :doc:`chunked`, where frames are treated as reusable tiles of work. + +How ``makeIdxMap`` Helps +------------------------ + +``makeIdxMap`` is the beginner-friendly way to iterate over the part of the problem assigned to the running workers. +Conceptually, it gives you the portable version of the "grid-stride loop" idea that CUDA users often learn early: +all workers cooperate to cover the whole range, and the loop only yields valid indices. + +The object that describes that iteration space in the tutorial examples is ``IdxRange``. +``IdxRange{out.getExtents()}`` means "the full valid index range of this output object". +For a vector, that is all indices from the first element to the last element. +For a matrix or image, it is the full multidimensional box of valid coordinates. + +This is exactly why ``IdxRange`` and ``FrameSpec`` are different objects. +``FrameSpec`` describes available parallel workers and their grouping. +``IdxRange`` describes the logical work that must be completed. +In most beginner kernels, keeping those two ideas separate makes the code much easier to understand. + +For one-dimensional data, the common pattern is: + +- Use ``IdxRange{out.getExtents()}`` to describe the full iteration space. +- Iterate over the indices yielded by ``makeIdxMap``. +- Read the inputs and write the output at that index. + +That is enough for a surprising number of useful kernels: vector addition, scaling, fused multiply-add, bias addition, simple activations, and similar element-wise operations. + +Typical Beginner Mistakes +------------------------- + +- Forgetting to copy the inputs to the device before enqueueing the kernel. +- Forgetting to copy the result back to the host after the kernel. +- Forgetting to wait before reading host-side results from a non-blocking queue. +- Choosing a one-dimensional frame for naturally multidimensional code and then reimplementing manual index arithmetic in the kernel. +- Writing the kernel in terms of raw thread ids even though the algorithm is just "process every element once". + +Where To Go Next +---------------- + +The next natural pages depend on the kind of problem you have: + +- read :doc:`multidim` for images, matrices, and stencils, +- read :doc:`sharedMemory` once data reuse inside a tile starts to matter, +- read :doc:`miniProject` for one compact image-style pipeline that combines several of these ideas. + +Complete Source File +-------------------- + +.. raw:: html + +
+ 12_kernelIntro.cpp + +.. literalinclude:: ../../snippets/example/12_kernelIntro.cpp + :language: cpp + :linenos: + +.. raw:: html + +
diff --git a/docs/source/tutorial/kernels.rst b/docs/source/tutorial/kernels.rst new file mode 100644 index 000000000..63f674bed --- /dev/null +++ b/docs/source/tutorial/kernels.rst @@ -0,0 +1,19 @@ +Kernels and Hierarchy +===================== + +This section moves from the first kernel to the practical execution hierarchy: +tiles, blocks, threads, shared memory, fences, and atomics. + +The pages are ordered from the simplest launch model toward the more cooperative kernel patterns. + +.. toctree:: + :maxdepth: 1 + + kernel.rst + hierarchy.rst + multidim.rst + sharedMemory.rst + memFence.rst + chunked.rst + atomics.rst + miniProject.rst diff --git a/docs/source/tutorial/math.rst b/docs/source/tutorial/math.rst new file mode 100644 index 000000000..1c01e96b0 --- /dev/null +++ b/docs/source/tutorial/math.rst @@ -0,0 +1,113 @@ +Math Functions in Kernels +========================= + +Inside kernels, prefer ``alpaka::math`` over calling backend-specific math APIs directly. +That keeps the code portable across host, CUDA, HIP, and SYCL backends. + +For teaching, math functions become much easier to understand when they are attached to tiny numerical stories instead of listed as names. +This chapter uses two of those stories: + +- a trigonometric identity check, which is a compact stand-in for many signal-processing style kernels, +- and a distance-like computation, which is a compact stand-in for geometry, physics, and graphics code. + +Element-wise Math Kernels +------------------------- + +For many kernels, the structure is still the same as vector addition: +iterate over the data with ``makeIdxMap`` and call math functions on each element. + + .. literalinclude:: ../../snippets/example/24_math.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-mathKernel + :end-before: END-TUTORIAL-mathKernel + :dedent: + +This example uses ``math::sincos`` and ``math::fma``. +That combination is common in numerical kernels because it keeps the code compact and can map efficiently to backend-specific instructions. +You can read this as "compute a mathematically meaningful quantity per input element". +That is the same overall shape as applying a nonlinear activation, evaluating a wave model at many sample points, or transforming an angle image into derived features. + +Distance-like Computations +-------------------------- + +Reciprocal square root is another common operation in physics, graphics, and geometry kernels. + + .. literalinclude:: ../../snippets/example/24_math.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-rsqrtKernel + :end-before: END-TUTORIAL-rsqrtKernel + :dedent: + +Commonly used functions include: + +The concrete picture here is "distance or inverse length from many points". +Even though the example is small, it matches a very common class of kernels: compute one derived floating-point quantity per element and write it back. + +Available Function Families +--------------------------- + +Unary real and complex helpers: + +- ``abs``, ``arg``, ``conj`` +- ``sin``, ``cos``, ``tan`` +- ``asin``, ``acos``, ``atan`` +- ``sinh``, ``cosh``, ``tanh`` +- ``asinh``, ``acosh``, ``atanh`` +- ``sqrt``, ``rsqrt``, ``cbrt`` +- ``exp``, ``log``, ``log2``, ``log10`` +- ``erf`` +- ``ceil``, ``floor``, ``round``, ``lround``, ``llround``, ``trunc`` +- ``isnan``, ``isinf``, ``isfinite`` + +Binary helpers: + +- ``atan2`` +- ``copysign`` +- ``min``, ``max`` +- ``pow`` +- ``fmod``, ``remainder`` + +Ternary helpers: + +- ``fma`` + +Mixed output helpers: + +- ``sincos`` + +The list above reflects the functions exposed in ``include/alpaka/math.hpp``. +The alpaka math unit tests also exercise the major unary, binary, and ternary operations against standard-library behavior where a direct comparison exists. + +Practical Advice +---------------- + +- Use ``alpaka::math`` in device code instead of backend-specific CUDA, HIP, or SYCL names. +- Compare floating-point results with a tolerance when testing. +- Write the clear mathematical version first, then optimize only if profiling shows a problem. +- Prefer one kernel per logical transform when teaching or debugging; fused kernels are useful later, but they are harder to reason about. + +Try Next +-------- + +Good follow-up exercises for this chapter are: + +- replace the trigonometric identity with ``exp`` and ``log`` on a positive input array +- build a simple Gaussian-like curve with ``exp(-x * x)`` +- compute a 2D point length with ``sqrt(x*x + y*y)`` +- add a small image-processing example such as gamma correction with ``pow`` + +Complete Source File +-------------------- + +.. raw:: html + +
+ 24_math.cpp + +.. literalinclude:: ../../snippets/example/24_math.cpp + :language: cpp + :linenos: + +.. raw:: html + +
diff --git a/docs/source/tutorial/memFence.rst b/docs/source/tutorial/memFence.rst new file mode 100644 index 000000000..034907d70 --- /dev/null +++ b/docs/source/tutorial/memFence.rst @@ -0,0 +1,104 @@ +Memory Fences +============= + +``onAcc::memFence`` is a visibility and ordering primitive inside kernels. +It is not a barrier. +It does not wait for other threads to reach the same point. +Instead, it tells the backend how writes before the fence must become visible relative to reads and writes after the fence. + +That distinction matters: + +- use ``syncBlockThreads`` when threads in one block must rendezvous, +- use ``memFence`` when you need ordering or publication guarantees, +- and use atomics when multiple threads update the same location. + +The two most common scopes are: + +- ``onAcc::scope::block`` for communication inside one block, +- ``onAcc::scope::device`` for communication across blocks on the same device. + +Block-Scope Ordering +-------------------- + +The first example follows the shared-memory ordering pattern from alpaka's unit tests. +One thread publishes two values into block-local shared memory. +The fence guarantees that the write to ``shared[0]`` becomes visible before the later write to ``shared[1]`` is observed as published. + + .. literalinclude:: ../../snippets/example/34_memFence.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-memFenceBlockKernel + :end-before: END-TUTORIAL-memFenceBlockKernel + :dedent: + +Launching that kernel looks ordinary. +The important part is the fence inside the kernel, not the host-side launch code. + + .. literalinclude:: ../../snippets/example/34_memFence.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-memFenceBlockLaunch + :end-before: END-TUTORIAL-memFenceBlockLaunch + :dedent: + +Device-Scope Publication +------------------------ + +The second example shows the classic producer/consumer publication pattern in global memory. +The producer writes the payload, issues a release fence, and only then atomically sets a ready flag. +The consumer spins on the atomic ready flag, issues an acquire fence, and then reads the payload. + + .. literalinclude:: ../../snippets/example/34_memFence.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-memFenceDeviceKernel + :end-before: END-TUTORIAL-memFenceDeviceKernel + :dedent: + + .. literalinclude:: ../../snippets/example/34_memFence.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-memFenceDeviceLaunch + :end-before: END-TUTORIAL-memFenceDeviceLaunch + :dedent: + +This is the pattern to remember: + +- producer: write data, ``memFence(..., scope::device, order::release)``, then atomically publish the flag +- consumer: atomically observe the flag, ``memFence(..., scope::device, order::acquire)``, then read the data + +Practical Advice +---------------- + +- Do not use ``memFence`` as a substitute for ``syncBlockThreads``. +- A fence orders memory operations; it does not make conflicting non-atomic writes safe. +- Keep the publication protocol simple: payload first, fence second, atomic flag update last. +- Prefer ``scope::block`` over ``scope::device`` when block-local visibility is enough. +- Use the weakest memory order that expresses the algorithm clearly. ``release`` / ``acquire`` is often the right pair for producer/consumer publication. + +Common Mistakes +--------------- + +- using ``memFence`` when the real need is a block barrier +- assuming a fence alone makes racy non-atomic writes correct +- publishing the ready flag before the payload is fully written +- widening the scope to ``device`` when the protocol is only block-local + +Where To Go Next +---------------- + +- read :doc:`atomics` for conflicting updates +- read :doc:`sharedMemory` for block-local cooperation patterns +- read :doc:`backendDifferences` if you want to understand how the same semantics feel across different backends + +Complete Source File +-------------------- + +.. raw:: html + +
+ 34_memFence.cpp + +.. literalinclude:: ../../snippets/example/34_memFence.cpp + :language: cpp + :linenos: + +.. raw:: html + +
diff --git a/docs/source/tutorial/memory.rst b/docs/source/tutorial/memory.rst index c65a56384..e9fce760b 100644 --- a/docs/source/tutorial/memory.rst +++ b/docs/source/tutorial/memory.rst @@ -3,8 +3,14 @@ Allocate Memory Now that we know how to :ref:`get a device ` and create :ref:`a queue `, we can move on to memory allocation. To allocate memory, you need a *device* and sometimes a *queue*. +See :ref:`memory-operations` for copy, fill, and memset details once the buffers exist. alpaka's memory allocation methods return a ``alpaka::onHost::SharedBuffer`` handle that tracks the lifetime of the memory and frees memory when the last instance goes out of scope, similar to ``std::shared_ptr<>`` in the STL. +This chapter is easiest to picture with two recurring examples from the rest of the tutorial: + +- an image-processing pipeline, where you may keep one host image, one device image, and perhaps one temporary output image, +- or a Monte Carlo workflow, where you keep input parameters, random samples, and partial results in separate buffers. + - Copying a ``alpaka::onHost::SharedBuffer`` handle is a shallow copy of the buffer handle and does not duplicate the data. - A deep copy of the memory must be explicitly triggered using ``alpaka::onHost::memcpy()``. - A buffer is **not** initialized with default values. @@ -58,40 +64,26 @@ Sometimes you want to allocate memory that is only used as a temporary buffer an Since memory allocations are costly, you generally avoid allocating memory, for example, in a loop. Depending on the device or queue API, ``alpaka::onHost::allocDeferred()`` automatically uses an internal caching allocator to keep allocation as cost-effective as possible. +That kind of temporary buffer shows up naturally later for things such as scan scratch storage, intermediate image tiles, or one stage of a multi-step numerical pipeline. + .. literalinclude:: ../../snippets/example/10_memory.cpp :language: cpp :start-after: BEGIN-TUTORIAL-allocBufferDeferred :end-before: END-TUTORIAL-allocBufferDeferred :dedent: -Memory Operations -================= - -One of the most commonly used memory operations is the copy operation, which copies data from one buffer to another. -All memory operations support any dimension ``>=1``. +Complete Source File +-------------------- -- ``alpaka::onHost::memcpy()`` always works with the entire buffer unless you specify the extent. The extent defines the number of elements, **not** the size in bytes. - - .. literalinclude:: ../../snippets/example/10_memory.cpp - :language: cpp - :start-after: BEGIN-TUTORIAL-memcpy - :end-before: END-TUTORIAL-memcpy - :dedent: +.. raw:: html -- You can also set all values of a buffer to a specific value using ``alpaka::onHost::fill()``. +
+ 10_memory.cpp - .. literalinclude:: ../../snippets/example/10_memory.cpp - :language: cpp - :start-after: BEGIN-TUTORIAL-fill - :end-before: END-TUTORIAL-fill - :dedent: +.. literalinclude:: ../../snippets/example/10_memory.cpp + :language: cpp + :linenos: -- With ``alpaka::onHost::memset()``, all bytes of a buffer can be set to a specific byte value. - This is typically used to set all bytes to zero. - **Attention:** The optional extent still defines the number of elements and **not** the size in bytes. +.. raw:: html - .. literalinclude:: ../../snippets/example/10_memory.cpp - :language: cpp - :start-after: BEGIN-TUTORIAL-memset - :end-before: END-TUTORIAL-memset - :dedent: +
diff --git a/docs/source/tutorial/memoryOperations.rst b/docs/source/tutorial/memoryOperations.rst new file mode 100644 index 000000000..91728184f --- /dev/null +++ b/docs/source/tutorial/memoryOperations.rst @@ -0,0 +1,53 @@ +.. _memory-operations: + +Memory Operations +================= + +After allocating buffers, the next step is moving or initializing data inside them. +One of the most commonly used memory operations is the copy operation, which copies data from one buffer to another. +All memory operations support any dimension ``>=1``. + +In practice, these operations are the "plumbing" around nearly every example in this tutorial: +copy an image to the device, clear a histogram buffer, move results back to the host, or prepare a Monte Carlo input/output pair before launching a kernel. + +- ``alpaka::onHost::memcpy()`` always works with the entire buffer unless you specify the extent. The extent defines the number of elements, **not** the size in bytes. + + .. literalinclude:: ../../snippets/example/10_memory.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-memcpy + :end-before: END-TUTORIAL-memcpy + :dedent: + +- You can also set all values of a buffer to a specific value using ``alpaka::onHost::fill()``. + + .. literalinclude:: ../../snippets/example/10_memory.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-fill + :end-before: END-TUTORIAL-fill + :dedent: + +- With ``alpaka::onHost::memset()``, all bytes of a buffer can be set to a specific byte value. + This is typically used to set all bytes to zero. + **Attention:** The optional extent still defines the number of elements and **not** the size in bytes. + + .. literalinclude:: ../../snippets/example/10_memory.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-memset + :end-before: END-TUTORIAL-memset + :dedent: + +Complete Source File +-------------------- + +.. raw:: html + +
+ 10_memory.cpp + +.. literalinclude:: ../../snippets/example/10_memory.cpp + :language: cpp + :linenos: + +.. raw:: html + +
diff --git a/docs/source/tutorial/mentalModel.rst b/docs/source/tutorial/mentalModel.rst new file mode 100644 index 000000000..87aa8365c --- /dev/null +++ b/docs/source/tutorial/mentalModel.rst @@ -0,0 +1,72 @@ +Core Mental Model +================= + +Many beginner questions in *alpaka* become easier once three ideas are kept separate: + +- ``IdxRange`` describes the logical work that must be completed, +- ``FrameSpec`` describes the available parallel structure for one launch, +- ``makeIdxMap`` maps that parallel structure onto the logical work. + +Those three ideas show up again and again in the tutorial. +If they stay clear in your head, most kernel code stops feeling mysterious. + +Logical Work: ``IdxRange`` +-------------------------- + +``IdxRange`` describes the valid data domain. + +- for a vector, that is the full one-dimensional element range, +- for an image, that is the two-dimensional box of valid pixel coordinates, +- for a volume, that is the full three-dimensional coordinate box. + +This is the answer to the question: +"What work actually needs to be done?" + +Available Parallelism: ``FrameSpec`` +------------------------------------ + +``FrameSpec`` describes the launch-side structure. +It tells alpaka how many frames are available and how large one frame is. + +This is the answer to the question: +"How much parallel structure do I want to make available in this launch?" + +That is why the frame shape often follows the problem: + +- a 1D frame for a vector transform, +- a 2D frame for an image or stencil, +- a 3D frame only when the data is truly volumetric. + +Mapping Both Together: ``makeIdxMap`` +------------------------------------- + +``makeIdxMap`` is the bridge between the two. +It takes the currently available workers and yields valid indices from the logical range. + +This is the answer to the question: +"Which valid data items should this worker process?" + +For beginners, that is usually the right level of abstraction. +You think in terms of output elements, pixels, samples, or cells, not in terms of manually computed global thread ids. + +One Short Example +----------------- + +If you have a ``1024 x 1024`` grayscale image: + +- ``IdxRange`` is the full ``1024 x 1024`` image domain, +- ``FrameSpec`` might choose a smaller 2D tile shape such as ``16 x 16``, +- and ``makeIdxMap`` lets the running workers cover the full image one valid pixel index at a time. + +So the important distinction is: + +- ``IdxRange`` is about the whole problem, +- ``FrameSpec`` is about the launch shape, +- ``makeIdxMap`` is how the kernel walks the problem with that launch shape. + +Where To Go Next +---------------- + +- :doc:`kernel` introduces the first real kernel with these concepts. +- :doc:`multidim` shows how the same model extends naturally to images and stencils. +- :doc:`chunked` shows how frames become reusable tiles of work. diff --git a/docs/source/tutorial/migration.rst b/docs/source/tutorial/migration.rst new file mode 100644 index 000000000..5e78ab519 --- /dev/null +++ b/docs/source/tutorial/migration.rst @@ -0,0 +1,14 @@ +Migration and Porting +===================== + +Many alpaka users arrive from CUDA, HIP/ROCm, or SYCL. +This section explains how to translate that mental model into alpaka style, what changes across backends, and what to tune first. + +.. toctree:: + :maxdepth: 1 + + migrationMap.rst + portingKernel.rst + backendDifferences.rst + vendorInterop.rst + tuning.rst diff --git a/docs/source/tutorial/migrationMap.rst b/docs/source/tutorial/migrationMap.rst new file mode 100644 index 000000000..215502b71 --- /dev/null +++ b/docs/source/tutorial/migrationMap.rst @@ -0,0 +1,58 @@ +From CUDA, HIP, or SYCL to alpaka +================================= + +Most migration questions are really mapping questions: "what is the alpaka equivalent of the concept I already know?" + +The short version is: + +- CUDA/HIP grid or SYCL global range -> the full data range you pass to ``makeIdxMap`` +- block / work-group -> one frame or tile +- thread / work-item -> one worker inside that frame +- warp / wavefront / subgroup -> ``onAcc::warp`` +- shared memory / local memory -> ``declareSharedVar``, ``declareSharedMdArray``, ``getDynSharedMem`` +- stream / queue -> ``onHost::Queue`` +- event -> ``onHost::Event`` + +Mental Model Shift +------------------ + +The biggest change is not the API names. +It is the style of writing kernels. + +In CUDA or HIP tutorials, the first pattern is often: + +- read ``blockIdx`` and ``threadIdx`` +- compute a global index by hand +- guard against out-of-range threads + +In alpaka, the preferred first pattern is: + +- describe the data range +- let ``makeIdxMap`` distribute that work +- keep the kernel written in terms of data indices + +That is why the beginner chapters try hard to avoid raw index arithmetic. +alpaka is designed so that the same kernel structure still makes sense on CPU, CUDA, HIP, and SYCL backends. + +Useful Equivalents +------------------ + +- block shape selection: ``onHost::FrameSpec`` or ``onHost::getFrameSpec`` +- strict block/thread control: ``onHost::ThreadSpec`` +- block-local synchronization: ``onAcc::syncBlockThreads`` +- memory ordering without synchronization: ``onAcc::memFence`` +- block-local and device-wide atomics: ``onAcc::atomic*`` with ``onAcc::scope::block`` or ``onAcc::scope::device`` +- subgroup communication: ``onAcc::warp::shfl*``, ``all``, ``any``, ``ballot`` + +What Usually Ports Cleanly +-------------------------- + +These things usually translate directly: + +- element-wise kernels +- reductions and scans +- tiled shared-memory kernels +- histogram and counter kernels with atomics +- stencil kernels with halo cells + +The next chapter shows what that port looks like in practice for a very small kernel. diff --git a/docs/source/tutorial/miniProject.rst b/docs/source/tutorial/miniProject.rst new file mode 100644 index 000000000..5f5267524 --- /dev/null +++ b/docs/source/tutorial/miniProject.rst @@ -0,0 +1,103 @@ +Mini Project: Threshold and Histogram +===================================== + +After reading the earlier chapters, it helps to see how the pieces fit together in one compact program. +This mini project uses a tiny grayscale image and performs two steps: + +1. threshold the image into black and white, +2. count dark and bright pixels with a histogram. + +That combines several tutorial ideas in one place: + +- multidimensional buffers for the image, +- ``makeIdxMap`` for data-centric iteration, +- a simple frame specification, +- an atomic histogram update, +- and ordinary queue-based copies around the kernels. + +Step 1: Threshold the Image +--------------------------- + +The first kernel is a plain 2D image transform. +Each output pixel is written exactly once, so no atomics are needed. + + .. literalinclude:: ../../snippets/example/40_imagePipeline.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-imageThresholdKernel + :end-before: END-TUTORIAL-imageThresholdKernel + :dedent: + +This is the same beginner style introduced earlier: +describe the valid image domain with ``IdxRange`` and let ``makeIdxMap`` yield the pixel indices. + +Step 2: Build a Histogram +------------------------- + +The second kernel reads the thresholded image and counts dark and bright pixels. +Now several pixels may contribute to the same bin, so atomics are required. + + .. literalinclude:: ../../snippets/example/40_imagePipeline.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-imageHistogramKernel + :end-before: END-TUTORIAL-imageHistogramKernel + :dedent: + +This is a useful transition point in the tutorial: +the first kernel was a plain per-pixel transform, the second kernel is a reduction-like accumulation pattern. + +Launching the Pipeline +---------------------- + +The host-side flow is still small and regular: +allocate buffers, copy the image to the device, clear the outputs, enqueue both kernels, and copy the results back. + + .. literalinclude:: ../../snippets/example/40_imagePipeline.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-imagePipelineLaunch + :end-before: END-TUTORIAL-imagePipelineLaunch + :dedent: + +After that, the host can read the two histogram counts as the summary of the whole image. + + .. literalinclude:: ../../snippets/example/40_imagePipeline.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-imagePipelineResult + :end-before: END-TUTORIAL-imagePipelineResult + :dedent: + +Why This Is A Good Tutorial Example +----------------------------------- + +This mini project is small, but it already looks like a real parallel program: + +- one buffer holds the input image, +- one kernel produces an output image, +- another kernel summarizes that output, +- and the host coordinates the full pipeline through the queue. + +That pattern scales well. +If you replaced thresholding with blur, edge detection, or one stencil step, the overall structure would stay very similar. + +What To Read With It +-------------------- + +- :doc:`views` for crops, subregions, and interior-only processing +- :doc:`multidim` for the 2D kernel style +- :doc:`atomics` for the histogram update +- :doc:`tuning` for what to optimize first if the image becomes large + +Complete Source File +-------------------- + +.. raw:: html + +
+ 40_imagePipeline.cpp + +.. literalinclude:: ../../snippets/example/40_imagePipeline.cpp + :language: cpp + :linenos: + +.. raw:: html + +
diff --git a/docs/source/tutorial/multidim.rst b/docs/source/tutorial/multidim.rst new file mode 100644 index 000000000..c6facc8a2 --- /dev/null +++ b/docs/source/tutorial/multidim.rst @@ -0,0 +1,109 @@ +Working With Multidimensional Kernels +===================================== + +Many important beginner examples in parallel computing are naturally multidimensional: +images, matrices, heat diffusion, cellular automata, and finite-difference stencils. +For those problems, it is usually clearer to keep the kernel multidimensional instead of flattening everything into one linear index. + +Choose the Kernel Shape From the Data +------------------------------------- + +If the data is naturally a matrix or image, use two-dimensional extents and two-dimensional frames. +This avoids hand-written index decoding and makes boundary conditions easier to read. + + .. literalinclude:: ../../snippets/example/18_multidimKernel.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-multidimFrameSpec + :end-before: END-TUTORIAL-multidimFrameSpec + :dedent: + +The important idea is that the frame shape should follow the logical shape of the work: + +- 1D frames for flat vectors and simple reductions. +- 2D frames for images, matrices, and most stencil codes. +- 3D frames only when the algorithm is truly volumetric. + +Keep in mind that the rightmost index, usually ``x``, is the fastest varying dimension in *alpaka* buffers. +That is why a frame like ``Vec{2, 4}`` is a sensible beginner choice: it keeps more work in ``x`` than in ``y``. + +A Small 2D Stencil Example +-------------------------- + +The following kernel performs one five-point average step on a small 2D grid. +This is a common teaching example because it introduces three important ideas at once: + +- iterating over multidimensional buffers, +- handling boundaries explicitly, +- and reading neighboring cells. + + .. literalinclude:: ../../snippets/example/18_multidimKernel.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-multidimKernelStructure + :end-before: END-TUTORIAL-multidimKernelStructure + :dedent: + +The structure is still the same as in the one-dimensional tutorial: + +- ask the output buffer for its extents, +- build ``IdxRange{extents}`` to describe the full valid multidimensional index box, +- iterate with ``makeIdxMap``, +- guard the boundary cells, +- then update neighbor locations by adding or subtracting direction vectors from the current ``Vec`` index. + +This is the natural alpaka style for stencil code. +The project examples, such as the heat-equation stencil, operate on the multidimensional index directly and move to +neighbors with vector offsets instead of splitting ``x`` and ``y`` into separate scalars and rebuilding indices. + +Launching the 2D Kernel +----------------------- + +The host-side launch is unchanged except that both the problem extents and the frame extents are vectors now. + + .. literalinclude:: ../../snippets/example/18_multidimKernel.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-multidimKernelLaunch + :end-before: END-TUTORIAL-multidimKernelLaunch + :dedent: + +This is one of the main design strengths of *alpaka*: the launch flow remains stable while the data shape changes. +Only the extents and the kernel body become multidimensional. + +What Users Usually Need To Know Early +------------------------------------- + +The following habits are worth learning from the start: + +- Keep boundary handling explicit. A branch for the border is normal in stencil-like kernels. +- Iterate over the full valid problem range, not over guessed thread ids. +- Use multidimensional buffers when the algorithm has multidimensional neighbors. +- Keep reads and writes easy to see. Beginners make fewer mistakes when each output element is written once. +- Start with a clear kernel and a small test case before trying to optimize shared memory use or manual thread mapping. + +When to Use Manual Thread and Block Indices +------------------------------------------- + +There are cases where explicit thread or block indices are useful, for example: + +- implementing a very specific GPU mapping, +- using an algorithm that must reason about exact block-local cooperation, +- or porting low-level CUDA/HIP code step by step. + +That is not the best starting point for most kernels. +For beginner code, prefer ``FrameSpec`` plus ``makeIdxMap`` first. +Once the algorithm is correct and tested, you can move to more specialized mappings if profiling shows that you need them. + +Complete Source File +-------------------- + +.. raw:: html + +
+ 18_multidimKernel.cpp + +.. literalinclude:: ../../snippets/example/18_multidimKernel.cpp + :language: cpp + :linenos: + +.. raw:: html + +
diff --git a/docs/source/tutorial/numerics.rst b/docs/source/tutorial/numerics.rst new file mode 100644 index 000000000..1fde3a65f --- /dev/null +++ b/docs/source/tutorial/numerics.rst @@ -0,0 +1,17 @@ +Algorithms and Numerics +======================= + +This section collects the higher-level algorithm helpers and numerical building blocks: +onHost algorithms, random-number generation, math functions, bit intrinsics, and warp-level tools. + +The pages are easiest to read as a progression: +first learn how to transform and summarize data, then add randomness and numerical functions, and only after that move to specialized tools such as bit intrinsics and warp-level operations. + +.. toctree:: + :maxdepth: 1 + + algorithms.rst + random.rst + math.rst + intrinsics.rst + warp.rst diff --git a/docs/source/tutorial/portingKernel.rst b/docs/source/tutorial/portingKernel.rst new file mode 100644 index 000000000..bce0352dd --- /dev/null +++ b/docs/source/tutorial/portingKernel.rst @@ -0,0 +1,71 @@ +Porting a Small Kernel +====================== + +For users coming from CUDA or HIP, SAXPY is a good first example because the original kernel is usually written with manual global-index arithmetic. +In alpaka, the ported kernel is simpler if you stop thinking about thread ids and instead ask for "all valid output elements". + +The Kernel +---------- + + .. literalinclude:: ../../snippets/example/36_portingKernel.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-portingKernel + :end-before: END-TUTORIAL-portingKernel + :dedent: + +What changed compared to the usual CUDA-style beginner kernel: + +- there is no manual ``blockIdx * blockDim + threadIdx`` arithmetic +- there is no explicit bounds guard around a hand-computed index +- the kernel body talks directly about the data index ``i`` + +This is the habit worth learning early. +It is not only shorter. +It also keeps the algorithm readable when the same kernel later runs on host executors or on a different GPU backend. + +The Launch +---------- + + .. literalinclude:: ../../snippets/example/36_portingKernel.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-portingLaunch + :end-before: END-TUTORIAL-portingLaunch + :dedent: + +The launch still has the same ingredients migration users expect: + +- allocate device buffers +- copy inputs +- choose a frame shape +- enqueue the kernel +- copy the result back + +What alpaka removes is the need to hard-code the whole execution formula inside the kernel. + +Porting Rule of Thumb +--------------------- + +When porting a small CUDA, HIP, or SYCL kernel into alpaka: + +1. keep the mathematical operation first +2. replace manual global-index arithmetic with ``makeIdxMap`` +3. keep block-local concepts only if the algorithm really uses them +4. add shared memory, warp logic, or atomics only after the plain data-parallel version is correct + +That order makes migration much less error-prone. + +Complete Source File +-------------------- + +.. raw:: html + +
+ 36_portingKernel.cpp + +.. literalinclude:: ../../snippets/example/36_portingKernel.cpp + :language: cpp + :linenos: + +.. raw:: html + +
diff --git a/docs/source/tutorial/queue.rst b/docs/source/tutorial/queue.rst index d32c24904..91f7a5746 100644 --- a/docs/source/tutorial/queue.rst +++ b/docs/source/tutorial/queue.rst @@ -37,3 +37,19 @@ If you do not pass ``queueKind`` as an argument, you will get a *non-blocking* q We will learn more about queue functions in later chapters. Before that, we need to deal with memory allocation, kernel writing, and events. + +Complete Source File +-------------------- + +.. raw:: html + +
+ 06_queue.cpp + +.. literalinclude:: ../../snippets/example/06_queue.cpp + :language: cpp + :linenos: + +.. raw:: html + +
diff --git a/docs/source/tutorial/random.rst b/docs/source/tutorial/random.rst new file mode 100644 index 000000000..4bf134ae5 --- /dev/null +++ b/docs/source/tutorial/random.rst @@ -0,0 +1,218 @@ +Random Numbers +============== + +Parallel codes often need random numbers for Monte Carlo methods, randomized initialization, sampling, or synthetic test data. +alpaka provides random engines and distributions that can be used directly inside kernels. +In the current alpaka code base, the practical starting point is: + +- ``rand::engine::Philox4x32x10`` as the engine, +- ``rand::distribution::UniformReal`` for bounded floating-point samples, +- ``rand::distribution::NormalReal`` for Gaussian noise. + +The important beginner rule is simple: each worker should get its own deterministic engine state. +The easiest way to do that is to derive the seed from the loop index. + +This chapter connects to two of the recurring tutorial examples: + +- Monte Carlo estimation of pi, where each worker draws points and contributes to a global count, +- and image or signal processing examples, where random values are used as synthetic input or as noise added around a clean signal. + +Uniform Random Numbers in a Kernel +---------------------------------- + + .. literalinclude:: ../../snippets/example/30_random.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-randomKernel + :end-before: END-TUTORIAL-randomKernel + :dedent: + +This example uses: + +- ``rand::engine::Philox4x32x10`` as the random engine, +- ``rand::distribution::UniformReal`` as the distribution, +- and ``rand::interval::co`` for the half-open interval ``[0, 1)``. + +Launching the Kernel +-------------------- + + .. literalinclude:: ../../snippets/example/30_random.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-randomLaunch + :end-before: END-TUTORIAL-randomLaunch + :dedent: + +This style follows the alpaka random example and the unit tests: +the kernel stays data-parallel, and the engine state is derived from a stable seed plus a stable worker index. + +Which Distributions Are Available +--------------------------------- + +For most beginner use cases, these are the two distributions to know first: + +- ``UniformReal`` for samples in a bounded floating-point interval +- ``NormalReal`` for Gaussian-distributed samples with a chosen mean and standard deviation + +Uniform distributions are the natural tool for probabilities, random offsets, randomized initialization, and rejection sampling. +Normal distributions are the natural tool for noise models, perturbations around a mean value, and many Monte Carlo methods. + +A classic beginner example is Monte Carlo estimation of pi: +draw points in the square ``[0, 1) x [0, 1)``, count how many land inside the unit quarter circle, and estimate ``pi`` from that ratio. +That is a good example of why ``rand::interval::co`` is such a natural default. +The half-open interval matches array-style reasoning and avoids awkward endpoint corner cases. +It also connects naturally to later chapters: +the random chapter gives you the samples, the reduction chapter gives you the accumulation, and the tuning chapter gives you the questions to ask once the first correct version works. + +Tiny Monte Carlo Pi +------------------- + +The following example turns that idea into a minimal alpaka workflow: +each worker draws one point, writes ``1`` if the point falls inside the quarter circle, and then a reduction adds up all hits. + + .. literalinclude:: ../../snippets/example/31_monteCarloPi.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-piKernel + :end-before: END-TUTORIAL-piKernel + :dedent: + +The launch and accumulation step stay compact because the reduction happens on the same queue right after the kernel. + + .. literalinclude:: ../../snippets/example/31_monteCarloPi.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-piLaunch + :end-before: END-TUTORIAL-piLaunch + :dedent: + +After copying back the single reduction result, the estimate itself is just the usual Monte Carlo formula. + + .. literalinclude:: ../../snippets/example/31_monteCarloPi.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-piEstimate + :end-before: END-TUTORIAL-piEstimate + :dedent: + +This is a good anchor example because it combines three ideas from the tutorial in one small program: + +- random numbers generate the sample points, +- a plain kernel classifies each point, +- and a reduction turns many local decisions into one global estimate. + +Intervals +--------- + +``UniformReal`` supports four interval tags: + +- ``rand::interval::co`` gives ``[a, b)`` +- ``rand::interval::oc`` gives ``(a, b]`` +- ``rand::interval::cc`` gives ``[a, b]`` +- ``rand::interval::oo`` gives ``(a, b)`` + +The following kernel shows all four forms side by side. + + .. literalinclude:: ../../snippets/example/30_random.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-randomIntervalsKernel + :end-before: END-TUTORIAL-randomIntervalsKernel + :dedent: + +The interval choice matters more than it may seem at first: + +- ``[a, b)`` is the safest default for probabilities, normalized coordinates, and bucket selection because the upper bound is excluded +- ``(a, b]`` is useful when zero would be problematic but the upper endpoint may remain valid +- ``[a, b]`` is useful when both endpoints are meaningful model values and exact endpoint hits are acceptable +- ``(a, b)`` is useful when neither endpoint is safe, for example before applying ``log(x)`` or in transforms that must avoid both ``0`` and ``1`` + +As a practical rule, if you are unsure, start with ``rand::interval::co``. +It is the most familiar half-open interval and avoids the common “sample equals upper bound” surprise. + +Practical Uniform Examples +-------------------------- + +Here are simple ways to think about the interval choices: + +- ``UniformReal{0.0f, 1.0f, rand::interval::co}`` for probabilities and histogram/bin mapping +- ``UniformReal{-0.5f, 0.5f, rand::interval::oo}`` for symmetric jitter where neither edge should be hit +- ``UniformReal{0.0f, maxTimeStep, rand::interval::cc}`` when both exact endpoints are acceptable outcomes + +Normal Distribution +------------------- + +``NormalReal`` generates Gaussian noise with a chosen mean and standard deviation. +Unlike the uniform distribution, it keeps internal state, so each worker should create and use its own distribution object. + + .. literalinclude:: ../../snippets/example/30_random.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-randomNormalKernel + :end-before: END-TUTORIAL-randomNormalKernel + :dedent: + +Launching the kernel is the same as before; only the kernel logic changes. + + .. literalinclude:: ../../snippets/example/30_random.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-randomNormalLaunch + :end-before: END-TUTORIAL-randomNormalLaunch + :dedent: + +This is useful for small, realistic teaching examples such as: + +- adding sensor noise around a known signal, +- perturbing particles around a mean position, +- or initializing values around a nominal operating point instead of drawing from a flat interval. + +Seen together with the image-style examples elsewhere in the tutorial, this is the natural way to think about Gaussian noise: +start from a clean signal or image and add a small random perturbation around the nominal value. + +Practical Advice +---------------- + +- Seed each worker deterministically from its index or from a reproducible application-level seed sequence. +- Pick the distribution that matches the quantity you really need; do not generate integers and reinterpret them by hand unless you have a clear reason. +- Use ``rand::interval::co`` by default for bounded uniform samples unless the algorithm has a specific endpoint requirement. +- Use ``rand::interval::oo`` when a later formula would break on ``0`` or ``1``. +- Keep ``NormalReal`` local to the worker because it has internal state. +- Keep the engine local to the worker unless you have a stronger state-management scheme. +- If you need a histogram or similar aggregation of random samples, combine this chapter with the atomics chapter rather than trying to share one engine across threads. + +Common Mistakes +--------------- + +- sharing one random engine across several workers +- seeding all workers with the same value and then expecting independent samples +- choosing the wrong interval for a later formula such as ``log(x)`` or bucket mapping +- treating random initialization and random sampling as if they required different kernel structure + +Where To Go Next +---------------- + +- read :doc:`algorithms` with the Monte Carlo pi example in mind if you want to summarize random samples +- read :doc:`atomics` if random samples are written into shared bins or histograms +- read :doc:`tuning` once the first correct random kernel is working and you want to scale it up + +Complete Source Files +--------------------- + +.. raw:: html + +
+ 30_random.cpp + +.. literalinclude:: ../../snippets/example/30_random.cpp + :language: cpp + :linenos: + +.. raw:: html + +
+ +.. raw:: html + +
+ 31_monteCarloPi.cpp + +.. literalinclude:: ../../snippets/example/31_monteCarloPi.cpp + :language: cpp + :linenos: + +.. raw:: html + +
diff --git a/docs/source/tutorial/sharedMemory.rst b/docs/source/tutorial/sharedMemory.rst new file mode 100644 index 000000000..265f272f6 --- /dev/null +++ b/docs/source/tutorial/sharedMemory.rst @@ -0,0 +1,189 @@ +Shared Memory +============= + +Shared memory is memory local to a thread block or frame. +It is useful when several threads in the same block need to reuse the same data or communicate through a fast local tile. +In alpaka there are three common forms: + +- a single shared value with ``declareSharedVar``, +- a fixed-size shared array or tile with ``declareSharedMdArray``, +- and dynamic shared memory with ``getDynSharedMem`` when the size is only known at launch time. + +When Shared Memory Helps +------------------------ + +Typical use cases are: + +- tiled stencil kernels, +- block-local reductions and scans, +- transposes, +- and small reusable working sets loaded once and consumed many times. + +A Single Shared Value +--------------------- + +Not every shared-memory kernel needs a tile. Sometimes one shared scalar is enough. +The next example computes one block-local sum in a shared variable. + + .. literalinclude:: ../../snippets/example/16_sharedMemory.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-sharedScalarKernel + :end-before: END-TUTORIAL-sharedScalarKernel + :dedent: + +This pattern is useful for block-local counters, flags, or partial reductions. +The important detail is that the scalar still belongs to the whole block, not to one thread. +One thread initializes it, the block synchronizes, all participating threads update it, and then the block synchronizes again before any thread consumes the final value. +You can think of this as the smallest useful shared-memory example behind a histogram bin count or a block-local vote such as "did any pixel in this tile exceed the threshold?" + +A Small Tiled Example +--------------------- + +The following kernel loads one frame into shared memory, synchronizes the block, and then writes the frame in reverse order. + + .. literalinclude:: ../../snippets/example/16_sharedMemory.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-sharedKernel + :end-before: END-TUTORIAL-sharedKernel + :dedent: + +The important steps are: + +1. declare block-local shared memory, +2. cooperatively fill it, +3. synchronize the block, +4. read from the shared tile. + +The "reverse order" work is only there to keep the example small. +The same structure is what you would use in more realistic kernels: + +- load a small image tile before applying a blur or stencil, +- stage a matrix tile before a transpose or matrix multiply step, +- or cache a short chunk of data before several neighboring threads reuse it. + +Launching a Shared-Memory Kernel +-------------------------------- + + .. literalinclude:: ../../snippets/example/16_sharedMemory.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-sharedLaunch + :end-before: END-TUTORIAL-sharedLaunch + :dedent: + +This example uses ``CVec`` for the frame extent because compile-time-known extents are the simplest way to express a fixed shared-memory tile. + +Dynamic Shared Memory +--------------------- + +Dynamic shared memory is useful when the amount of temporary storage depends on the launch configuration or the kernel arguments. +In alpaka you allocate it indirectly: the runtime reserves a byte buffer for each block, and the kernel accesses it through ``onAcc::getDynSharedMem(acc)``. + +There are two supported ways to tell alpaka how many bytes to reserve. + +Dynamic Size Through a Kernel Member +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The most direct option is to give the kernel object a public ``uint32_t dynSharedMemBytes`` member. +This works well when the required size is already known when the kernel object is created. + + .. literalinclude:: ../../snippets/example/16_sharedMemory.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-dynSharedMemberKernel + :end-before: END-TUTORIAL-dynSharedMemberKernel + :dedent: + +When you launch that kernel, set the byte count in the kernel object itself. + + .. code-block:: cpp + + auto frameSpec = onHost::FrameSpec{1u, CVec{}}; + queue.enqueue( + frameSpec, + KernelBundle{ + DynamicReverseKernel{static_cast(hostInput.size() * sizeof(int))}, + outputBuffer, + inputBuffer}); + +This form is simple and readable, but it is intentionally limited: the size can only depend on data you put into the kernel object. + +Dynamic Size Through ``BlockDynSharedMemBytes`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +When the size should depend on the executor, the thread specification, or the kernel arguments, alpaka uses a trait specialization. +That is what the unit tests exercise as the second dynamic-shared-memory path. + + .. literalinclude:: ../../snippets/example/16_sharedMemory.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-dynSharedTraitSpec + :end-before: END-TUTORIAL-dynSharedTraitSpec + :dedent: + +The kernel itself still uses ``getDynSharedMem`` in the normal way. + + .. literalinclude:: ../../snippets/example/16_sharedMemory.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-dynSharedTraitKernel + :end-before: END-TUTORIAL-dynSharedTraitKernel + :dedent: + +This form is the more flexible one because the trait call can inspect: + +- the executor, +- the thread specification, +- and the kernel arguments passed through ``KernelBundle``. + +Use the trait form when the shared-memory size should follow the launch shape or the runtime arguments, and use the member form when a fixed byte count on the kernel object is already enough. + +If you provide neither a ``dynSharedMemBytes`` member nor a ``BlockDynSharedMemBytes`` specialization, alpaka reserves no dynamic shared memory for that kernel. +On host executors this is intentionally guarded so that accidental ``getDynSharedMem`` usage fails cleanly instead of silently returning an invalid buffer. + +Practical Advice +---------------- + +- Shared memory is local to one block. Different blocks cannot see each other's shared data. +- Shared memory is not initialized automatically. +- Every thread that reads shared data written by other threads usually needs a block synchronization first. +- Reusing the same shared-memory id returns the same storage again; a different id gives you different storage. +- ``declareSharedVar`` is the natural choice for one shared scalar or one small fixed object. +- ``declareSharedMdArray`` is the natural choice for tiles and multidimensional workspaces. +- ``getDynSharedMem`` is the natural choice when the temporary size depends on the launch or the input. +- Start with small tiles and a simple mapping before trying to micro-optimize the memory layout. + +Shared memory is one of the main tools for moving from a correct kernel to a faster kernel, but only after the simpler global-memory version is already correct and tested. +In practice, a good beginner workflow is: + +1. write the plain global-memory version first, +2. measure it, +3. identify a reused working set such as an image tile or a short reduction chunk, +4. then move only that reused data into shared memory. + +Common Mistakes +--------------- + +- treating shared memory as if different blocks could see the same storage +- reading shared values before a required block synchronization +- introducing shared memory before checking that the data is actually reused +- using dynamic shared memory when a small fixed tile would already be simpler and clearer + +Where To Go Next +---------------- + +- read :doc:`multidim` if your shared-memory use is tied to a 2D or 3D stencil +- read :doc:`chunked` if you want to think about frames as reusable tiles +- read :doc:`miniProject` for a small image pipeline that can later be optimized with shared-memory tiles + +Complete Source File +-------------------- + +.. raw:: html + +
+ 16_sharedMemory.cpp + +.. literalinclude:: ../../snippets/example/16_sharedMemory.cpp + :language: cpp + :linenos: + +.. raw:: html + +
diff --git a/docs/source/tutorial/tuning.rst b/docs/source/tutorial/tuning.rst new file mode 100644 index 000000000..5c7dfb869 --- /dev/null +++ b/docs/source/tutorial/tuning.rst @@ -0,0 +1,75 @@ +Performance Portability and Tuning +================================== + +Users switching from CUDA or HIP often ask the same question very early: +"what should I tune first without destroying portability?" + +The safest tuning order in alpaka is: + +1. choose a sensible frame shape +2. improve data locality with tiles or chunked work +3. use shared memory when data is reused +4. reduce atomic pressure with hierarchical accumulation +5. only then reach for warp-local optimization + +What to Tune First +------------------ + +The first knob is almost always the frame or tile shape. +That is why the tutorial introduces ``getFrameSpec``, chunked kernels, and hierarchical kernels early. + +Good first questions are: + +- is the data naturally 1D, 2D, or 3D? +- should one block own a tile, a row stripe, or a chunk? +- is there a naturally one-dimensional inner direction for warp-local work? + +Small practical examples help here: + +- for SAXPY or a point-wise transform, a simple 1D frame is usually the natural starting point, +- for an image blur or stencil, a 2D tile is usually easier to reason about than flattening everything immediately, +- for a histogram, the first performance question is often not warp code at all but whether each block can accumulate locally before touching global atomics. + +What Usually Comes Later +------------------------ + +These are useful optimization tools, but they are usually not the first step: + +- shared memory +- dynamic shared memory +- warp shuffle operations +- block-local atomics +- more specialized executor control with ``ThreadSpec`` + +Common Migration Mistakes +------------------------- + +- hard-coding a CUDA-style block size before measuring +- assuming the best GPU layout is also the best CPU layout +- introducing shared memory before checking whether there is any data reuse +- using device-wide atomics when block-local accumulation would work +- writing subgroup-specific logic before the plain data-parallel version is validated + +One useful beginner habit is to keep one concrete workload in mind while tuning. +For example: + +- an image blur asks about tile shape and shared-memory reuse, +- a Monte Carlo pi kernel asks about random-number generation and reduction strategy, +- a histogram asks about contention and atomic scope, +- and a stencil asks about neighborhood reuse and boundary handling. + +That makes the tuning choices feel less like rules to memorize and more like answers to a concrete data movement problem. + +How to Use the Existing Tutorial Material +----------------------------------------- + +For tuning in alpaka, these chapters are the main reference points: + +- :doc:`kernel` for frame selection +- :doc:`hierarchy` for blocks, threads, and warps +- :doc:`sharedMemory` for local caching +- :doc:`chunked` for tile-based work decomposition +- :doc:`atomics` for conflicting updates +- :doc:`warp` for subgroup-level optimization + +That is also the recommended order for introducing performance-oriented features into a ported code base. diff --git a/docs/source/tutorial/vector.rst b/docs/source/tutorial/vector.rst index 48392de26..dc9629394 100644 --- a/docs/source/tutorial/vector.rst +++ b/docs/source/tutorial/vector.rst @@ -126,3 +126,19 @@ In this example it can only be validated with ``static_assert()`` because the op :start-after: BEGIN-TUTORIAL-CVecOp :end-before: END-TUTORIAL-CVecOp :dedent: + +Complete Source File +-------------------- + +.. raw:: html + +
+ 00_vector.cpp + +.. literalinclude:: ../../snippets/example/00_vector.cpp + :language: cpp + :linenos: + +.. raw:: html + +
diff --git a/docs/source/tutorial/vendorInterop.rst b/docs/source/tutorial/vendorInterop.rst new file mode 100644 index 000000000..4cf255c84 --- /dev/null +++ b/docs/source/tutorial/vendorInterop.rst @@ -0,0 +1,124 @@ +Vendor and Third-Party Interop +============================== + +Sooner or later, many alpaka users want to keep a vendor library in one backend-specific path instead of rewriting everything as a plain alpaka kernel. +That is a normal use case. +You might want to call ``thrust::transform`` on CUDA, ``rocPRIM`` on HIP, a oneAPI library on SYCL, or even a CPU-side library function on the host backend. + +alpaka provides a function-symbol interface for exactly that job. +The idea is simple: + +- define one logical operation, +- specialize implementations for the backends that have a special vendor path, +- keep one generic alpaka fallback for the rest. + +The caller still sees one function call. +The queue or device specification decides which implementation is dispatched. + +The example in this chapter is easiest to picture as a tiny image-processing operation. +Each input value can be read as a pixel intensity, and the operation computes ``scale * value + shift``. +That is the same shape as a brightness-and-contrast adjustment on one grayscale image row. + +Defining a Dispatchable Function +-------------------------------- + + .. literalinclude:: ../../snippets/example/38_vendorInterop.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-vendorSymbol + :end-before: END-TUTORIAL-vendorSymbol + :dedent: + +``ALPAKA_FN_SYMBOL`` defines the public function symbol. +The fallback choice tells alpaka that it may call the generic alpaka implementation when no vendor-specific overload can be dispatched. + +Registering a Generic alpaka Fallback +------------------------------------- + + .. literalinclude:: ../../snippets/example/38_vendorInterop.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-vendorFallback + :end-before: END-TUTORIAL-vendorFallback + :dedent: + +This overload is the portable baseline. +It works on every backend that can run the normal alpaka algorithm path, so it is a good default even when you later add CUDA-, HIP-, or SYCL-specific overloads. +The affine operation itself is spelled out as a tiny named functor so the tutorial still shows the callable logic directly even though backend-compatible code cannot use the original local lambda form here: + + .. literalinclude:: ../../snippets/example/38_vendorInterop.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-vendorFunctor + :end-before: END-TUTORIAL-vendorFunctor + :dedent: + +Registering a Backend-Specific Overload +--------------------------------------- + + .. literalinclude:: ../../snippets/example/38_vendorInterop.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-vendorHost + :end-before: END-TUTORIAL-vendorHost + :dedent: + +This example uses ``std::transform`` as a small stand-in for a third-party backend function. +The pattern is the same when the backend-specific code comes from a GPU vendor library. +On CUDA, for example, this is where you would pass ``queue.getNativeHandle()`` to a library that expects a CUDA stream and then call the vendor routine there. + +The important part is the ``Spec`` type: + +- it states which backend the overload belongs to, +- it keeps the backend choice out of the call site, +- and it lets the same public function symbol dispatch differently for different queues and devices. + +Calling the Function +-------------------- + + .. literalinclude:: ../../snippets/example/38_vendorInterop.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-vendorCall + :end-before: END-TUTORIAL-vendorCall + :dedent: + +The call itself stays simple. +You pass the queue and the ordinary data arguments. +alpaka looks at the queue's backend information and forwards the call to the best matching overload. + +How This Generalizes +-------------------- + +The same structure works for more than transform-like functions. +You can use it for: + +- vendor reductions and scans, +- BLAS or FFT library calls, +- image-processing kernels, +- custom memory operations, +- or any other third-party function that should only run on one subset of backends. + +In practice the recipe is: + +1. choose one clean public function signature, +2. keep the arguments backend-neutral, +3. specialize backend-specific overloads with ``fnDispatch``, +4. keep one alpaka fallback when possible, +5. use the queue's native handle inside the backend-specific overload if the vendor API expects a native stream or queue. + +That last point is what usually matters for CUDA, HIP, and SYCL integrations. +The caller remains pure alpaka code, while the backend-specific overload is free to bridge from the alpaka queue into the vendor runtime. +Seen from the outside, the code still reads like "apply this operation to my data." +That is exactly the separation you want: portable call site, backend-specific implementation detail. + +Complete Source File +-------------------- + +.. raw:: html + +
+ 38_vendorInterop.cpp + +.. literalinclude:: ../../snippets/example/38_vendorInterop.cpp + :language: cpp + :linenos: + +.. raw:: html + +
diff --git a/docs/source/tutorial/views.rst b/docs/source/tutorial/views.rst new file mode 100644 index 000000000..a50b1ab08 --- /dev/null +++ b/docs/source/tutorial/views.rst @@ -0,0 +1,68 @@ +Views and Subviews +================== + +Buffers own memory. +Views do not. +A view is the right tool when you want to describe existing data without copying or reallocating it. + +This matters in two common beginner situations: + +- you already have a host container such as ``std::vector`` and want to use it with *alpaka*, +- or you want to work on only part of a buffer, for example a slice, halo region, or tile. + +A good mental picture is image processing. +The full image may live in one owning buffer, but many operations only touch a crop, one color plane, or the interior pixels without the boundary. +Those smaller regions are natural views. + +Creating a View +--------------- + +You can create a non-owning view from a host container and then derive a subview from it. + + .. literalinclude:: ../../snippets/example/11_views.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-viewCreation + :end-before: END-TUTORIAL-viewCreation + :dedent: + +This is useful when the data already exists and you want to keep using the original storage. +It also makes function interfaces simpler because kernels and helper functions can accept views without caring who owns the memory. +For example, a stencil update often wants the interior cells only, while a boundary kernel wants a narrow halo view around the edge. + +Copying Through a View +---------------------- + +Views work with the usual memory operations. +That means you can allocate device memory based on a view and copy only the relevant slice back. + + .. literalinclude:: ../../snippets/example/11_views.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-viewCopy + :end-before: END-TUTORIAL-viewCopy + :dedent: + +Typical use cases: + +- copying a subrange of a 1D vector, +- copying only the active interior of a 2D grid, +- passing a tile into a helper function, +- and reusing kernel code with both owning buffers and non-owning views. + +For beginners, the main rule is simple: own data with buffers, describe data with views. +If you imagine "crop this image" or "ignore the outer ghost cells", you are already thinking in the right direction. + +Complete Source File +-------------------- + +.. raw:: html + +
+ 11_views.cpp + +.. literalinclude:: ../../snippets/example/11_views.cpp + :language: cpp + :linenos: + +.. raw:: html + +
diff --git a/docs/source/tutorial/warp.rst b/docs/source/tutorial/warp.rst new file mode 100644 index 000000000..7211af586 --- /dev/null +++ b/docs/source/tutorial/warp.rst @@ -0,0 +1,76 @@ +Warp and Subgroup Functions +=========================== + +Some algorithms need communication inside a warp or subgroup. +This is a lower-level tool than the earlier tutorial chapters, but it is still important for reductions, scans, voting, and specialized GPU kernels. + +If you know CUDA, these functions are analogous to warp intrinsics. +If you know SYCL, they are conceptually similar to subgroup communication. + +When to Reach for Warp Functions +-------------------------------- + +Use warp functions when: + +- you want fast communication among threads that execute in lock-step, +- you are implementing a reduction or prefix-style pattern inside a warp, +- or you need ballot-style voting or lane-to-lane value exchange. + +Do not start here for ordinary element-wise kernels. +For most beginner kernels, ``makeIdxMap`` over the data remains the right first solution. + +A Warp Reduction With ``shflDown`` +---------------------------------- + +The following example reduces one value per lane to one value per warp. +It still uses ``makeIdxMap`` to assign block-local work, but the reduction inside the warp is handled with shuffle operations. + + .. literalinclude:: ../../snippets/example/26_warp.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-warpKernel + :end-before: END-TUTORIAL-warpKernel + :dedent: + +Launching the Kernel +-------------------- + + .. literalinclude:: ../../snippets/example/26_warp.cpp + :language: cpp + :start-after: BEGIN-TUTORIAL-warpLaunch + :end-before: END-TUTORIAL-warpLaunch + :dedent: + +Important rules: + +- All participating threads must call the same warp intrinsic in a compatible control-flow region. +- Use the actual warp size reported by the backend instead of hard-coding ``32``. +- Prefer warp functions for cooperation inside a subgroup, not for general global indexing. +- On host backends, the warp size can be ``1``. The code still compiles and runs, but the subgroup behavior is naturally trivial there. + +Beyond ``shflDown`` +------------------- + +Other useful warp functions include: + +- ``onAcc::warp::shfl`` to broadcast from a chosen lane, +- ``onAcc::warp::shflUp`` and ``onAcc::warp::shflXor`` for other exchange patterns, +- ``onAcc::warp::all`` and ``onAcc::warp::any`` for voting, +- ``onAcc::warp::ballot`` for predicate masks. + +These are powerful tools, but they are best introduced after you are comfortable with ordinary data-parallel kernels. + +Complete Source File +-------------------- + +.. raw:: html + +
+ 26_warp.cpp + +.. literalinclude:: ../../snippets/example/26_warp.cpp + :language: cpp + :linenos: + +.. raw:: html + +