From 9553890e486c343d853702825e8bcdd51b409060 Mon Sep 17 00:00:00 2001 From: Jan Stephan Date: Thu, 18 Dec 2025 14:32:35 +0100 Subject: [PATCH 01/11] Add lane masks bit-shift example Signed-off-by: Jan Stephan --- .../Porting-CUDA-code-to-HIP/CMakeLists.txt | 1 + .../Porting-CUDA-code-to-HIP/Makefile | 1 + .../lane_masks_bit_shift/.gitignore | 1 + .../lane_masks_bit_shift/CMakeLists.txt | 67 +++++ .../lane_masks_bit_shift/Makefile | 59 ++++ .../lane_masks_bit_shift/README.md | 210 ++++++++++++++ .../lane_masks_bit_shift_vs2017.sln | 24 ++ .../lane_masks_bit_shift_vs2017.vcxproj | 131 +++++++++ ...ane_masks_bit_shift_vs2017.vcxproj.filters | 6 + .../lane_masks_bit_shift_vs2019.sln | 24 ++ .../lane_masks_bit_shift_vs2019.vcxproj | 131 +++++++++ ...ane_masks_bit_shift_vs2019.vcxproj.filters | 6 + .../lane_masks_bit_shift_vs2022.sln | 24 ++ .../lane_masks_bit_shift_vs2022.vcxproj | 131 +++++++++ ...ane_masks_bit_shift_vs2022.vcxproj.filters | 6 + .../lane_masks_bit_shift/main.hip | 258 ++++++++++++++++++ README.md | 1 + ROCm-Examples-Portable-VS2017.sln | 5 + ROCm-Examples-Portable-VS2019.sln | 5 + ROCm-Examples-Portable-VS2022.sln | 5 + ROCm-Examples-VS2017.sln | 6 + ROCm-Examples-VS2019.sln | 6 + ROCm-Examples-VS2022.sln | 7 + 23 files changed, 1115 insertions(+) create mode 100644 HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/.gitignore create mode 100644 HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/CMakeLists.txt create mode 100644 HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/Makefile create mode 100644 HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/README.md create mode 100644 HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2017.sln create mode 100644 HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2017.vcxproj create mode 100644 HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2017.vcxproj.filters create mode 100644 HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2019.sln create mode 100644 HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2019.vcxproj create mode 100644 HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2019.vcxproj.filters create mode 100644 HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2022.sln create mode 100644 HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2022.vcxproj create mode 100644 HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2022.vcxproj.filters create mode 100644 HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/main.hip diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/CMakeLists.txt b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/CMakeLists.txt index 10f1a3c46..3369b5cb8 100644 --- a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/CMakeLists.txt +++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/CMakeLists.txt @@ -40,5 +40,6 @@ else() add_subdirectory(load_module_ex_cuda) endif() +add_subdirectory(lane_masks_bit_shift) add_subdirectory(per_thread_default_stream) add_subdirectory(pointer_memory_type) diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/Makefile b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/Makefile index 50c5338ef..773a15d9b 100644 --- a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/Makefile +++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/Makefile @@ -26,6 +26,7 @@ EXAMPLES := \ host_code_feature_identification \ identifying_compilation_target_platform \ identifying_host_device_compilation_pass \ + lane_masks_bit_shift \ load_module \ load_module_ex \ per_thread_default_stream \ diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/.gitignore b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/.gitignore new file mode 100644 index 000000000..b71f55e74 --- /dev/null +++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/.gitignore @@ -0,0 +1 @@ +hip_lane_masks_bit_shift diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/CMakeLists.txt b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/CMakeLists.txt new file mode 100644 index 000000000..2da61faf4 --- /dev/null +++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/CMakeLists.txt @@ -0,0 +1,67 @@ +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +set(example_name hip_lane_masks_bit_shift) + +cmake_minimum_required(VERSION 3.21 FATAL_ERROR) +project(${example_name} LANGUAGES CXX) + +include("${CMAKE_CURRENT_LIST_DIR}/../../../../Common/HipPlatform.cmake") +select_gpu_language() +enable_language(${ROCM_EXAMPLES_GPU_LANGUAGE}) +select_hip_platform() + +if(ROCM_EXAMPLES_GPU_LANGUAGE STREQUAL "CUDA") + # The example calls hipDeviceGetAttribute which internally uses the CUDA driver API. We must explicitly link the + # driver library. + find_package(CUDAToolkit REQUIRED) +endif() + +if(CMAKE_SYSTEM_NAME MATCHES "Windows") + set(ROCM_ROOT + "$ENV{HIP_PATH}" + CACHE PATH + "Root directory of the ROCm installation" + ) +else() + set(ROCM_ROOT + "/opt/rocm" + CACHE PATH + "Root directory of the ROCm installation" + ) +endif() + +list(APPEND CMAKE_PREFIX_PATH "${ROCM_ROOT}") + +add_executable(${example_name} main.hip) +# Make example runnable using ctest +add_test(NAME ${example_name} COMMAND ${example_name}) + +set_source_files_properties(main.hip PROPERTIES LANGUAGE ${ROCM_EXAMPLES_GPU_LANGUAGE}) + +set_target_properties(${example_name} PROPERTIES $<$:CUDA_EXTENSIONS OFF> + $<$:HIP_EXTENSIONS OFF>) +target_compile_features(${example_name} PRIVATE cuda_std_17 hip_std_17) +target_include_directories(${example_name} PRIVATE $<$:${ROCM_ROOT}/include>) +target_link_libraries(${example_name} PRIVATE $<$:CUDA::cuda_driver>) + +install(TARGETS ${example_name}) diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/Makefile b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/Makefile new file mode 100644 index 000000000..d5ce838e0 --- /dev/null +++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/Makefile @@ -0,0 +1,59 @@ +# MIT License +# +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +EXAMPLE := hip_lane_masks_bit_shift +GPU_RUNTIME ?= HIP + +# HIP variables +ROCM_INSTALL_DIR ?= /opt/rocm +HIP_INCLUDE_DIR := $(ROCM_INSTALL_DIR)/include + +HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc + +# Common variables and flags +CXX_STD := c++17 +ICXXFLAGS := -std=$(CXX_STD) +ICPPFLAGS := +ILDFLAGS := +ILDLIBS := + +ifeq ($(GPU_RUNTIME), CUDA) + ICXXFLAGS += -x cu + ICPPFLAGS += -isystem $(HIP_INCLUDE_DIR) +else ifeq ($(GPU_RUNTIME), HIP) + CXXFLAGS ?= -Wall -Wextra +else + $(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP) +endif + +ICXXFLAGS += $(CXXFLAGS) +ICPPFLAGS += $(CPPFLAGS) +ILDFLAGS += $(LDFLAGS) +ILDLIBS += $(LDLIBS) + +$(EXAMPLE): main.hip + $(HIPCXX) $(ICXXFLAGS) $(ICPPFLAGS) $(ILDFLAGS) -o $@ $< $(ILDLIBS) + +clean: + $(RM) $(EXAMPLE) + +.PHONY: clean diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/README.md b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/README.md new file mode 100644 index 000000000..5a1f369b7 --- /dev/null +++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/README.md @@ -0,0 +1,210 @@ +# HIP-Doc Lane Masks Bit-Shift Example + +## Description + +This example demonstrates how to write portable HIP code that correctly handles +lane masks and bit-shift operations across different GPU architectures with +varying warp sizes. It highlights a common portability issue when porting CUDA +code to HIP and provides best-practice solutions. + +For more information on this topic, please refer to the +[HIP Porting Guide - Lane masks bit-shift section](https://rocm.docs.amd.com/projects/HIP/en/latest/how-to/hip_porting_guide.html#lane-masks-bit-shift). + +### Background + +A thread in a warp is also called a lane, and a lane mask is a bitmask where +each bit corresponds to a thread in a warp. Lane masks are commonly used in +warp-level operations for tasks like warp reduction, ballot operations, and +parallel algorithms. + +The critical portability issue arises from the fact that: + +- **AMD RDNA architectures** and **NVIDIA GPUs** have a warp size of 32 +- **AMD GCN/CDNA architectures** have a warp size of 64 + +When using bit-shift operations to create lane masks, using 32-bit integer +literals (like `1 << laneId`) will overflow on AMD GPUs with a warp size of 64 +and `laneId >= 32`, leading to incorrect results. + +### Application Flow + +The example demonstrates three approaches to handling lane masks: + +1. **Problematic Implementation** (32-bit, non-portable) + - Uses 32-bit integer literals for bit-shift operations + - Only safe for warp sizes ≤ 32 + - Demonstrates the problem that needs to be avoided + +2. **Improved Implementation** (64-bit, explicit) + - Uses 64-bit integer literals (`1ull`) for bit-shift operations + - Works correctly for all warp sizes + - Explicit but not architecture-specific + +3. **Best Practice Implementation** (portable) + - Uses a portable `lane_mask_t` typedef that adapts to the architecture + - Provides architecture-specific optimizations + - Recommended approach for production code + +For each approach, the example: + +- Queries the device's warp size +- Executes kernels performing warp reduction with bit-shift operations +- Stores and displays the results +- Verifies correctness by comparing results + +## Key APIs and Concepts + +- `warpSize` - Built-in constant that represents the warp size in device code +- `hipDeviceGetAttribute` - Queries the device's warp size on the host side +- **Lane masks** - Bitmasks representing active threads in a warp +- **Bit-shift operations** - Creating masks and selecting threads using bit + shifting +- **Architecture-specific typedefs** - Using conditional compilation for + portability +- `hipcub::WarpReduce` - Warp-level reduction primitive from the hipCUB library + +### Problem Demonstration + +```cpp +// PROBLEMATIC - Only works on warp size ≤ 32 +unsigned int laneId = threadIdx.x % warpSize; +std::uint32_t mask = (1 << laneId) - 1; // Overflow when laneId >= 32! +``` + +### Solution 1: Explicit 64-bit + +```cpp +// IMPROVED - Works on all current GPUs +unsigned int laneId = threadIdx.x % warpSize; +std::uint64_t mask = (1ull << laneId) - 1; // Uses 64-bit literal +``` + +### Solution 2: Portable Typedef (Recommended) + +```cpp +// BEST PRACTICE - Architecture-specific optimization +#if defined(__GFX8__) || defined(__GFX9__) +typedef std::uint64_t lane_mask_t; +#else +typedef std::uint32_t lane_mask_t; +#endif + +unsigned int laneId = threadIdx.x % warpSize; +lane_mask_t mask = (lane_mask_t{1} << laneId) - 1; // Portable! +``` + +## Building + +### Linux + +#### CMake + +```bash +mkdir build && cd build +cmake .. # or cmake -D GPU_RUNTIME=CUDA .. for CUDA +make +``` + +#### Make + +```bash +make # or make GPU_RUNTIME=CUDA for CUDA +``` + +### Windows + +#### Visual Studio + +Open one of the Visual Studio solution files and build the project: + +- `lane_masks_bit_shift_vs2017.sln` for Visual Studio 2017 +- `lane_masks_bit_shift_vs2019.sln` for Visual Studio 2019 +- `lane_masks_bit_shift_vs2022.sln` for Visual Studio 2022 + +#### CMake + +```shell +cmake -G Ninja -S . -B build +cmake --build build +``` + +## Running + +After building, run the executable: + +```shell +./hip_lane_masks_bit_shift # Linux +hip_lane_masks_bit_shift.exe # Windows +``` + +### Expected Output + +On a GPU with warp size 64: + +```plaintext +=== Device Information === +Device name: AMD Instinct GPU +Warp size: 64 + +=== Test 1: Skipped === +Problematic 32-bit kernel skipped (warpSize=64 would cause overflow) + +=== Test 2: Improved 64-bit Implementation === +Results: 9223372036854775808 9223372036854775808 + +=== Test 3: Best Practice Portable Implementation === +Using lane_mask_t (64-bit on this architecture) +Results: 9223372036854775808 9223372036854775808 + +=== Verification === +64-bit and portable results match: YES + +SUCCESS: All kernels produced correct results! +``` + +On a GPU with warp size 32: + +```plaintext +=== Device Information === +Device name: AMD Radeon GPU +Warp size: 32 + +=== Test 1: Problematic 32-bit Implementation === +Running on warpSize=32 (safe for this architecture) +Results: 2147483648 2147483648 2147483648 2147483648 + +=== Test 2: Improved 64-bit Implementation === +Results: 2147483648 2147483648 2147483648 2147483648 + +=== Test 3: Best Practice Portable Implementation === +Using lane_mask_t (32-bit on this architecture) +Results: 2147483648 2147483648 2147483648 2147483648 + +=== Verification === +64-bit and portable results match: YES + +SUCCESS: All kernels produced correct results! +``` + +## Demonstrated API Calls + +### HIP Runtime + +#### Device Symbols + +- `threadIdx` - Thread index within a block +- `warpSize` - Built-in constant for warp size in device code + +#### Host Symbols + +- `hipDeviceGetAttribute` - Query device attributes +- `hipGetDeviceProperties` - Get device properties +- `hipMalloc` - Allocate device memory +- `hipMemcpy` - Copy memory between host and device +- `hipFree` - Free device memory +- `hipDeviceSynchronize` - Synchronize device execution +- `hipGetLastError` - Get last error from runtime + +### hipCUB + +- `hipcub::WarpReduce` - Warp-level reduction operations diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2017.sln b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2017.sln new file mode 100644 index 000000000..227aa99af --- /dev/null +++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2017.sln @@ -0,0 +1,24 @@ +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 15 +VisualStudioVersion = 15.0.28307.1684 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "lane_masks_bit_shift_vs2017", "lane_masks_bit_shift_vs2017.vcxproj", "{A1B2C3D4-E5F6-4789-A012-3456789ABCDE}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {A1B2C3D4-E5F6-4789-A012-3456789ABCDE}.Debug|x64.ActiveCfg = Debug|x64 + {A1B2C3D4-E5F6-4789-A012-3456789ABCDE}.Debug|x64.Build.0 = Debug|x64 + {A1B2C3D4-E5F6-4789-A012-3456789ABCDE}.Release|x64.ActiveCfg = Release|x64 + {A1B2C3D4-E5F6-4789-A012-3456789ABCDE}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {F1E2D3C4-B5A6-4978-9012-3456789ABCDE} + EndGlobalSection +EndGlobal diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2017.vcxproj b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2017.vcxproj new file mode 100644 index 000000000..a58eaee4b --- /dev/null +++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2017.vcxproj @@ -0,0 +1,131 @@ + + + + + Debug + x64 + + + Release + x64 + + + + 15.0 + {A1B2C3D4-E5F6-4789-A012-3456789ABCDE} + Win32Proj + lane_masks_bit_shift_vs2017 + $(LatestTargetPlatformVersion) + + + + + + + Application + true + HIP clang 7.1 + Unicode + + + Application + false + HIP clang 7.1 + Unicode + + + $(PlatformToolset.Replace(`HIP clang `, ``)) + + + $(PlatformToolset.Replace(`HIP nvcc `, ``)) + + + + + + + + + + + + + + + + + true + hip_$(ProjectName) + + + false + hip_$(ProjectName) + + + + true + + + + + Level2 + __clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + %(AdditionalIncludeDirectories) + stdcpp17 + true + + + Console + true + + + + + Level2 + __CUDACC__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + %(AdditionalIncludeDirectories) + stdcpp17 + true + + + Console + true + cuda.lib;%(AdditionalDependencies) + + + + + Level2 + true + __clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + %(AdditionalIncludeDirectories) + stdcpp17 + true + + + Console + true + true + + + + + Level2 + true + __CUDACC__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + %(AdditionalIncludeDirectories) + stdcpp17 + true + + + Console + UseLinkTimeCodeGeneration + cuda.lib;%(AdditionalDependencies) + + + + + + + + diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2017.vcxproj.filters b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2017.vcxproj.filters new file mode 100644 index 000000000..441040352 --- /dev/null +++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2017.vcxproj.filters @@ -0,0 +1,6 @@ + + + + + + diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2019.sln b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2019.sln new file mode 100644 index 000000000..35d743608 --- /dev/null +++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2019.sln @@ -0,0 +1,24 @@ +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 16 +VisualStudioVersion = 16.0.30717.126 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "lane_masks_bit_shift_vs2019", "lane_masks_bit_shift_vs2019.vcxproj", "{B2C3D4E5-F6A7-4890-B123-456789ABCDEF}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {B2C3D4E5-F6A7-4890-B123-456789ABCDEF}.Debug|x64.ActiveCfg = Debug|x64 + {B2C3D4E5-F6A7-4890-B123-456789ABCDEF}.Debug|x64.Build.0 = Debug|x64 + {B2C3D4E5-F6A7-4890-B123-456789ABCDEF}.Release|x64.ActiveCfg = Release|x64 + {B2C3D4E5-F6A7-4890-B123-456789ABCDEF}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {C2D3E4F5-A6B7-4890-C123-456789ABCDEF} + EndGlobalSection +EndGlobal diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2019.vcxproj b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2019.vcxproj new file mode 100644 index 000000000..1a7e11a52 --- /dev/null +++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2019.vcxproj @@ -0,0 +1,131 @@ + + + + + Debug + x64 + + + Release + x64 + + + + 16.0 + {B2C3D4E5-F6A7-4890-B123-456789ABCDEF} + Win32Proj + lane_masks_bit_shift_vs2019 + $(LatestTargetPlatformVersion) + + + + + + + Application + true + HIP clang 7.1 + Unicode + + + Application + false + HIP clang 7.1 + Unicode + + + $(PlatformToolset.Replace(`HIP clang `, ``)) + + + $(PlatformToolset.Replace(`HIP nvcc `, ``)) + + + + + + + + + + + + + + + + + true + hip_$(ProjectName) + + + false + hip_$(ProjectName) + + + + true + + + + + Level2 + __clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + %(AdditionalIncludeDirectories) + stdcpp17 + true + + + Console + true + + + + + Level2 + __CUDACC__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + %(AdditionalIncludeDirectories) + stdcpp17 + true + + + Console + true + cuda.lib;%(AdditionalDependencies) + + + + + Level2 + true + __clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + %(AdditionalIncludeDirectories) + stdcpp17 + true + + + Console + true + true + + + + + Level2 + true + __CUDACC__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + %(AdditionalIncludeDirectories) + stdcpp17 + true + + + Console + UseLinkTimeCodeGeneration + cuda.lib;%(AdditionalDependencies) + + + + + + + + diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2019.vcxproj.filters b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2019.vcxproj.filters new file mode 100644 index 000000000..e52b0bb62 --- /dev/null +++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2019.vcxproj.filters @@ -0,0 +1,6 @@ + + + + + + diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2022.sln b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2022.sln new file mode 100644 index 000000000..00896c8d5 --- /dev/null +++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2022.sln @@ -0,0 +1,24 @@ +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio Version 17 +VisualStudioVersion = 17.0.31903.59 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "lane_masks_bit_shift_vs2022", "lane_masks_bit_shift_vs2022.vcxproj", "{C3D4E5F6-A7B8-4901-C234-56789ABCDEF0}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|x64 = Debug|x64 + Release|x64 = Release|x64 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {C3D4E5F6-A7B8-4901-C234-56789ABCDEF0}.Debug|x64.ActiveCfg = Debug|x64 + {C3D4E5F6-A7B8-4901-C234-56789ABCDEF0}.Debug|x64.Build.0 = Debug|x64 + {C3D4E5F6-A7B8-4901-C234-56789ABCDEF0}.Release|x64.ActiveCfg = Release|x64 + {C3D4E5F6-A7B8-4901-C234-56789ABCDEF0}.Release|x64.Build.0 = Release|x64 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection + GlobalSection(ExtensibilityGlobals) = postSolution + SolutionGuid = {D3E4F5A6-B7C8-4901-D234-56789ABCDEF0} + EndGlobalSection +EndGlobal diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2022.vcxproj b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2022.vcxproj new file mode 100644 index 000000000..964d345f7 --- /dev/null +++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2022.vcxproj @@ -0,0 +1,131 @@ + + + + + Debug + x64 + + + Release + x64 + + + + 17.0 + {C3D4E5F6-A7B8-4901-C234-56789ABCDEF0} + Win32Proj + lane_masks_bit_shift_vs2022 + $(LatestTargetPlatformVersion) + + + + + + + Application + true + HIP clang 7.1 + Unicode + + + Application + false + HIP clang 7.1 + Unicode + + + $(PlatformToolset.Replace(`HIP clang `, ``)) + + + $(PlatformToolset.Replace(`HIP nvcc `, ``)) + + + + + + + + + + + + + + + + + true + hip_$(ProjectName) + + + false + hip_$(ProjectName) + + + + true + + + + + Level2 + __clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + %(AdditionalIncludeDirectories) + stdcpp17 + true + + + Console + true + + + + + Level2 + __CUDACC__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + %(AdditionalIncludeDirectories) + stdcpp17 + true + + + Console + true + cuda.lib;%(AdditionalDependencies) + + + + + Level2 + true + __clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + %(AdditionalIncludeDirectories) + stdcpp17 + true + + + Console + true + true + + + + + Level2 + true + __CUDACC__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + %(AdditionalIncludeDirectories) + stdcpp17 + true + + + Console + UseLinkTimeCodeGeneration + cuda.lib;%(AdditionalDependencies) + + + + + + + + diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2022.vcxproj.filters b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2022.vcxproj.filters new file mode 100644 index 000000000..e52b0bb62 --- /dev/null +++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2022.vcxproj.filters @@ -0,0 +1,6 @@ + + + + + + diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/main.hip b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/main.hip new file mode 100644 index 000000000..95ec02ee7 --- /dev/null +++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/main.hip @@ -0,0 +1,258 @@ +// MIT License +// +// Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#include + +#include + +#include +#include +#include + +#define HIP_CHECK(expression) \ +{ \ + const hipError_t err = expression; \ + if (err != hipSuccess) \ + { \ + std::cout << "HIP Error: " << hipGetErrorString(err) \ + << " at line " << __LINE__ << std::endl; \ + std::exit(EXIT_FAILURE); \ + } \ +} + +// Problematic kernel: Uses 32-bit integer for lane masks, which fails on warp sizes > 32 +__global__ void initial_problematic(std::uint32_t* results) +{ + using WarpReduce = hipcub::WarpReduce; + __shared__ typename WarpReduce::TempStorage tempStorage; + + const unsigned int tid = threadIdx.x; + unsigned int val = tid % warpSize; + + // [sphinx-initial-start] + // Get the thread's position in the warp + unsigned int laneId = threadIdx.x % warpSize; + + // Use lane ID for bit-shift - PROBLEM: 1 is a 32-bit literal + // On AMD GPUs with warpSize=64, this can overflow when laneId >= 32 + std::uint32_t mask = (1 << laneId) - 1; + + // Shift 32 bit integer with val variable - PROBLEM: can overflow + std::uint32_t shift_result = (val < warpSize) ? (1 << val) : 0; + std::uint32_t warp_sum = WarpReduce(tempStorage).Sum(shift_result); + // [sphinx-initial-end] + + // Store result for the first thread in each warp + if (laneId == 0) + results[tid / warpSize] = warp_sum; +} + +// Improved kernel: Uses 64-bit integers explicitly +__global__ void improved_64bit(std::uint64_t* results) +{ + using WarpReduce = hipcub::WarpReduce; + __shared__ typename WarpReduce::TempStorage tempStorage; + + const unsigned int tid = threadIdx.x; + unsigned int val = tid % warpSize; + + // [sphinx-improved-start] + // Get the thread's position in the warp + unsigned int laneId = threadIdx.x % warpSize; + + // Use lane ID for bit-shift - FIXED: 1ull is a 64-bit literal + std::uint64_t mask = (1ull << laneId) - 1; + + // Shift 64 bit integer with val variable - FIXED: works for warpSize up to 64 + std::uint64_t shift_result = (val < warpSize) ? (1ull << val) : 0; + std::uint64_t warp_sum = WarpReduce(tempStorage).Sum(shift_result); + // [sphinx-improved-end] + + // Store result for the first thread in each warp + if (laneId == 0) + results[tid / warpSize] = warp_sum; +} + +// [sphinx-typedef-start] +// Portable lane mask type: adjusts based on GPU architecture +#ifdef __GFX9__ +using lane_mask_t = std::uint64_t; +#else +using lane_mask_t = std::uint32_t; +#endif +// [sphinx-typedef-end] + +// Best practice kernel: Uses portable lane_mask_t type +__global__ void best_portable(lane_mask_t* results) +{ + using WarpReduce = hipcub::WarpReduce; + __shared__ typename WarpReduce::TempStorage tempStorage; + + const unsigned int tid = threadIdx.x; + unsigned int val = tid % warpSize; + + // [sphinx-best-start] + // Get the thread's position in the warp + unsigned int laneId = threadIdx.x % warpSize; + + // Use lane ID for bit-shift - PORTABLE: lane_mask_t adjusts to warp size + lane_mask_t mask = (lane_mask_t{1} << laneId) - 1; + + // Shift 32 or 64 bit integer with val variable - PORTABLE + lane_mask_t shift_result = (val < warpSize) ? (lane_mask_t{1} << val) : 0; + lane_mask_t warp_sum = WarpReduce(tempStorage).Sum(shift_result); + // [sphinx-best-end] + + // Store result for the first thread in each warp + if (laneId == 0) + results[tid / warpSize] = warp_sum; +} + +int main() +{ + // Query device properties + int warpSizeHost; + HIP_CHECK(hipDeviceGetAttribute(&warpSizeHost, hipDeviceAttributeWarpSize, 0)); + + hipDeviceProp_t deviceProp; + HIP_CHECK(hipGetDeviceProperties(&deviceProp, 0)); + + std::cout << "=== Device Information ===" << std::endl; + std::cout << "Device name: " << deviceProp.name << std::endl; + std::cout << "Warp size: " << warpSizeHost << std::endl; + std::cout << std::endl; + + const int num_threads = 128; + const int num_warps = num_threads / warpSizeHost; + + // Test 1: Problematic kernel (only safe for warpSize <= 32) + if (warpSizeHost <= 32) + { + std::cout << "=== Test 1: Problematic 32-bit Implementation ===" << std::endl; + std::cout << "Running on warpSize=" << warpSizeHost + << " (safe for this architecture)" << std::endl; + + std::uint32_t* d_results_32; + HIP_CHECK(hipMalloc(&d_results_32, num_warps * sizeof(std::uint32_t))); + + initial_problematic<<<1, num_threads>>>(d_results_32); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + std::uint32_t h_results_32[num_warps]; + HIP_CHECK(hipMemcpy(h_results_32, d_results_32, + num_warps * sizeof(std::uint32_t), + hipMemcpyDeviceToHost)); + + std::cout << "Results: "; + for (int i = 0; i < num_warps; ++i) + { + std::cout << h_results_32[i] << " "; + } + std::cout << std::endl; + + HIP_CHECK(hipFree(d_results_32)); + } + else + { + std::cout << "=== Test 1: Skipped ===" << std::endl; + std::cout << "Problematic 32-bit kernel skipped (warpSize=" << warpSizeHost + << " would cause overflow)" << std::endl; + } + std::cout << std::endl; + + // Test 2: Improved 64-bit implementation + std::cout << "=== Test 2: Improved 64-bit Implementation ===" << std::endl; + std::uint64_t* d_results_64; + HIP_CHECK(hipMalloc(&d_results_64, num_warps * sizeof(std::uint64_t))); + + improved_64bit<<<1, num_threads>>>(d_results_64); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + std::uint64_t h_results_64[num_warps]; + HIP_CHECK(hipMemcpy(h_results_64, d_results_64, + num_warps * sizeof(std::uint64_t), + hipMemcpyDeviceToHost)); + + std::cout << "Results: "; + for (int i = 0; i < num_warps; ++i) + { + std::cout << h_results_64[i] << " "; + } + std::cout << std::endl << std::endl; + + HIP_CHECK(hipFree(d_results_64)); + + // Test 3: Best practice portable implementation + std::cout << "=== Test 3: Best Practice Portable Implementation ===" << std::endl; + std::cout << "Using lane_mask_t ("; + std::cout << (sizeof(lane_mask_t) == 8 ? "64-bit" : "32-bit"); + std::cout << " on this architecture)" << std::endl; + + lane_mask_t* d_results_portable; + HIP_CHECK(hipMalloc(&d_results_portable, num_warps * sizeof(lane_mask_t))); + + best_portable<<<1, num_threads>>>(d_results_portable); + HIP_CHECK(hipGetLastError()); + HIP_CHECK(hipDeviceSynchronize()); + + lane_mask_t h_results_portable[num_warps]; + HIP_CHECK(hipMemcpy(h_results_portable, d_results_portable, + num_warps * sizeof(lane_mask_t), + hipMemcpyDeviceToHost)); + + std::cout << "Results: "; + for (int i = 0; i < num_warps; ++i) + { + std::cout << h_results_portable[i] << " "; + } + std::cout << std::endl << std::endl; + + HIP_CHECK(hipFree(d_results_portable)); + + // Verify results match between 64-bit and portable versions + bool results_match = true; + for (int i = 0; i < num_warps; ++i) + { + if (h_results_64[i] != static_cast(h_results_portable[i])) + { + results_match = false; + break; + } + } + + std::cout << "=== Verification ===" << std::endl; + std::cout << "64-bit and portable results match: " + << (results_match ? "YES" : "NO") << std::endl; + + if (results_match) + { + std::cout << "\nSUCCESS: All kernels produced correct results!" << std::endl; + return EXIT_SUCCESS; + } + else + { + std::cout << "\nFAILURE: Results do not match!" << std::endl; + return EXIT_FAILURE; + } +} diff --git a/README.md b/README.md index 74f52badb..e376ac37a 100644 --- a/README.md +++ b/README.md @@ -183,6 +183,7 @@ The following options are available when building with CMake. - [warp_size_reduction](https://github.com/ROCm/rocm-examples/tree/amd-staging/HIP-Doc/Programming-Guide/HIP-C++-Language-Extensions/warp_size_reduction): Shows how to perform a reduction while relying on the warp size as an early-folded constant. - [Porting-CUDA-code-to-HIP](https://github.com/ROCm/rocm-examples/tree/amd-staging/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP) contains the examples from the [Porting NVIDIA CUDA code to HIP guide](https://rocm.docs.amd.com/projects/HIP/en/latest/how-to/hip_porting_guide.html) page. - [address_retrieval](https://github.com/ROCm/rocm-examples/tree/amd-staging/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/address_retrieval): Shows how to obtain the address of a HIP runtime function. + - [lane_masks_bit_shift](https://github.com/ROCm/rocm-examples/tree/amd-staging/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift): Demonstrates portable handling of lane masks and bit-shift operations across different warp sizes. - [device_code_feature_identification](https://github.com/ROCm/rocm-examples/tree/amd-staging/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/device_code_feature_identification): Shows how to query the device's compute features in device code. - [host_code_feature_identification](https://github.com/ROCm/rocm-examples/tree/amd-staging/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/host_code_feature_identification): Shows how to query the device's compute features in host code. - [identifying_compilation_target_platform](https://github.com/ROCm/rocm-examples/tree/amd-staging/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/identifying_compilation_target_platform): Shows how to distinguish between AMD and NVIDIA target platforms in code. diff --git a/ROCm-Examples-Portable-VS2017.sln b/ROCm-Examples-Portable-VS2017.sln index c2b59852e..5cea96a81 100644 --- a/ROCm-Examples-Portable-VS2017.sln +++ b/ROCm-Examples-Portable-VS2017.sln @@ -158,6 +158,10 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ordinary_memory_allocation_ EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "HIP-C++-Language-Extensions", "HIP-C++-Language-Extensions", "{70A51D02-8ADF-4CFD-BE3E-6647E09AF45B}" EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Porting-CUDA-code-to-HIP-Doc", "Porting-CUDA-code-to-HIP-Doc", "{A6B7C8D9-E0F1-4234-5678-9ABCDEF01234}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "lane_masks_bit_shift_vs2017", "HIP-Doc\Programming-Guide\Porting-CUDA-code-to-HIP\lane_masks_bit_shift\lane_masks_bit_shift_vs2017.vcxproj", "{A1B2C3D4-E5F6-4789-A012-3456789ABCDE}" +EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Porting-CUDA-code-to-HIP", "Porting-CUDA-code-to-HIP", "{BB989D47-FA5A-4B3E-AB7E-C18C4F62036B}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "calling_global_functions_vs2017", "HIP-Doc\Programming-Guide\HIP-C++-Language-Extensions\calling_global_functions\calling_global_functions_vs2017.vcxproj", "{67EF330D-B8D8-4A07-971E-80E7A0AB32D5}" @@ -655,6 +659,7 @@ Global {4FCDF29A-65AB-406E-A28A-40EA3D73FB18} = {70A51D02-8ADF-4CFD-BE3E-6647E09AF45B} {FAD05E9E-E29B-47BD-B136-4B39B583FCF6} = {70A51D02-8ADF-4CFD-BE3E-6647E09AF45B} {328E4455-5F63-4050-A03A-5B572858C6F7} = {70A51D02-8ADF-4CFD-BE3E-6647E09AF45B} + {A1B2C3D4-E5F6-4789-A012-3456789ABCDE} = {BB989D47-FA5A-4B3E-AB7E-C18C4F62036B} {16A6CA98-768D-45D7-932E-04B42AD8A19F} = {BB989D47-FA5A-4B3E-AB7E-C18C4F62036B} {11967426-7BAC-4129-AD17-82C5DCE5AC9D} = {BB989D47-FA5A-4B3E-AB7E-C18C4F62036B} {9EF24773-61DD-448C-8CF3-1A7BE4F6D779} = {BB989D47-FA5A-4B3E-AB7E-C18C4F62036B} diff --git a/ROCm-Examples-Portable-VS2019.sln b/ROCm-Examples-Portable-VS2019.sln index 9bb13ca85..1ca7e6026 100644 --- a/ROCm-Examples-Portable-VS2019.sln +++ b/ROCm-Examples-Portable-VS2019.sln @@ -180,6 +180,10 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "warp_size_reduction_vs2019" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "timer_vs2019", "HIP-Doc\Programming-Guide\HIP-C++-Language-Extensions\timer\timer_vs2019.vcxproj", "{33D89BE7-5A31-4325-86C1-FC7A9809C5F0}" EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Porting-CUDA-code-to-HIP-Doc", "Porting-CUDA-code-to-HIP-Doc", "{F5A6B7C8-D9E0-4123-F456-789ABCDEF012}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "lane_masks_bit_shift_vs2019", "HIP-Doc\Programming-Guide\Porting-CUDA-code-to-HIP\lane_masks_bit_shift\lane_masks_bit_shift_vs2019.vcxproj", "{B2C3D4E5-F6A7-4890-B123-456789ABCDEF}" +EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Porting-CUDA-code-to-HIP", "Porting-CUDA-code-to-HIP", "{9AE38387-AAF7-491C-93D5-003422E121DF}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "identifying_compilation_target_platform_vs2019", "HIP-Doc\Programming-Guide\Porting-CUDA-code-to-HIP\identifying_compilation_target_platform\identifying_compilation_target_platform_vs2019.vcxproj", "{F0D0D5CC-04D3-45B2-BCCF-79D38CD31238}" @@ -658,6 +662,7 @@ Global {86492341-E8DA-4A05-8BE5-3B34029AD412} = {EBE02186-C3E3-426C-B230-CC88230E5683} {8C4D1B5F-CE89-4E53-979A-83BC50A218FE} = {EBE02186-C3E3-426C-B230-CC88230E5683} {33D89BE7-5A31-4325-86C1-FC7A9809C5F0} = {EBE02186-C3E3-426C-B230-CC88230E5683} + {B2C3D4E5-F6A7-4890-B123-456789ABCDEF} = {9AE38387-AAF7-491C-93D5-003422E121DF} {9AE38387-AAF7-491C-93D5-003422E121DF} = {7CC0F577-AD85-4051-B589-5C837C2E155D} {F0D0D5CC-04D3-45B2-BCCF-79D38CD31238} = {9AE38387-AAF7-491C-93D5-003422E121DF} {C352D9CB-430F-4BE6-8A87-D73B9804E20C} = {9AE38387-AAF7-491C-93D5-003422E121DF} diff --git a/ROCm-Examples-Portable-VS2022.sln b/ROCm-Examples-Portable-VS2022.sln index bc771e78c..e819e5efd 100644 --- a/ROCm-Examples-Portable-VS2022.sln +++ b/ROCm-Examples-Portable-VS2022.sln @@ -180,6 +180,10 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "warp_size_reduction_vs2022" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "timer_vs2022", "HIP-Doc\Programming-Guide\HIP-C++-Language-Extensions\timer\timer_vs2022.vcxproj", "{C1957D25-4652-01D7-463A-086B05C9B9C5}" EndProject +Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Porting-CUDA-code-to-HIP-Doc", "Porting-CUDA-code-to-HIP-Doc", "{E4F5A6B7-C8D9-4012-E345-6789ABCDEF01}" +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "lane_masks_bit_shift_vs2022", "HIP-Doc\Programming-Guide\Porting-CUDA-code-to-HIP\lane_masks_bit_shift\lane_masks_bit_shift_vs2022.vcxproj", "{C3D4E5F6-A7B8-4901-C234-56789ABCDEF0}" +EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Porting-CUDA-code-to-HIP", "Porting-CUDA-code-to-HIP", "{D913551B-498A-4177-A7D6-F2A22A90FC2C}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "identifying_compilation_target_platform_vs2022", "HIP-Doc\Programming-Guide\Porting-CUDA-code-to-HIP\identifying_compilation_target_platform\identifying_compilation_target_platform_vs2022.vcxproj", "{608D4B4B-6629-74E9-034A-D069EF2A68A6}" @@ -658,6 +662,7 @@ Global {8C91BCE4-7972-5257-EA2F-F3606BC54AAC} = {8C875693-29CC-43B9-8B41-107D26C155B4} {46AFE6A7-6314-6CCE-9FDE-D8D6F8E2E884} = {8C875693-29CC-43B9-8B41-107D26C155B4} {C1957D25-4652-01D7-463A-086B05C9B9C5} = {8C875693-29CC-43B9-8B41-107D26C155B4} + {C3D4E5F6-A7B8-4901-C234-56789ABCDEF0} = {D913551B-498A-4177-A7D6-F2A22A90FC2C} {D913551B-498A-4177-A7D6-F2A22A90FC2C} = {4596F2CF-5E1E-4EF7-8339-4934D6BF1F00} {608D4B4B-6629-74E9-034A-D069EF2A68A6} = {D913551B-498A-4177-A7D6-F2A22A90FC2C} {53F8C1D9-E167-F401-BC95-79A9AC3ECCC7} = {D913551B-498A-4177-A7D6-F2A22A90FC2C} diff --git a/ROCm-Examples-VS2017.sln b/ROCm-Examples-VS2017.sln index eeeb4cddf..5bdfd1647 100644 --- a/ROCm-Examples-VS2017.sln +++ b/ROCm-Examples-VS2017.sln @@ -376,6 +376,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "per_thread_default_stream_v EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "pointer_memory_type_vs2017", "HIP-Doc\Programming-Guide\Porting-CUDA-code-to-HIP\pointer_memory_type\pointer_memory_type_vs2017.vcxproj", "{35E3ACD9-51BD-49DF-99A4-DB745ACB491B}" EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "lane_masks_bit_shift_vs2017", "HIP-Doc\Programming-Guide\Porting-CUDA-code-to-HIP\lane_masks_bit_shift\lane_masks_bit_shift_vs2017.vcxproj", "{A1B2C3D4-E5F6-4789-A012-3456789ABCDE}" +EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "async_kernel_execution_vs2017", "HIP-Doc\Programming-Guide\Using-HIP-Runtime-API\Asynchronous-Concurrent-Execution\async_kernel_execution\async_kernel_execution_vs2017.vcxproj", "{97295C91-A455-4B78-89C9-BB6EA1791BBE}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "event_based_synchronization_vs2017", "HIP-Doc\Programming-Guide\Using-HIP-Runtime-API\Asynchronous-Concurrent-Execution\event_based_synchronization\event_based_synchronization_vs2017.vcxproj", "{54C05E77-0E5D-449E-B048-8166D2EE97E7}" @@ -1056,6 +1058,10 @@ Global {35E3ACD9-51BD-49DF-99A4-DB745ACB491B}.Debug|x64.Build.0 = Debug|x64 {35E3ACD9-51BD-49DF-99A4-DB745ACB491B}.Release|x64.ActiveCfg = Release|x64 {35E3ACD9-51BD-49DF-99A4-DB745ACB491B}.Release|x64.Build.0 = Release|x64 + {A1B2C3D4-E5F6-4789-A012-3456789ABCDE}.Debug|x64.ActiveCfg = Debug|x64 + {A1B2C3D4-E5F6-4789-A012-3456789ABCDE}.Debug|x64.Build.0 = Debug|x64 + {A1B2C3D4-E5F6-4789-A012-3456789ABCDE}.Release|x64.ActiveCfg = Release|x64 + {A1B2C3D4-E5F6-4789-A012-3456789ABCDE}.Release|x64.Build.0 = Release|x64 {97295C91-A455-4B78-89C9-BB6EA1791BBE}.Debug|x64.ActiveCfg = Debug|x64 {97295C91-A455-4B78-89C9-BB6EA1791BBE}.Debug|x64.Build.0 = Debug|x64 {97295C91-A455-4B78-89C9-BB6EA1791BBE}.Release|x64.ActiveCfg = Release|x64 diff --git a/ROCm-Examples-VS2019.sln b/ROCm-Examples-VS2019.sln index 867d9b207..eac95ed2d 100644 --- a/ROCm-Examples-VS2019.sln +++ b/ROCm-Examples-VS2019.sln @@ -430,6 +430,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "per_thread_default_stream_v EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "pointer_memory_type_vs2019", "HIP-Doc\Programming-Guide\Porting-CUDA-code-to-HIP\pointer_memory_type\pointer_memory_type_vs2019.vcxproj", "{38AD6CC6-867E-44BA-B117-F02686076F9F}" EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "lane_masks_bit_shift_vs2019", "HIP-Doc\Programming-Guide\Porting-CUDA-code-to-HIP\lane_masks_bit_shift\lane_masks_bit_shift_vs2019.vcxproj", "{B2C3D4E5-F6A7-4890-B123-456789ABCDEF}" +EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Programming-for-HIP-Runtime-Compiler", "Programming-for-HIP-Runtime-Compiler", "{D0056E51-E2CD-40B1-B542-02F90E2F0767}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "compilation_apis_vs2019", "HIP-Doc\Programming-Guide\Programming-for-HIP-Runtime-Compiler\compilation_apis\compilation_apis_vs2019.vcxproj", "{404200F8-EE3E-452B-A17F-B8CE8C142054}" @@ -1164,6 +1166,10 @@ Global {38AD6CC6-867E-44BA-B117-F02686076F9F}.Debug|x64.Build.0 = Debug|x64 {38AD6CC6-867E-44BA-B117-F02686076F9F}.Release|x64.ActiveCfg = Release|x64 {38AD6CC6-867E-44BA-B117-F02686076F9F}.Release|x64.Build.0 = Release|x64 + {B2C3D4E5-F6A7-4890-B123-456789ABCDEF}.Debug|x64.ActiveCfg = Debug|x64 + {B2C3D4E5-F6A7-4890-B123-456789ABCDEF}.Debug|x64.Build.0 = Debug|x64 + {B2C3D4E5-F6A7-4890-B123-456789ABCDEF}.Release|x64.ActiveCfg = Release|x64 + {B2C3D4E5-F6A7-4890-B123-456789ABCDEF}.Release|x64.Build.0 = Release|x64 {404200F8-EE3E-452B-A17F-B8CE8C142054}.Debug|x64.ActiveCfg = Debug|x64 {404200F8-EE3E-452B-A17F-B8CE8C142054}.Debug|x64.Build.0 = Debug|x64 {404200F8-EE3E-452B-A17F-B8CE8C142054}.Release|x64.ActiveCfg = Release|x64 diff --git a/ROCm-Examples-VS2022.sln b/ROCm-Examples-VS2022.sln index bb6b34148..7a43c9672 100644 --- a/ROCm-Examples-VS2022.sln +++ b/ROCm-Examples-VS2022.sln @@ -430,6 +430,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "per_thread_default_stream_v EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "pointer_memory_type_vs2022", "HIP-Doc\Programming-Guide\Porting-CUDA-code-to-HIP\pointer_memory_type\pointer_memory_type_vs2022.vcxproj", "{7B0E56C2-C6D6-E245-7B21-A64767C87738}" EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "lane_masks_bit_shift_vs2022", "HIP-Doc\Programming-Guide\Porting-CUDA-code-to-HIP\lane_masks_bit_shift\lane_masks_bit_shift_vs2022.vcxproj", "{C3D4E5F6-A7B8-4901-C234-56789ABCDEF0}" +EndProject Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Programming-for-HIP-Runtime-Compiler", "Programming-for-HIP-Runtime-Compiler", "{AA794309-1F62-4964-A6BE-7E09BE0C1875}" EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "compilation_apis_vs2022", "HIP-Doc\Programming-Guide\Programming-for-HIP-Runtime-Compiler\compilation_apis\compilation_apis_vs2022.vcxproj", "{17AD5745-81C3-6574-B3A7-BD5E407836CC}" @@ -1164,6 +1166,10 @@ Global {7B0E56C2-C6D6-E245-7B21-A64767C87738}.Debug|x64.Build.0 = Debug|x64 {7B0E56C2-C6D6-E245-7B21-A64767C87738}.Release|x64.ActiveCfg = Release|x64 {7B0E56C2-C6D6-E245-7B21-A64767C87738}.Release|x64.Build.0 = Release|x64 + {C3D4E5F6-A7B8-4901-C234-56789ABCDEF0}.Debug|x64.ActiveCfg = Debug|x64 + {C3D4E5F6-A7B8-4901-C234-56789ABCDEF0}.Debug|x64.Build.0 = Debug|x64 + {C3D4E5F6-A7B8-4901-C234-56789ABCDEF0}.Release|x64.ActiveCfg = Release|x64 + {C3D4E5F6-A7B8-4901-C234-56789ABCDEF0}.Release|x64.Build.0 = Release|x64 {17AD5745-81C3-6574-B3A7-BD5E407836CC}.Debug|x64.ActiveCfg = Debug|x64 {17AD5745-81C3-6574-B3A7-BD5E407836CC}.Debug|x64.Build.0 = Debug|x64 {17AD5745-81C3-6574-B3A7-BD5E407836CC}.Release|x64.ActiveCfg = Release|x64 @@ -1455,6 +1461,7 @@ Global {919E29D3-0B7C-AFFF-8E2C-3CD179744131} = {C74FDF2D-7421-4680-9940-32FC63E96B96} {2F28FDEC-F73B-C8E7-2166-6F00ED3AF12C} = {C74FDF2D-7421-4680-9940-32FC63E96B96} {7B0E56C2-C6D6-E245-7B21-A64767C87738} = {C74FDF2D-7421-4680-9940-32FC63E96B96} + {C3D4E5F6-A7B8-4901-C234-56789ABCDEF0} = {C74FDF2D-7421-4680-9940-32FC63E96B96} {AA794309-1F62-4964-A6BE-7E09BE0C1875} = {0D14E46B-FEBB-4A92-95E3-C46C9FD2A8F4} {17AD5745-81C3-6574-B3A7-BD5E407836CC} = {AA794309-1F62-4964-A6BE-7E09BE0C1875} {9CA95495-B513-E12B-C332-BBEE9CF93DAE} = {AA794309-1F62-4964-A6BE-7E09BE0C1875} From 3b27770b6354640e95e7eab196bf61225ce5cbb7 Mon Sep 17 00:00:00 2001 From: Jan Stephan Date: Thu, 18 Dec 2025 15:18:53 +0100 Subject: [PATCH 02/11] Install hipcub Signed-off-by: Jan Stephan --- .github/workflows/build_hip_documentation.yml | 2 +- .github/workflows/build_hip_documentation_cuda.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build_hip_documentation.yml b/.github/workflows/build_hip_documentation.yml index 549d3b478..4bf2cccdd 100644 --- a/.github/workflows/build_hip_documentation.yml +++ b/.github/workflows/build_hip_documentation.yml @@ -45,7 +45,7 @@ jobs: wget https://repo.radeon.com/amdgpu-install/${{ env.ROCM_VERSION }}/ubuntu/jammy/amdgpu-install_${{ env.AMDGPU_INSTALLER_VERSION }}_all.deb apt-get -y install ./amdgpu-install_${{ env.AMDGPU_INSTALLER_VERSION }}_all.deb && apt-get update -qq && - apt-get -y install rocm-dev rocm-llvm-dev hipfft-dev + apt-get -y install rocm-dev rocm-llvm-dev hipfft-dev hipcub-dev echo "/opt/rocm/bin" >> $GITHUB_PATH echo "ROCM_PATH=/opt/rocm" >> $GITHUB_ENV echo "LD_LIBRARY_PATH=/opt/rocm/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV diff --git a/.github/workflows/build_hip_documentation_cuda.yml b/.github/workflows/build_hip_documentation_cuda.yml index d4096d108..c39b1382f 100644 --- a/.github/workflows/build_hip_documentation_cuda.yml +++ b/.github/workflows/build_hip_documentation_cuda.yml @@ -46,7 +46,7 @@ jobs: wget https://repo.radeon.com/amdgpu-install/${{ env.ROCM_VERSION }}/ubuntu/jammy/amdgpu-install_${{ env.AMDGPU_INSTALLER_VERSION }}_all.deb apt-get -y install ./amdgpu-install_${{ env.AMDGPU_INSTALLER_VERSION }}_all.deb && apt-get update -qq && - apt-get install -y hip-dev hipify-clang hipfft-dev + apt-get install -y hip-dev hipify-clang hipfft-dev hipcub-dev - name: Configure and Build # The CMAKE_POLICY_VERSION_MINIMUM environment variable can be removed once the CMake updates from ROCm 7.0 are available From 9a896b848e761d0ed1e9c0b01a8469fef1f5170e Mon Sep 17 00:00:00 2001 From: Jan Stephan Date: Thu, 18 Dec 2025 15:46:29 +0100 Subject: [PATCH 03/11] Install CCCL Signed-off-by: Jan Stephan --- .github/workflows/build_hip_documentation_cuda.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build_hip_documentation_cuda.yml b/.github/workflows/build_hip_documentation_cuda.yml index c39b1382f..2e09f14eb 100644 --- a/.github/workflows/build_hip_documentation_cuda.yml +++ b/.github/workflows/build_hip_documentation_cuda.yml @@ -38,6 +38,7 @@ jobs: apt-get update -qq && apt-get install -y build-essential g++ glslang-tools \ python3 python3-pip locales wget git libtiff-dev + cuda-cccl-12.6 python3 -m pip install --upgrade pip python3 -m pip install cmake From 25df1fcf4766710f08b31d75c99dae8ab46d9746 Mon Sep 17 00:00:00 2001 From: Jan Stephan Date: Thu, 18 Dec 2025 16:55:16 +0100 Subject: [PATCH 04/11] CMake: Find hipCUB Signed-off-by: Jan Stephan --- .../build_hip_documentation_cuda.yml | 1 - .../Porting-CUDA-code-to-HIP/CMakeLists.txt | 24 ++++++++++++++++++- .../lane_masks_bit_shift/CMakeLists.txt | 8 ++++--- 3 files changed, 28 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build_hip_documentation_cuda.yml b/.github/workflows/build_hip_documentation_cuda.yml index 2e09f14eb..c39b1382f 100644 --- a/.github/workflows/build_hip_documentation_cuda.yml +++ b/.github/workflows/build_hip_documentation_cuda.yml @@ -38,7 +38,6 @@ jobs: apt-get update -qq && apt-get install -y build-essential g++ glslang-tools \ python3 python3-pip locales wget git libtiff-dev - cuda-cccl-12.6 python3 -m pip install --upgrade pip python3 -m pip install cmake diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/CMakeLists.txt b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/CMakeLists.txt index 3369b5cb8..71df95225 100644 --- a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/CMakeLists.txt +++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/CMakeLists.txt @@ -26,6 +26,22 @@ project(HIP-Doc-Programming-Guide-Porting-CUDA-code-to-HIP LANGUAGES CXX) file(RELATIVE_PATH folder_bin ${CMAKE_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin/${folder_bin}) +if(CMAKE_SYSTEM_NAME MATCHES "Windows") + set(ROCM_ROOT + "$ENV{HIP_PATH}" + CACHE PATH + "Root directory of the ROCm installation" + ) +else() + set(ROCM_ROOT + "/opt/rocm" + CACHE PATH + "Root directory of the ROCm installation" + ) +endif() + +list(APPEND CMAKE_PREFIX_PATH "${ROCM_ROOT}") + add_subdirectory(address_retrieval) add_subdirectory(device_code_feature_identification) add_subdirectory(host_code_feature_identification) @@ -40,6 +56,12 @@ else() add_subdirectory(load_module_ex_cuda) endif() -add_subdirectory(lane_masks_bit_shift) +find_package(hipcub QUIET) +if(NOT hipcub_FOUND) + message(STATUS "hipCUB not found. Skipping lane masks bit-shift example.") +else() + add_subdirectory(lane_masks_bit_shift) +endif() + add_subdirectory(per_thread_default_stream) add_subdirectory(pointer_memory_type) diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/CMakeLists.txt b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/CMakeLists.txt index 2da61faf4..bb1b05a44 100644 --- a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/CMakeLists.txt +++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/CMakeLists.txt @@ -31,8 +31,8 @@ enable_language(${ROCM_EXAMPLES_GPU_LANGUAGE}) select_hip_platform() if(ROCM_EXAMPLES_GPU_LANGUAGE STREQUAL "CUDA") - # The example calls hipDeviceGetAttribute which internally uses the CUDA driver API. We must explicitly link the - # driver library. + # The example calls hipDeviceGetAttribute which internally uses the CUDA + # driver API. We must explicitly link the driver library. find_package(CUDAToolkit REQUIRED) endif() @@ -52,6 +52,8 @@ endif() list(APPEND CMAKE_PREFIX_PATH "${ROCM_ROOT}") +find_package(hipcub REQUIRED) + add_executable(${example_name} main.hip) # Make example runnable using ctest add_test(NAME ${example_name} COMMAND ${example_name}) @@ -62,6 +64,6 @@ set_target_properties(${example_name} PROPERTIES $<$:CUDA $<$:HIP_EXTENSIONS OFF>) target_compile_features(${example_name} PRIVATE cuda_std_17 hip_std_17) target_include_directories(${example_name} PRIVATE $<$:${ROCM_ROOT}/include>) -target_link_libraries(${example_name} PRIVATE $<$:CUDA::cuda_driver>) +target_link_libraries(${example_name} PRIVATE $<$:CUDA::cuda_driver> hip::hipcub) install(TARGETS ${example_name}) From af979b6343d5628c6c534a97aef7e509c5c48246 Mon Sep 17 00:00:00 2001 From: Jan Stephan Date: Thu, 18 Dec 2025 17:03:28 +0100 Subject: [PATCH 05/11] Install CCCL Signed-off-by: Jan Stephan --- .github/workflows/build_portable_sln.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_portable_sln.yml b/.github/workflows/build_portable_sln.yml index ee9774231..9bbeadc5e 100644 --- a/.github/workflows/build_portable_sln.yml +++ b/.github/workflows/build_portable_sln.yml @@ -71,7 +71,7 @@ jobs: Write-Host "Downloading installer https://developer.download.nvidia.com/compute/cuda/${{ env.CUDA_VERSION }}/local_installers/cuda_${{ env.CUDA_VERSION }}_${{ env.CUDA_INSTALLER_VERSION }}_windows.exe" Invoke-WebRequest -Uri https://developer.download.nvidia.com/compute/cuda/${{ env.CUDA_VERSION }}/local_installers/cuda_${{ env.CUDA_VERSION }}_${{ env.CUDA_INSTALLER_VERSION }}_windows.exe -OutFile CUDAToolkit.exe Write-Host "Installing CUDA Toolkit" - Start-Process CUDAToolkit.exe -ArgumentList '-s cuda_profiler_api_12.8 cudart_12.8 nvcc_12.8 nvrtc_dev_12.8 cublas_dev_12.8 cufft_dev_12.8 curand_dev_12.8 cusolver_dev_12.8 cusparse_dev_12.8 visual_studio_integration_12.8','-n' -NoNewWindow -Wait + Start-Process CUDAToolkit.exe -ArgumentList '-s cuda_profiler_api_12.8 cudart_12.8 nvcc_12.8 nvrtc_dev_12.8 cccl_dev_12.8 cublas_dev_12.8 cufft_dev_12.8 curand_dev_12.8 cusolver_dev_12.8 cusparse_dev_12.8 visual_studio_integration_12.8','-n' -NoNewWindow -Wait if (Test-Path -Path "${{ env.CUDA_PATH }}") { Write-Host "CUDA_PATH directory exists: ${{ env.CUDA_PATH }}" Write-Host "CUDA Toolkit installed successfully" From cfa93194af0ff714783c520ca36a00a81d8a5082 Mon Sep 17 00:00:00 2001 From: Jan Stephan Date: Thu, 18 Dec 2025 17:12:51 +0100 Subject: [PATCH 06/11] Guard against double inclusion Signed-off-by: Jan Stephan --- .../lane_masks_bit_shift/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/CMakeLists.txt b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/CMakeLists.txt index bb1b05a44..822e6922f 100644 --- a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/CMakeLists.txt +++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/CMakeLists.txt @@ -52,7 +52,9 @@ endif() list(APPEND CMAKE_PREFIX_PATH "${ROCM_ROOT}") -find_package(hipcub REQUIRED) +if(NOT TARGET hip::hipcub) + find_package(hipcub REQUIRED) +endif() add_executable(${example_name} main.hip) # Make example runnable using ctest From cba092ec5053353cc8e7fd717f7c03fb89a22daa Mon Sep 17 00:00:00 2001 From: Jan Stephan Date: Thu, 18 Dec 2025 17:19:54 +0100 Subject: [PATCH 07/11] Remove non-existing subpackage Signed-off-by: Jan Stephan --- .github/workflows/build_portable_sln.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build_portable_sln.yml b/.github/workflows/build_portable_sln.yml index 9bbeadc5e..ee9774231 100644 --- a/.github/workflows/build_portable_sln.yml +++ b/.github/workflows/build_portable_sln.yml @@ -71,7 +71,7 @@ jobs: Write-Host "Downloading installer https://developer.download.nvidia.com/compute/cuda/${{ env.CUDA_VERSION }}/local_installers/cuda_${{ env.CUDA_VERSION }}_${{ env.CUDA_INSTALLER_VERSION }}_windows.exe" Invoke-WebRequest -Uri https://developer.download.nvidia.com/compute/cuda/${{ env.CUDA_VERSION }}/local_installers/cuda_${{ env.CUDA_VERSION }}_${{ env.CUDA_INSTALLER_VERSION }}_windows.exe -OutFile CUDAToolkit.exe Write-Host "Installing CUDA Toolkit" - Start-Process CUDAToolkit.exe -ArgumentList '-s cuda_profiler_api_12.8 cudart_12.8 nvcc_12.8 nvrtc_dev_12.8 cccl_dev_12.8 cublas_dev_12.8 cufft_dev_12.8 curand_dev_12.8 cusolver_dev_12.8 cusparse_dev_12.8 visual_studio_integration_12.8','-n' -NoNewWindow -Wait + Start-Process CUDAToolkit.exe -ArgumentList '-s cuda_profiler_api_12.8 cudart_12.8 nvcc_12.8 nvrtc_dev_12.8 cublas_dev_12.8 cufft_dev_12.8 curand_dev_12.8 cusolver_dev_12.8 cusparse_dev_12.8 visual_studio_integration_12.8','-n' -NoNewWindow -Wait if (Test-Path -Path "${{ env.CUDA_PATH }}") { Write-Host "CUDA_PATH directory exists: ${{ env.CUDA_PATH }}" Write-Host "CUDA Toolkit installed successfully" From 32d1d1fad19de84f8dd2b2f74f770541109f54a0 Mon Sep 17 00:00:00 2001 From: Jan Stephan Date: Mon, 2 Feb 2026 18:32:51 +0100 Subject: [PATCH 08/11] Update to CUDA 12.9.1 Signed-off-by: Jan Stephan --- .github/workflows/build_portable_sln.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build_portable_sln.yml b/.github/workflows/build_portable_sln.yml index ee9774231..f8c5cf3d8 100644 --- a/.github/workflows/build_portable_sln.yml +++ b/.github/workflows/build_portable_sln.yml @@ -30,9 +30,9 @@ env: PLATFORM_TOOLSET_VERSION: 6.4 HIP_PATH: C:\Program Files\AMD\ROCm\6.4\ HIPSDK_INSTALLER_VERSION: 25.Q3 - CUDA_VERSION: 12.8.1 - CUDA_INSTALLER_VERSION: 572.61 - CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8 + CUDA_VERSION: 12.9.1 + CUDA_INSTALLER_VERSION: 576.57 + CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9 GLFW_DIR: C:\glfw-3.4.bin.WIN64\ VULKAN_SDK: C:\VulkanSDK\1.4.313.2\ @@ -71,7 +71,7 @@ jobs: Write-Host "Downloading installer https://developer.download.nvidia.com/compute/cuda/${{ env.CUDA_VERSION }}/local_installers/cuda_${{ env.CUDA_VERSION }}_${{ env.CUDA_INSTALLER_VERSION }}_windows.exe" Invoke-WebRequest -Uri https://developer.download.nvidia.com/compute/cuda/${{ env.CUDA_VERSION }}/local_installers/cuda_${{ env.CUDA_VERSION }}_${{ env.CUDA_INSTALLER_VERSION }}_windows.exe -OutFile CUDAToolkit.exe Write-Host "Installing CUDA Toolkit" - Start-Process CUDAToolkit.exe -ArgumentList '-s cuda_profiler_api_12.8 cudart_12.8 nvcc_12.8 nvrtc_dev_12.8 cublas_dev_12.8 cufft_dev_12.8 curand_dev_12.8 cusolver_dev_12.8 cusparse_dev_12.8 visual_studio_integration_12.8','-n' -NoNewWindow -Wait + Start-Process CUDAToolkit.exe -ArgumentList '-s cuda_profiler_api_12.9 cudart_12.9 nvcc_12.9 nvrtc_dev_12.9 cublas_dev_12.9 cufft_dev_12.9 curand_dev_12.9 cusolver_dev_12.9 cusparse_dev_12.9 visual_studio_integration_12.9','-n' -NoNewWindow -Wait if (Test-Path -Path "${{ env.CUDA_PATH }}") { Write-Host "CUDA_PATH directory exists: ${{ env.CUDA_PATH }}" Write-Host "CUDA Toolkit installed successfully" From c967b7c78b45faf6eda9241c8503fb1096870d4a Mon Sep 17 00:00:00 2001 From: Jan Stephan Date: Tue, 3 Feb 2026 12:51:13 +0100 Subject: [PATCH 09/11] Update to HIP SDK 7.1.1 Signed-off-by: Jan Stephan --- .github/workflows/build_applications_vs.yml | 6 +++--- .github/workflows/build_hip_basic_vs.yml | 6 +++--- .github/workflows/build_hip_documentation_vs.yml | 6 +++--- .github/workflows/build_libraries_vs.yml | 6 +++--- .github/workflows/build_portable_sln.yml | 6 +++--- .github/workflows/build_programming_guide_vs.yml | 6 +++--- 6 files changed, 18 insertions(+), 18 deletions(-) diff --git a/.github/workflows/build_applications_vs.yml b/.github/workflows/build_applications_vs.yml index 442933b7f..7ed7fe1ff 100644 --- a/.github/workflows/build_applications_vs.yml +++ b/.github/workflows/build_applications_vs.yml @@ -19,9 +19,9 @@ concurrency: cancel-in-progress: true env: - PLATFORM_TOOLSET_VERSION: 6.4 - HIP_PATH: C:\Program Files\AMD\ROCm\6.4\ - HIPSDK_INSTALLER_VERSION: 25.Q3 + PLATFORM_TOOLSET_VERSION: 7.1 + HIP_PATH: C:\Program Files\AMD\ROCm\7.1\ + HIPSDK_INSTALLER_VERSION: 26.Q1 GLFW_DIR: C:\glfw-3.4.bin.WIN64\ jobs: diff --git a/.github/workflows/build_hip_basic_vs.yml b/.github/workflows/build_hip_basic_vs.yml index cdb007f43..b96b026af 100644 --- a/.github/workflows/build_hip_basic_vs.yml +++ b/.github/workflows/build_hip_basic_vs.yml @@ -19,9 +19,9 @@ concurrency: cancel-in-progress: true env: - PLATFORM_TOOLSET_VERSION: 6.4 - HIP_PATH: C:\Program Files\AMD\ROCm\6.4\ - HIPSDK_INSTALLER_VERSION: 25.Q3 + PLATFORM_TOOLSET_VERSION: 7.1 + HIP_PATH: C:\Program Files\AMD\ROCm\7.1\ + HIPSDK_INSTALLER_VERSION: 26.Q1 VULKAN_SDK: C:\VulkanSDK\1.4.313.2\ GLFW_DIR: C:\glfw-3.4.bin.WIN64\ diff --git a/.github/workflows/build_hip_documentation_vs.yml b/.github/workflows/build_hip_documentation_vs.yml index da8447b6a..516c5a14b 100644 --- a/.github/workflows/build_hip_documentation_vs.yml +++ b/.github/workflows/build_hip_documentation_vs.yml @@ -19,9 +19,9 @@ concurrency: cancel-in-progress: true env: - PLATFORM_TOOLSET_VERSION: 6.4 - HIP_PATH: C:\Program Files\AMD\ROCm\6.4\ - HIPSDK_INSTALLER_VERSION: 25.Q3 + PLATFORM_TOOLSET_VERSION: 7.1 + HIP_PATH: C:\Program Files\AMD\ROCm\7.1\ + HIPSDK_INSTALLER_VERSION: 26.Q1 VULKAN_SDK: C:\VulkanSDK\1.4.313.2\ GLFW_DIR: C:\glfw-3.4.bin.WIN64\ diff --git a/.github/workflows/build_libraries_vs.yml b/.github/workflows/build_libraries_vs.yml index 232d57456..c5cbfa8a2 100644 --- a/.github/workflows/build_libraries_vs.yml +++ b/.github/workflows/build_libraries_vs.yml @@ -19,9 +19,9 @@ concurrency: cancel-in-progress: true env: - PLATFORM_TOOLSET_VERSION: 6.4 - HIP_PATH: C:\Program Files\AMD\ROCm\6.4\ - HIPSDK_INSTALLER_VERSION: 25.Q3 + PLATFORM_TOOLSET_VERSION: 7.1 + HIP_PATH: C:\Program Files\AMD\ROCm\7.1\ + HIPSDK_INSTALLER_VERSION: 26.Q1 jobs: build: diff --git a/.github/workflows/build_portable_sln.yml b/.github/workflows/build_portable_sln.yml index f8c5cf3d8..d1c29e7ef 100644 --- a/.github/workflows/build_portable_sln.yml +++ b/.github/workflows/build_portable_sln.yml @@ -27,9 +27,9 @@ concurrency: cancel-in-progress: true env: - PLATFORM_TOOLSET_VERSION: 6.4 - HIP_PATH: C:\Program Files\AMD\ROCm\6.4\ - HIPSDK_INSTALLER_VERSION: 25.Q3 + PLATFORM_TOOLSET_VERSION: 7.1 + HIP_PATH: C:\Program Files\AMD\ROCm\7.1\ + HIPSDK_INSTALLER_VERSION: 26.Q1 CUDA_VERSION: 12.9.1 CUDA_INSTALLER_VERSION: 576.57 CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9 diff --git a/.github/workflows/build_programming_guide_vs.yml b/.github/workflows/build_programming_guide_vs.yml index 79a7dfb19..890c313a6 100644 --- a/.github/workflows/build_programming_guide_vs.yml +++ b/.github/workflows/build_programming_guide_vs.yml @@ -19,9 +19,9 @@ concurrency: cancel-in-progress: true env: - PLATFORM_TOOLSET_VERSION: 6.4 - HIP_PATH: C:\Program Files\AMD\ROCm\6.4\ - HIPSDK_INSTALLER_VERSION: 25.Q3 + PLATFORM_TOOLSET_VERSION: 7.1 + HIP_PATH: C:\Program Files\AMD\ROCm\7.1\ + HIPSDK_INSTALLER_VERSION: 26.Q1 VULKAN_SDK: C:\VulkanSDK\1.4.313.2\ GLFW_DIR: C:\glfw-3.4.bin.WIN64\ From d9dbd627d24a4c282ea732025045c5a6237fbfee Mon Sep 17 00:00:00 2001 From: Jan Stephan Date: Tue, 3 Feb 2026 12:56:15 +0100 Subject: [PATCH 10/11] Fix installer URL Signed-off-by: Jan Stephan --- .github/workflows/build_applications_vs.yml | 4 ++-- .github/workflows/build_hip_basic_vs.yml | 4 ++-- .github/workflows/build_hip_documentation_vs.yml | 4 ++-- .github/workflows/build_libraries_vs.yml | 4 ++-- .github/workflows/build_portable_sln.yml | 4 ++-- .github/workflows/build_programming_guide_vs.yml | 4 ++-- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/build_applications_vs.yml b/.github/workflows/build_applications_vs.yml index 7ed7fe1ff..ba8a52345 100644 --- a/.github/workflows/build_applications_vs.yml +++ b/.github/workflows/build_applications_vs.yml @@ -52,8 +52,8 @@ jobs: - name: Install HIP SDK run: | $ProgressPreference = 'SilentlyContinue' - Write-Host "Downloading installer https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win10-Win11-For-HIP.exe" - Invoke-WebRequest -Uri https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win10-Win11-For-HIP.exe -OutFile HIPSDK.exe + Write-Host "Downloading installer https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe" + Invoke-WebRequest -Uri https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe -OutFile HIPSDK.exe Write-Host "Installing HIP SDK" Start-Process HIPSDK.exe -ArgumentList '-install','-log',"$((Get-Location).Path)\installer_log.txt" -NoNewWindow -Wait Write-Host "Installer log:" diff --git a/.github/workflows/build_hip_basic_vs.yml b/.github/workflows/build_hip_basic_vs.yml index b96b026af..2edc90e5e 100644 --- a/.github/workflows/build_hip_basic_vs.yml +++ b/.github/workflows/build_hip_basic_vs.yml @@ -57,8 +57,8 @@ jobs: - name: Install HIP SDK run: | $ProgressPreference = 'SilentlyContinue' - Write-Host "Downloading installer https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win10-Win11-For-HIP.exe" - Invoke-WebRequest -Uri https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win10-Win11-For-HIP.exe -OutFile HIPSDK.exe + Write-Host "Downloading installer https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe" + Invoke-WebRequest -Uri https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe -OutFile HIPSDK.exe Write-Host "Installing HIP SDK" Start-Process HIPSDK.exe -ArgumentList '-install','-log',"$((Get-Location).Path)\installer_log.txt" -NoNewWindow -Wait Write-Host "Installer log:" diff --git a/.github/workflows/build_hip_documentation_vs.yml b/.github/workflows/build_hip_documentation_vs.yml index 516c5a14b..755dd5925 100644 --- a/.github/workflows/build_hip_documentation_vs.yml +++ b/.github/workflows/build_hip_documentation_vs.yml @@ -57,8 +57,8 @@ jobs: - name: Install HIP SDK run: | $ProgressPreference = 'SilentlyContinue' - Write-Host "Downloading installer https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win10-Win11-For-HIP.exe" - Invoke-WebRequest -Uri https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win10-Win11-For-HIP.exe -OutFile HIPSDK.exe + Write-Host "Downloading installer https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe" + Invoke-WebRequest -Uri https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe -OutFile HIPSDK.exe Write-Host "Installing HIP SDK" Start-Process HIPSDK.exe -ArgumentList '-install','-log',"$((Get-Location).Path)\installer_log.txt" -NoNewWindow -Wait Write-Host "Installer log:" diff --git a/.github/workflows/build_libraries_vs.yml b/.github/workflows/build_libraries_vs.yml index c5cbfa8a2..477f457c2 100644 --- a/.github/workflows/build_libraries_vs.yml +++ b/.github/workflows/build_libraries_vs.yml @@ -46,8 +46,8 @@ jobs: - name: Install HIP SDK run: | $ProgressPreference = 'SilentlyContinue' - Write-Host "Downloading installer https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win10-Win11-For-HIP.exe" - Invoke-WebRequest -Uri https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win10-Win11-For-HIP.exe -OutFile HIPSDK.exe + Write-Host "Downloading installer https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe" + Invoke-WebRequest -Uri https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe -OutFile HIPSDK.exe Write-Host "Installing HIP SDK" Start-Process HIPSDK.exe -ArgumentList '-install','-log',"$((Get-Location).Path)\installer_log.txt" -NoNewWindow -Wait Write-Host "Installer log:" diff --git a/.github/workflows/build_portable_sln.yml b/.github/workflows/build_portable_sln.yml index d1c29e7ef..be781148a 100644 --- a/.github/workflows/build_portable_sln.yml +++ b/.github/workflows/build_portable_sln.yml @@ -83,8 +83,8 @@ jobs: - name: Install HIP SDK run: | $ProgressPreference = 'SilentlyContinue' - Write-Host "Downloading installer https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win10-Win11-For-HIP.exe" - Invoke-WebRequest -Uri https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win10-Win11-For-HIP.exe -OutFile HIPSDK.exe + Write-Host "Downloading installer https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe" + Invoke-WebRequest -Uri https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe -OutFile HIPSDK.exe Write-Host "Installing HIP SDK" Start-Process HIPSDK.exe -ArgumentList '-install','-log',"$((Get-Location).Path)\installer_log.txt" -NoNewWindow -Wait Write-Host "Installer log:" diff --git a/.github/workflows/build_programming_guide_vs.yml b/.github/workflows/build_programming_guide_vs.yml index 890c313a6..334f51234 100644 --- a/.github/workflows/build_programming_guide_vs.yml +++ b/.github/workflows/build_programming_guide_vs.yml @@ -47,8 +47,8 @@ jobs: - name: Install HIP SDK run: | $ProgressPreference = 'SilentlyContinue' - Write-Host "Downloading installer https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win10-Win11-For-HIP.exe" - Invoke-WebRequest -Uri https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win10-Win11-For-HIP.exe -OutFile HIPSDK.exe + Write-Host "Downloading installer https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe" + Invoke-WebRequest -Uri https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe -OutFile HIPSDK.exe Write-Host "Installing HIP SDK" Start-Process HIPSDK.exe -ArgumentList '-install','-log',"$((Get-Location).Path)\installer_log.txt" -NoNewWindow -Wait Write-Host "Installer log:" From dd9045e01c056453f5c4134607bae79758d8292c Mon Sep 17 00:00:00 2001 From: Adel Johar Date: Tue, 3 Mar 2026 11:03:12 +0100 Subject: [PATCH 11/11] Rebase fixes --- .github/workflows/build_applications_cuda.yml | 2 +- .github/workflows/build_hip_basic_cuda.yml | 2 +- .../build_hip_documentation_cuda.yml | 2 +- .github/workflows/build_libraries_cuda.yml | 2 +- .../build_programming_guide_cuda.yml | 2 +- .../lane_masks_bit_shift/main.hip | 64 +++++++++---------- ROCm-Examples-Portable-VS2017.sln | 4 ++ ROCm-Examples-Portable-VS2019.sln | 4 ++ ROCm-Examples-Portable-VS2022.sln | 4 ++ 9 files changed, 49 insertions(+), 37 deletions(-) diff --git a/.github/workflows/build_applications_cuda.yml b/.github/workflows/build_applications_cuda.yml index 800132abb..85d98c6b3 100644 --- a/.github/workflows/build_applications_cuda.yml +++ b/.github/workflows/build_applications_cuda.yml @@ -26,7 +26,7 @@ jobs: name: "Build Applications CUDA" runs-on: ubuntu-latest container: - image: nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04 + image: nvidia/cuda:12.9.1-cudnn-devel-ubuntu22.04 defaults: run: shell: bash diff --git a/.github/workflows/build_hip_basic_cuda.yml b/.github/workflows/build_hip_basic_cuda.yml index 652ed6c6d..99ad84358 100644 --- a/.github/workflows/build_hip_basic_cuda.yml +++ b/.github/workflows/build_hip_basic_cuda.yml @@ -26,7 +26,7 @@ jobs: name: "Build HIP Basic CUDA" runs-on: ubuntu-latest container: - image: nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04 + image: nvidia/cuda:12.9.1-cudnn-devel-ubuntu22.04 defaults: run: shell: bash diff --git a/.github/workflows/build_hip_documentation_cuda.yml b/.github/workflows/build_hip_documentation_cuda.yml index c39b1382f..06879e4dc 100644 --- a/.github/workflows/build_hip_documentation_cuda.yml +++ b/.github/workflows/build_hip_documentation_cuda.yml @@ -26,7 +26,7 @@ jobs: name: "Build HIP Documentation CUDA" runs-on: ubuntu-latest container: - image: nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04 + image: nvidia/cuda:12.9.1-cudnn-devel-ubuntu22.04 defaults: run: shell: bash diff --git a/.github/workflows/build_libraries_cuda.yml b/.github/workflows/build_libraries_cuda.yml index ca012960e..6d43c3851 100644 --- a/.github/workflows/build_libraries_cuda.yml +++ b/.github/workflows/build_libraries_cuda.yml @@ -31,7 +31,7 @@ jobs: run: shell: bash container: - image: ghcr.io/rocm/rocm-examples-hip-libraries-cuda-ubuntu + image: nvidia/cuda:12.9.1-cudnn-devel-ubuntu22.04 options: --user root steps: diff --git a/.github/workflows/build_programming_guide_cuda.yml b/.github/workflows/build_programming_guide_cuda.yml index 43cbd3e36..94e9ab60f 100644 --- a/.github/workflows/build_programming_guide_cuda.yml +++ b/.github/workflows/build_programming_guide_cuda.yml @@ -26,7 +26,7 @@ jobs: name: "Build ROCm Programming Guide CUDA" runs-on: ubuntu-latest container: - image: nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04 + image: nvidia/cuda:12.9.1-cudnn-devel-ubuntu22.04 defaults: run: shell: bash diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/main.hip b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/main.hip index 95ec02ee7..93d816bc4 100644 --- a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/main.hip +++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/main.hip @@ -20,13 +20,13 @@ // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. -#include - #include +#include #include #include #include +#include #define HIP_CHECK(expression) \ { \ @@ -44,7 +44,7 @@ __global__ void initial_problematic(std::uint32_t* results) { using WarpReduce = hipcub::WarpReduce; __shared__ typename WarpReduce::TempStorage tempStorage; - + const unsigned int tid = threadIdx.x; unsigned int val = tid % warpSize; @@ -132,10 +132,10 @@ int main() // Query device properties int warpSizeHost; HIP_CHECK(hipDeviceGetAttribute(&warpSizeHost, hipDeviceAttributeWarpSize, 0)); - + hipDeviceProp_t deviceProp; HIP_CHECK(hipGetDeviceProperties(&deviceProp, 0)); - + std::cout << "=== Device Information ===" << std::endl; std::cout << "Device name: " << deviceProp.name << std::endl; std::cout << "Warp size: " << warpSizeHost << std::endl; @@ -148,34 +148,34 @@ int main() if (warpSizeHost <= 32) { std::cout << "=== Test 1: Problematic 32-bit Implementation ===" << std::endl; - std::cout << "Running on warpSize=" << warpSizeHost + std::cout << "Running on warpSize=" << warpSizeHost << " (safe for this architecture)" << std::endl; - + std::uint32_t* d_results_32; HIP_CHECK(hipMalloc(&d_results_32, num_warps * sizeof(std::uint32_t))); - + initial_problematic<<<1, num_threads>>>(d_results_32); HIP_CHECK(hipGetLastError()); HIP_CHECK(hipDeviceSynchronize()); - - std::uint32_t h_results_32[num_warps]; - HIP_CHECK(hipMemcpy(h_results_32, d_results_32, - num_warps * sizeof(std::uint32_t), + + std::vector h_results_32(num_warps); + HIP_CHECK(hipMemcpy(h_results_32.data(), d_results_32, + num_warps * sizeof(std::uint32_t), hipMemcpyDeviceToHost)); - + std::cout << "Results: "; for (int i = 0; i < num_warps; ++i) { std::cout << h_results_32[i] << " "; } std::cout << std::endl; - + HIP_CHECK(hipFree(d_results_32)); } else { std::cout << "=== Test 1: Skipped ===" << std::endl; - std::cout << "Problematic 32-bit kernel skipped (warpSize=" << warpSizeHost + std::cout << "Problematic 32-bit kernel skipped (warpSize=" << warpSizeHost << " would cause overflow)" << std::endl; } std::cout << std::endl; @@ -184,23 +184,23 @@ int main() std::cout << "=== Test 2: Improved 64-bit Implementation ===" << std::endl; std::uint64_t* d_results_64; HIP_CHECK(hipMalloc(&d_results_64, num_warps * sizeof(std::uint64_t))); - + improved_64bit<<<1, num_threads>>>(d_results_64); HIP_CHECK(hipGetLastError()); HIP_CHECK(hipDeviceSynchronize()); - - std::uint64_t h_results_64[num_warps]; - HIP_CHECK(hipMemcpy(h_results_64, d_results_64, - num_warps * sizeof(std::uint64_t), + + std::vector h_results_64(num_warps); + HIP_CHECK(hipMemcpy(h_results_64.data(), d_results_64, + num_warps * sizeof(std::uint64_t), hipMemcpyDeviceToHost)); - + std::cout << "Results: "; for (int i = 0; i < num_warps; ++i) { std::cout << h_results_64[i] << " "; } std::cout << std::endl << std::endl; - + HIP_CHECK(hipFree(d_results_64)); // Test 3: Best practice portable implementation @@ -208,26 +208,26 @@ int main() std::cout << "Using lane_mask_t ("; std::cout << (sizeof(lane_mask_t) == 8 ? "64-bit" : "32-bit"); std::cout << " on this architecture)" << std::endl; - + lane_mask_t* d_results_portable; HIP_CHECK(hipMalloc(&d_results_portable, num_warps * sizeof(lane_mask_t))); - + best_portable<<<1, num_threads>>>(d_results_portable); HIP_CHECK(hipGetLastError()); HIP_CHECK(hipDeviceSynchronize()); - - lane_mask_t h_results_portable[num_warps]; - HIP_CHECK(hipMemcpy(h_results_portable, d_results_portable, - num_warps * sizeof(lane_mask_t), + + std::vector h_results_portable(num_warps); + HIP_CHECK(hipMemcpy(h_results_portable.data(), d_results_portable, + num_warps * sizeof(lane_mask_t), hipMemcpyDeviceToHost)); - + std::cout << "Results: "; for (int i = 0; i < num_warps; ++i) { std::cout << h_results_portable[i] << " "; } std::cout << std::endl << std::endl; - + HIP_CHECK(hipFree(d_results_portable)); // Verify results match between 64-bit and portable versions @@ -240,9 +240,9 @@ int main() break; } } - + std::cout << "=== Verification ===" << std::endl; - std::cout << "64-bit and portable results match: " + std::cout << "64-bit and portable results match: " << (results_match ? "YES" : "NO") << std::endl; if (results_match) diff --git a/ROCm-Examples-Portable-VS2017.sln b/ROCm-Examples-Portable-VS2017.sln index 5cea96a81..e798967ef 100644 --- a/ROCm-Examples-Portable-VS2017.sln +++ b/ROCm-Examples-Portable-VS2017.sln @@ -572,6 +572,10 @@ Global {2F455A27-44B0-4962-AA36-79F7053D320E}.Debug|x64.Build.0 = Debug|x64 {2F455A27-44B0-4962-AA36-79F7053D320E}.Release|x64.ActiveCfg = Release|x64 {2F455A27-44B0-4962-AA36-79F7053D320E}.Release|x64.Build.0 = Release|x64 + {A1B2C3D4-E5F6-4789-A012-3456789ABCDE}.Debug|x64.ActiveCfg = Debug|x64 + {A1B2C3D4-E5F6-4789-A012-3456789ABCDE}.Debug|x64.Build.0 = Debug|x64 + {A1B2C3D4-E5F6-4789-A012-3456789ABCDE}.Release|x64.ActiveCfg = Release|x64 + {A1B2C3D4-E5F6-4789-A012-3456789ABCDE}.Release|x64.Build.0 = Release|x64 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/ROCm-Examples-Portable-VS2019.sln b/ROCm-Examples-Portable-VS2019.sln index 1ca7e6026..dd2a24531 100644 --- a/ROCm-Examples-Portable-VS2019.sln +++ b/ROCm-Examples-Portable-VS2019.sln @@ -572,6 +572,10 @@ Global {D9432180-B518-41B6-8528-9B376AC64E38}.Debug|x64.Build.0 = Debug|x64 {D9432180-B518-41B6-8528-9B376AC64E38}.Release|x64.ActiveCfg = Release|x64 {D9432180-B518-41B6-8528-9B376AC64E38}.Release|x64.Build.0 = Release|x64 + {B2C3D4E5-F6A7-4890-B123-456789ABCDEF}.Debug|x64.ActiveCfg = Debug|x64 + {B2C3D4E5-F6A7-4890-B123-456789ABCDEF}.Debug|x64.Build.0 = Debug|x64 + {B2C3D4E5-F6A7-4890-B123-456789ABCDEF}.Release|x64.ActiveCfg = Release|x64 + {B2C3D4E5-F6A7-4890-B123-456789ABCDEF}.Release|x64.Build.0 = Release|x64 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/ROCm-Examples-Portable-VS2022.sln b/ROCm-Examples-Portable-VS2022.sln index e819e5efd..e0009ccaa 100644 --- a/ROCm-Examples-Portable-VS2022.sln +++ b/ROCm-Examples-Portable-VS2022.sln @@ -572,6 +572,10 @@ Global {B7E76F08-1DA6-4520-86D7-1986CF809179}.Debug|x64.Build.0 = Debug|x64 {B7E76F08-1DA6-4520-86D7-1986CF809179}.Release|x64.ActiveCfg = Release|x64 {B7E76F08-1DA6-4520-86D7-1986CF809179}.Release|x64.Build.0 = Release|x64 + {C3D4E5F6-A7B8-4901-C234-56789ABCDEF0}.Debug|x64.ActiveCfg = Debug|x64 + {C3D4E5F6-A7B8-4901-C234-56789ABCDEF0}.Debug|x64.Build.0 = Debug|x64 + {C3D4E5F6-A7B8-4901-C234-56789ABCDEF0}.Release|x64.ActiveCfg = Release|x64 + {C3D4E5F6-A7B8-4901-C234-56789ABCDEF0}.Release|x64.Build.0 = Release|x64 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE