From 9553890e486c343d853702825e8bcdd51b409060 Mon Sep 17 00:00:00 2001
From: Jan Stephan <jan.stephan@amd.com>
Date: Thu, 18 Dec 2025 14:32:35 +0100
Subject: [PATCH 01/11] Add lane masks bit-shift example

Signed-off-by: Jan Stephan <jan.stephan@amd.com>
---
 .../Porting-CUDA-code-to-HIP/CMakeLists.txt   |   1 +
 .../Porting-CUDA-code-to-HIP/Makefile         |   1 +
 .../lane_masks_bit_shift/.gitignore           |   1 +
 .../lane_masks_bit_shift/CMakeLists.txt       |  67 +++++
 .../lane_masks_bit_shift/Makefile             |  59 ++++
 .../lane_masks_bit_shift/README.md            | 210 ++++++++++++++
 .../lane_masks_bit_shift_vs2017.sln           |  24 ++
 .../lane_masks_bit_shift_vs2017.vcxproj       | 131 +++++++++
 ...ane_masks_bit_shift_vs2017.vcxproj.filters |   6 +
 .../lane_masks_bit_shift_vs2019.sln           |  24 ++
 .../lane_masks_bit_shift_vs2019.vcxproj       | 131 +++++++++
 ...ane_masks_bit_shift_vs2019.vcxproj.filters |   6 +
 .../lane_masks_bit_shift_vs2022.sln           |  24 ++
 .../lane_masks_bit_shift_vs2022.vcxproj       | 131 +++++++++
 ...ane_masks_bit_shift_vs2022.vcxproj.filters |   6 +
 .../lane_masks_bit_shift/main.hip             | 258 ++++++++++++++++++
 README.md                                     |   1 +
 ROCm-Examples-Portable-VS2017.sln             |   5 +
 ROCm-Examples-Portable-VS2019.sln             |   5 +
 ROCm-Examples-Portable-VS2022.sln             |   5 +
 ROCm-Examples-VS2017.sln                      |   6 +
 ROCm-Examples-VS2019.sln                      |   6 +
 ROCm-Examples-VS2022.sln                      |   7 +
 23 files changed, 1115 insertions(+)
 create mode 100644 HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/.gitignore
 create mode 100644 HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/CMakeLists.txt
 create mode 100644 HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/Makefile
 create mode 100644 HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/README.md
 create mode 100644 HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2017.sln
 create mode 100644 HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2017.vcxproj
 create mode 100644 HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2017.vcxproj.filters
 create mode 100644 HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2019.sln
 create mode 100644 HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2019.vcxproj
 create mode 100644 HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2019.vcxproj.filters
 create mode 100644 HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2022.sln
 create mode 100644 HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2022.vcxproj
 create mode 100644 HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2022.vcxproj.filters
 create mode 100644 HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/main.hip

diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/CMakeLists.txt b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/CMakeLists.txt
index 10f1a3c46..3369b5cb8 100644
--- a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/CMakeLists.txt
+++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/CMakeLists.txt
@@ -40,5 +40,6 @@ else()
     add_subdirectory(load_module_ex_cuda)
 endif()
 
+add_subdirectory(lane_masks_bit_shift)
 add_subdirectory(per_thread_default_stream)
 add_subdirectory(pointer_memory_type)
diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/Makefile b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/Makefile
index 50c5338ef..773a15d9b 100644
--- a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/Makefile
+++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/Makefile
@@ -26,6 +26,7 @@ EXAMPLES := \
   host_code_feature_identification \
   identifying_compilation_target_platform \
   identifying_host_device_compilation_pass \
+  lane_masks_bit_shift \
   load_module \
   load_module_ex \
   per_thread_default_stream \
diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/.gitignore b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/.gitignore
new file mode 100644
index 000000000..b71f55e74
--- /dev/null
+++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/.gitignore
@@ -0,0 +1 @@
+hip_lane_masks_bit_shift
diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/CMakeLists.txt b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/CMakeLists.txt
new file mode 100644
index 000000000..2da61faf4
--- /dev/null
+++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/CMakeLists.txt
@@ -0,0 +1,67 @@
+# MIT License
+#
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+set(example_name hip_lane_masks_bit_shift)
+
+cmake_minimum_required(VERSION 3.21 FATAL_ERROR)
+project(${example_name} LANGUAGES CXX)
+
+include("${CMAKE_CURRENT_LIST_DIR}/../../../../Common/HipPlatform.cmake")
+select_gpu_language()
+enable_language(${ROCM_EXAMPLES_GPU_LANGUAGE})
+select_hip_platform()
+
+if(ROCM_EXAMPLES_GPU_LANGUAGE STREQUAL "CUDA")
+    # The example calls hipDeviceGetAttribute which internally uses the CUDA driver API. We must explicitly link the
+    # driver library.
+    find_package(CUDAToolkit REQUIRED)
+endif()
+
+if(CMAKE_SYSTEM_NAME MATCHES "Windows")
+    set(ROCM_ROOT
+        "$ENV{HIP_PATH}"
+        CACHE PATH
+        "Root directory of the ROCm installation"
+    )
+else()
+    set(ROCM_ROOT
+        "/opt/rocm"
+        CACHE PATH
+        "Root directory of the ROCm installation"
+    )
+endif()
+
+list(APPEND CMAKE_PREFIX_PATH "${ROCM_ROOT}")
+
+add_executable(${example_name} main.hip)
+# Make example runnable using ctest
+add_test(NAME ${example_name} COMMAND ${example_name})
+
+set_source_files_properties(main.hip PROPERTIES LANGUAGE ${ROCM_EXAMPLES_GPU_LANGUAGE})
+
+set_target_properties(${example_name} PROPERTIES $<$<COMPILE_LANGUAGE:CUDA>:CUDA_EXTENSIONS OFF>
+                                                 $<$<COMPILE_LANGUAGE:HIP>:HIP_EXTENSIONS OFF>)
+target_compile_features(${example_name} PRIVATE cuda_std_17 hip_std_17)
+target_include_directories(${example_name} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${ROCM_ROOT}/include>)
+target_link_libraries(${example_name} PRIVATE $<$<LINK_LANGUAGE:CUDA>:CUDA::cuda_driver>)
+
+install(TARGETS ${example_name})
diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/Makefile b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/Makefile
new file mode 100644
index 000000000..d5ce838e0
--- /dev/null
+++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/Makefile
@@ -0,0 +1,59 @@
+# MIT License
+#
+# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+EXAMPLE := hip_lane_masks_bit_shift
+GPU_RUNTIME ?= HIP
+
+# HIP variables
+ROCM_INSTALL_DIR ?= /opt/rocm
+HIP_INCLUDE_DIR  := $(ROCM_INSTALL_DIR)/include
+
+HIPCXX ?= $(ROCM_INSTALL_DIR)/bin/hipcc
+
+# Common variables and flags
+CXX_STD   := c++17
+ICXXFLAGS := -std=$(CXX_STD)
+ICPPFLAGS :=
+ILDFLAGS  :=
+ILDLIBS   :=
+
+ifeq ($(GPU_RUNTIME), CUDA)
+  ICXXFLAGS += -x cu
+  ICPPFLAGS += -isystem $(HIP_INCLUDE_DIR)
+else ifeq ($(GPU_RUNTIME), HIP)
+  CXXFLAGS ?= -Wall -Wextra
+else
+  $(error GPU_RUNTIME is set to "$(GPU_RUNTIME)". GPU_RUNTIME must be either CUDA or HIP)
+endif
+
+ICXXFLAGS += $(CXXFLAGS)
+ICPPFLAGS += $(CPPFLAGS)
+ILDFLAGS  += $(LDFLAGS)
+ILDLIBS   += $(LDLIBS)
+
+$(EXAMPLE): main.hip
+	$(HIPCXX) $(ICXXFLAGS) $(ICPPFLAGS) $(ILDFLAGS) -o $@ $< $(ILDLIBS)
+
+clean:
+	$(RM) $(EXAMPLE)
+
+.PHONY: clean
diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/README.md b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/README.md
new file mode 100644
index 000000000..5a1f369b7
--- /dev/null
+++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/README.md
@@ -0,0 +1,210 @@
+# HIP-Doc Lane Masks Bit-Shift Example
+
+## Description
+
+This example demonstrates how to write portable HIP code that correctly handles
+lane masks and bit-shift operations across different GPU architectures with
+varying warp sizes. It highlights a common portability issue when porting CUDA
+code to HIP and provides best-practice solutions.
+
+For more information on this topic, please refer to the
+[HIP Porting Guide - Lane masks bit-shift section](https://rocm.docs.amd.com/projects/HIP/en/latest/how-to/hip_porting_guide.html#lane-masks-bit-shift).
+
+### Background
+
+A thread in a warp is also called a lane, and a lane mask is a bitmask where
+each bit corresponds to a thread in a warp. Lane masks are commonly used in
+warp-level operations for tasks like warp reduction, ballot operations, and
+parallel algorithms.
+
+The critical portability issue arises from the fact that:
+
+- **AMD RDNA architectures** and **NVIDIA GPUs** have a warp size of 32
+- **AMD GCN/CDNA architectures** have a warp size of 64
+
+When using bit-shift operations to create lane masks, using 32-bit integer
+literals (like `1 << laneId`) will overflow on AMD GPUs with a warp size of 64
+and `laneId >= 32`, leading to incorrect results.
+
+### Application Flow
+
+The example demonstrates three approaches to handling lane masks:
+
+1. **Problematic Implementation** (32-bit, non-portable)
+   - Uses 32-bit integer literals for bit-shift operations
+   - Only safe for warp sizes ≤ 32
+   - Demonstrates the problem that needs to be avoided
+
+2. **Improved Implementation** (64-bit, explicit)
+   - Uses 64-bit integer literals (`1ull`) for bit-shift operations
+   - Works correctly for all warp sizes
+   - Explicit but not architecture-specific
+
+3. **Best Practice Implementation** (portable)
+   - Uses a portable `lane_mask_t` typedef that adapts to the architecture
+   - Provides architecture-specific optimizations
+   - Recommended approach for production code
+
+For each approach, the example:
+
+- Queries the device's warp size
+- Executes kernels performing warp reduction with bit-shift operations
+- Stores and displays the results
+- Verifies correctness by comparing results
+
+## Key APIs and Concepts
+
+- `warpSize` - Built-in constant that represents the warp size in device code
+- `hipDeviceGetAttribute` - Queries the device's warp size on the host side
+- **Lane masks** - Bitmasks representing active threads in a warp
+- **Bit-shift operations** - Creating masks and selecting threads using bit
+  shifting
+- **Architecture-specific typedefs** - Using conditional compilation for
+  portability
+- `hipcub::WarpReduce` - Warp-level reduction primitive from the hipCUB library
+
+### Problem Demonstration
+
+```cpp
+// PROBLEMATIC - Only works on warp size ≤ 32
+unsigned int laneId = threadIdx.x % warpSize;
+std::uint32_t mask = (1 << laneId) - 1;  // Overflow when laneId >= 32!
+```
+
+### Solution 1: Explicit 64-bit
+
+```cpp
+// IMPROVED - Works on all current GPUs
+unsigned int laneId = threadIdx.x % warpSize;
+std::uint64_t mask = (1ull << laneId) - 1;  // Uses 64-bit literal
+```
+
+### Solution 2: Portable Typedef (Recommended)
+
+```cpp
+// BEST PRACTICE - Architecture-specific optimization
+#if defined(__GFX8__) || defined(__GFX9__)
+typedef std::uint64_t lane_mask_t;
+#else
+typedef std::uint32_t lane_mask_t;
+#endif
+
+unsigned int laneId = threadIdx.x % warpSize;
+lane_mask_t mask = (lane_mask_t{1} << laneId) - 1;  // Portable!
+```
+
+## Building
+
+### Linux
+
+#### CMake
+
+```bash
+mkdir build && cd build
+cmake ..  # or cmake -D GPU_RUNTIME=CUDA .. for CUDA
+make
+```
+
+#### Make
+
+```bash
+make  # or make GPU_RUNTIME=CUDA for CUDA
+```
+
+### Windows
+
+#### Visual Studio
+
+Open one of the Visual Studio solution files and build the project:
+
+- `lane_masks_bit_shift_vs2017.sln` for Visual Studio 2017
+- `lane_masks_bit_shift_vs2019.sln` for Visual Studio 2019
+- `lane_masks_bit_shift_vs2022.sln` for Visual Studio 2022
+
+#### CMake
+
+```shell
+cmake -G Ninja -S . -B build
+cmake --build build
+```
+
+## Running
+
+After building, run the executable:
+
+```shell
+./hip_lane_masks_bit_shift  # Linux
+hip_lane_masks_bit_shift.exe  # Windows
+```
+
+### Expected Output
+
+On a GPU with warp size 64:
+
+```plaintext
+=== Device Information ===
+Device name: AMD Instinct GPU
+Warp size: 64
+
+=== Test 1: Skipped ===
+Problematic 32-bit kernel skipped (warpSize=64 would cause overflow)
+
+=== Test 2: Improved 64-bit Implementation ===
+Results: 9223372036854775808 9223372036854775808
+
+=== Test 3: Best Practice Portable Implementation ===
+Using lane_mask_t (64-bit on this architecture)
+Results: 9223372036854775808 9223372036854775808
+
+=== Verification ===
+64-bit and portable results match: YES
+
+SUCCESS: All kernels produced correct results!
+```
+
+On a GPU with warp size 32:
+
+```plaintext
+=== Device Information ===
+Device name: AMD Radeon GPU
+Warp size: 32
+
+=== Test 1: Problematic 32-bit Implementation ===
+Running on warpSize=32 (safe for this architecture)
+Results: 2147483648 2147483648 2147483648 2147483648
+
+=== Test 2: Improved 64-bit Implementation ===
+Results: 2147483648 2147483648 2147483648 2147483648
+
+=== Test 3: Best Practice Portable Implementation ===
+Using lane_mask_t (32-bit on this architecture)
+Results: 2147483648 2147483648 2147483648 2147483648
+
+=== Verification ===
+64-bit and portable results match: YES
+
+SUCCESS: All kernels produced correct results!
+```
+
+## Demonstrated API Calls
+
+### HIP Runtime
+
+#### Device Symbols
+
+- `threadIdx` - Thread index within a block
+- `warpSize` - Built-in constant for warp size in device code
+
+#### Host Symbols
+
+- `hipDeviceGetAttribute` - Query device attributes
+- `hipGetDeviceProperties` - Get device properties
+- `hipMalloc` - Allocate device memory
+- `hipMemcpy` - Copy memory between host and device
+- `hipFree` - Free device memory
+- `hipDeviceSynchronize` - Synchronize device execution
+- `hipGetLastError` - Get last error from runtime
+
+### hipCUB
+
+- `hipcub::WarpReduce` - Warp-level reduction operations
diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2017.sln b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2017.sln
new file mode 100644
index 000000000..227aa99af
--- /dev/null
+++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2017.sln
@@ -0,0 +1,24 @@
+﻿Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 15
+VisualStudioVersion = 15.0.28307.1684
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "lane_masks_bit_shift_vs2017", "lane_masks_bit_shift_vs2017.vcxproj", "{A1B2C3D4-E5F6-4789-A012-3456789ABCDE}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{A1B2C3D4-E5F6-4789-A012-3456789ABCDE}.Debug|x64.ActiveCfg = Debug|x64
+		{A1B2C3D4-E5F6-4789-A012-3456789ABCDE}.Debug|x64.Build.0 = Debug|x64
+		{A1B2C3D4-E5F6-4789-A012-3456789ABCDE}.Release|x64.ActiveCfg = Release|x64
+		{A1B2C3D4-E5F6-4789-A012-3456789ABCDE}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {F1E2D3C4-B5A6-4978-9012-3456789ABCDE}
+	EndGlobalSection
+EndGlobal
diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2017.vcxproj b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2017.vcxproj
new file mode 100644
index 000000000..a58eaee4b
--- /dev/null
+++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2017.vcxproj
@@ -0,0 +1,131 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>15.0</VCProjectVersion>
+    <ProjectGuid>{A1B2C3D4-E5F6-4789-A012-3456789ABCDE}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>lane_masks_bit_shift_vs2017</RootNamespace>
+    <WindowsTargetPlatformVersion>$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <ItemGroup>
+    <ClCompile Include="main.hip" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>HIP clang 7.1</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>HIP clang 7.1</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(PlatformToolset.Contains(`HIP clang `))'">
+    <HIPVersion>$(PlatformToolset.Replace(`HIP clang `, ``))</HIPVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(PlatformToolset.Contains(`HIP nvcc `))'">
+    <HIPVersion>$(PlatformToolset.Replace(`HIP nvcc `, ``))</HIPVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Condition="'$(PlatformToolset)'=='HIP clang $(HIPVersion)'" Project="$(VCTargetsPath)\Platforms\$(Platform)\PlatformToolsets\HIP clang $(HIPVersion)\AMD.HIP.Clang.Common.props" />
+    <Import Condition="'$(PlatformToolset)'=='HIP nvcc $(HIPVersion)'" Project="$(VCTargetsPath)\Platforms\$(Platform)\PlatformToolsets\HIP nvcc $(HIPVersion)\AMD.HIP.Nvcc.Common.props" />
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <TargetName>hip_$(ProjectName)</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <TargetName>hip_$(ProjectName)</TargetName>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64' and '$(PlatformToolset)'=='HIP clang $(HIPVersion)'">
+    <ClCompile>
+      <WarningLevel>Level2</WarningLevel>
+      <PreprocessorDefinitions>__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+      <RuntimeTypeInfo>true</RuntimeTypeInfo>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64' and '$(PlatformToolset)'=='HIP nvcc $(HIPVersion)'">
+    <ClCompile>
+      <WarningLevel>Level2</WarningLevel>
+      <PreprocessorDefinitions>__CUDACC__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+      <RuntimeTypeInfo>true</RuntimeTypeInfo>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>cuda.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64' and '$(PlatformToolset)'=='HIP clang $(HIPVersion)'">
+    <ClCompile>
+      <WarningLevel>Level2</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <PreprocessorDefinitions>__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+      <RuntimeTypeInfo>true</RuntimeTypeInfo>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64' and '$(PlatformToolset)'=='HIP nvcc $(HIPVersion)'">
+    <ClCompile>
+      <WarningLevel>Level2</WarningLevel>
+      <WholeProgramOptimization>true</WholeProgramOptimization>
+      <PreprocessorDefinitions>__CUDACC__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+      <RuntimeTypeInfo>true</RuntimeTypeInfo>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+      <AdditionalDependencies>cuda.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Condition="'$(PlatformToolset)'=='HIP clang $(HIPVersion)'" Project="$(VCTargetsPath)\Platforms\$(Platform)\PlatformToolsets\HIP clang $(HIPVersion)\AMD.HIP.Clang.Common.targets" />
+    <Import Condition="'$(PlatformToolset)'=='HIP nvcc $(HIPVersion)'" Project="$(VCTargetsPath)\Platforms\$(Platform)\PlatformToolsets\HIP nvcc $(HIPVersion)\AMD.HIP.Nvcc.Common.targets" />
+  </ImportGroup>
+</Project>
diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2017.vcxproj.filters b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2017.vcxproj.filters
new file mode 100644
index 000000000..441040352
--- /dev/null
+++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2017.vcxproj.filters
@@ -0,0 +1,6 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <ClCompile Include="main.hip" />
+  </ItemGroup>
+</Project>
diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2019.sln b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2019.sln
new file mode 100644
index 000000000..35d743608
--- /dev/null
+++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2019.sln
@@ -0,0 +1,24 @@
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio Version 16
+VisualStudioVersion = 16.0.30717.126
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "lane_masks_bit_shift_vs2019", "lane_masks_bit_shift_vs2019.vcxproj", "{B2C3D4E5-F6A7-4890-B123-456789ABCDEF}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{B2C3D4E5-F6A7-4890-B123-456789ABCDEF}.Debug|x64.ActiveCfg = Debug|x64
+		{B2C3D4E5-F6A7-4890-B123-456789ABCDEF}.Debug|x64.Build.0 = Debug|x64
+		{B2C3D4E5-F6A7-4890-B123-456789ABCDEF}.Release|x64.ActiveCfg = Release|x64
+		{B2C3D4E5-F6A7-4890-B123-456789ABCDEF}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {C2D3E4F5-A6B7-4890-C123-456789ABCDEF}
+	EndGlobalSection
+EndGlobal
diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2019.vcxproj b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2019.vcxproj
new file mode 100644
index 000000000..1a7e11a52
--- /dev/null
+++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2019.vcxproj
@@ -0,0 +1,131 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>16.0</VCProjectVersion>
+    <ProjectGuid>{B2C3D4E5-F6A7-4890-B123-456789ABCDEF}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>lane_masks_bit_shift_vs2019</RootNamespace>
+    <WindowsTargetPlatformVersion>$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <ItemGroup>
+    <ClCompile Include="main.hip" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>HIP clang 7.1</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>HIP clang 7.1</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(PlatformToolset.Contains(`HIP clang `))'">
+    <HIPVersion>$(PlatformToolset.Replace(`HIP clang `, ``))</HIPVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(PlatformToolset.Contains(`HIP nvcc `))'">
+    <HIPVersion>$(PlatformToolset.Replace(`HIP nvcc `, ``))</HIPVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Condition="'$(PlatformToolset)'=='HIP clang $(HIPVersion)'" Project="$(VCTargetsPath)\Platforms\$(Platform)\PlatformToolsets\HIP clang $(HIPVersion)\AMD.HIP.Clang.Common.props" />
+    <Import Condition="'$(PlatformToolset)'=='HIP nvcc $(HIPVersion)'" Project="$(VCTargetsPath)\Platforms\$(Platform)\PlatformToolsets\HIP nvcc $(HIPVersion)\AMD.HIP.Nvcc.Common.props" />
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <TargetName>hip_$(ProjectName)</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <TargetName>hip_$(ProjectName)</TargetName>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64' and '$(PlatformToolset)'=='HIP clang $(HIPVersion)'">
+    <ClCompile>
+      <WarningLevel>Level2</WarningLevel>
+      <PreprocessorDefinitions>__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+      <RuntimeTypeInfo>true</RuntimeTypeInfo>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64' and '$(PlatformToolset)'=='HIP nvcc $(HIPVersion)'">
+    <ClCompile>
+      <WarningLevel>Level2</WarningLevel>
+      <PreprocessorDefinitions>__CUDACC__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+      <RuntimeTypeInfo>true</RuntimeTypeInfo>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>cuda.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64' and '$(PlatformToolset)'=='HIP clang $(HIPVersion)'">
+    <ClCompile>
+      <WarningLevel>Level2</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <PreprocessorDefinitions>__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+      <RuntimeTypeInfo>true</RuntimeTypeInfo>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64' and '$(PlatformToolset)'=='HIP nvcc $(HIPVersion)'">
+    <ClCompile>
+      <WarningLevel>Level2</WarningLevel>
+      <WholeProgramOptimization>true</WholeProgramOptimization>
+      <PreprocessorDefinitions>__CUDACC__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+      <RuntimeTypeInfo>true</RuntimeTypeInfo>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+      <AdditionalDependencies>cuda.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Condition="'$(PlatformToolset)'=='HIP clang $(HIPVersion)'" Project="$(VCTargetsPath)\Platforms\$(Platform)\PlatformToolsets\HIP clang $(HIPVersion)\AMD.HIP.Clang.Common.targets" />
+    <Import Condition="'$(PlatformToolset)'=='HIP nvcc $(HIPVersion)'" Project="$(VCTargetsPath)\Platforms\$(Platform)\PlatformToolsets\HIP nvcc $(HIPVersion)\AMD.HIP.Nvcc.Common.targets" />
+  </ImportGroup>
+</Project>
diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2019.vcxproj.filters b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2019.vcxproj.filters
new file mode 100644
index 000000000..e52b0bb62
--- /dev/null
+++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2019.vcxproj.filters
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <ClCompile Include="main.hip" />
+  </ItemGroup>
+</Project>
diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2022.sln b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2022.sln
new file mode 100644
index 000000000..00896c8d5
--- /dev/null
+++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2022.sln
@@ -0,0 +1,24 @@
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio Version 17
+VisualStudioVersion = 17.0.31903.59
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "lane_masks_bit_shift_vs2022", "lane_masks_bit_shift_vs2022.vcxproj", "{C3D4E5F6-A7B8-4901-C234-56789ABCDEF0}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{C3D4E5F6-A7B8-4901-C234-56789ABCDEF0}.Debug|x64.ActiveCfg = Debug|x64
+		{C3D4E5F6-A7B8-4901-C234-56789ABCDEF0}.Debug|x64.Build.0 = Debug|x64
+		{C3D4E5F6-A7B8-4901-C234-56789ABCDEF0}.Release|x64.ActiveCfg = Release|x64
+		{C3D4E5F6-A7B8-4901-C234-56789ABCDEF0}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {D3E4F5A6-B7C8-4901-D234-56789ABCDEF0}
+	EndGlobalSection
+EndGlobal
diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2022.vcxproj b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2022.vcxproj
new file mode 100644
index 000000000..964d345f7
--- /dev/null
+++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2022.vcxproj
@@ -0,0 +1,131 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>17.0</VCProjectVersion>
+    <ProjectGuid>{C3D4E5F6-A7B8-4901-C234-56789ABCDEF0}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>lane_masks_bit_shift_vs2022</RootNamespace>
+    <WindowsTargetPlatformVersion>$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <ItemGroup>
+    <ClCompile Include="main.hip" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>HIP clang 7.1</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>HIP clang 7.1</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(PlatformToolset.Contains(`HIP clang `))'">
+    <HIPVersion>$(PlatformToolset.Replace(`HIP clang `, ``))</HIPVersion>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(PlatformToolset.Contains(`HIP nvcc `))'">
+    <HIPVersion>$(PlatformToolset.Replace(`HIP nvcc `, ``))</HIPVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+    <Import Condition="'$(PlatformToolset)'=='HIP clang $(HIPVersion)'" Project="$(VCTargetsPath)\Platforms\$(Platform)\PlatformToolsets\HIP clang $(HIPVersion)\AMD.HIP.Clang.Common.props" />
+    <Import Condition="'$(PlatformToolset)'=='HIP nvcc $(HIPVersion)'" Project="$(VCTargetsPath)\Platforms\$(Platform)\PlatformToolsets\HIP nvcc $(HIPVersion)\AMD.HIP.Nvcc.Common.props" />
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>true</LinkIncremental>
+    <TargetName>hip_$(ProjectName)</TargetName>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <TargetName>hip_$(ProjectName)</TargetName>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64' and '$(PlatformToolset)'=='HIP clang $(HIPVersion)'">
+    <ClCompile>
+      <WarningLevel>Level2</WarningLevel>
+      <PreprocessorDefinitions>__clang__;__HIP__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+      <RuntimeTypeInfo>true</RuntimeTypeInfo>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64' and '$(PlatformToolset)'=='HIP nvcc $(HIPVersion)'">
+    <ClCompile>
+      <WarningLevel>Level2</WarningLevel>
+      <PreprocessorDefinitions>__CUDACC__;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+      <RuntimeTypeInfo>true</RuntimeTypeInfo>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>cuda.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64' and '$(PlatformToolset)'=='HIP clang $(HIPVersion)'">
+    <ClCompile>
+      <WarningLevel>Level2</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <PreprocessorDefinitions>__clang__;__HIP__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+      <RuntimeTypeInfo>true</RuntimeTypeInfo>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64' and '$(PlatformToolset)'=='HIP nvcc $(HIPVersion)'">
+    <ClCompile>
+      <WarningLevel>Level2</WarningLevel>
+      <WholeProgramOptimization>true</WholeProgramOptimization>
+      <PreprocessorDefinitions>__CUDACC__;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <LanguageStandard>stdcpp17</LanguageStandard>
+      <RuntimeTypeInfo>true</RuntimeTypeInfo>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
+      <AdditionalDependencies>cuda.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Condition="'$(PlatformToolset)'=='HIP clang $(HIPVersion)'" Project="$(VCTargetsPath)\Platforms\$(Platform)\PlatformToolsets\HIP clang $(HIPVersion)\AMD.HIP.Clang.Common.targets" />
+    <Import Condition="'$(PlatformToolset)'=='HIP nvcc $(HIPVersion)'" Project="$(VCTargetsPath)\Platforms\$(Platform)\PlatformToolsets\HIP nvcc $(HIPVersion)\AMD.HIP.Nvcc.Common.targets" />
+  </ImportGroup>
+</Project>
diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2022.vcxproj.filters b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2022.vcxproj.filters
new file mode 100644
index 000000000..e52b0bb62
--- /dev/null
+++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/lane_masks_bit_shift_vs2022.vcxproj.filters
@@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <ClCompile Include="main.hip" />
+  </ItemGroup>
+</Project>
diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/main.hip b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/main.hip
new file mode 100644
index 000000000..95ec02ee7
--- /dev/null
+++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/main.hip
@@ -0,0 +1,258 @@
+// MIT License
+//
+// Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in all
+// copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+
+#include <hipcub/hipcub.hpp>
+
+#include <hip/hip_runtime.h>
+
+#include <cstdint>
+#include <cstdlib>
+#include <iostream>
+
+#define HIP_CHECK(expression)                                \
+{                                                            \
+    const hipError_t err = expression;                       \
+    if (err != hipSuccess)                                   \
+    {                                                        \
+        std::cout << "HIP Error: " << hipGetErrorString(err) \
+              << " at line " << __LINE__ << std::endl;       \
+        std::exit(EXIT_FAILURE);                             \
+    }                                                        \
+}
+
+// Problematic kernel: Uses 32-bit integer for lane masks, which fails on warp sizes > 32
+__global__ void initial_problematic(std::uint32_t* results)
+{
+    using WarpReduce = hipcub::WarpReduce<std::uint32_t>;
+    __shared__ typename WarpReduce::TempStorage tempStorage;
+    
+    const unsigned int tid = threadIdx.x;
+    unsigned int val = tid % warpSize;
+
+    // [sphinx-initial-start]
+    // Get the thread's position in the warp
+    unsigned int laneId = threadIdx.x % warpSize;
+
+    // Use lane ID for bit-shift - PROBLEM: 1 is a 32-bit literal
+    // On AMD GPUs with warpSize=64, this can overflow when laneId >= 32
+    std::uint32_t mask = (1 << laneId) - 1;
+
+    // Shift 32 bit integer with val variable - PROBLEM: can overflow
+    std::uint32_t shift_result = (val < warpSize) ? (1 << val) : 0;
+    std::uint32_t warp_sum = WarpReduce(tempStorage).Sum(shift_result);
+    // [sphinx-initial-end]
+
+    // Store result for the first thread in each warp
+    if (laneId == 0)
+        results[tid / warpSize] = warp_sum;
+}
+
+// Improved kernel: Uses 64-bit integers explicitly
+__global__ void improved_64bit(std::uint64_t* results)
+{
+    using WarpReduce = hipcub::WarpReduce<std::uint64_t>;
+    __shared__ typename WarpReduce::TempStorage tempStorage;
+
+    const unsigned int tid = threadIdx.x;
+    unsigned int val = tid % warpSize;
+
+    // [sphinx-improved-start]
+    // Get the thread's position in the warp
+    unsigned int laneId = threadIdx.x % warpSize;
+
+    // Use lane ID for bit-shift - FIXED: 1ull is a 64-bit literal
+    std::uint64_t mask = (1ull << laneId) - 1;
+
+    // Shift 64 bit integer with val variable - FIXED: works for warpSize up to 64
+    std::uint64_t shift_result = (val < warpSize) ? (1ull << val) : 0;
+    std::uint64_t warp_sum = WarpReduce(tempStorage).Sum(shift_result);
+    // [sphinx-improved-end]
+
+    // Store result for the first thread in each warp
+    if (laneId == 0)
+        results[tid / warpSize] = warp_sum;
+}
+
+// [sphinx-typedef-start]
+// Portable lane mask type: adjusts based on GPU architecture
+#ifdef __GFX9__
+using lane_mask_t = std::uint64_t;
+#else
+using lane_mask_t = std::uint32_t;
+#endif
+// [sphinx-typedef-end]
+
+// Best practice kernel: Uses portable lane_mask_t type
+__global__ void best_portable(lane_mask_t* results)
+{
+    using WarpReduce = hipcub::WarpReduce<lane_mask_t>;
+    __shared__ typename WarpReduce::TempStorage tempStorage;
+
+    const unsigned int tid = threadIdx.x;
+    unsigned int val = tid % warpSize;
+
+    // [sphinx-best-start]
+    // Get the thread's position in the warp
+    unsigned int laneId = threadIdx.x % warpSize;
+
+    // Use lane ID for bit-shift - PORTABLE: lane_mask_t adjusts to warp size
+    lane_mask_t mask = (lane_mask_t{1} << laneId) - 1;
+
+    // Shift 32 or 64 bit integer with val variable - PORTABLE
+    lane_mask_t shift_result = (val < warpSize) ? (lane_mask_t{1} << val) : 0;
+    lane_mask_t warp_sum = WarpReduce(tempStorage).Sum(shift_result);
+    // [sphinx-best-end]
+
+    // Store result for the first thread in each warp
+    if (laneId == 0)
+        results[tid / warpSize] = warp_sum;
+}
+
+int main()
+{
+    // Query device properties
+    int warpSizeHost;
+    HIP_CHECK(hipDeviceGetAttribute(&warpSizeHost, hipDeviceAttributeWarpSize, 0));
+    
+    hipDeviceProp_t deviceProp;
+    HIP_CHECK(hipGetDeviceProperties(&deviceProp, 0));
+    
+    std::cout << "=== Device Information ===" << std::endl;
+    std::cout << "Device name: " << deviceProp.name << std::endl;
+    std::cout << "Warp size: " << warpSizeHost << std::endl;
+    std::cout << std::endl;
+
+    const int num_threads = 128;
+    const int num_warps = num_threads / warpSizeHost;
+
+    // Test 1: Problematic kernel (only safe for warpSize <= 32)
+    if (warpSizeHost <= 32)
+    {
+        std::cout << "=== Test 1: Problematic 32-bit Implementation ===" << std::endl;
+        std::cout << "Running on warpSize=" << warpSizeHost 
+                  << " (safe for this architecture)" << std::endl;
+        
+        std::uint32_t* d_results_32;
+        HIP_CHECK(hipMalloc(&d_results_32, num_warps * sizeof(std::uint32_t)));
+        
+        initial_problematic<<<1, num_threads>>>(d_results_32);
+        HIP_CHECK(hipGetLastError());
+        HIP_CHECK(hipDeviceSynchronize());
+        
+        std::uint32_t h_results_32[num_warps];
+        HIP_CHECK(hipMemcpy(h_results_32, d_results_32, 
+                           num_warps * sizeof(std::uint32_t), 
+                           hipMemcpyDeviceToHost));
+        
+        std::cout << "Results: ";
+        for (int i = 0; i < num_warps; ++i)
+        {
+            std::cout << h_results_32[i] << " ";
+        }
+        std::cout << std::endl;
+        
+        HIP_CHECK(hipFree(d_results_32));
+    }
+    else
+    {
+        std::cout << "=== Test 1: Skipped ===" << std::endl;
+        std::cout << "Problematic 32-bit kernel skipped (warpSize=" << warpSizeHost 
+                  << " would cause overflow)" << std::endl;
+    }
+    std::cout << std::endl;
+
+    // Test 2: Improved 64-bit implementation
+    std::cout << "=== Test 2: Improved 64-bit Implementation ===" << std::endl;
+    std::uint64_t* d_results_64;
+    HIP_CHECK(hipMalloc(&d_results_64, num_warps * sizeof(std::uint64_t)));
+    
+    improved_64bit<<<1, num_threads>>>(d_results_64);
+    HIP_CHECK(hipGetLastError());
+    HIP_CHECK(hipDeviceSynchronize());
+    
+    std::uint64_t h_results_64[num_warps];
+    HIP_CHECK(hipMemcpy(h_results_64, d_results_64, 
+                       num_warps * sizeof(std::uint64_t), 
+                       hipMemcpyDeviceToHost));
+    
+    std::cout << "Results: ";
+    for (int i = 0; i < num_warps; ++i)
+    {
+        std::cout << h_results_64[i] << " ";
+    }
+    std::cout << std::endl << std::endl;
+    
+    HIP_CHECK(hipFree(d_results_64));
+
+    // Test 3: Best practice portable implementation
+    std::cout << "=== Test 3: Best Practice Portable Implementation ===" << std::endl;
+    std::cout << "Using lane_mask_t (";
+    std::cout << (sizeof(lane_mask_t) == 8 ? "64-bit" : "32-bit");
+    std::cout << " on this architecture)" << std::endl;
+    
+    lane_mask_t* d_results_portable;
+    HIP_CHECK(hipMalloc(&d_results_portable, num_warps * sizeof(lane_mask_t)));
+    
+    best_portable<<<1, num_threads>>>(d_results_portable);
+    HIP_CHECK(hipGetLastError());
+    HIP_CHECK(hipDeviceSynchronize());
+    
+    lane_mask_t h_results_portable[num_warps];
+    HIP_CHECK(hipMemcpy(h_results_portable, d_results_portable, 
+                       num_warps * sizeof(lane_mask_t), 
+                       hipMemcpyDeviceToHost));
+    
+    std::cout << "Results: ";
+    for (int i = 0; i < num_warps; ++i)
+    {
+        std::cout << h_results_portable[i] << " ";
+    }
+    std::cout << std::endl << std::endl;
+    
+    HIP_CHECK(hipFree(d_results_portable));
+
+    // Verify results match between 64-bit and portable versions
+    bool results_match = true;
+    for (int i = 0; i < num_warps; ++i)
+    {
+        if (h_results_64[i] != static_cast<std::uint64_t>(h_results_portable[i]))
+        {
+            results_match = false;
+            break;
+        }
+    }
+    
+    std::cout << "=== Verification ===" << std::endl;
+    std::cout << "64-bit and portable results match: " 
+              << (results_match ? "YES" : "NO") << std::endl;
+
+    if (results_match)
+    {
+        std::cout << "\nSUCCESS: All kernels produced correct results!" << std::endl;
+        return EXIT_SUCCESS;
+    }
+    else
+    {
+        std::cout << "\nFAILURE: Results do not match!" << std::endl;
+        return EXIT_FAILURE;
+    }
+}
diff --git a/README.md b/README.md
index 74f52badb..e376ac37a 100644
--- a/README.md
+++ b/README.md
@@ -183,6 +183,7 @@ The following options are available when building with CMake.
       - [warp_size_reduction](https://github.com/ROCm/rocm-examples/tree/amd-staging/HIP-Doc/Programming-Guide/HIP-C++-Language-Extensions/warp_size_reduction): Shows how to perform a reduction while relying on the warp size as an early-folded constant.
     - [Porting-CUDA-code-to-HIP](https://github.com/ROCm/rocm-examples/tree/amd-staging/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP) contains the examples from the [Porting NVIDIA CUDA code to HIP guide](https://rocm.docs.amd.com/projects/HIP/en/latest/how-to/hip_porting_guide.html) page.
       - [address_retrieval](https://github.com/ROCm/rocm-examples/tree/amd-staging/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/address_retrieval): Shows how to obtain the address of a HIP runtime function.
+      - [lane_masks_bit_shift](https://github.com/ROCm/rocm-examples/tree/amd-staging/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift): Demonstrates portable handling of lane masks and bit-shift operations across different warp sizes.
       - [device_code_feature_identification](https://github.com/ROCm/rocm-examples/tree/amd-staging/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/device_code_feature_identification): Shows how to query the device's compute features in device code.
       - [host_code_feature_identification](https://github.com/ROCm/rocm-examples/tree/amd-staging/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/host_code_feature_identification): Shows how to query the device's compute features in host code.
       - [identifying_compilation_target_platform](https://github.com/ROCm/rocm-examples/tree/amd-staging/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/identifying_compilation_target_platform): Shows how to distinguish between AMD and NVIDIA target platforms in code.
diff --git a/ROCm-Examples-Portable-VS2017.sln b/ROCm-Examples-Portable-VS2017.sln
index c2b59852e..5cea96a81 100644
--- a/ROCm-Examples-Portable-VS2017.sln
+++ b/ROCm-Examples-Portable-VS2017.sln
@@ -158,6 +158,10 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ordinary_memory_allocation_
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "HIP-C++-Language-Extensions", "HIP-C++-Language-Extensions", "{70A51D02-8ADF-4CFD-BE3E-6647E09AF45B}"
 EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Porting-CUDA-code-to-HIP-Doc", "Porting-CUDA-code-to-HIP-Doc", "{A6B7C8D9-E0F1-4234-5678-9ABCDEF01234}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "lane_masks_bit_shift_vs2017", "HIP-Doc\Programming-Guide\Porting-CUDA-code-to-HIP\lane_masks_bit_shift\lane_masks_bit_shift_vs2017.vcxproj", "{A1B2C3D4-E5F6-4789-A012-3456789ABCDE}"
+EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Porting-CUDA-code-to-HIP", "Porting-CUDA-code-to-HIP", "{BB989D47-FA5A-4B3E-AB7E-C18C4F62036B}"
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "calling_global_functions_vs2017", "HIP-Doc\Programming-Guide\HIP-C++-Language-Extensions\calling_global_functions\calling_global_functions_vs2017.vcxproj", "{67EF330D-B8D8-4A07-971E-80E7A0AB32D5}"
@@ -655,6 +659,7 @@ Global
 		{4FCDF29A-65AB-406E-A28A-40EA3D73FB18} = {70A51D02-8ADF-4CFD-BE3E-6647E09AF45B}
 		{FAD05E9E-E29B-47BD-B136-4B39B583FCF6} = {70A51D02-8ADF-4CFD-BE3E-6647E09AF45B}
 		{328E4455-5F63-4050-A03A-5B572858C6F7} = {70A51D02-8ADF-4CFD-BE3E-6647E09AF45B}
+		{A1B2C3D4-E5F6-4789-A012-3456789ABCDE} = {BB989D47-FA5A-4B3E-AB7E-C18C4F62036B}
 		{16A6CA98-768D-45D7-932E-04B42AD8A19F} = {BB989D47-FA5A-4B3E-AB7E-C18C4F62036B}
 		{11967426-7BAC-4129-AD17-82C5DCE5AC9D} = {BB989D47-FA5A-4B3E-AB7E-C18C4F62036B}
 		{9EF24773-61DD-448C-8CF3-1A7BE4F6D779} = {BB989D47-FA5A-4B3E-AB7E-C18C4F62036B}
diff --git a/ROCm-Examples-Portable-VS2019.sln b/ROCm-Examples-Portable-VS2019.sln
index 9bb13ca85..1ca7e6026 100644
--- a/ROCm-Examples-Portable-VS2019.sln
+++ b/ROCm-Examples-Portable-VS2019.sln
@@ -180,6 +180,10 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "warp_size_reduction_vs2019"
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "timer_vs2019", "HIP-Doc\Programming-Guide\HIP-C++-Language-Extensions\timer\timer_vs2019.vcxproj", "{33D89BE7-5A31-4325-86C1-FC7A9809C5F0}"
 EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Porting-CUDA-code-to-HIP-Doc", "Porting-CUDA-code-to-HIP-Doc", "{F5A6B7C8-D9E0-4123-F456-789ABCDEF012}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "lane_masks_bit_shift_vs2019", "HIP-Doc\Programming-Guide\Porting-CUDA-code-to-HIP\lane_masks_bit_shift\lane_masks_bit_shift_vs2019.vcxproj", "{B2C3D4E5-F6A7-4890-B123-456789ABCDEF}"
+EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Porting-CUDA-code-to-HIP", "Porting-CUDA-code-to-HIP", "{9AE38387-AAF7-491C-93D5-003422E121DF}"
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "identifying_compilation_target_platform_vs2019", "HIP-Doc\Programming-Guide\Porting-CUDA-code-to-HIP\identifying_compilation_target_platform\identifying_compilation_target_platform_vs2019.vcxproj", "{F0D0D5CC-04D3-45B2-BCCF-79D38CD31238}"
@@ -658,6 +662,7 @@ Global
 		{86492341-E8DA-4A05-8BE5-3B34029AD412} = {EBE02186-C3E3-426C-B230-CC88230E5683}
 		{8C4D1B5F-CE89-4E53-979A-83BC50A218FE} = {EBE02186-C3E3-426C-B230-CC88230E5683}
 		{33D89BE7-5A31-4325-86C1-FC7A9809C5F0} = {EBE02186-C3E3-426C-B230-CC88230E5683}
+		{B2C3D4E5-F6A7-4890-B123-456789ABCDEF} = {9AE38387-AAF7-491C-93D5-003422E121DF}
 		{9AE38387-AAF7-491C-93D5-003422E121DF} = {7CC0F577-AD85-4051-B589-5C837C2E155D}
 		{F0D0D5CC-04D3-45B2-BCCF-79D38CD31238} = {9AE38387-AAF7-491C-93D5-003422E121DF}
 		{C352D9CB-430F-4BE6-8A87-D73B9804E20C} = {9AE38387-AAF7-491C-93D5-003422E121DF}
diff --git a/ROCm-Examples-Portable-VS2022.sln b/ROCm-Examples-Portable-VS2022.sln
index bc771e78c..e819e5efd 100644
--- a/ROCm-Examples-Portable-VS2022.sln
+++ b/ROCm-Examples-Portable-VS2022.sln
@@ -180,6 +180,10 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "warp_size_reduction_vs2022"
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "timer_vs2022", "HIP-Doc\Programming-Guide\HIP-C++-Language-Extensions\timer\timer_vs2022.vcxproj", "{C1957D25-4652-01D7-463A-086B05C9B9C5}"
 EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Porting-CUDA-code-to-HIP-Doc", "Porting-CUDA-code-to-HIP-Doc", "{E4F5A6B7-C8D9-4012-E345-6789ABCDEF01}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "lane_masks_bit_shift_vs2022", "HIP-Doc\Programming-Guide\Porting-CUDA-code-to-HIP\lane_masks_bit_shift\lane_masks_bit_shift_vs2022.vcxproj", "{C3D4E5F6-A7B8-4901-C234-56789ABCDEF0}"
+EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Porting-CUDA-code-to-HIP", "Porting-CUDA-code-to-HIP", "{D913551B-498A-4177-A7D6-F2A22A90FC2C}"
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "identifying_compilation_target_platform_vs2022", "HIP-Doc\Programming-Guide\Porting-CUDA-code-to-HIP\identifying_compilation_target_platform\identifying_compilation_target_platform_vs2022.vcxproj", "{608D4B4B-6629-74E9-034A-D069EF2A68A6}"
@@ -658,6 +662,7 @@ Global
 		{8C91BCE4-7972-5257-EA2F-F3606BC54AAC} = {8C875693-29CC-43B9-8B41-107D26C155B4}
 		{46AFE6A7-6314-6CCE-9FDE-D8D6F8E2E884} = {8C875693-29CC-43B9-8B41-107D26C155B4}
 		{C1957D25-4652-01D7-463A-086B05C9B9C5} = {8C875693-29CC-43B9-8B41-107D26C155B4}
+		{C3D4E5F6-A7B8-4901-C234-56789ABCDEF0} = {D913551B-498A-4177-A7D6-F2A22A90FC2C}
 		{D913551B-498A-4177-A7D6-F2A22A90FC2C} = {4596F2CF-5E1E-4EF7-8339-4934D6BF1F00}
 		{608D4B4B-6629-74E9-034A-D069EF2A68A6} = {D913551B-498A-4177-A7D6-F2A22A90FC2C}
 		{53F8C1D9-E167-F401-BC95-79A9AC3ECCC7} = {D913551B-498A-4177-A7D6-F2A22A90FC2C}
diff --git a/ROCm-Examples-VS2017.sln b/ROCm-Examples-VS2017.sln
index eeeb4cddf..5bdfd1647 100644
--- a/ROCm-Examples-VS2017.sln
+++ b/ROCm-Examples-VS2017.sln
@@ -376,6 +376,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "per_thread_default_stream_v
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "pointer_memory_type_vs2017", "HIP-Doc\Programming-Guide\Porting-CUDA-code-to-HIP\pointer_memory_type\pointer_memory_type_vs2017.vcxproj", "{35E3ACD9-51BD-49DF-99A4-DB745ACB491B}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "lane_masks_bit_shift_vs2017", "HIP-Doc\Programming-Guide\Porting-CUDA-code-to-HIP\lane_masks_bit_shift\lane_masks_bit_shift_vs2017.vcxproj", "{A1B2C3D4-E5F6-4789-A012-3456789ABCDE}"
+EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "async_kernel_execution_vs2017", "HIP-Doc\Programming-Guide\Using-HIP-Runtime-API\Asynchronous-Concurrent-Execution\async_kernel_execution\async_kernel_execution_vs2017.vcxproj", "{97295C91-A455-4B78-89C9-BB6EA1791BBE}"
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "event_based_synchronization_vs2017", "HIP-Doc\Programming-Guide\Using-HIP-Runtime-API\Asynchronous-Concurrent-Execution\event_based_synchronization\event_based_synchronization_vs2017.vcxproj", "{54C05E77-0E5D-449E-B048-8166D2EE97E7}"
@@ -1056,6 +1058,10 @@ Global
 		{35E3ACD9-51BD-49DF-99A4-DB745ACB491B}.Debug|x64.Build.0 = Debug|x64
 		{35E3ACD9-51BD-49DF-99A4-DB745ACB491B}.Release|x64.ActiveCfg = Release|x64
 		{35E3ACD9-51BD-49DF-99A4-DB745ACB491B}.Release|x64.Build.0 = Release|x64
+		{A1B2C3D4-E5F6-4789-A012-3456789ABCDE}.Debug|x64.ActiveCfg = Debug|x64
+		{A1B2C3D4-E5F6-4789-A012-3456789ABCDE}.Debug|x64.Build.0 = Debug|x64
+		{A1B2C3D4-E5F6-4789-A012-3456789ABCDE}.Release|x64.ActiveCfg = Release|x64
+		{A1B2C3D4-E5F6-4789-A012-3456789ABCDE}.Release|x64.Build.0 = Release|x64
 		{97295C91-A455-4B78-89C9-BB6EA1791BBE}.Debug|x64.ActiveCfg = Debug|x64
 		{97295C91-A455-4B78-89C9-BB6EA1791BBE}.Debug|x64.Build.0 = Debug|x64
 		{97295C91-A455-4B78-89C9-BB6EA1791BBE}.Release|x64.ActiveCfg = Release|x64
diff --git a/ROCm-Examples-VS2019.sln b/ROCm-Examples-VS2019.sln
index 867d9b207..eac95ed2d 100644
--- a/ROCm-Examples-VS2019.sln
+++ b/ROCm-Examples-VS2019.sln
@@ -430,6 +430,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "per_thread_default_stream_v
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "pointer_memory_type_vs2019", "HIP-Doc\Programming-Guide\Porting-CUDA-code-to-HIP\pointer_memory_type\pointer_memory_type_vs2019.vcxproj", "{38AD6CC6-867E-44BA-B117-F02686076F9F}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "lane_masks_bit_shift_vs2019", "HIP-Doc\Programming-Guide\Porting-CUDA-code-to-HIP\lane_masks_bit_shift\lane_masks_bit_shift_vs2019.vcxproj", "{B2C3D4E5-F6A7-4890-B123-456789ABCDEF}"
+EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Programming-for-HIP-Runtime-Compiler", "Programming-for-HIP-Runtime-Compiler", "{D0056E51-E2CD-40B1-B542-02F90E2F0767}"
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "compilation_apis_vs2019", "HIP-Doc\Programming-Guide\Programming-for-HIP-Runtime-Compiler\compilation_apis\compilation_apis_vs2019.vcxproj", "{404200F8-EE3E-452B-A17F-B8CE8C142054}"
@@ -1164,6 +1166,10 @@ Global
 		{38AD6CC6-867E-44BA-B117-F02686076F9F}.Debug|x64.Build.0 = Debug|x64
 		{38AD6CC6-867E-44BA-B117-F02686076F9F}.Release|x64.ActiveCfg = Release|x64
 		{38AD6CC6-867E-44BA-B117-F02686076F9F}.Release|x64.Build.0 = Release|x64
+		{B2C3D4E5-F6A7-4890-B123-456789ABCDEF}.Debug|x64.ActiveCfg = Debug|x64
+		{B2C3D4E5-F6A7-4890-B123-456789ABCDEF}.Debug|x64.Build.0 = Debug|x64
+		{B2C3D4E5-F6A7-4890-B123-456789ABCDEF}.Release|x64.ActiveCfg = Release|x64
+		{B2C3D4E5-F6A7-4890-B123-456789ABCDEF}.Release|x64.Build.0 = Release|x64
 		{404200F8-EE3E-452B-A17F-B8CE8C142054}.Debug|x64.ActiveCfg = Debug|x64
 		{404200F8-EE3E-452B-A17F-B8CE8C142054}.Debug|x64.Build.0 = Debug|x64
 		{404200F8-EE3E-452B-A17F-B8CE8C142054}.Release|x64.ActiveCfg = Release|x64
diff --git a/ROCm-Examples-VS2022.sln b/ROCm-Examples-VS2022.sln
index bb6b34148..7a43c9672 100644
--- a/ROCm-Examples-VS2022.sln
+++ b/ROCm-Examples-VS2022.sln
@@ -430,6 +430,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "per_thread_default_stream_v
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "pointer_memory_type_vs2022", "HIP-Doc\Programming-Guide\Porting-CUDA-code-to-HIP\pointer_memory_type\pointer_memory_type_vs2022.vcxproj", "{7B0E56C2-C6D6-E245-7B21-A64767C87738}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "lane_masks_bit_shift_vs2022", "HIP-Doc\Programming-Guide\Porting-CUDA-code-to-HIP\lane_masks_bit_shift\lane_masks_bit_shift_vs2022.vcxproj", "{C3D4E5F6-A7B8-4901-C234-56789ABCDEF0}"
+EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Programming-for-HIP-Runtime-Compiler", "Programming-for-HIP-Runtime-Compiler", "{AA794309-1F62-4964-A6BE-7E09BE0C1875}"
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "compilation_apis_vs2022", "HIP-Doc\Programming-Guide\Programming-for-HIP-Runtime-Compiler\compilation_apis\compilation_apis_vs2022.vcxproj", "{17AD5745-81C3-6574-B3A7-BD5E407836CC}"
@@ -1164,6 +1166,10 @@ Global
 		{7B0E56C2-C6D6-E245-7B21-A64767C87738}.Debug|x64.Build.0 = Debug|x64
 		{7B0E56C2-C6D6-E245-7B21-A64767C87738}.Release|x64.ActiveCfg = Release|x64
 		{7B0E56C2-C6D6-E245-7B21-A64767C87738}.Release|x64.Build.0 = Release|x64
+		{C3D4E5F6-A7B8-4901-C234-56789ABCDEF0}.Debug|x64.ActiveCfg = Debug|x64
+		{C3D4E5F6-A7B8-4901-C234-56789ABCDEF0}.Debug|x64.Build.0 = Debug|x64
+		{C3D4E5F6-A7B8-4901-C234-56789ABCDEF0}.Release|x64.ActiveCfg = Release|x64
+		{C3D4E5F6-A7B8-4901-C234-56789ABCDEF0}.Release|x64.Build.0 = Release|x64
 		{17AD5745-81C3-6574-B3A7-BD5E407836CC}.Debug|x64.ActiveCfg = Debug|x64
 		{17AD5745-81C3-6574-B3A7-BD5E407836CC}.Debug|x64.Build.0 = Debug|x64
 		{17AD5745-81C3-6574-B3A7-BD5E407836CC}.Release|x64.ActiveCfg = Release|x64
@@ -1455,6 +1461,7 @@ Global
 		{919E29D3-0B7C-AFFF-8E2C-3CD179744131} = {C74FDF2D-7421-4680-9940-32FC63E96B96}
 		{2F28FDEC-F73B-C8E7-2166-6F00ED3AF12C} = {C74FDF2D-7421-4680-9940-32FC63E96B96}
 		{7B0E56C2-C6D6-E245-7B21-A64767C87738} = {C74FDF2D-7421-4680-9940-32FC63E96B96}
+		{C3D4E5F6-A7B8-4901-C234-56789ABCDEF0} = {C74FDF2D-7421-4680-9940-32FC63E96B96}
 		{AA794309-1F62-4964-A6BE-7E09BE0C1875} = {0D14E46B-FEBB-4A92-95E3-C46C9FD2A8F4}
 		{17AD5745-81C3-6574-B3A7-BD5E407836CC} = {AA794309-1F62-4964-A6BE-7E09BE0C1875}
 		{9CA95495-B513-E12B-C332-BBEE9CF93DAE} = {AA794309-1F62-4964-A6BE-7E09BE0C1875}

From 3b27770b6354640e95e7eab196bf61225ce5cbb7 Mon Sep 17 00:00:00 2001
From: Jan Stephan <jan.stephan@amd.com>
Date: Thu, 18 Dec 2025 15:18:53 +0100
Subject: [PATCH 02/11] Install hipcub

Signed-off-by: Jan Stephan <jan.stephan@amd.com>
---
 .github/workflows/build_hip_documentation.yml      | 2 +-
 .github/workflows/build_hip_documentation_cuda.yml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_hip_documentation.yml b/.github/workflows/build_hip_documentation.yml
index 549d3b478..4bf2cccdd 100644
--- a/.github/workflows/build_hip_documentation.yml
+++ b/.github/workflows/build_hip_documentation.yml
@@ -45,7 +45,7 @@ jobs:
                   wget https://repo.radeon.com/amdgpu-install/${{ env.ROCM_VERSION }}/ubuntu/jammy/amdgpu-install_${{ env.AMDGPU_INSTALLER_VERSION }}_all.deb
                   apt-get -y install ./amdgpu-install_${{ env.AMDGPU_INSTALLER_VERSION }}_all.deb &&
                   apt-get update -qq &&
-                  apt-get -y install rocm-dev rocm-llvm-dev hipfft-dev
+                  apt-get -y install rocm-dev rocm-llvm-dev hipfft-dev hipcub-dev
                   echo "/opt/rocm/bin" >> $GITHUB_PATH
                   echo "ROCM_PATH=/opt/rocm" >> $GITHUB_ENV
                   echo "LD_LIBRARY_PATH=/opt/rocm/lib:${LD_LIBRARY_PATH}" >> $GITHUB_ENV
diff --git a/.github/workflows/build_hip_documentation_cuda.yml b/.github/workflows/build_hip_documentation_cuda.yml
index d4096d108..c39b1382f 100644
--- a/.github/workflows/build_hip_documentation_cuda.yml
+++ b/.github/workflows/build_hip_documentation_cuda.yml
@@ -46,7 +46,7 @@ jobs:
               wget https://repo.radeon.com/amdgpu-install/${{ env.ROCM_VERSION }}/ubuntu/jammy/amdgpu-install_${{ env.AMDGPU_INSTALLER_VERSION }}_all.deb
               apt-get -y install ./amdgpu-install_${{ env.AMDGPU_INSTALLER_VERSION }}_all.deb &&
               apt-get update -qq &&
-              apt-get install -y hip-dev hipify-clang hipfft-dev
+              apt-get install -y hip-dev hipify-clang hipfft-dev hipcub-dev
 
           - name: Configure and Build
             # The CMAKE_POLICY_VERSION_MINIMUM environment variable can be removed once the CMake updates from ROCm 7.0 are available

From 9a896b848e761d0ed1e9c0b01a8469fef1f5170e Mon Sep 17 00:00:00 2001
From: Jan Stephan <jan.stephan@amd.com>
Date: Thu, 18 Dec 2025 15:46:29 +0100
Subject: [PATCH 03/11] Install CCCL

Signed-off-by: Jan Stephan <jan.stephan@amd.com>
---
 .github/workflows/build_hip_documentation_cuda.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/build_hip_documentation_cuda.yml b/.github/workflows/build_hip_documentation_cuda.yml
index c39b1382f..2e09f14eb 100644
--- a/.github/workflows/build_hip_documentation_cuda.yml
+++ b/.github/workflows/build_hip_documentation_cuda.yml
@@ -38,6 +38,7 @@ jobs:
                   apt-get update -qq &&
                   apt-get install -y build-essential g++ glslang-tools \
                     python3 python3-pip locales wget git libtiff-dev
+                    cuda-cccl-12.6
                   python3 -m pip install --upgrade pip
                   python3 -m pip install cmake
 

From 25df1fcf4766710f08b31d75c99dae8ab46d9746 Mon Sep 17 00:00:00 2001
From: Jan Stephan <jan.stephan@amd.com>
Date: Thu, 18 Dec 2025 16:55:16 +0100
Subject: [PATCH 04/11] CMake: Find hipCUB

Signed-off-by: Jan Stephan <jan.stephan@amd.com>
---
 .../build_hip_documentation_cuda.yml          |  1 -
 .../Porting-CUDA-code-to-HIP/CMakeLists.txt   | 24 ++++++++++++++++++-
 .../lane_masks_bit_shift/CMakeLists.txt       |  8 ++++---
 3 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/build_hip_documentation_cuda.yml b/.github/workflows/build_hip_documentation_cuda.yml
index 2e09f14eb..c39b1382f 100644
--- a/.github/workflows/build_hip_documentation_cuda.yml
+++ b/.github/workflows/build_hip_documentation_cuda.yml
@@ -38,7 +38,6 @@ jobs:
                   apt-get update -qq &&
                   apt-get install -y build-essential g++ glslang-tools \
                     python3 python3-pip locales wget git libtiff-dev
-                    cuda-cccl-12.6
                   python3 -m pip install --upgrade pip
                   python3 -m pip install cmake
 
diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/CMakeLists.txt b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/CMakeLists.txt
index 3369b5cb8..71df95225 100644
--- a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/CMakeLists.txt
+++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/CMakeLists.txt
@@ -26,6 +26,22 @@ project(HIP-Doc-Programming-Guide-Porting-CUDA-code-to-HIP LANGUAGES CXX)
 file(RELATIVE_PATH folder_bin ${CMAKE_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin/${folder_bin})
 
+if(CMAKE_SYSTEM_NAME MATCHES "Windows")
+    set(ROCM_ROOT
+        "$ENV{HIP_PATH}"
+        CACHE PATH
+        "Root directory of the ROCm installation"
+    )
+else()
+    set(ROCM_ROOT
+        "/opt/rocm"
+        CACHE PATH
+        "Root directory of the ROCm installation"
+    )
+endif()
+
+list(APPEND CMAKE_PREFIX_PATH "${ROCM_ROOT}")
+
 add_subdirectory(address_retrieval)
 add_subdirectory(device_code_feature_identification)
 add_subdirectory(host_code_feature_identification)
@@ -40,6 +56,12 @@ else()
     add_subdirectory(load_module_ex_cuda)
 endif()
 
-add_subdirectory(lane_masks_bit_shift)
+find_package(hipcub QUIET)
+if(NOT hipcub_FOUND)
+    message(STATUS "hipCUB not found. Skipping lane masks bit-shift example.")
+else()
+    add_subdirectory(lane_masks_bit_shift)
+endif()
+
 add_subdirectory(per_thread_default_stream)
 add_subdirectory(pointer_memory_type)
diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/CMakeLists.txt b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/CMakeLists.txt
index 2da61faf4..bb1b05a44 100644
--- a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/CMakeLists.txt
+++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/CMakeLists.txt
@@ -31,8 +31,8 @@ enable_language(${ROCM_EXAMPLES_GPU_LANGUAGE})
 select_hip_platform()
 
 if(ROCM_EXAMPLES_GPU_LANGUAGE STREQUAL "CUDA")
-    # The example calls hipDeviceGetAttribute which internally uses the CUDA driver API. We must explicitly link the
-    # driver library.
+    # The example calls hipDeviceGetAttribute which internally uses the CUDA
+    # driver API. We must explicitly link the driver library.
     find_package(CUDAToolkit REQUIRED)
 endif()
 
@@ -52,6 +52,8 @@ endif()
 
 list(APPEND CMAKE_PREFIX_PATH "${ROCM_ROOT}")
 
+find_package(hipcub REQUIRED)
+
 add_executable(${example_name} main.hip)
 # Make example runnable using ctest
 add_test(NAME ${example_name} COMMAND ${example_name})
@@ -62,6 +64,6 @@ set_target_properties(${example_name} PROPERTIES $<$<COMPILE_LANGUAGE:CUDA>:CUDA
                                                  $<$<COMPILE_LANGUAGE:HIP>:HIP_EXTENSIONS OFF>)
 target_compile_features(${example_name} PRIVATE cuda_std_17 hip_std_17)
 target_include_directories(${example_name} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:${ROCM_ROOT}/include>)
-target_link_libraries(${example_name} PRIVATE $<$<LINK_LANGUAGE:CUDA>:CUDA::cuda_driver>)
+target_link_libraries(${example_name} PRIVATE $<$<LINK_LANGUAGE:CUDA>:CUDA::cuda_driver> hip::hipcub)
 
 install(TARGETS ${example_name})

From af979b6343d5628c6c534a97aef7e509c5c48246 Mon Sep 17 00:00:00 2001
From: Jan Stephan <jan.stephan@amd.com>
Date: Thu, 18 Dec 2025 17:03:28 +0100
Subject: [PATCH 05/11] Install CCCL

Signed-off-by: Jan Stephan <jan.stephan@amd.com>
---
 .github/workflows/build_portable_sln.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_portable_sln.yml b/.github/workflows/build_portable_sln.yml
index ee9774231..9bbeadc5e 100644
--- a/.github/workflows/build_portable_sln.yml
+++ b/.github/workflows/build_portable_sln.yml
@@ -71,7 +71,7 @@ jobs:
                   Write-Host "Downloading installer https://developer.download.nvidia.com/compute/cuda/${{ env.CUDA_VERSION }}/local_installers/cuda_${{ env.CUDA_VERSION }}_${{ env.CUDA_INSTALLER_VERSION }}_windows.exe"
                   Invoke-WebRequest -Uri https://developer.download.nvidia.com/compute/cuda/${{ env.CUDA_VERSION }}/local_installers/cuda_${{ env.CUDA_VERSION }}_${{ env.CUDA_INSTALLER_VERSION }}_windows.exe -OutFile CUDAToolkit.exe
                   Write-Host "Installing CUDA Toolkit"
-                  Start-Process CUDAToolkit.exe -ArgumentList '-s cuda_profiler_api_12.8 cudart_12.8 nvcc_12.8 nvrtc_dev_12.8 cublas_dev_12.8 cufft_dev_12.8 curand_dev_12.8 cusolver_dev_12.8 cusparse_dev_12.8 visual_studio_integration_12.8','-n' -NoNewWindow -Wait
+                  Start-Process CUDAToolkit.exe -ArgumentList '-s cuda_profiler_api_12.8 cudart_12.8 nvcc_12.8 nvrtc_dev_12.8 cccl_dev_12.8 cublas_dev_12.8 cufft_dev_12.8 curand_dev_12.8 cusolver_dev_12.8 cusparse_dev_12.8 visual_studio_integration_12.8','-n' -NoNewWindow -Wait
                   if (Test-Path -Path "${{ env.CUDA_PATH }}") {
                       Write-Host "CUDA_PATH directory exists: ${{ env.CUDA_PATH }}"
                       Write-Host "CUDA Toolkit installed successfully"

From cfa93194af0ff714783c520ca36a00a81d8a5082 Mon Sep 17 00:00:00 2001
From: Jan Stephan <jan.stephan@amd.com>
Date: Thu, 18 Dec 2025 17:12:51 +0100
Subject: [PATCH 06/11] Guard against double inclusion

Signed-off-by: Jan Stephan <jan.stephan@amd.com>
---
 .../lane_masks_bit_shift/CMakeLists.txt                       | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/CMakeLists.txt b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/CMakeLists.txt
index bb1b05a44..822e6922f 100644
--- a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/CMakeLists.txt
+++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/CMakeLists.txt
@@ -52,7 +52,9 @@ endif()
 
 list(APPEND CMAKE_PREFIX_PATH "${ROCM_ROOT}")
 
-find_package(hipcub REQUIRED)
+if(NOT TARGET hip::hipcub)
+    find_package(hipcub REQUIRED)
+endif()
 
 add_executable(${example_name} main.hip)
 # Make example runnable using ctest

From cba092ec5053353cc8e7fd717f7c03fb89a22daa Mon Sep 17 00:00:00 2001
From: Jan Stephan <jan.stephan@amd.com>
Date: Thu, 18 Dec 2025 17:19:54 +0100
Subject: [PATCH 07/11] Remove non-existing subpackage

Signed-off-by: Jan Stephan <jan.stephan@amd.com>
---
 .github/workflows/build_portable_sln.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_portable_sln.yml b/.github/workflows/build_portable_sln.yml
index 9bbeadc5e..ee9774231 100644
--- a/.github/workflows/build_portable_sln.yml
+++ b/.github/workflows/build_portable_sln.yml
@@ -71,7 +71,7 @@ jobs:
                   Write-Host "Downloading installer https://developer.download.nvidia.com/compute/cuda/${{ env.CUDA_VERSION }}/local_installers/cuda_${{ env.CUDA_VERSION }}_${{ env.CUDA_INSTALLER_VERSION }}_windows.exe"
                   Invoke-WebRequest -Uri https://developer.download.nvidia.com/compute/cuda/${{ env.CUDA_VERSION }}/local_installers/cuda_${{ env.CUDA_VERSION }}_${{ env.CUDA_INSTALLER_VERSION }}_windows.exe -OutFile CUDAToolkit.exe
                   Write-Host "Installing CUDA Toolkit"
-                  Start-Process CUDAToolkit.exe -ArgumentList '-s cuda_profiler_api_12.8 cudart_12.8 nvcc_12.8 nvrtc_dev_12.8 cccl_dev_12.8 cublas_dev_12.8 cufft_dev_12.8 curand_dev_12.8 cusolver_dev_12.8 cusparse_dev_12.8 visual_studio_integration_12.8','-n' -NoNewWindow -Wait
+                  Start-Process CUDAToolkit.exe -ArgumentList '-s cuda_profiler_api_12.8 cudart_12.8 nvcc_12.8 nvrtc_dev_12.8 cublas_dev_12.8 cufft_dev_12.8 curand_dev_12.8 cusolver_dev_12.8 cusparse_dev_12.8 visual_studio_integration_12.8','-n' -NoNewWindow -Wait
                   if (Test-Path -Path "${{ env.CUDA_PATH }}") {
                       Write-Host "CUDA_PATH directory exists: ${{ env.CUDA_PATH }}"
                       Write-Host "CUDA Toolkit installed successfully"

From 32d1d1fad19de84f8dd2b2f74f770541109f54a0 Mon Sep 17 00:00:00 2001
From: Jan Stephan <jan.stephan@amd.com>
Date: Mon, 2 Feb 2026 18:32:51 +0100
Subject: [PATCH 08/11] Update to CUDA 12.9.1

Signed-off-by: Jan Stephan <jan.stephan@amd.com>
---
 .github/workflows/build_portable_sln.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build_portable_sln.yml b/.github/workflows/build_portable_sln.yml
index ee9774231..f8c5cf3d8 100644
--- a/.github/workflows/build_portable_sln.yml
+++ b/.github/workflows/build_portable_sln.yml
@@ -30,9 +30,9 @@ env:
     PLATFORM_TOOLSET_VERSION: 6.4
     HIP_PATH: C:\Program Files\AMD\ROCm\6.4\
     HIPSDK_INSTALLER_VERSION: 25.Q3
-    CUDA_VERSION: 12.8.1
-    CUDA_INSTALLER_VERSION: 572.61
-    CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8
+    CUDA_VERSION: 12.9.1
+    CUDA_INSTALLER_VERSION: 576.57
+    CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9
     GLFW_DIR: C:\glfw-3.4.bin.WIN64\
     VULKAN_SDK: C:\VulkanSDK\1.4.313.2\
 
@@ -71,7 +71,7 @@ jobs:
                   Write-Host "Downloading installer https://developer.download.nvidia.com/compute/cuda/${{ env.CUDA_VERSION }}/local_installers/cuda_${{ env.CUDA_VERSION }}_${{ env.CUDA_INSTALLER_VERSION }}_windows.exe"
                   Invoke-WebRequest -Uri https://developer.download.nvidia.com/compute/cuda/${{ env.CUDA_VERSION }}/local_installers/cuda_${{ env.CUDA_VERSION }}_${{ env.CUDA_INSTALLER_VERSION }}_windows.exe -OutFile CUDAToolkit.exe
                   Write-Host "Installing CUDA Toolkit"
-                  Start-Process CUDAToolkit.exe -ArgumentList '-s cuda_profiler_api_12.8 cudart_12.8 nvcc_12.8 nvrtc_dev_12.8 cublas_dev_12.8 cufft_dev_12.8 curand_dev_12.8 cusolver_dev_12.8 cusparse_dev_12.8 visual_studio_integration_12.8','-n' -NoNewWindow -Wait
+                  Start-Process CUDAToolkit.exe -ArgumentList '-s cuda_profiler_api_12.9 cudart_12.9 nvcc_12.9 nvrtc_dev_12.9 cublas_dev_12.9 cufft_dev_12.9 curand_dev_12.9 cusolver_dev_12.9 cusparse_dev_12.9 visual_studio_integration_12.9','-n' -NoNewWindow -Wait
                   if (Test-Path -Path "${{ env.CUDA_PATH }}") {
                       Write-Host "CUDA_PATH directory exists: ${{ env.CUDA_PATH }}"
                       Write-Host "CUDA Toolkit installed successfully"

From c967b7c78b45faf6eda9241c8503fb1096870d4a Mon Sep 17 00:00:00 2001
From: Jan Stephan <jan.stephan@amd.com>
Date: Tue, 3 Feb 2026 12:51:13 +0100
Subject: [PATCH 09/11] Update to HIP SDK 7.1.1

Signed-off-by: Jan Stephan <jan.stephan@amd.com>
---
 .github/workflows/build_applications_vs.yml      | 6 +++---
 .github/workflows/build_hip_basic_vs.yml         | 6 +++---
 .github/workflows/build_hip_documentation_vs.yml | 6 +++---
 .github/workflows/build_libraries_vs.yml         | 6 +++---
 .github/workflows/build_portable_sln.yml         | 6 +++---
 .github/workflows/build_programming_guide_vs.yml | 6 +++---
 6 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/build_applications_vs.yml b/.github/workflows/build_applications_vs.yml
index 442933b7f..7ed7fe1ff 100644
--- a/.github/workflows/build_applications_vs.yml
+++ b/.github/workflows/build_applications_vs.yml
@@ -19,9 +19,9 @@ concurrency:
   cancel-in-progress: true
 
 env:
-    PLATFORM_TOOLSET_VERSION: 6.4
-    HIP_PATH: C:\Program Files\AMD\ROCm\6.4\
-    HIPSDK_INSTALLER_VERSION: 25.Q3
+    PLATFORM_TOOLSET_VERSION: 7.1
+    HIP_PATH: C:\Program Files\AMD\ROCm\7.1\
+    HIPSDK_INSTALLER_VERSION: 26.Q1
     GLFW_DIR: C:\glfw-3.4.bin.WIN64\
 
 jobs:
diff --git a/.github/workflows/build_hip_basic_vs.yml b/.github/workflows/build_hip_basic_vs.yml
index cdb007f43..b96b026af 100644
--- a/.github/workflows/build_hip_basic_vs.yml
+++ b/.github/workflows/build_hip_basic_vs.yml
@@ -19,9 +19,9 @@ concurrency:
   cancel-in-progress: true
 
 env:
-    PLATFORM_TOOLSET_VERSION: 6.4
-    HIP_PATH: C:\Program Files\AMD\ROCm\6.4\
-    HIPSDK_INSTALLER_VERSION: 25.Q3
+    PLATFORM_TOOLSET_VERSION: 7.1
+    HIP_PATH: C:\Program Files\AMD\ROCm\7.1\
+    HIPSDK_INSTALLER_VERSION: 26.Q1
     VULKAN_SDK: C:\VulkanSDK\1.4.313.2\
     GLFW_DIR: C:\glfw-3.4.bin.WIN64\
 
diff --git a/.github/workflows/build_hip_documentation_vs.yml b/.github/workflows/build_hip_documentation_vs.yml
index da8447b6a..516c5a14b 100644
--- a/.github/workflows/build_hip_documentation_vs.yml
+++ b/.github/workflows/build_hip_documentation_vs.yml
@@ -19,9 +19,9 @@ concurrency:
   cancel-in-progress: true
 
 env:
-    PLATFORM_TOOLSET_VERSION: 6.4
-    HIP_PATH: C:\Program Files\AMD\ROCm\6.4\
-    HIPSDK_INSTALLER_VERSION: 25.Q3
+    PLATFORM_TOOLSET_VERSION: 7.1
+    HIP_PATH: C:\Program Files\AMD\ROCm\7.1\
+    HIPSDK_INSTALLER_VERSION: 26.Q1
     VULKAN_SDK: C:\VulkanSDK\1.4.313.2\
     GLFW_DIR: C:\glfw-3.4.bin.WIN64\
 
diff --git a/.github/workflows/build_libraries_vs.yml b/.github/workflows/build_libraries_vs.yml
index 232d57456..c5cbfa8a2 100644
--- a/.github/workflows/build_libraries_vs.yml
+++ b/.github/workflows/build_libraries_vs.yml
@@ -19,9 +19,9 @@ concurrency:
   cancel-in-progress: true
 
 env:
-    PLATFORM_TOOLSET_VERSION: 6.4
-    HIP_PATH: C:\Program Files\AMD\ROCm\6.4\
-    HIPSDK_INSTALLER_VERSION: 25.Q3
+    PLATFORM_TOOLSET_VERSION: 7.1
+    HIP_PATH: C:\Program Files\AMD\ROCm\7.1\
+    HIPSDK_INSTALLER_VERSION: 26.Q1
 
 jobs:
     build:
diff --git a/.github/workflows/build_portable_sln.yml b/.github/workflows/build_portable_sln.yml
index f8c5cf3d8..d1c29e7ef 100644
--- a/.github/workflows/build_portable_sln.yml
+++ b/.github/workflows/build_portable_sln.yml
@@ -27,9 +27,9 @@ concurrency:
   cancel-in-progress: true
 
 env:
-    PLATFORM_TOOLSET_VERSION: 6.4
-    HIP_PATH: C:\Program Files\AMD\ROCm\6.4\
-    HIPSDK_INSTALLER_VERSION: 25.Q3
+    PLATFORM_TOOLSET_VERSION: 7.1
+    HIP_PATH: C:\Program Files\AMD\ROCm\7.1\
+    HIPSDK_INSTALLER_VERSION: 26.Q1
     CUDA_VERSION: 12.9.1
     CUDA_INSTALLER_VERSION: 576.57
     CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9
diff --git a/.github/workflows/build_programming_guide_vs.yml b/.github/workflows/build_programming_guide_vs.yml
index 79a7dfb19..890c313a6 100644
--- a/.github/workflows/build_programming_guide_vs.yml
+++ b/.github/workflows/build_programming_guide_vs.yml
@@ -19,9 +19,9 @@ concurrency:
   cancel-in-progress: true
 
 env:
-    PLATFORM_TOOLSET_VERSION: 6.4
-    HIP_PATH: C:\Program Files\AMD\ROCm\6.4\
-    HIPSDK_INSTALLER_VERSION: 25.Q3
+    PLATFORM_TOOLSET_VERSION: 7.1
+    HIP_PATH: C:\Program Files\AMD\ROCm\7.1\
+    HIPSDK_INSTALLER_VERSION: 26.Q1
     VULKAN_SDK: C:\VulkanSDK\1.4.313.2\
     GLFW_DIR: C:\glfw-3.4.bin.WIN64\
 

From d9dbd627d24a4c282ea732025045c5a6237fbfee Mon Sep 17 00:00:00 2001
From: Jan Stephan <jan.stephan@amd.com>
Date: Tue, 3 Feb 2026 12:56:15 +0100
Subject: [PATCH 10/11] Fix installer URL

Signed-off-by: Jan Stephan <jan.stephan@amd.com>
---
 .github/workflows/build_applications_vs.yml      | 4 ++--
 .github/workflows/build_hip_basic_vs.yml         | 4 ++--
 .github/workflows/build_hip_documentation_vs.yml | 4 ++--
 .github/workflows/build_libraries_vs.yml         | 4 ++--
 .github/workflows/build_portable_sln.yml         | 4 ++--
 .github/workflows/build_programming_guide_vs.yml | 4 ++--
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/build_applications_vs.yml b/.github/workflows/build_applications_vs.yml
index 7ed7fe1ff..ba8a52345 100644
--- a/.github/workflows/build_applications_vs.yml
+++ b/.github/workflows/build_applications_vs.yml
@@ -52,8 +52,8 @@ jobs:
           - name: Install HIP SDK
             run: |
                   $ProgressPreference = 'SilentlyContinue'
-                  Write-Host "Downloading installer https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win10-Win11-For-HIP.exe"
-                  Invoke-WebRequest -Uri https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win10-Win11-For-HIP.exe -OutFile HIPSDK.exe
+                  Write-Host "Downloading installer https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe"
+                  Invoke-WebRequest -Uri https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe -OutFile HIPSDK.exe
                   Write-Host "Installing HIP SDK"
                   Start-Process HIPSDK.exe -ArgumentList '-install','-log',"$((Get-Location).Path)\installer_log.txt" -NoNewWindow -Wait
                   Write-Host "Installer log:"
diff --git a/.github/workflows/build_hip_basic_vs.yml b/.github/workflows/build_hip_basic_vs.yml
index b96b026af..2edc90e5e 100644
--- a/.github/workflows/build_hip_basic_vs.yml
+++ b/.github/workflows/build_hip_basic_vs.yml
@@ -57,8 +57,8 @@ jobs:
           - name: Install HIP SDK
             run: |
                   $ProgressPreference = 'SilentlyContinue'
-                  Write-Host "Downloading installer https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win10-Win11-For-HIP.exe"
-                  Invoke-WebRequest -Uri https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win10-Win11-For-HIP.exe -OutFile HIPSDK.exe
+                  Write-Host "Downloading installer https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe"
+                  Invoke-WebRequest -Uri https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe -OutFile HIPSDK.exe
                   Write-Host "Installing HIP SDK"
                   Start-Process HIPSDK.exe -ArgumentList '-install','-log',"$((Get-Location).Path)\installer_log.txt" -NoNewWindow -Wait
                   Write-Host "Installer log:"
diff --git a/.github/workflows/build_hip_documentation_vs.yml b/.github/workflows/build_hip_documentation_vs.yml
index 516c5a14b..755dd5925 100644
--- a/.github/workflows/build_hip_documentation_vs.yml
+++ b/.github/workflows/build_hip_documentation_vs.yml
@@ -57,8 +57,8 @@ jobs:
           - name: Install HIP SDK
             run: |
                   $ProgressPreference = 'SilentlyContinue'
-                  Write-Host "Downloading installer https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win10-Win11-For-HIP.exe"
-                  Invoke-WebRequest -Uri https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win10-Win11-For-HIP.exe -OutFile HIPSDK.exe
+                  Write-Host "Downloading installer https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe"
+                  Invoke-WebRequest -Uri https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe -OutFile HIPSDK.exe
                   Write-Host "Installing HIP SDK"
                   Start-Process HIPSDK.exe -ArgumentList '-install','-log',"$((Get-Location).Path)\installer_log.txt" -NoNewWindow -Wait
                   Write-Host "Installer log:"
diff --git a/.github/workflows/build_libraries_vs.yml b/.github/workflows/build_libraries_vs.yml
index c5cbfa8a2..477f457c2 100644
--- a/.github/workflows/build_libraries_vs.yml
+++ b/.github/workflows/build_libraries_vs.yml
@@ -46,8 +46,8 @@ jobs:
           - name: Install HIP SDK
             run: |
                   $ProgressPreference = 'SilentlyContinue'
-                  Write-Host "Downloading installer https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win10-Win11-For-HIP.exe"
-                  Invoke-WebRequest -Uri https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win10-Win11-For-HIP.exe -OutFile HIPSDK.exe
+                  Write-Host "Downloading installer https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe"
+                  Invoke-WebRequest -Uri https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe -OutFile HIPSDK.exe
                   Write-Host "Installing HIP SDK"
                   Start-Process HIPSDK.exe -ArgumentList '-install','-log',"$((Get-Location).Path)\installer_log.txt" -NoNewWindow -Wait
                   Write-Host "Installer log:"
diff --git a/.github/workflows/build_portable_sln.yml b/.github/workflows/build_portable_sln.yml
index d1c29e7ef..be781148a 100644
--- a/.github/workflows/build_portable_sln.yml
+++ b/.github/workflows/build_portable_sln.yml
@@ -83,8 +83,8 @@ jobs:
           - name: Install HIP SDK
             run: |
                   $ProgressPreference = 'SilentlyContinue'
-                  Write-Host "Downloading installer https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win10-Win11-For-HIP.exe"
-                  Invoke-WebRequest -Uri https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win10-Win11-For-HIP.exe -OutFile HIPSDK.exe
+                  Write-Host "Downloading installer https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe"
+                  Invoke-WebRequest -Uri https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe -OutFile HIPSDK.exe
                   Write-Host "Installing HIP SDK"
                   Start-Process HIPSDK.exe -ArgumentList '-install','-log',"$((Get-Location).Path)\installer_log.txt" -NoNewWindow -Wait
                   Write-Host "Installer log:"
diff --git a/.github/workflows/build_programming_guide_vs.yml b/.github/workflows/build_programming_guide_vs.yml
index 890c313a6..334f51234 100644
--- a/.github/workflows/build_programming_guide_vs.yml
+++ b/.github/workflows/build_programming_guide_vs.yml
@@ -47,8 +47,8 @@ jobs:
           - name: Install HIP SDK
             run: |
                   $ProgressPreference = 'SilentlyContinue'
-                  Write-Host "Downloading installer https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win10-Win11-For-HIP.exe"
-                  Invoke-WebRequest -Uri https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win10-Win11-For-HIP.exe -OutFile HIPSDK.exe
+                  Write-Host "Downloading installer https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe"
+                  Invoke-WebRequest -Uri https://download.amd.com/developer/eula/rocm-hub/AMD-Software-PRO-Edition-${{ env.HIPSDK_INSTALLER_VERSION }}-Win11-For-HIP.exe -OutFile HIPSDK.exe
                   Write-Host "Installing HIP SDK"
                   Start-Process HIPSDK.exe -ArgumentList '-install','-log',"$((Get-Location).Path)\installer_log.txt" -NoNewWindow -Wait
                   Write-Host "Installer log:"

From dd9045e01c056453f5c4134607bae79758d8292c Mon Sep 17 00:00:00 2001
From: Adel Johar <adel.johar@amd.com>
Date: Tue, 3 Mar 2026 11:03:12 +0100
Subject: [PATCH 11/11] Rebase fixes

---
 .github/workflows/build_applications_cuda.yml |  2 +-
 .github/workflows/build_hip_basic_cuda.yml    |  2 +-
 .../build_hip_documentation_cuda.yml          |  2 +-
 .github/workflows/build_libraries_cuda.yml    |  2 +-
 .../build_programming_guide_cuda.yml          |  2 +-
 .../lane_masks_bit_shift/main.hip             | 64 +++++++++----------
 ROCm-Examples-Portable-VS2017.sln             |  4 ++
 ROCm-Examples-Portable-VS2019.sln             |  4 ++
 ROCm-Examples-Portable-VS2022.sln             |  4 ++
 9 files changed, 49 insertions(+), 37 deletions(-)

diff --git a/.github/workflows/build_applications_cuda.yml b/.github/workflows/build_applications_cuda.yml
index 800132abb..85d98c6b3 100644
--- a/.github/workflows/build_applications_cuda.yml
+++ b/.github/workflows/build_applications_cuda.yml
@@ -26,7 +26,7 @@ jobs:
         name: "Build Applications CUDA"
         runs-on: ubuntu-latest
         container:
-            image: nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04
+            image: nvidia/cuda:12.9.1-cudnn-devel-ubuntu22.04
         defaults:
             run:
                 shell: bash
diff --git a/.github/workflows/build_hip_basic_cuda.yml b/.github/workflows/build_hip_basic_cuda.yml
index 652ed6c6d..99ad84358 100644
--- a/.github/workflows/build_hip_basic_cuda.yml
+++ b/.github/workflows/build_hip_basic_cuda.yml
@@ -26,7 +26,7 @@ jobs:
         name: "Build HIP Basic CUDA"
         runs-on: ubuntu-latest
         container:
-            image: nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04
+            image: nvidia/cuda:12.9.1-cudnn-devel-ubuntu22.04
         defaults:
             run:
                 shell: bash
diff --git a/.github/workflows/build_hip_documentation_cuda.yml b/.github/workflows/build_hip_documentation_cuda.yml
index c39b1382f..06879e4dc 100644
--- a/.github/workflows/build_hip_documentation_cuda.yml
+++ b/.github/workflows/build_hip_documentation_cuda.yml
@@ -26,7 +26,7 @@ jobs:
         name: "Build HIP Documentation CUDA"
         runs-on: ubuntu-latest
         container:
-            image: nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04
+            image: nvidia/cuda:12.9.1-cudnn-devel-ubuntu22.04
         defaults:
             run:
                 shell: bash
diff --git a/.github/workflows/build_libraries_cuda.yml b/.github/workflows/build_libraries_cuda.yml
index ca012960e..6d43c3851 100644
--- a/.github/workflows/build_libraries_cuda.yml
+++ b/.github/workflows/build_libraries_cuda.yml
@@ -31,7 +31,7 @@ jobs:
       run:
         shell: bash
     container:
-      image: ghcr.io/rocm/rocm-examples-hip-libraries-cuda-ubuntu
+      image: nvidia/cuda:12.9.1-cudnn-devel-ubuntu22.04
       options: --user root
 
     steps:
diff --git a/.github/workflows/build_programming_guide_cuda.yml b/.github/workflows/build_programming_guide_cuda.yml
index 43cbd3e36..94e9ab60f 100644
--- a/.github/workflows/build_programming_guide_cuda.yml
+++ b/.github/workflows/build_programming_guide_cuda.yml
@@ -26,7 +26,7 @@ jobs:
         name: "Build ROCm Programming Guide CUDA"
         runs-on: ubuntu-latest
         container:
-            image: nvidia/cuda:12.8.1-cudnn-devel-ubuntu22.04
+            image: nvidia/cuda:12.9.1-cudnn-devel-ubuntu22.04
         defaults:
             run:
                 shell: bash
diff --git a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/main.hip b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/main.hip
index 95ec02ee7..93d816bc4 100644
--- a/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/main.hip
+++ b/HIP-Doc/Programming-Guide/Porting-CUDA-code-to-HIP/lane_masks_bit_shift/main.hip
@@ -20,13 +20,13 @@
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 // SOFTWARE.
 
-#include <hipcub/hipcub.hpp>
-
 #include <hip/hip_runtime.h>
+#include <hipcub/warp/warp_reduce.hpp>
 
 #include <cstdint>
 #include <cstdlib>
 #include <iostream>
+#include <vector>
 
 #define HIP_CHECK(expression)                                \
 {                                                            \
@@ -44,7 +44,7 @@ __global__ void initial_problematic(std::uint32_t* results)
 {
     using WarpReduce = hipcub::WarpReduce<std::uint32_t>;
     __shared__ typename WarpReduce::TempStorage tempStorage;
-    
+
     const unsigned int tid = threadIdx.x;
     unsigned int val = tid % warpSize;
 
@@ -132,10 +132,10 @@ int main()
     // Query device properties
     int warpSizeHost;
     HIP_CHECK(hipDeviceGetAttribute(&warpSizeHost, hipDeviceAttributeWarpSize, 0));
-    
+
     hipDeviceProp_t deviceProp;
     HIP_CHECK(hipGetDeviceProperties(&deviceProp, 0));
-    
+
     std::cout << "=== Device Information ===" << std::endl;
     std::cout << "Device name: " << deviceProp.name << std::endl;
     std::cout << "Warp size: " << warpSizeHost << std::endl;
@@ -148,34 +148,34 @@ int main()
     if (warpSizeHost <= 32)
     {
         std::cout << "=== Test 1: Problematic 32-bit Implementation ===" << std::endl;
-        std::cout << "Running on warpSize=" << warpSizeHost 
+        std::cout << "Running on warpSize=" << warpSizeHost
                   << " (safe for this architecture)" << std::endl;
-        
+
         std::uint32_t* d_results_32;
         HIP_CHECK(hipMalloc(&d_results_32, num_warps * sizeof(std::uint32_t)));
-        
+
         initial_problematic<<<1, num_threads>>>(d_results_32);
         HIP_CHECK(hipGetLastError());
         HIP_CHECK(hipDeviceSynchronize());
-        
-        std::uint32_t h_results_32[num_warps];
-        HIP_CHECK(hipMemcpy(h_results_32, d_results_32, 
-                           num_warps * sizeof(std::uint32_t), 
+
+        std::vector<std::uint32_t> h_results_32(num_warps);
+        HIP_CHECK(hipMemcpy(h_results_32.data(), d_results_32,
+                           num_warps * sizeof(std::uint32_t),
                            hipMemcpyDeviceToHost));
-        
+
         std::cout << "Results: ";
         for (int i = 0; i < num_warps; ++i)
         {
             std::cout << h_results_32[i] << " ";
         }
         std::cout << std::endl;
-        
+
         HIP_CHECK(hipFree(d_results_32));
     }
     else
     {
         std::cout << "=== Test 1: Skipped ===" << std::endl;
-        std::cout << "Problematic 32-bit kernel skipped (warpSize=" << warpSizeHost 
+        std::cout << "Problematic 32-bit kernel skipped (warpSize=" << warpSizeHost
                   << " would cause overflow)" << std::endl;
     }
     std::cout << std::endl;
@@ -184,23 +184,23 @@ int main()
     std::cout << "=== Test 2: Improved 64-bit Implementation ===" << std::endl;
     std::uint64_t* d_results_64;
     HIP_CHECK(hipMalloc(&d_results_64, num_warps * sizeof(std::uint64_t)));
-    
+
     improved_64bit<<<1, num_threads>>>(d_results_64);
     HIP_CHECK(hipGetLastError());
     HIP_CHECK(hipDeviceSynchronize());
-    
-    std::uint64_t h_results_64[num_warps];
-    HIP_CHECK(hipMemcpy(h_results_64, d_results_64, 
-                       num_warps * sizeof(std::uint64_t), 
+
+    std::vector<std::uint32_t> h_results_64(num_warps);
+    HIP_CHECK(hipMemcpy(h_results_64.data(), d_results_64,
+                       num_warps * sizeof(std::uint64_t),
                        hipMemcpyDeviceToHost));
-    
+
     std::cout << "Results: ";
     for (int i = 0; i < num_warps; ++i)
     {
         std::cout << h_results_64[i] << " ";
     }
     std::cout << std::endl << std::endl;
-    
+
     HIP_CHECK(hipFree(d_results_64));
 
     // Test 3: Best practice portable implementation
@@ -208,26 +208,26 @@ int main()
     std::cout << "Using lane_mask_t (";
     std::cout << (sizeof(lane_mask_t) == 8 ? "64-bit" : "32-bit");
     std::cout << " on this architecture)" << std::endl;
-    
+
     lane_mask_t* d_results_portable;
     HIP_CHECK(hipMalloc(&d_results_portable, num_warps * sizeof(lane_mask_t)));
-    
+
     best_portable<<<1, num_threads>>>(d_results_portable);
     HIP_CHECK(hipGetLastError());
     HIP_CHECK(hipDeviceSynchronize());
-    
-    lane_mask_t h_results_portable[num_warps];
-    HIP_CHECK(hipMemcpy(h_results_portable, d_results_portable, 
-                       num_warps * sizeof(lane_mask_t), 
+
+    std::vector<lane_mask_t> h_results_portable(num_warps);
+    HIP_CHECK(hipMemcpy(h_results_portable.data(), d_results_portable,
+                       num_warps * sizeof(lane_mask_t),
                        hipMemcpyDeviceToHost));
-    
+
     std::cout << "Results: ";
     for (int i = 0; i < num_warps; ++i)
     {
         std::cout << h_results_portable[i] << " ";
     }
     std::cout << std::endl << std::endl;
-    
+
     HIP_CHECK(hipFree(d_results_portable));
 
     // Verify results match between 64-bit and portable versions
@@ -240,9 +240,9 @@ int main()
             break;
         }
     }
-    
+
     std::cout << "=== Verification ===" << std::endl;
-    std::cout << "64-bit and portable results match: " 
+    std::cout << "64-bit and portable results match: "
               << (results_match ? "YES" : "NO") << std::endl;
 
     if (results_match)
diff --git a/ROCm-Examples-Portable-VS2017.sln b/ROCm-Examples-Portable-VS2017.sln
index 5cea96a81..e798967ef 100644
--- a/ROCm-Examples-Portable-VS2017.sln
+++ b/ROCm-Examples-Portable-VS2017.sln
@@ -572,6 +572,10 @@ Global
 		{2F455A27-44B0-4962-AA36-79F7053D320E}.Debug|x64.Build.0 = Debug|x64
 		{2F455A27-44B0-4962-AA36-79F7053D320E}.Release|x64.ActiveCfg = Release|x64
 		{2F455A27-44B0-4962-AA36-79F7053D320E}.Release|x64.Build.0 = Release|x64
+		{A1B2C3D4-E5F6-4789-A012-3456789ABCDE}.Debug|x64.ActiveCfg = Debug|x64
+		{A1B2C3D4-E5F6-4789-A012-3456789ABCDE}.Debug|x64.Build.0 = Debug|x64
+		{A1B2C3D4-E5F6-4789-A012-3456789ABCDE}.Release|x64.ActiveCfg = Release|x64
+		{A1B2C3D4-E5F6-4789-A012-3456789ABCDE}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
diff --git a/ROCm-Examples-Portable-VS2019.sln b/ROCm-Examples-Portable-VS2019.sln
index 1ca7e6026..dd2a24531 100644
--- a/ROCm-Examples-Portable-VS2019.sln
+++ b/ROCm-Examples-Portable-VS2019.sln
@@ -572,6 +572,10 @@ Global
 		{D9432180-B518-41B6-8528-9B376AC64E38}.Debug|x64.Build.0 = Debug|x64
 		{D9432180-B518-41B6-8528-9B376AC64E38}.Release|x64.ActiveCfg = Release|x64
 		{D9432180-B518-41B6-8528-9B376AC64E38}.Release|x64.Build.0 = Release|x64
+		{B2C3D4E5-F6A7-4890-B123-456789ABCDEF}.Debug|x64.ActiveCfg = Debug|x64
+		{B2C3D4E5-F6A7-4890-B123-456789ABCDEF}.Debug|x64.Build.0 = Debug|x64
+		{B2C3D4E5-F6A7-4890-B123-456789ABCDEF}.Release|x64.ActiveCfg = Release|x64
+		{B2C3D4E5-F6A7-4890-B123-456789ABCDEF}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
diff --git a/ROCm-Examples-Portable-VS2022.sln b/ROCm-Examples-Portable-VS2022.sln
index e819e5efd..e0009ccaa 100644
--- a/ROCm-Examples-Portable-VS2022.sln
+++ b/ROCm-Examples-Portable-VS2022.sln
@@ -572,6 +572,10 @@ Global
 		{B7E76F08-1DA6-4520-86D7-1986CF809179}.Debug|x64.Build.0 = Debug|x64
 		{B7E76F08-1DA6-4520-86D7-1986CF809179}.Release|x64.ActiveCfg = Release|x64
 		{B7E76F08-1DA6-4520-86D7-1986CF809179}.Release|x64.Build.0 = Release|x64
+		{C3D4E5F6-A7B8-4901-C234-56789ABCDEF0}.Debug|x64.ActiveCfg = Debug|x64
+		{C3D4E5F6-A7B8-4901-C234-56789ABCDEF0}.Debug|x64.Build.0 = Debug|x64
+		{C3D4E5F6-A7B8-4901-C234-56789ABCDEF0}.Release|x64.ActiveCfg = Release|x64
+		{C3D4E5F6-A7B8-4901-C234-56789ABCDEF0}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE