JuliaGPU · christiangnrd · Nov 22, 2025 · vchuravy · Nov 22, 2025 · christiangnrd
diff --git a/src/intrinsics.jl b/src/intrinsics.jl
@@ -116,6 +116,32 @@ Declare memory that is local to a workgroup.
 """
 localmemory(::Type{T}, dims) where {T} = localmemory(T, Val(dims))
 
+"""
+    shfl_down(val::T, offset::Integer)::T where T
+
+Read `val` from a lane with higher id given by `offset`.
+
+!!! note
+    Backend implementations **must** implement:
+    ```
+    @device_override shfl_down(val::T, offset::Integer)::T where T
+    ```
+    As well as the on-device functionality.
+"""
+function shfl_down end
+
+"""
+    shfl_down_types(::Backend)::Vector{DataType}
+
+Returns a vector of `DataType`s supported on `backend`
+
+!!! note
+    Backend implementations **must** implement this function
+    only if they support `shfl_down` for any types.
+"""
+shfl_down_types(::Backend) = DataType[]
+
+
 """
     barrier()
 

diff --git a/test/intrinsics.jl b/test/intrinsics.jl
@@ -20,6 +20,32 @@ function test_intrinsics_kernel(results)
     return
 end
 
+# Do NOT use this kernel as an example for your code.
+#  It was written assuming one workgroup of size 32 and
+#  is only valid for those
+function shfl_down_test_kernel(a, b)
+    # This is not valid
+    idx = KI.get_local_id().x
+
+    temp = KI.localmemory(eltype(b), 32)
+    temp[idx] = a[idx]
+
+    KI.barrier()
+
+    if idx == 1
+        value = temp[idx]
+
+        value = value + KI.shfl_down(value, 16)
+        value = value + KI.shfl_down(value,  8)
+        value = value + KI.shfl_down(value,  4)
+        value = value + KI.shfl_down(value,  2)
+        value = value + KI.shfl_down(value,  1)
+
+        b[idx] = value
+    end
+    return
+end
+
 function intrinsics_testsuite(backend, AT)
     @testset "KernelIntrinsics Tests" begin
         @testset "Launch parameters" begin
@@ -119,6 +145,18 @@ function intrinsics_testsuite(backend, AT)
                 @test local_id_x == expected_local
             end
         end
+        @testset "shfl_down(::$T)" for T in KI.shfl_down_types(backend())
+            a = zeros(T, 32)
+            rand!(a, (1:4))
+
+            dev_a = AT(a)
+            dev_b = AT(zeros(T, 32))
+
+            KI.@kernel backend() workgroupsize=32 shfl_down_test_kernel(dev_a, dev_b)
+
+            b = Array(dev_b)
+            @test sum(a) ≈ b[1]
+        end
     end
     return nothing
 end