Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions src/intrinsics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,32 @@ Declare memory that is local to a workgroup.
"""
localmemory(::Type{T}, dims) where {T} = localmemory(T, Val(dims))

"""
shfl_down(val::T, offset::Integer)::T where T

Read `val` from a lane with higher id given by `offset`.

!!! note
Backend implementations **must** implement:
```
@device_override shfl_down(val::T, offset::Integer)::T where T
```
As well as the on-device functionality.
"""
function shfl_down end

"""
shfl_down_types(::Backend)::Vector{DataType}

Returns a vector of `DataType`s supported on `backend`

!!! note
Backend implementations **must** implement this function
only if they support `shfl_down` for any types.
"""
shfl_down_types(::Backend) = DataType[]


"""
barrier()

Expand Down
38 changes: 38 additions & 0 deletions test/intrinsics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,32 @@ function test_intrinsics_kernel(results)
return
end

# Do NOT use this kernel as an example for your code.
# It was written assuming one workgroup of size 32 and
# is only valid for those
function shfl_down_test_kernel(a, b)
# This is not valid
idx = KI.get_local_id().x

temp = KI.localmemory(eltype(b), 32)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So we need a query function to find the subgroup size? Then pass that to a Val?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently this is like #559 where it assumes that subgroup size is always 32.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The "This is not valid" is because it's using the local_id but we could do like #559 and modulo 32 to find subgroup position and stuff

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I think AMD has some chips where subgroup size is 64. So we should have some way for the use to query this (even if it is just on the host)

Copy link
Member Author

@christiangnrd christiangnrd Nov 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

GPU Backend Host-Side Method Device-Side Method (Intrinsic)
Metal thread_execution_width property of MTLComputePipelineState (need compiled kernel) [[threads_per_simdgroup()
AMDGPU wavefrontsize(dev::HIPDevice) wavefrontsize()
CUDA warpsize(dev::CuDevice) warpsize()
OpenCL get_sub_group_size()
oneAPI get_sub_group_size()?

Copy link
Member Author

@christiangnrd christiangnrd Nov 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is Metal the only backend that currently lacks dynamic local memory?

Copy link

@VarLad VarLad Nov 23, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On OpenCL and oneAPI, the host side methods are probably CL_DEVICE_SUB_GROUP_SIZES_INTEL + clDeviceInfo and subGroupSizes + zeDeviceGetComputeProperties, respectively

refs:
OpenCL extension doc
Intel levelZero docs
pocl cuda driver

temp[idx] = a[idx]

KI.barrier()

if idx == 1
value = temp[idx]

value = value + KI.shfl_down(value, 16)
value = value + KI.shfl_down(value, 8)
value = value + KI.shfl_down(value, 4)
value = value + KI.shfl_down(value, 2)
value = value + KI.shfl_down(value, 1)

b[idx] = value
end
return
end

function intrinsics_testsuite(backend, AT)
@testset "KernelIntrinsics Tests" begin
@testset "Launch parameters" begin
Expand Down Expand Up @@ -119,6 +145,18 @@ function intrinsics_testsuite(backend, AT)
@test local_id_x == expected_local
end
end
@testset "shfl_down(::$T)" for T in KI.shfl_down_types(backend())
a = zeros(T, 32)
rand!(a, (1:4))

dev_a = AT(a)
dev_b = AT(zeros(T, 32))

KI.@kernel backend() workgroupsize=32 shfl_down_test_kernel(dev_a, dev_b)

b = Array(dev_b)
@test sum(a) ≈ b[1]
end
end
return nothing
end
Loading