[CUDA] columnwise quantize with tma#3157
Conversation
| #if (CUDART_VERSION >= 12080) && (__CUDA_ARCH__ >= 1000) && \ | ||
| defined(__CUDA_ARCH_SPECIFIC__) | ||
|
|
||
| __device__ __forceinline__ void mbarrier_init(uint64_t* mbar, uint32_t count) { |
There was a problem hiding this comment.
Should we use the cuda::ptx APIs like cuda::ptx::mbarrier_init API instead? They don't have good documentation and you would have to search https://github.com/NVIDIA/cccl to find out API names though.
| @@ -10,11 +10,6 @@ namespace mlx::core { | |||
|
|
|||
| namespace cg = cooperative_groups; | |||
There was a problem hiding this comment.
This should be moved to namespace cu too.
| auto tidy = block_idx.y * block_size.y + idx_in_block.y; | ||
| auto grid_dim_x = cg::this_grid().dim_blocks().x * block_size.x; | ||
|
|
||
| size_t thread_idx = tidx + grid_dim_x * size_t(tidy); |
There was a problem hiding this comment.
I think we can just use cg::this_grid().thread_rank()?
| int in_size_bytes, // itemsize | ||
| int bits) { | ||
| dim3 grid; | ||
| grid.x = (grid_dim_x_size + block_size_x - 1) / block_size_x; |
There was a problem hiding this comment.
Can you use cuda::ceil_div when possible?
| grid.x = (grid_dim_x_size + block_size_x - 1) / block_size_x; | |
| grid.x = cuda::ceil_div(grid_dim_x_size, block_size_x); |
|
|
||
| constexpr size_t out_tile_elems = BUFF_ELEMS / elem_per_byte; | ||
| constexpr size_t out_tile_size = out_tile_elems; | ||
| constexpr size_t out_buff_size_aligned = |
There was a problem hiding this comment.
This is not used anywhere.
| (reinterpret_cast<uintptr_t>(shared_mem) + TMA_SHMEM_ALIGNMENT - 1) & | ||
| ~(static_cast<uintptr_t>(TMA_SHMEM_ALIGNMENT - 1)); | ||
|
|
||
| T* in_sh = reinterpret_cast<T*>(aligned_shared); |
There was a problem hiding this comment.
You can make sure you get necessary alignment with dynamic allocated shared memory with this:
extern __shared__ uint128_t shared_mem[];
T* in_sh = reinterpret_cast<T*>(shared_mem);or:
extern __shared__ alignas(128) char shared_mem[];Also, I think in_smem would be an easier to understand name.
| ((out_tile_elems * BUFFS_NUM + TMA_SHMEM_ALIGNMENT - 1) / | ||
| TMA_SHMEM_ALIGNMENT) * | ||
| TMA_SHMEM_ALIGNMENT; | ||
| const size_t smem_size = |
There was a problem hiding this comment.
It appears that the size of shared memory is static? I don't think you need to use dynamic shared memory in this case, you can ensure alignment with:
__shared__ alignas(128) T smem[SIZE];
Columnwise quantization with tma (mxfp8), bfloat16:
This PR:
get_swizzle_launch_argsintocu::namespace for consistencyTODO:
nvfp4requires a separate columnwise kernel due to TMA tile size constraints. In the proposed kernel each thread processing a tile of size (N, M) and store a transposed result. M is equal togroup_size-- 32 bytes formcfp8, but only 8 bytes fornvfp4. Since TMA requires the innermost tile dimension to be at least 128 bits (16 bytes), fornvfp4kernel would need to load a larger tile and iterate over multiple groups.