|
| 1 | +/** |
| 2 | + * GGML CUDA Backend for PagedAttention |
| 3 | + * |
| 4 | + * This file provides the CUDA backend implementation for the GGML_OP_PAGED_ATTENTION operation. |
| 5 | + * It bridges GGML's operation framework with the PagedAttention CUDA kernels. |
| 6 | + * |
| 7 | + * NOTE: PagedAttention is currently experimental and only supported on CUDA. |
| 8 | + * MUSA support is disabled due to compiler compatibility issues. |
| 9 | + */ |
| 10 | + |
| 11 | +// PagedAttention is not yet supported on MUSA |
| 12 | +#ifndef GGML_USE_MUSA |
| 13 | + |
| 14 | +#include "common.cuh" |
| 15 | +#include "paged-attention.cuh" |
| 16 | +#include "paged-attention-backend.cuh" |
| 17 | + |
| 18 | +// Extract parameters from GGML tensor |
| 19 | +static void ggml_cuda_op_paged_attention_get_params( |
| 20 | + const ggml_tensor * dst, |
| 21 | + float * scale, |
| 22 | + int32_t * block_size) { |
| 23 | + |
| 24 | + const float * params = (const float *)dst->op_params; |
| 25 | + *scale = params[0]; |
| 26 | + *block_size = (int32_t)params[1]; |
| 27 | +} |
| 28 | + |
| 29 | +// Main CUDA backend function for PagedAttention |
| 30 | +void ggml_cuda_op_paged_attention( |
| 31 | + ggml_backend_cuda_context & ctx, |
| 32 | + ggml_tensor * dst) { |
| 33 | + |
| 34 | + const ggml_tensor * q = dst->src[0]; // query |
| 35 | + const ggml_tensor * k_cache = dst->src[1]; // key cache (paged) |
| 36 | + const ggml_tensor * v_cache = dst->src[2]; // value cache (paged) |
| 37 | + const ggml_tensor * block_tables = dst->src[3]; // block tables |
| 38 | + const ggml_tensor * seq_lens = dst->src[4]; // sequence lengths |
| 39 | + const ggml_tensor * alibi_slopes = dst->src[5]; // optional ALiBi slopes (can be nullptr) |
| 40 | + |
| 41 | + // Extract parameters |
| 42 | + float scale; |
| 43 | + int32_t block_size; |
| 44 | + ggml_cuda_op_paged_attention_get_params(dst, &scale, &block_size); |
| 45 | + |
| 46 | + // Get tensor dimensions |
| 47 | + const int64_t head_size = q->ne[0]; |
| 48 | + const int64_t n_heads = q->ne[1]; |
| 49 | + const int64_t n_tokens = q->ne[2]; |
| 50 | + const int64_t n_seqs = q->ne[3]; |
| 51 | + |
| 52 | + const int64_t n_kv_heads = k_cache->ne[2]; |
| 53 | + const int64_t num_blocks = k_cache->ne[0]; |
| 54 | + |
| 55 | + const int64_t max_blocks_per_seq = block_tables->ne[0]; |
| 56 | + |
| 57 | + // Validate tensor dimensions |
| 58 | + GGML_ASSERT(n_tokens > 0 && "Number of query tokens must be positive"); |
| 59 | + GGML_ASSERT(n_seqs > 0 && "Number of sequences must be positive"); |
| 60 | + GGML_ASSERT(num_blocks > 0 && "Number of KV cache blocks must be positive"); |
| 61 | + GGML_ASSERT(max_blocks_per_seq > 0 && "Max blocks per sequence must be positive"); |
| 62 | + |
| 63 | + // Validate that we have enough blocks available |
| 64 | + // Note: This is a soft check - actual usage depends on sequence lengths |
| 65 | + GGML_ASSERT(num_blocks >= max_blocks_per_seq && |
| 66 | + "Total number of blocks should be >= max blocks per sequence"); |
| 67 | + |
| 68 | + // For PagedAttention, typically we have one query per sequence (decode mode) |
| 69 | + // or multiple queries per sequence (prefill mode) |
| 70 | + GGML_ASSERT(n_tokens <= n_seqs * 1024 && |
| 71 | + "Number of tokens seems unusually large relative to batch size"); |
| 72 | + |
| 73 | + // Get pointers |
| 74 | + void * out_ptr = dst->data; |
| 75 | + const void * q_ptr = q->data; |
| 76 | + const void * k_cache_ptr = k_cache->data; |
| 77 | + const void * v_cache_ptr = v_cache->data; |
| 78 | + const int32_t * block_tables_ptr = (const int32_t *)block_tables->data; |
| 79 | + const int32_t * seq_lens_ptr = (const int32_t *)seq_lens->data; |
| 80 | + |
| 81 | + // Debug: Check for null pointers |
| 82 | + GGML_ASSERT(out_ptr != nullptr && "Output pointer is null"); |
| 83 | + GGML_ASSERT(q_ptr != nullptr && "Query pointer is null"); |
| 84 | + GGML_ASSERT(k_cache_ptr != nullptr && "K cache pointer is null"); |
| 85 | + GGML_ASSERT(v_cache_ptr != nullptr && "V cache pointer is null"); |
| 86 | + GGML_ASSERT(block_tables_ptr != nullptr && "Block tables pointer is null"); |
| 87 | + GGML_ASSERT(seq_lens_ptr != nullptr && "Sequence lengths pointer is null"); |
| 88 | + |
| 89 | + // Get ALiBi slopes pointer if provided |
| 90 | + const float * alibi_slopes_ptr = nullptr; |
| 91 | + if (alibi_slopes != nullptr) { |
| 92 | + // ALiBi slopes should be a 1D tensor with one slope per attention head |
| 93 | + GGML_ASSERT(alibi_slopes->type == GGML_TYPE_F32 && |
| 94 | + "ALiBi slopes must be float32"); |
| 95 | + GGML_ASSERT(alibi_slopes->ne[0] == n_heads && |
| 96 | + "ALiBi slopes tensor must have one value per head"); |
| 97 | + alibi_slopes_ptr = (const float *)alibi_slopes->data; |
| 98 | + } |
| 99 | + |
| 100 | + // Calculate max sequence length (needed to decide V1 vs V2) |
| 101 | + int max_seq_len = 0; |
| 102 | + for (int i = 0; i < n_seqs; i++) { |
| 103 | + if (seq_lens_ptr[i] > max_seq_len) { |
| 104 | + max_seq_len = seq_lens_ptr[i]; |
| 105 | + } |
| 106 | + } |
| 107 | + |
| 108 | + // Get CUDA stream |
| 109 | + cudaStream_t stream = ctx.stream(); |
| 110 | + |
| 111 | + // Decide whether to use V1 or V2 |
| 112 | + const bool use_v1 = ggml_cuda_paged_attention::should_use_v1( |
| 113 | + max_seq_len, n_seqs, n_heads); |
| 114 | + |
| 115 | + // Launch appropriate kernel |
| 116 | + if (use_v1) { |
| 117 | + ggml_cuda_paged_attention::paged_attention_v1_launcher( |
| 118 | + out_ptr, |
| 119 | + q_ptr, |
| 120 | + k_cache_ptr, |
| 121 | + v_cache_ptr, |
| 122 | + n_seqs, |
| 123 | + n_heads, |
| 124 | + n_kv_heads, |
| 125 | + head_size, |
| 126 | + block_size, |
| 127 | + max_blocks_per_seq, |
| 128 | + block_tables_ptr, |
| 129 | + seq_lens_ptr, |
| 130 | + max_seq_len, |
| 131 | + scale, |
| 132 | + alibi_slopes_ptr, |
| 133 | + q->type, |
| 134 | + k_cache->type, |
| 135 | + stream); |
| 136 | + } else { |
| 137 | + ggml_cuda_paged_attention::paged_attention_v2_launcher( |
| 138 | + out_ptr, |
| 139 | + q_ptr, |
| 140 | + k_cache_ptr, |
| 141 | + v_cache_ptr, |
| 142 | + n_seqs, |
| 143 | + n_heads, |
| 144 | + n_kv_heads, |
| 145 | + head_size, |
| 146 | + block_size, |
| 147 | + max_blocks_per_seq, |
| 148 | + block_tables_ptr, |
| 149 | + seq_lens_ptr, |
| 150 | + max_seq_len, |
| 151 | + scale, |
| 152 | + alibi_slopes_ptr, |
| 153 | + q->type, |
| 154 | + k_cache->type, |
| 155 | + ctx.pool(), |
| 156 | + stream); |
| 157 | + } |
| 158 | + |
| 159 | + // Check for errors |
| 160 | + CUDA_CHECK(cudaGetLastError()); |
| 161 | +} |
| 162 | + |
| 163 | +// Check if PagedAttention is supported for given configuration |
| 164 | +bool ggml_cuda_can_paged_attention(const ggml_tensor * dst) { |
| 165 | + const ggml_tensor * q = dst->src[0]; |
| 166 | + const ggml_tensor * k_cache = dst->src[1]; |
| 167 | + |
| 168 | + // Check data types |
| 169 | + if (q->type != GGML_TYPE_F16 && q->type != GGML_TYPE_F32) { |
| 170 | + return false; |
| 171 | + } |
| 172 | + |
| 173 | + if (k_cache->type != GGML_TYPE_F16 && k_cache->type != GGML_TYPE_F32) { |
| 174 | + return false; |
| 175 | + } |
| 176 | + |
| 177 | + // Check head size is supported |
| 178 | + const int64_t head_size = q->ne[0]; |
| 179 | + const int supported_head_sizes[] = {32, 64, 80, 96, 112, 120, 128, 192, 256}; |
| 180 | + bool head_size_supported = false; |
| 181 | + |
| 182 | + for (int hs : supported_head_sizes) { |
| 183 | + if (head_size == hs) { |
| 184 | + head_size_supported = true; |
| 185 | + break; |
| 186 | + } |
| 187 | + } |
| 188 | + |
| 189 | + if (!head_size_supported) { |
| 190 | + return false; |
| 191 | + } |
| 192 | + |
| 193 | + // Extract block size and check it's supported |
| 194 | + float scale; |
| 195 | + int32_t block_size; |
| 196 | + ggml_cuda_op_paged_attention_get_params(dst, &scale, &block_size); |
| 197 | + |
| 198 | + if (block_size != 8 && block_size != 16 && block_size != 32) { |
| 199 | + return false; |
| 200 | + } |
| 201 | + |
| 202 | + return true; |
| 203 | +} |
| 204 | + |
| 205 | +#else // GGML_USE_MUSA |
| 206 | + |
| 207 | +// Stub implementations for MUSA (PagedAttention not yet supported) |
| 208 | +#include "common.cuh" |
| 209 | + |
| 210 | +void ggml_cuda_op_paged_attention( |
| 211 | + ggml_backend_cuda_context & ctx, |
| 212 | + ggml_tensor * dst) { |
| 213 | + GGML_UNUSED(ctx); |
| 214 | + GGML_UNUSED(dst); |
| 215 | + GGML_ABORT("PagedAttention is not yet supported on MUSA"); |
| 216 | +} |
| 217 | + |
| 218 | +bool ggml_cuda_supports_paged_attention(const ggml_tensor * dst) { |
| 219 | + GGML_UNUSED(dst); |
| 220 | + return false; |
| 221 | +} |
| 222 | + |
| 223 | +#endif // GGML_USE_MUSA |
0 commit comments