Skip to content

Commit 19466fb

Browse files
committed
Add PagedAttention support (experimental, CUDA only)
Implement PagedAttention algorithm from for memory-efficient KV cache management. This feature reduces memory fragmentation by storing KV cache in fixed-size blocks (similar to virtual memory paging) and enables efficient memory sharing between sequences through copy-on-write semantics. The implementation is experimental and disabled by default. Enable with the --pagedattention flag Signed-off-by: Eric Curtin <eric.curtin@docker.com>
1 parent 7f8ef50 commit 19466fb

25 files changed

+3229
-15
lines changed

common/arg.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1017,6 +1017,14 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
10171017
string_format("error: unkown value for --flash-attn: '%s'\n", value.c_str()));
10181018
}
10191019
}).set_env("LLAMA_ARG_FLASH_ATTN"));
1020+
add_opt(common_arg(
1021+
{"--pagedattention"},
1022+
"enable PagedAttention for KV cache (experimental, requires CUDA)",
1023+
[](common_params & params) {
1024+
fprintf(stderr, "DEBUG: --pagedattention flag parsed, setting params.use_paged_attention = true\n");
1025+
params.use_paged_attention = true;
1026+
}
1027+
).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}));
10201028
add_opt(common_arg(
10211029
{"-p", "--prompt"}, "PROMPT",
10221030
"prompt to start generation with; for system message, use -sys",

common/common.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1249,6 +1249,9 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
12491249
struct llama_context_params common_context_params_to_llama(const common_params & params) {
12501250
auto cparams = llama_context_default_params();
12511251

1252+
fprintf(stderr, "DEBUG common_context_params_to_llama: params.use_paged_attention = %s\n",
1253+
params.use_paged_attention ? "true" : "false");
1254+
12521255
cparams.n_ctx = params.n_ctx;
12531256
cparams.n_seq_max = params.n_parallel;
12541257
cparams.n_batch = params.n_batch;
@@ -1275,6 +1278,7 @@ struct llama_context_params common_context_params_to_llama(const common_params &
12751278
cparams.op_offload = !params.no_op_offload;
12761279
cparams.swa_full = params.swa_full;
12771280
cparams.kv_unified = params.kv_unified;
1281+
cparams.use_paged_attention = params.use_paged_attention;
12781282

12791283
cparams.type_k = params.cache_type_k;
12801284
cparams.type_v = params.cache_type_v;

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -406,6 +406,7 @@ struct common_params {
406406
bool ctx_shift = false; // context shift on infinite text generation
407407
bool swa_full = false; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
408408
bool kv_unified = false; // enable unified KV cache
409+
bool use_paged_attention = false; // enable PagedAttention (experimental, requires CUDA)
409410

410411
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
411412
bool use_mmap = true; // use mmap for faster loads

ggml/include/ggml.h

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -537,6 +537,8 @@ extern "C" {
537537

538538
GGML_OP_FLASH_ATTN_EXT,
539539
GGML_OP_FLASH_ATTN_BACK,
540+
GGML_OP_PAGED_ATTENTION,
541+
GGML_OP_PAGED_CPY,
540542
GGML_OP_SSM_CONV,
541543
GGML_OP_SSM_SCAN,
542544
GGML_OP_WIN_PART,
@@ -2312,6 +2314,32 @@ extern "C" {
23122314
struct ggml_tensor * a,
23132315
struct ggml_tensor * sinks);
23142316

2317+
// PagedAttention (paged KV cache attention)
2318+
// q: [n_tokens, n_heads, head_size]
2319+
// k_cache: [num_blocks, block_size, n_kv_heads, head_size] (paged)
2320+
// v_cache: [num_blocks, block_size, n_kv_heads, head_size] (paged)
2321+
// block_tables: [n_seqs, max_blocks_per_seq] (int32)
2322+
// seq_lens: [n_seqs] (int32)
2323+
GGML_API struct ggml_tensor * ggml_paged_attention(
2324+
struct ggml_context * ctx,
2325+
struct ggml_tensor * q,
2326+
struct ggml_tensor * k_cache,
2327+
struct ggml_tensor * v_cache,
2328+
struct ggml_tensor * block_tables,
2329+
struct ggml_tensor * seq_lens,
2330+
int32_t block_size,
2331+
float scale);
2332+
2333+
// Copy K/V data to paged cache blocks (similar to vLLM's reshape_and_cache)
2334+
// kv_cur: [head_size, n_heads, n_tokens] - K or V current
2335+
// kv_cache: [num_blocks, n_kv_heads, head_size, block_size] - paged K or V cache
2336+
// slot_idxs: [n_tokens] (int32) - cache slot index for each token
2337+
GGML_API struct ggml_tensor * ggml_paged_cpy(
2338+
struct ggml_context * ctx,
2339+
struct ggml_tensor * kv_cur,
2340+
struct ggml_tensor * kv_cache,
2341+
struct ggml_tensor * slot_idxs);
2342+
23152343
// TODO: needs to be adapted to ggml_flash_attn_ext
23162344
GGML_API struct ggml_tensor * ggml_flash_attn_back(
23172345
struct ggml_context * ctx,

ggml/src/ggml-cpu/ggml-cpu.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2062,6 +2062,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
20622062
{
20632063
// nop
20642064
} break;
2065+
case GGML_OP_PAGED_ATTENTION:
2066+
{
2067+
// nop (CUDA-only operation)
2068+
} break;
20652069
case GGML_OP_COUNT:
20662070
{
20672071
GGML_ABORT("fatal error");

ggml/src/ggml-cuda/ggml-cuda.cu

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232
#include "ggml-cuda/opt-step-sgd.cuh"
3333
#include "ggml-cuda/out-prod.cuh"
3434
#include "ggml-cuda/pad.cuh"
35+
#include "ggml-cuda/paged-attention-backend.cuh"
36+
#include "ggml-cuda/paged-cpy.cuh"
3537
#include "ggml-cuda/pool2d.cuh"
3638
#include "ggml-cuda/quantize.cuh"
3739
#include "ggml-cuda/rope.cuh"
@@ -2719,6 +2721,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
27192721
case GGML_OP_OPT_STEP_SGD:
27202722
ggml_cuda_opt_step_sgd(ctx, dst);
27212723
break;
2724+
case GGML_OP_PAGED_ATTENTION:
2725+
ggml_cuda_op_paged_attention(ctx, dst);
2726+
break;
2727+
case GGML_OP_PAGED_CPY:
2728+
ggml_cuda_op_paged_cpy(ctx, dst);
2729+
break;
27222730
case GGML_OP_SOLVE_TRI:
27232731
ggml_cuda_op_solve_tri(ctx, dst);
27242732
break;
Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
/**
2+
* GGML CUDA Backend for PagedAttention
3+
*
4+
* This file provides the CUDA backend implementation for the GGML_OP_PAGED_ATTENTION operation.
5+
* It bridges GGML's operation framework with the PagedAttention CUDA kernels.
6+
*
7+
* NOTE: PagedAttention is currently experimental and only supported on CUDA.
8+
* MUSA support is disabled due to compiler compatibility issues.
9+
*/
10+
11+
// PagedAttention is not yet supported on MUSA
12+
#ifndef GGML_USE_MUSA
13+
14+
#include "common.cuh"
15+
#include "paged-attention.cuh"
16+
#include "paged-attention-backend.cuh"
17+
18+
// Extract parameters from GGML tensor
19+
static void ggml_cuda_op_paged_attention_get_params(
20+
const ggml_tensor * dst,
21+
float * scale,
22+
int32_t * block_size) {
23+
24+
const float * params = (const float *)dst->op_params;
25+
*scale = params[0];
26+
*block_size = (int32_t)params[1];
27+
}
28+
29+
// Main CUDA backend function for PagedAttention
30+
void ggml_cuda_op_paged_attention(
31+
ggml_backend_cuda_context & ctx,
32+
ggml_tensor * dst) {
33+
34+
const ggml_tensor * q = dst->src[0]; // query
35+
const ggml_tensor * k_cache = dst->src[1]; // key cache (paged)
36+
const ggml_tensor * v_cache = dst->src[2]; // value cache (paged)
37+
const ggml_tensor * block_tables = dst->src[3]; // block tables
38+
const ggml_tensor * seq_lens = dst->src[4]; // sequence lengths
39+
const ggml_tensor * alibi_slopes = dst->src[5]; // optional ALiBi slopes (can be nullptr)
40+
41+
// Extract parameters
42+
float scale;
43+
int32_t block_size;
44+
ggml_cuda_op_paged_attention_get_params(dst, &scale, &block_size);
45+
46+
// Get tensor dimensions
47+
const int64_t head_size = q->ne[0];
48+
const int64_t n_heads = q->ne[1];
49+
const int64_t n_tokens = q->ne[2];
50+
const int64_t n_seqs = q->ne[3];
51+
52+
const int64_t n_kv_heads = k_cache->ne[2];
53+
const int64_t num_blocks = k_cache->ne[0];
54+
55+
const int64_t max_blocks_per_seq = block_tables->ne[0];
56+
57+
// Validate tensor dimensions
58+
GGML_ASSERT(n_tokens > 0 && "Number of query tokens must be positive");
59+
GGML_ASSERT(n_seqs > 0 && "Number of sequences must be positive");
60+
GGML_ASSERT(num_blocks > 0 && "Number of KV cache blocks must be positive");
61+
GGML_ASSERT(max_blocks_per_seq > 0 && "Max blocks per sequence must be positive");
62+
63+
// Validate that we have enough blocks available
64+
// Note: This is a soft check - actual usage depends on sequence lengths
65+
GGML_ASSERT(num_blocks >= max_blocks_per_seq &&
66+
"Total number of blocks should be >= max blocks per sequence");
67+
68+
// For PagedAttention, typically we have one query per sequence (decode mode)
69+
// or multiple queries per sequence (prefill mode)
70+
GGML_ASSERT(n_tokens <= n_seqs * 1024 &&
71+
"Number of tokens seems unusually large relative to batch size");
72+
73+
// Get pointers
74+
void * out_ptr = dst->data;
75+
const void * q_ptr = q->data;
76+
const void * k_cache_ptr = k_cache->data;
77+
const void * v_cache_ptr = v_cache->data;
78+
const int32_t * block_tables_ptr = (const int32_t *)block_tables->data;
79+
const int32_t * seq_lens_ptr = (const int32_t *)seq_lens->data;
80+
81+
// Debug: Check for null pointers
82+
GGML_ASSERT(out_ptr != nullptr && "Output pointer is null");
83+
GGML_ASSERT(q_ptr != nullptr && "Query pointer is null");
84+
GGML_ASSERT(k_cache_ptr != nullptr && "K cache pointer is null");
85+
GGML_ASSERT(v_cache_ptr != nullptr && "V cache pointer is null");
86+
GGML_ASSERT(block_tables_ptr != nullptr && "Block tables pointer is null");
87+
GGML_ASSERT(seq_lens_ptr != nullptr && "Sequence lengths pointer is null");
88+
89+
// Get ALiBi slopes pointer if provided
90+
const float * alibi_slopes_ptr = nullptr;
91+
if (alibi_slopes != nullptr) {
92+
// ALiBi slopes should be a 1D tensor with one slope per attention head
93+
GGML_ASSERT(alibi_slopes->type == GGML_TYPE_F32 &&
94+
"ALiBi slopes must be float32");
95+
GGML_ASSERT(alibi_slopes->ne[0] == n_heads &&
96+
"ALiBi slopes tensor must have one value per head");
97+
alibi_slopes_ptr = (const float *)alibi_slopes->data;
98+
}
99+
100+
// Calculate max sequence length (needed to decide V1 vs V2)
101+
int max_seq_len = 0;
102+
for (int i = 0; i < n_seqs; i++) {
103+
if (seq_lens_ptr[i] > max_seq_len) {
104+
max_seq_len = seq_lens_ptr[i];
105+
}
106+
}
107+
108+
// Get CUDA stream
109+
cudaStream_t stream = ctx.stream();
110+
111+
// Decide whether to use V1 or V2
112+
const bool use_v1 = ggml_cuda_paged_attention::should_use_v1(
113+
max_seq_len, n_seqs, n_heads);
114+
115+
// Launch appropriate kernel
116+
if (use_v1) {
117+
ggml_cuda_paged_attention::paged_attention_v1_launcher(
118+
out_ptr,
119+
q_ptr,
120+
k_cache_ptr,
121+
v_cache_ptr,
122+
n_seqs,
123+
n_heads,
124+
n_kv_heads,
125+
head_size,
126+
block_size,
127+
max_blocks_per_seq,
128+
block_tables_ptr,
129+
seq_lens_ptr,
130+
max_seq_len,
131+
scale,
132+
alibi_slopes_ptr,
133+
q->type,
134+
k_cache->type,
135+
stream);
136+
} else {
137+
ggml_cuda_paged_attention::paged_attention_v2_launcher(
138+
out_ptr,
139+
q_ptr,
140+
k_cache_ptr,
141+
v_cache_ptr,
142+
n_seqs,
143+
n_heads,
144+
n_kv_heads,
145+
head_size,
146+
block_size,
147+
max_blocks_per_seq,
148+
block_tables_ptr,
149+
seq_lens_ptr,
150+
max_seq_len,
151+
scale,
152+
alibi_slopes_ptr,
153+
q->type,
154+
k_cache->type,
155+
ctx.pool(),
156+
stream);
157+
}
158+
159+
// Check for errors
160+
CUDA_CHECK(cudaGetLastError());
161+
}
162+
163+
// Check if PagedAttention is supported for given configuration
164+
bool ggml_cuda_can_paged_attention(const ggml_tensor * dst) {
165+
const ggml_tensor * q = dst->src[0];
166+
const ggml_tensor * k_cache = dst->src[1];
167+
168+
// Check data types
169+
if (q->type != GGML_TYPE_F16 && q->type != GGML_TYPE_F32) {
170+
return false;
171+
}
172+
173+
if (k_cache->type != GGML_TYPE_F16 && k_cache->type != GGML_TYPE_F32) {
174+
return false;
175+
}
176+
177+
// Check head size is supported
178+
const int64_t head_size = q->ne[0];
179+
const int supported_head_sizes[] = {32, 64, 80, 96, 112, 120, 128, 192, 256};
180+
bool head_size_supported = false;
181+
182+
for (int hs : supported_head_sizes) {
183+
if (head_size == hs) {
184+
head_size_supported = true;
185+
break;
186+
}
187+
}
188+
189+
if (!head_size_supported) {
190+
return false;
191+
}
192+
193+
// Extract block size and check it's supported
194+
float scale;
195+
int32_t block_size;
196+
ggml_cuda_op_paged_attention_get_params(dst, &scale, &block_size);
197+
198+
if (block_size != 8 && block_size != 16 && block_size != 32) {
199+
return false;
200+
}
201+
202+
return true;
203+
}
204+
205+
#else // GGML_USE_MUSA
206+
207+
// Stub implementations for MUSA (PagedAttention not yet supported)
208+
#include "common.cuh"
209+
210+
void ggml_cuda_op_paged_attention(
211+
ggml_backend_cuda_context & ctx,
212+
ggml_tensor * dst) {
213+
GGML_UNUSED(ctx);
214+
GGML_UNUSED(dst);
215+
GGML_ABORT("PagedAttention is not yet supported on MUSA");
216+
}
217+
218+
bool ggml_cuda_supports_paged_attention(const ggml_tensor * dst) {
219+
GGML_UNUSED(dst);
220+
return false;
221+
}
222+
223+
#endif // GGML_USE_MUSA
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#include "common.cuh"
2+
3+
void ggml_cuda_op_paged_attention(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
4+
5+
bool ggml_cuda_can_paged_attention(const ggml_tensor * dst);

0 commit comments

Comments
 (0)