Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions dlinfer/ops/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,8 @@ def prefill_attention(
value: Tensor,
q_start_loc: Tensor,
q_seq_len: Tensor,
max_q_seq_len: int,
num_q_heads: int,
max_q_seq_len: Tensor,
num_q_heads: Tensor,
num_kv_heads: int,
attn_mask: Sequence[Optional[Tensor]],
softmax_scale: Optional[float],
Expand All @@ -128,7 +128,7 @@ def prefill_attention(
value (Tensor): The value tensor.
q_start_loc (Tensor): The start location of each query sequence.
q_seq_len (Tensor): The length of each query sequence.
max_q_seq_len (int): The maximum length of any query sequence.
max_q_seq_len (Tensor): The maximum length of any query sequence.
num_q_heads (int): The number of query heads.
num_kv_heads (int): The number of key/value heads.
attn_mask (Sequence[Optional[Tensor]]): A sequence of optional attention masks, one for each batch.
Expand Down Expand Up @@ -223,7 +223,7 @@ def paged_decode_attention(
block_table: Tensor,
block_size: int,
kv_seq_len: Tensor,
max_kv_seq_len: int,
max_kv_seq_len: Tensor,
num_q_heads: int,
num_kv_heads: int,
softmax_scale: Optional[float],
Expand All @@ -245,7 +245,7 @@ def paged_decode_attention(
block in the key/value cache.
block_size (int): The size of each block in the input sequence.
kv_seq_len (Tensor): The length of each key/value sequence.
max_kv_seq_len (int): The maximum length of any key/value sequence.
max_kv_seq_len (Tensor): The maximum length of any key/value sequence.
num_q_heads (int): The number of query heads.
num_kv_heads (int): The number of key/value heads.
softmax_scale (Optional[float]): The scale factor to apply to the attention logits before the softmax.
Expand Down Expand Up @@ -301,8 +301,8 @@ def paged_prefill_attention(
q_seq_len: Tensor,
kv_seq_len: Tensor,
cu_seq_lens_kv: Tensor,
max_q_seq_len: int,
max_kv_seq_len: int,
max_q_seq_len: Tensor,
max_kv_seq_len: Tensor,
num_q_heads: int,
num_kv_heads: int,
attn_mask: Sequence[Optional[Tensor]],
Expand All @@ -328,8 +328,8 @@ def paged_prefill_attention(
q_seq_len (Tensor): The length of each query sequence.
kv_seq_len (Tensor): The length of each key/value sequence.
cu_seq_lens_kv (Tensor): The cumulative sequence lengths of the key/value sequences.
max_q_seq_len (int): The maximum length of any query sequence.
max_kv_seq_len (int): The maximum length of any key/value sequence.
max_q_seq_len (Tensor): The maximum length of any query sequence.
max_kv_seq_len (Tensor): The maximum length of any key/value sequence.
num_q_heads (int): The number of query heads.
num_kv_heads (int): The number of key/value heads.
attn_mask (Sequence[Optional[Tensor]]): A sequence of optional attention masks, one for each batch.
Expand Down
8 changes: 4 additions & 4 deletions dlinfer/vendor/ascend/torch_npu_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def prefill_attention(
value: Tensor,
q_start_loc: Tensor,
q_seq_len: Tensor,
max_q_seq_len: int,
max_q_seq_len: Tensor,
num_q_heads: int,
num_kv_heads: int,
attn_mask: Sequence[Optional[Tensor]],
Expand Down Expand Up @@ -174,7 +174,7 @@ def paged_decode_attention(
block_table: Optional[Tensor],
block_size: int,
kv_seq_len: Tensor,
max_kv_seq_len: int,
max_kv_seq_len: Tensor,
num_q_heads: int,
num_kv_heads: int,
softmax_scale: Optional[float],
Expand Down Expand Up @@ -236,8 +236,8 @@ def paged_prefill_attention(
q_seq_len: Tensor,
kv_seq_len: Tensor,
cu_seq_lens_kv: Tensor,
max_q_seq_len: int,
max_kv_seq_len: int,
max_q_seq_len: Tensor,
max_kv_seq_len: Tensor,
num_q_heads: int,
num_kv_heads: int,
attn_mask: Sequence[Optional[Tensor]],
Expand Down
8 changes: 4 additions & 4 deletions dlinfer/vendor/camb/camb_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def prefill_attention(
value: Tensor,
q_start_loc: Tensor,
q_seq_len: Tensor,
max_q_seq_len: int,
max_q_seq_len: Tensor,
num_q_heads: int,
num_kv_heads: int,
attn_mask: Sequence[Optional[Tensor]],
Expand Down Expand Up @@ -195,7 +195,7 @@ def paged_decode_attention(
block_table: Optional[Tensor],
block_size: int,
kv_seq_len: Tensor,
max_kv_seq_len: int,
max_kv_seq_len: Tensor,
num_q_heads: int,
num_kv_heads: int,
softmax_scale: Optional[float],
Expand Down Expand Up @@ -260,8 +260,8 @@ def paged_prefill_attention(
q_seq_len: Tensor,
kv_seq_len: Tensor,
cu_seq_lens_kv: Tensor,
max_q_seq_len: int,
max_kv_seq_len: int,
max_q_seq_len: Tensor,
max_kv_seq_len: Tensor,
num_q_heads: int,
num_kv_heads: int,
attn_mask: Sequence[Optional[Tensor]],
Expand Down
1 change: 1 addition & 0 deletions dlinfer/vendor/maca/csrc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ find_library(torch_python_LIBRARY torch_python PATHS

set(DLINFER_VLLM_SRC
"pybind.cpp"
"attention/attention_kernels.cu"
"pos_encoding_kernels.cu"
"moe_align_block_size_kernels.cu"
"moe/topk_softmax_kernels.cu"
Expand Down
7 changes: 4 additions & 3 deletions dlinfer/vendor/maca/csrc/attention/attention_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -1463,7 +1463,7 @@ template <typename T, typename CACHE_T, int BLOCK_SIZE,
void paged_attention_v1_launcher(
torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
torch::Tensor& value_cache, int num_kv_heads, float scale,
torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
torch::Tensor& block_tables, torch::Tensor& seq_lens, torch::Tensor& max_seq_len,
const c10::optional<torch::Tensor>& alibi_slopes, float k_scale,
float v_scale, const int tp_rank, const int blocksparse_local_blocks,
const int blocksparse_vert_stride, const int blocksparse_block_size,
Expand Down Expand Up @@ -1494,8 +1494,9 @@ void paged_attention_v1_launcher(

constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
int padded_max_seq_len =
DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE;
DIVIDE_ROUND_UP(512, BLOCK_SIZE) * BLOCK_SIZE;
int logits_size = padded_max_seq_len * sizeof(float);

int V_VEC_SIZE = 16 / sizeof(CACHE_T);
int NUM_V_VECS_PER_THREAD = head_size / V_VEC_SIZE;
int NUM_COLS_PER_ITER = MAX(WARP_SIZE / NUM_V_VECS_PER_THREAD, 1);
Expand Down Expand Up @@ -1590,7 +1591,7 @@ void paged_attention_v1(
double scale,
torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
torch::Tensor& seq_lens, // [num_seqs]
int64_t block_size, int64_t max_seq_len,
int64_t block_size, torch::Tensor& max_seq_len,
const c10::optional<torch::Tensor>& alibi_slopes,
const std::string& kv_cache_dtype, double k_scale, double v_scale,
const int64_t tp_rank, const int64_t blocksparse_local_blocks,
Expand Down
2 changes: 1 addition & 1 deletion dlinfer/vendor/maca/csrc/ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#include <optional>

void paged_attention_v1(torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size, int64_t max_seq_len,
torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size, torch::Tensor& max_seq_len,
const c10::optional<torch::Tensor>& alibi_slopes, const std::string& kv_cache_dtype, double k_scale, double v_scale,
const int64_t tp_rank, const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride,
const int64_t blocksparse_block_size, const int64_t blocksparse_head_sliding_step);
Expand Down
15 changes: 15 additions & 0 deletions dlinfer/vendor/maca/csrc/pybind.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,21 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
// vLLM custom ops
pybind11::module ops = m.def_submodule("ops", "vLLM custom operators");

// Attention ops
// Compute the attention between an input query and the cached
// keys/values using PagedAttention.
ops.def("paged_attention_v1",
&paged_attention_v1,
"paged_attention_v1("
" Tensor! out, Tensor query, Tensor key_cache,"
" Tensor value_cache, int num_kv_heads, float scale,"
" Tensor block_tables, Tensor seq_lens, int block_size,"
" Tensor max_seq_len, Tensor? alibi_slopes,"
" str kv_cache_dtype, float k_scale, float v_scale,"
" int tp_rank, int blocksparse_local_blocks,"
" int blocksparse_vert_stride, int blocksparse_block_size,"
" int blocksparse_head_sliding_step) -> ()");

// Rotary embedding
// Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
ops.def("rotary_embedding",
Expand Down
10 changes: 5 additions & 5 deletions dlinfer/vendor/maca/maca_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def prefill_attention(
value: Tensor,
q_start_loc: Tensor,
q_seq_len: Tensor,
max_q_seq_len: int,
max_q_seq_len: Tensor,
num_q_heads: int,
num_kv_heads: int,
attn_mask: Sequence[Optional[Tensor]],
Expand Down Expand Up @@ -207,7 +207,7 @@ def paged_decode_attention(
block_table: Optional[Tensor],
block_size: int,
kv_seq_len: Tensor,
max_kv_seq_len: int,
max_kv_seq_len: Tensor,
num_q_heads: int,
num_kv_heads: int,
softmax_scale: Optional[float],
Expand All @@ -225,7 +225,7 @@ def paged_decode_attention(
num_kv_heads = value_cache.size(1)
block_size = value_cache.size(2)
output = torch.empty_like(query)
custom_ops.paged_attention_v1(
maca_ext_ops.paged_attention_v1(
output,
query,
key_cache,
Expand Down Expand Up @@ -263,8 +263,8 @@ def paged_prefill_attention(
q_seq_len: Tensor,
kv_seq_len: Tensor,
cu_seq_lens_kv: Tensor,
max_q_seq_len: int,
max_kv_seq_len: int,
max_q_seq_len: Tensor,
max_kv_seq_len: Tensor,
num_q_heads: int,
num_kv_heads: int,
attn_mask: Sequence[Optional[Tensor]],
Expand Down
Loading