vllm-project
diff --git a/‎csrc/mamba/mamba_ssm/selective_scan.h‎
Lines changed: 7 additions & 1 deletion b/‎csrc/mamba/mamba_ssm/selective_scan.h‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎csrc/mamba/mamba_ssm/selective_scan_fwd.cu‎
Lines changed: 113 additions & 21 deletions b/‎csrc/mamba/mamba_ssm/selective_scan_fwd.cu‎
Lines changed: 113 additions & 21 deletions
diff --git a/‎csrc/ops.h‎
Lines changed: 13 additions & 11 deletions b/‎csrc/ops.h‎
Lines changed: 13 additions & 11 deletions
diff --git a/‎csrc/torch_bindings.cpp‎
Lines changed: 5 additions & 1 deletion b/‎csrc/torch_bindings.cpp‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎tests/kernels/mamba/test_mamba_ssm.py‎
Lines changed: 15 additions & 0 deletions b/‎tests/kernels/mamba/test_mamba_ssm.py‎
Lines changed: 15 additions & 0 deletions
@@ -24,6 +24,8 @@ struct SSMParamsBase {
     int64_t pad_slot_id;
 
     bool delta_softplus;
+    bool cache_enabled;
+    int block_size;
 
     index_t A_d_stride;
     index_t A_dstate_stride;
@@ -46,8 +48,9 @@ struct SSMParamsBase {
     index_t out_z_batch_stride;
     index_t out_z_d_stride;
     index_t ssm_states_batch_stride;
-    index_t ssm_states_dim_stride;  
+    index_t ssm_states_dim_stride;
     index_t ssm_states_dstate_stride;
+    index_t cache_indices_stride;
 
     // Common data pointers.
     void *__restrict__ A_ptr;
@@ -66,6 +69,9 @@ struct SSMParamsBase {
     void *__restrict__ cache_indices_ptr;
     void *__restrict__ has_initial_state_ptr;
 
+    void *__restrict__ block_idx_first_scheduled_token_ptr;  // (batch,) - first block to write
+    void *__restrict__ block_idx_last_scheduled_token_ptr;   // (batch,) - last block to write
+    void *__restrict__ initial_state_idx_ptr;  // (batch,) - index of the initial state to use
 };
 
 
 
@@ -119,7 +119,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
 
     const int* cache_indices = params.cache_indices_ptr == nullptr ? nullptr
         : reinterpret_cast<int *>(params.cache_indices_ptr);
-    const int cache_index = cache_indices == nullptr ? batch_id : cache_indices[batch_id];
+    const int cache_index = cache_indices == nullptr ? batch_id : cache_indices[batch_id]; 
     // cache_index == params.pad_slot_id is defined as padding, so we exit early
     if (cache_index == params.pad_slot_id){
         return;
@@ -133,9 +133,18 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
     input_t *Bvar = reinterpret_cast<input_t *>(params.B_ptr) + sequence_start_index * params.B_batch_stride + group_id * params.B_group_stride;
     weight_t *C = reinterpret_cast<weight_t *>(params.C_ptr) + dim_id * kNRows * params.C_d_stride;
     input_t *Cvar = reinterpret_cast<input_t *>(params.C_ptr) + sequence_start_index * params.C_batch_stride + group_id * params.C_group_stride;
-    typename Ktraits::state_t *ssm_states = reinterpret_cast<typename Ktraits::state_t *>(params.ssm_states_ptr) + 
-    cache_index * params.ssm_states_batch_stride + 
-    dim_id * kNRows * params.ssm_states_dim_stride;
+
+    typename Ktraits::state_t *ssm_states;
+    if (params.cache_enabled) {
+        // APC mode: ssm_states points to the base, we'll use absolute cache slots later
+        ssm_states = reinterpret_cast<typename Ktraits::state_t *>(params.ssm_states_ptr) +
+            dim_id * kNRows * params.ssm_states_dim_stride;
+    } else {
+        // Non-APC mode: offset by cache_index as before
+        ssm_states = reinterpret_cast<typename Ktraits::state_t *>(params.ssm_states_ptr) +
+            cache_index * params.ssm_states_batch_stride +
+            dim_id * kNRows * params.ssm_states_dim_stride;
+    }
 
     float D_val[kNRows] = {0};
     if (params.D_ptr != nullptr) {
@@ -159,7 +168,22 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
     // }
 
     constexpr int kChunkSize = kNThreads * kNItems;
-    const int n_chunks = (seqlen + 2048 - 1) / 2048;
+
+    // Use block_size for chunking when APC is enabled, otherwise use 2048 for backwards compatibility
+    const int iteration_chunk_size = params.cache_enabled ? params.block_size : 2048;
+    const int n_chunks = (seqlen + iteration_chunk_size - 1) / iteration_chunk_size;
+
+    const int* batch_cache_indices = cache_indices != nullptr ?
+                                     cache_indices + batch_id * params.cache_indices_stride : nullptr;
+    const int* block_idx_first_scheduled = params.block_idx_first_scheduled_token_ptr != nullptr ?
+                                           reinterpret_cast<const int*>(params.block_idx_first_scheduled_token_ptr) : nullptr;
+    const int* block_idx_last_scheduled = params.block_idx_last_scheduled_token_ptr != nullptr ?
+                                          reinterpret_cast<const int*>(params.block_idx_last_scheduled_token_ptr) : nullptr;
+    const int* initial_state_idx = params.initial_state_idx_ptr != nullptr ?
+                                   reinterpret_cast<const int*>(params.initial_state_idx_ptr) : nullptr;
+
+    const size_t load_cache_slot = params.cache_enabled && batch_cache_indices != nullptr ? batch_cache_indices[initial_state_idx[batch_id]] : cache_index;
+
     for (int chunk = 0; chunk < n_chunks; ++chunk) {
         input_t u_vals[kNRows][kNItems], delta_vals_load[kNRows][kNItems];
 
@@ -219,7 +243,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
             if constexpr (kIsVariableC) {
                 auto &smem_load_weight_C = !kIsVariableB ? smem_load_weight : smem_load_weight1;
                 load_weight<Ktraits>(Cvar + state_idx * params.C_dstate_stride, C_vals,
-                    smem_load_weight_C, (seqlen - chunk * kChunkSize) * (1 ));
+                    smem_load_weight_C, (seqlen - chunk * kChunkSize) * (1));
                 if constexpr (!kIsVariableB) {
                     #pragma unroll
                     for (int r = 0; r < kNRows; ++r) {
@@ -242,16 +266,31 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
                 for (int i = 0; i < kNItems; ++i) {
                     thread_data[i] = make_float2(exp2f(delta_vals[r][i] * A_val[r]),
                                                  !kIsVariableB ? delta_u_vals[r][i] : B_vals[i] * delta_u_vals[r][i]);
-                    
                     if (seqlen % (kNItems * kNThreads) != 0) {  // So that the last state is correct
                         if (threadIdx.x * kNItems + i >= seqlen - chunk * kChunkSize) {
                             thread_data[i] = make_float2(1.f, 0.f);
                         }
                     }
                 }
                 // Initialize running total
-
-                scan_t running_prefix = chunk > 0 ? smem_running_prefix[state_idx + r * MAX_DSTATE] : make_float2(1.0, has_initial_state ? float(ssm_states[state_idx * params.ssm_states_dstate_stride]): 0.0);
+                scan_t running_prefix;
+                if (chunk > 0) {
+                    running_prefix = smem_running_prefix[state_idx + r * MAX_DSTATE];
+                } else {
+                    // Load initial state
+                    if (params.cache_enabled && has_initial_state && batch_cache_indices != nullptr) {
+                        size_t state_offset = load_cache_slot * params.ssm_states_batch_stride +
+                                             r * params.ssm_states_dim_stride +
+                                             state_idx * params.ssm_states_dstate_stride;
+                        running_prefix = make_float2(1.0, float(ssm_states[state_offset]));
+                    } else if (has_initial_state) {
+                        // Non-APC mode: load from current batch position
+                        running_prefix = make_float2(1.0, float(ssm_states[state_idx * params.ssm_states_dstate_stride]));
+                    } else {
+                        // No initial state
+                        running_prefix = make_float2(1.0, 0.0);
+                    }
+                }
 
                 SSMScanPrefixCallbackOp<weight_t> prefix_op(running_prefix);
                 typename Ktraits::BlockScanT(smem_scan).InclusiveScan(
@@ -260,8 +299,25 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
                 // There's a syncthreads in the scan op, so we don't need to sync here.
                 // Unless there's only 1 warp, but then it's the same thread (0) reading and writing.
                 if (threadIdx.x == 0) {
-                    smem_running_prefix[state_idx] = prefix_op.running_prefix;
-                    if (chunk == n_chunks - 1) {
+                    smem_running_prefix[state_idx + r * MAX_DSTATE] = prefix_op.running_prefix;
+
+                    // Store state at the end of each chunk when cache is enabled
+                    if (params.cache_enabled && batch_cache_indices != nullptr) {
+
+                        size_t cache_slot;
+                        if (chunk == n_chunks - 1) {
+                            cache_slot = batch_cache_indices[block_idx_last_scheduled[batch_id]];
+                        } else {
+                            cache_slot = batch_cache_indices[block_idx_first_scheduled[batch_id] + chunk];
+                        }
+
+                        size_t state_offset = cache_slot * params.ssm_states_batch_stride +
+                                             r * params.ssm_states_dim_stride +
+                                             state_idx * params.ssm_states_dstate_stride;
+
+                        ssm_states[state_offset] = typename Ktraits::state_t(prefix_op.running_prefix.y);
+                    } else if (!params.cache_enabled && chunk == n_chunks - 1) {
+                        // Non-APC mode: store only final state at current batch position
                         ssm_states[state_idx * params.ssm_states_dstate_stride] = typename Ktraits::state_t(prefix_op.running_prefix.y);
                     }
                 }
@@ -274,7 +330,6 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
                 }
             }
         }
-        
         input_t *out = reinterpret_cast<input_t *>(params.out_ptr) + sequence_start_index * params.out_batch_stride
             + dim_id * kNRows * params.out_d_stride + chunk * kChunkSize;
         __syncthreads();
@@ -346,7 +401,9 @@ template<typename input_t, typename weight_t, typename state_t>
 void selective_scan_fwd_cuda(SSMParamsBase &params, cudaStream_t stream) {
 
     #ifndef USE_ROCM
-        if (params.seqlen <= 128) {           
+        if (params.cache_enabled && params.block_size == 1024) {
+            selective_scan_fwd_launch<64, 16, input_t, weight_t, state_t>(params, stream);
+        } else if (params.seqlen <= 128) {
             selective_scan_fwd_launch<32, 4, input_t, weight_t, state_t>(params, stream);
         } else if (params.seqlen <= 256) {
             selective_scan_fwd_launch<32, 8, input_t, weight_t, state_t>(params, stream);
@@ -358,7 +415,9 @@ void selective_scan_fwd_cuda(SSMParamsBase &params, cudaStream_t stream) {
             selective_scan_fwd_launch<128, 16, input_t, weight_t, state_t>(params, stream);
         }
     #else
-        if (params.seqlen <= 256) {
+        if (params.cache_enabled && params.block_size == 1024) {
+            selective_scan_fwd_launch<64, 16, input_t, weight_t, state_t>(params, stream);
+        } else if (params.seqlen <= 256) {
             selective_scan_fwd_launch<64, 4, input_t, weight_t, state_t>(params, stream);
         } else if (params.seqlen <= 512) {
             selective_scan_fwd_launch<64, 8, input_t, weight_t, state_t>(params, stream);
@@ -437,13 +496,17 @@ void set_ssm_params_fwd(SSMParamsBase &params,
                         const std::optional<at::Tensor>& D,
                         const std::optional<at::Tensor>& delta_bias,
                         const torch::Tensor ssm_states,
-                        bool has_z, 
+                        bool has_z,
                         bool delta_softplus,
                         const std::optional<at::Tensor>& query_start_loc,
                         const std::optional<at::Tensor>& cache_indices,
                         const std::optional<at::Tensor>& has_initial_state,
                         bool varlen,
-                        int64_t pad_slot_id) {
+                        int64_t pad_slot_id,
+                        int64_t block_size,
+                        const std::optional<torch::Tensor> &block_idx_first_scheduled_token,
+                        const std::optional<torch::Tensor> &block_idx_last_scheduled_token,
+                        const std::optional<torch::Tensor> &initial_state_idx) {
 
     // Reset the parameters
     memset(&params, 0, sizeof(params));
@@ -477,6 +540,14 @@ void set_ssm_params_fwd(SSMParamsBase &params,
     params.cache_indices_ptr = cache_indices.has_value() ? cache_indices.value().data_ptr() : nullptr;
     params.has_initial_state_ptr = has_initial_state.has_value() ? has_initial_state.value().data_ptr() : nullptr;
 
+    // Set cache parameters - cache is enabled if we have direct cache writing params
+    params.cache_enabled = block_idx_first_scheduled_token.has_value();
+    params.block_size = static_cast<int>(block_size);
+
+    // Set direct cache writing pointers
+    params.block_idx_first_scheduled_token_ptr = block_idx_first_scheduled_token.has_value() ? block_idx_first_scheduled_token.value().data_ptr() : nullptr;
+    params.block_idx_last_scheduled_token_ptr = block_idx_last_scheduled_token.has_value() ? block_idx_last_scheduled_token.value().data_ptr() : nullptr;
+    params.initial_state_idx_ptr = initial_state_idx.has_value() ? initial_state_idx.value().data_ptr() : nullptr;
 
     // All stride are in elements, not bytes.
     params.A_d_stride = A.stride(0);
@@ -504,9 +575,11 @@ void set_ssm_params_fwd(SSMParamsBase &params,
         params.out_d_stride = out.stride(0);
 
         params.ssm_states_batch_stride = ssm_states.stride(0);
-        params.ssm_states_dim_stride = ssm_states.stride(1);  
+        params.ssm_states_dim_stride = ssm_states.stride(1);
         params.ssm_states_dstate_stride = ssm_states.stride(2);
 
+        params.cache_indices_stride = cache_indices.has_value() ? cache_indices.value().stride(0) : 0;
+
     }
     else{
         if (!is_variable_B) {
@@ -537,8 +610,10 @@ void set_ssm_params_fwd(SSMParamsBase &params,
         params.out_d_stride = out.stride(1);
 
         params.ssm_states_batch_stride = ssm_states.stride(0);
-        params.ssm_states_dim_stride = ssm_states.stride(1);  
+        params.ssm_states_dim_stride = ssm_states.stride(1);
         params.ssm_states_dstate_stride = ssm_states.stride(2);
+
+        params.cache_indices_stride = cache_indices.has_value() ? cache_indices.value().stride(0) : 0;
     }
 }
 
@@ -554,7 +629,11 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
                   const torch::Tensor &ssm_states,
                   // used to identify padding entries if cache_indices provided
                   // in case of padding, the kernel will return early
-                  int64_t pad_slot_id) {
+                  int64_t pad_slot_id,
+                  int64_t block_size,
+                  const std::optional<torch::Tensor> &block_idx_first_scheduled_token,
+                  const std::optional<torch::Tensor> &block_idx_last_scheduled_token,
+                  const std::optional<torch::Tensor> &initial_state_idx) {
     auto input_type = u.scalar_type();
     auto weight_type = A.scalar_type();
     TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
@@ -646,7 +725,16 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
         auto cache_indices_ = cache_indices.value();
         TORCH_CHECK(cache_indices_.scalar_type() == at::ScalarType::Int);
         TORCH_CHECK(cache_indices_.is_cuda());
-        CHECK_SHAPE(cache_indices_, batch_size);
+
+        // cache_indices can be either 1D (batch_size,) for non-APC mode
+        // or 2D (batch_size, max_positions) for APC mode
+        const bool is_apc_mode = block_idx_first_scheduled_token.has_value();
+        if (is_apc_mode) {
+            TORCH_CHECK(cache_indices_.dim() == 2, "cache_indices must be 2D for APC mode");
+            TORCH_CHECK(cache_indices_.size(0) == batch_size, "cache_indices first dimension must match batch_size");
+        } else {
+            CHECK_SHAPE(cache_indices_, batch_size);
+        }
     }
 
 
@@ -686,7 +774,11 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
                        cache_indices,
                        has_initial_state,
                        varlen,
-                       pad_slot_id
+                       pad_slot_id,
+                       block_size,
+                       block_idx_first_scheduled_token,
+                       block_idx_last_scheduled_token,
+                       initial_state_idx
                        );
 
 
 
@@ -321,17 +321,19 @@ void dynamic_per_token_scaled_fp8_quant(
     torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scale,
     std::optional<torch::Tensor> const& scale_ub);
 
-void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta,
-                        const torch::Tensor& A, const torch::Tensor& B,
-                        const torch::Tensor& C,
-                        const std::optional<torch::Tensor>& D_,
-                        const std::optional<torch::Tensor>& z_,
-                        const std::optional<torch::Tensor>& delta_bias_,
-                        bool delta_softplus,
-                        const std::optional<torch::Tensor>& query_start_loc,
-                        const std::optional<torch::Tensor>& cache_indices,
-                        const std::optional<torch::Tensor>& has_initial_state,
-                        const torch::Tensor& ssm_states, int64_t pad_slot_id);
+void selective_scan_fwd(
+    const torch::Tensor& u, const torch::Tensor& delta, const torch::Tensor& A,
+    const torch::Tensor& B, const torch::Tensor& C,
+    const std::optional<torch::Tensor>& D_,
+    const std::optional<torch::Tensor>& z_,
+    const std::optional<torch::Tensor>& delta_bias_, bool delta_softplus,
+    const std::optional<torch::Tensor>& query_start_loc,
+    const std::optional<torch::Tensor>& cache_indices,
+    const std::optional<torch::Tensor>& has_initial_state,
+    const torch::Tensor& ssm_states, int64_t pad_slot_id, int64_t block_size,
+    const std::optional<torch::Tensor>& block_idx_first_scheduled_token,
+    const std::optional<torch::Tensor>& block_idx_last_scheduled_token,
+    const std::optional<torch::Tensor>& initial_state_idx);
 
 torch::Tensor dynamic_4bit_int_moe_cpu(
     torch::Tensor x, torch::Tensor topk_ids, torch::Tensor topk_weights,
 
@@ -611,7 +611,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "Tensor? cache_indices,"
       "Tensor? has_initial_state,"
       "Tensor! ssm_states,"
-      "int pad_slot_id) -> ()");
+      "int pad_slot_id,"
+      "int block_size,"
+      "Tensor? block_idx_first_scheduled_token,"
+      "Tensor? block_idx_last_scheduled_token,"
+      "Tensor? initial_state_idx) -> ()");
   ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);
 
   // Hadamard transforms
 
@@ -179,6 +179,10 @@ def selective_scan_opcheck_fn(
     has_initial_state=None,
     ssm_states=None,
     pad_slot_id=PAD_SLOT_ID,
+    block_size=2048,
+    block_idx_first_scheduled_token=None,
+    block_idx_last_scheduled_token=None,
+    initial_state_idx=None,
 ):
     """if return_last_state is True, returns (out, last_state)
     last_state has shape (batch, dim, dstate).
@@ -223,6 +227,10 @@ def selective_scan_opcheck_fn(
             has_initial_state,
             ssm_states,
             pad_slot_id,
+            block_size,
+            block_idx_first_scheduled_token,
+            block_idx_last_scheduled_token,
+            initial_state_idx,
         ),
         test_utils=["test_schema", "test_faketensor"],
     )
@@ -338,6 +346,11 @@ def test_selective_scan(
             has_initial_state=torch.ones(batch_size, device=u.device, dtype=torch.bool)
             if c > 0
             else None,
+            pad_slot_id=PAD_SLOT_ID,
+            block_size=2048,
+            block_idx_first_scheduled_token=None,
+            block_idx_last_scheduled_token=None,
+            initial_state_idx=None,
         )
         outs.append(out)
     if len(outs) > 1:
@@ -372,6 +385,7 @@ def test_selective_scan(
         delta_bias=delta_bias,
         delta_softplus=delta_softplus,
         ssm_states=state,
+        block_size=2048,
     )
 
 
@@ -586,6 +600,7 @@ def test_selective_scan_varlen(
         padded_state_indices,
         has_initial_state,
         prev_state,
+        block_size=2048,
     )