flashinfer-ai
diff --git a/‎csrc/batch_attention.cu‎
Lines changed: 2 additions & 2 deletions b/‎csrc/batch_attention.cu‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎csrc/batch_decode.cu‎
Lines changed: 2 additions & 2 deletions b/‎csrc/batch_decode.cu‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎csrc/batch_decode_mla_cute_sm80.cu‎
Lines changed: 2 additions & 2 deletions b/‎csrc/batch_decode_mla_cute_sm80.cu‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎csrc/batch_decode_mla_plan.cu‎
Lines changed: 1 addition & 1 deletion b/‎csrc/batch_decode_mla_plan.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/batch_decode_mla_run.cu‎
Lines changed: 1 addition & 1 deletion b/‎csrc/batch_decode_mla_run.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/batch_mla_plan.cu‎
Lines changed: 1 addition & 1 deletion b/‎csrc/batch_mla_plan.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/batch_mla_run.cu‎
Lines changed: 1 addition & 1 deletion b/‎csrc/batch_mla_run.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/batch_mla_sm90_plan.cu‎
Lines changed: 1 addition & 1 deletion b/‎csrc/batch_mla_sm90_plan.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/batch_mla_sm90_run.cu‎
Lines changed: 1 addition & 1 deletion b/‎csrc/batch_mla_sm90_run.cu‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎csrc/batch_pod.cu‎
Lines changed: 2 additions & 2 deletions b/‎csrc/batch_pod.cu‎
Lines changed: 2 additions & 2 deletions
@@ -48,7 +48,7 @@ Array<int64_t> BatchPagedAttentionPlan(TensorView float_workspace_buffer,
 
   HolisticPlanInfo<2> plan_info;
 
-  cudaSetDevice(float_workspace_buffer.device().device_id);
+  ffi::CUDADeviceGuard device_guard(float_workspace_buffer.device().device_id);
   const cudaStream_t stream = get_stream(float_workspace_buffer.device());
 
   cudaError_t status = TwoStageHolisticPlan<IdType>(
@@ -102,7 +102,7 @@ void BatchPagedAttentionRun(TensorView float_workspace_buffer, TensorView int_wo
     v_stride_n = v_cache.stride(2);
   }
 
-  cudaSetDevice(q.device().device_id);
+  ffi::CUDADeviceGuard device_guard(q.device().device_id);
   const cudaStream_t stream = get_stream(q.device());
 
   DISPATCH_context(
 
@@ -53,7 +53,7 @@ Array<int64_t> BatchDecodeWithPagedKVCachePlan(
       << "CUDA cores template only supports equal head dim for QK and VO, please use tensor "
          "cores template for different head dim";
 
-  cudaSetDevice(float_workspace_buffer.device().device_id);
+  ffi::CUDADeviceGuard device_guard(float_workspace_buffer.device().device_id);
   const cudaStream_t stream = get_stream(float_workspace_buffer.device());
   DISPATCH_context(
       DTypeQ, DTypeKV, DTypeO, IdType, HEAD_DIM_QK, HEAD_DIM_VO, POS_ENCODING_MODE,
@@ -130,7 +130,7 @@ void BatchDecodeWithPagedKVCacheRun(TensorView float_workspace_buffer,
   }
   kv_cache_strides = k_strides.data();
 
-  cudaSetDevice(q.device().device_id);
+  ffi::CUDADeviceGuard device_guard(q.device().device_id);
   const cudaStream_t stream = get_stream(q.device());
 
   DISPATCH_context(
 
@@ -23,7 +23,7 @@ Array<int64_t> BatchDecodeWithPagedKVCachePlanMLA(ffi::TensorView float_workspac
       int_workspace_buffer.size(0) * get_element_size(int_workspace_buffer);
 
   DecodePlanInfo plan_info;
-  cudaSetDevice(float_workspace_buffer.device().device_id);
+  ffi::CUDADeviceGuard device_guard(float_workspace_buffer.device().device_id);
   const cudaStream_t stream = get_stream(float_workspace_buffer.device());
 
   auto work_estimation_func = BatchDecodeWithPagedKVCacheWorkEstimationDispatchedMlaCuteSM80<
@@ -103,7 +103,7 @@ void BatchDecodeWithPagedKVCacheRunMLA(
   }
   params.padded_batch_size = plan_info.padded_batch_size;
 
-  cudaSetDevice(paged_ckv_cache.device().device_id);
+  ffi::CUDADeviceGuard device_guard(paged_ckv_cache.device().device_id);
   const cudaStream_t stream = get_stream(paged_ckv_cache.device());
   cudaError_t status = BatchDecodeWithPagedKVCacheDispatchedMlaCuteSM80<HEAD_DIM_CKV, HEAD_DIM_KPE,
                                                                         QO_TILE_LEN, Params>(
 
@@ -15,7 +15,7 @@ Array<int64_t> BatchDecodeWithPagedKVCachePlanMLA(TensorView float_workspace_buf
                                                   TensorView indptr, int64_t batch_size,
                                                   int64_t num_qo_heads, int64_t page_size,
                                                   bool enable_cuda_graph) {
-  cudaSetDevice(float_workspace_buffer.device().device_id);
+  ffi::CUDADeviceGuard device_guard(float_workspace_buffer.device().device_id);
   const cudaStream_t stream = get_stream(float_workspace_buffer.device());
 
   size_t float_workspace_size_in_bytes =
 
@@ -35,7 +35,7 @@ void BatchDecodeWithPagedKVCacheRunMLA(
   void* float_buffer = static_cast<void*>(float_workspace_buffer.data_ptr());
   void* int_buffer = static_cast<void*>(int_workspace_buffer.data_ptr());
 
-  cudaSetDevice(q_nope.device().device_id);
+  ffi::CUDADeviceGuard device_guard(q_nope.device().device_id);
   const cudaStream_t stream = get_stream(q_nope.device());
 
   paged_kv_mla_t<DTypeKV, IdType> paged_kv(
 
@@ -38,7 +38,7 @@ Array<int64_t> BatchMLAPagedAttentionPlan(TensorView float_workspace_buffer,
 
   int batch_size = kv_len.size(0);
 
-  cudaSetDevice(float_workspace_buffer.device().device_id);
+  ffi::CUDADeviceGuard device_guard(float_workspace_buffer.device().device_id);
   const cudaStream_t stream = get_stream(float_workspace_buffer.device());
 
   cudaError_t status =
 
@@ -56,7 +56,7 @@ void BatchMLAPagedAttentionRun(TensorView float_workspace_buffer, TensorView int
   unsigned int o_stride_n = o.stride(0);
   unsigned int o_stride_h = o.stride(1);
 
-  cudaSetDevice(q_nope.device().device_id);
+  ffi::CUDADeviceGuard device_guard(q_nope.device().device_id);
   const cudaStream_t stream = get_stream(q_nope.device());
 
   DISPATCH_context(
 
@@ -38,7 +38,7 @@ Array<int64_t> BatchMLAPagedAttentionSM90Plan(TensorView float_workspace_buffer,
 
   int batch_size = kv_len.size(0);
 
-  cudaSetDevice(float_workspace_buffer.device().device_id);
+  ffi::CUDADeviceGuard device_guard(float_workspace_buffer.device().device_id);
   const cudaStream_t stream = get_stream(float_workspace_buffer.device());
 
   cudaError_t status =
 
@@ -56,7 +56,7 @@ void BatchMLAPagedAttentionSM90Run(TensorView float_workspace_buffer,
   unsigned int o_stride_n = o.stride(0);
   unsigned int o_stride_h = o.stride(1);
 
-  cudaSetDevice(q_nope.device().device_id);
+  ffi::CUDADeviceGuard device_guard(q_nope.device().device_id);
   const cudaStream_t stream = get_stream(q_nope.device());
 
   DISPATCH_context(
 
@@ -100,7 +100,7 @@ void batch_pod_with_kv_cache_tensor(
   }
   kv_cache_strides_p = k_strides_p.data();
 
-  cudaSetDevice(float_workspace_buffer_p.device().device_id);
+  ffi::CUDADeviceGuard device_guard(float_workspace_buffer_p.device().device_id);
   const cudaStream_t stream = get_stream(float_workspace_buffer_p.device());
 
   // Decode setup (TensorView decode = batched prefill)
@@ -152,7 +152,7 @@ void batch_pod_with_kv_cache_tensor(
   kv_cache_strides_d = k_strides_d.data();
 
   // Already handled by prefill
-  // cudaSetDevice(float_workspace_buffer_d.device().device_id);
+  // ffi::CUDADeviceGuard device_guard(float_workspace_buffer_d.device().device_id);
   // const cudaStream_t stream = get_stream(float_workspace_buffer_d.device());
 
   DISPATCH_context(