pytorch
diff --git a/‎backends/cuda/cuda_backend.py‎
Lines changed: 3 additions & 1 deletion b/‎backends/cuda/cuda_backend.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎backends/cuda/runtime/TARGETS‎
Lines changed: 4 additions & 0 deletions b/‎backends/cuda/runtime/TARGETS‎
Lines changed: 4 additions & 0 deletions
@@ -38,6 +38,7 @@ def get_device_name(cls) -> str:
     def get_supported_fallback_kernels(cls) -> Dict[str, Any]:
         return {
             "at::_ops::_weight_int4pack_mm::call": None,
+            "at::_ops::_scaled_dot_product_flash_attention::call": None,
         }
 
     @classmethod
@@ -49,7 +50,8 @@ def get_decomposition_table(cls) -> Dict[Any, Any]:
     @classmethod
     def get_custom_passes(cls) -> List[typing.Any]:
         """Return CUDA-specific passes: ReplaceEdgeOpWithTritonOpPass"""
-        return [ReplaceEdgeOpWithTritonOpPass()]
+        return []
+        # return [ReplaceEdgeOpWithTritonOpPass()]
 
     @classmethod
     def get_aoti_compile_options(
 
@@ -53,6 +53,7 @@ runtime.cxx_library(
         "shims/cuda_guard.cpp",
         "shims/int4mm.cu",
         "shims/memory.cpp",
+        "shims/sdpa.cu",
         "shims/tensor_attribute.cpp",
     ],
     headers = [
@@ -61,6 +62,8 @@ runtime.cxx_library(
         "shims/int4mm.cuh",
         "shims/int4mm.h",
         "shims/memory.h",
+        "shims/sdpa.cuh",
+        "shims/sdpa.h",
         "shims/tensor_attribute.h",
         "utils.h",
     ],
@@ -84,6 +87,7 @@ runtime.cxx_library(
     ],
     external_deps = [
         ("cuda", None, "cuda-lazy"),
+        ("cuda", None, "cublas-lazy"),
     ],
 )