huggingface
diff --git a/‎.github/workflows/get-pr-info.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/get-pr-info.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/self-comment-ci.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/self-comment-ci.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 2 additions & 2 deletions b/‎CONTRIBUTING.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎MIGRATION_GUIDE_V5.md‎
Lines changed: 485 additions & 0 deletions b/‎MIGRATION_GUIDE_V5.md‎
Lines changed: 485 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmark_v2/framework/benchmark_config.py‎
Lines changed: 37 additions & 37 deletions b/‎benchmark_v2/framework/benchmark_config.py‎
Lines changed: 37 additions & 37 deletions
@@ -40,7 +40,7 @@ on:
         description: "The sha of the merge commit for the pull request (created by GitHub) in the base repository"
         value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_SHA }}
       PR_MERGE_COMMIT_BASE_SHA:
-        description: "The sha of the parent commit of the the merge commit on the target branch in the base repository"
+        description: "The sha of the parent commit of the merge commit on the target branch in the base repository"
         value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_BASE_SHA }}
       PR_HEAD_COMMIT_DATE:
         description: "The date of the head sha of the pull request branch in the head repository"
 
@@ -27,7 +27,7 @@ env:
 jobs:
   get-pr-number:
     name: Get PR number
-    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "eustlb", "MekkCyber", "vasqu", "ivarflakstad", "stevhliu", "ebezzam", "remi-or", "itazap"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
+    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "eustlb", "MekkCyber", "vasqu", "ivarflakstad", "stevhliu", "ebezzam", "remi-or", "itazap", "3outeille"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
     uses: ./.github/workflows/get-pr-number.yml
 
   get-pr-info:
 
@@ -125,9 +125,9 @@ If you're contributing a **vision-language model** (or any multimodal model that
 All new models should use the modular architecture pattern. Create a `modular_<model_name>.py` file using the modular model converter:
 
 - Use the CLI, [`transformers add-new-model-like`](https://github.com/huggingface/transformers/blob/main/src/transformers/cli/add_new_model_like.py) to generate a modular skeleton and get started
-- All code should be in the modular file if possible. Modeling must be in it, it's better if configuration is in it as well. [Modular guide](./modular_transformers#implementing-a-modular-file) shows a quick way to set up a modular file.
+- All code should be in the modular file if possible. Modeling must be in it, it's better if configuration is in it as well. [Modular guide](./docs/source/en/modular_transformers.md#implementing-a-modular-file) shows a quick way to set up a modular file.
 - Reuse existing patterns from similar models as much as possible
-- You can make the model compatible with inference engines such as vLLM or SGLang, and enable zero-effort integration. See specific requirements for model implementation in ["Transformers modeling backend"](./transformers_as_backend#multimodal-models)
+- You can make the model compatible with inference engines such as vLLM or SGLang, and enable zero-effort integration. See specific requirements for model implementation in ["Transformers modeling backend"](./docs/source/en/transformers_as_backend.md#multimodal-models)
 
 To verify your modular file is correct, run:
 
 
@@ -134,7 +134,7 @@ pipeline("the secret to baking a really good cake is ")
 To chat with a model, the usage pattern is the same. The only difference is you need to construct a chat history (the input to `Pipeline`) between you and the system.
 
 > [!TIP]
-> You can also chat with a model directly from the command line.
+> You can also chat with a model directly from the command line, as long as [`transformers serve` is running](https://huggingface.co/docs/transformers/main/en/serving).
 > ```shell
 > transformers chat Qwen/Qwen2.5-0.5B-Instruct
 > ```
 
@@ -2,9 +2,10 @@
 import itertools
 import json
 import logging
+from functools import lru_cache
 from typing import Any
 
-from transformers.utils.import_utils import is_flash_attn_2_available
+from transformers.utils.import_utils import is_flash_attn_2_available, is_kernels_available
 
 
 KERNELIZATION_AVAILABLE = False
@@ -18,17 +19,36 @@
 logger = logging.getLogger(__name__)
 
 
+@lru_cache
+def is_fa2_or_kernel_available() -> bool:
+    """Returns True if the flash_attn_2 or a fallback kernel is available"""
+    # Early return if flash_attn_2 is available
+    if is_flash_attn_2_available():
+        return True
+    # Early return if kernels is not available
+    if not is_kernels_available():
+        logger.warning(
+            "flash_attention_2 is not available. kernels is not installed. Benchmarking flash_attention_2 will not "
+            "be possible."
+        )
+        return False
+    # If kernels is available, try to get the flash_attn_2 kernel
+    try:
+        from kernels import get_kernel
+
+        get_kernel("kernels-community/flash-attn")
+    except Exception as _:
+        logger.warning(
+            "flash_attention_2 is not available. kernels is installed, but the flash_attn kernel is not available."
+            "Benchmarking flash_attention_2 will not be possible."
+        )
+        return False
+
+
 class BenchmarkConfig:
     """Configuration for a single benchmark scenario."""
 
-    all_attn_implementations = [
-        ("flash_attention_2", None),
-        ("eager", None),
-        ("sdpa", "math"),
-        ("sdpa", "flash_attention"),
-        ("flex_attention", None),
-    ]
-
+    all_attn_implementations = ["flash_attention_2", "eager", "sdpa", "flex_attention"]
     all_compiled_modes = [None, "default", "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs"]
 
     def __init__(
@@ -41,7 +61,6 @@ def __init__(
         sequence_length: int = 128,
         num_tokens_to_generate: int = 128,
         attn_implementation: str = "eager",
-        sdpa_backend: str | None = None,
         compile_mode: str | None = None,
         compile_options: dict[str, Any] | None = None,
         kernelize: bool = False,
@@ -59,7 +78,6 @@ def __init__(
         self.num_tokens_to_generate = num_tokens_to_generate
         # Generation parameters
         self.attn_implementation = attn_implementation
-        self.sdpa_backend = sdpa_backend
         # Optimization parameters
         self.compile_mode = compile_mode
         self.compile_options = compile_options if compile_options is not None else {}
@@ -75,34 +93,21 @@ def check_validity(self, skip_validity_check: bool = False) -> None:
         if skip_validity_check:
             return
         # Check FA is installed
-        if self.attn_implementation == "flash_attention_2" and not is_flash_attn_2_available():
-            logger.warning(
-                "Flash attention does not support compile mode. Defaulting to SDPA w/ flash attention backend."
-            )
+        is_fa = self.attn_implementation == "flash_attention_2"
+        if is_fa and not is_fa2_or_kernel_available():
+            logger.warning("Flash attention is not available. Defaulting to SDPA.")
             self.attn_implementation = "sdpa"
-            self.sdpa_backend = "flash_attention"
         # Flash attention does not support compile mode, so we turn it off # FIXME: it would be better to support it
-        is_fa = self.attn_implementation == "flash_attention_2"
-        is_fa |= self.attn_implementation == "sdpa" and self.sdpa_backend == "flash_attention"
-        if is_fa:
+        if is_fa and self.compile_mode is not None:
             logger.warning("Flash attention does not support compile mode. Turning off compile mode.")
             self.compile_mode = None
-        # Handle SDPA backend if not determined by the config (needs to be done before skipping duplicates)
-        if self.attn_implementation == "sdpa" and self.sdpa_backend is None:
-            default_backend = "flash_attention"  # FIXME: torch has a _cur_sdpa_kernel_backends but it fails
-            logger.warning(f"No SDPA backend provided, using {default_backend} instead.")
-            self.sdpa_backend = default_backend
+        # Handle continuous batching cases
         if self.continuous_batching:
             if self.attn_implementation == "flex_attention":
                 logger.error(
-                    "disabling continuous batching because of invalid configuration: flex attention is not supported"
+                    "Disabling continuous batching because of invalid configuration: flex attention is not supported."
                 )
                 self.continuous_batching = False
-            elif self.attn_implementation == "sdpa" and self.sdpa_backend is not None:
-                logger.warning(
-                    "when continuous batching is enabled, sdpa_backend must be None because of the attention mask, setting it to None"
-                )
-                self.sdpa_backend = "math"
 
     @property
     def hash(self) -> str:
@@ -115,7 +120,6 @@ def infer_name(self, compact: bool = True) -> str:
             gpu_monitor_str = "monitored" if self.gpu_monitoring else "unmonitored"
             dimensions_str = f"b{self.batch_size}_s{self.sequence_length}_n{self.num_tokens_to_generate}"
             attn_code = self.attn_implementation
-            attn_code += f"_{self.sdpa_backend}" if self.attn_implementation == "sdpa" else ""
             compile_str = f"compiled_{self.compile_mode}" if self.compile_mode is not None else "uncompiled"
             kernelize_str = "kernelized" if self.kernelize else "unkernelized"
             continuous_batching_str = "cb" if self.continuous_batching else "generate"
@@ -125,7 +129,6 @@ def infer_name(self, compact: bool = True) -> str:
             gpu_monitor_str = ("with" if self.gpu_monitoring else "no") + " GPU monitoring"
             dimensions_str = f"batch size {self.batch_size}, sequence length {self.sequence_length}, {self.num_tokens_to_generate} generated tokens"
             attn_code = f"{self.attn_implementation} attention"
-            attn_code += f" with {self.sdpa_backend} backend" if self.attn_implementation == "sdpa" else ""
             compile_str = "compiled" if self.compile_mode is not None else "not compiled"
             kernelize_str = "kernelized" if self.kernelize else "not kernelized"
             continuous_batching_str = "continuous batching" if self.continuous_batching else "regular generate"
@@ -145,7 +148,6 @@ def to_dict(self) -> dict[str, Any]:
             "sequence_length": self.sequence_length,
             "num_tokens_to_generate": self.num_tokens_to_generate,
             "attn_implementation": self.attn_implementation,
-            "sdpa_backend": self.sdpa_backend,
             "compile_mode": self.compile_mode,
             "compile_options": self.compile_options | {},  # to avoid inplace modification of the original dict
             "kernelize": self.kernelize,
@@ -162,7 +164,6 @@ def from_dict(cls, data: dict[str, Any], skip_validity_check: bool = False) -> "
             sequence_length=data.get("sequence_length", 128),
             num_tokens_to_generate=data.get("num_tokens_to_generate", 128),
             attn_implementation=data.get("attn_implementation", "eager"),
-            sdpa_backend=data.get("sdpa_backend"),
             compile_mode=data.get("compile_mode"),
             compile_options=data.get("compile_options"),
             kernelize=data.get("kernelize", False),
@@ -213,7 +214,7 @@ def get_config_by_level(level: int) -> list[BenchmarkConfig]:
     configs = []
     # Early return if level is greater than 3: we generate all combinations of configs, maybe even w/ all compile modes
     if level >= 3:
-        for attn_implementation, sdpa_backend in BenchmarkConfig.all_attn_implementations:
+        for attn_implementation in BenchmarkConfig.all_attn_implementations:
             # Usually there is not much to gain by compiling with other modes, but we allow it for level 4
             compile_modes = BenchmarkConfig.all_compiled_modes if level >= 4 else [None, "default"]
             for cm in compile_modes:
@@ -222,7 +223,6 @@ def get_config_by_level(level: int) -> list[BenchmarkConfig]:
                         configs.append(
                             BenchmarkConfig(
                                 attn_implementation=attn_implementation,
-                                sdpa_backend=sdpa_backend,
                                 compile_mode=cm,
                                 kernelize=kernelize_on,
                                 continuous_batching=cb_on,
@@ -240,5 +240,5 @@ def get_config_by_level(level: int) -> list[BenchmarkConfig]:
         configs.append(BenchmarkConfig(attn_implementation="sdpa", compile_mode="default"))
         configs.append(BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", kernelize=True))
         configs.append(BenchmarkConfig(attn_implementation="flash_attention_2", kernelize=True))
-        configs.append(BenchmarkConfig(attn_implementation="paged|sdpa", continuous_batching=True))
+        configs.append(BenchmarkConfig(attn_implementation="sdpa", continuous_batching=True))
     return configs