[OMNIML-2850] [3/n] Adds sparse attention calibration #538

realAsma · 2025-12-24T23:41:27Z

same as https://github.com/NVIDIA/Model-Optimizer/pull/538/files#r2646356349 and avoid repeated attention modification

I've removed this check.

coderabbitai · 2026-01-28T00:45:04Z

⚠️ Potential issue | 🟡 Minor

Add missing imports in the quick example.

The code example is missing necessary imports for AutoModelForCausalLM and torch, which would cause the example to fail if copied as-is.

📝 Suggested fix

```python +import torch +from transformers import AutoModelForCausalLM + import modelopt.torch.sparsity.attention_sparsity as mtsa from modelopt.torch.sparsity.attention_sparsity.config import SKIP_SOFTMAX_DEFAULT

🤖 Prompt for AI Agents

In `@examples/llm_sparsity/attention_sparsity/README.md` around lines 9 - 22, The example in README.md is missing imports required to run the snippet; add an import for torch and for AutoModelForCausalLM from transformers at the top so the symbols AutoModelForCausalLM and torch referenced when loading the model (and later calling mtsa.sparsify with SKIP_SOFTMAX_DEFAULT) are defined; update the snippet to include these two imports before importing modelopt.torch.sparsity.attention_sparsity and using mtsa.sparsify.

-Original file line number
+Diff line change
@@ Expand Up / @@ -43,9 +43,11 @@ @@
     from lm_eval.api.model import T
     from lm_eval.models.huggingface import HFLM
     from quantization_utils import quantize_model
+    from sparse_attention_utils import sparsify_model
     import modelopt.torch.opt as mto
     from modelopt.torch.quantization.utils import is_quantized
+    from modelopt.torch.sparsity.attention_sparsity.conversion import is_attn_sparsified
     def create_from_arg_obj(cls: type[T], arg_dict: dict, additional_config: dict | None = None) -> T:
@@ Expand All @@
         calib_size = arg_dict.pop("calib_size", 512)
         compress = arg_dict.pop("compress", False)
+        # Sparse attention arguments
+        sparse_cfg = arg_dict.pop("sparse_cfg", None)
         additional_config = {} if additional_config is None else additional_config
         additional_config = {k: v for k, v in additional_config.items() if v is not None}
@@ Expand Down Expand Up @@
                 auto_quantize_checkpoint=auto_quantize_checkpoint,
             )
+        if sparse_cfg:
+            if is_attn_sparsified(model_obj.model):
+                warnings.warn("Skipping sparse attention: model already has sparse attention applied.")
+            else:
+                sparsify_model(
+                    model=model_obj,
+                    sparse_cfg=sparse_cfg,
+                )
         return model_obj
@@ Expand Down Expand Up / @@ -152,6 +166,11 @@ def setup_parser_with_modelopt_args(): @@
             action="store_true",
             help="Compress the model after quantization",
         )
+        parser.add_argument(
+            "--sparse_cfg",
+            type=str,
+            help="Sparse attention configuration (e.g., SKIP_SOFTMAX_DEFAULT, SKIP_SOFTMAX_CALIB)",
+        )
         return parser
@@ Expand All / @@ -177,6 +196,7 @@ def setup_parser_with_modelopt_args(): @@
                 "calib_batch_size": args.calib_batch_size,
                 "calib_size": args.calib_size,
                 "compress": args.compress,
+                "sparse_cfg": args.sparse_cfg,
             }
         )
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -48,6 +48,7 @@ @@
     from fire import Fire
     from modeling import EvalModel, select_model
     from quantization_utils import MAX_SEQ_LEN, get_tokenizer, quantize_model
+    from sparse_attention_utils import sparsify_model
     from tqdm import tqdm
     try:
@@ Expand All / @@ -56,6 +57,7 @@ @@
         LLM = None  # type: ignore[misc]
     import modelopt.torch.opt as mto
     from modelopt.torch.quantization.utils import is_quantized
+    from modelopt.torch.sparsity.attention_sparsity.conversion import is_attn_sparsified
     os.environ["TOKENIZERS_PARALLELISM"] = "false"
@@ Expand Down Expand Up / @@ -230,6 +232,7 @@ def main( @@
         auto_quantize_method: str = "gradient",
         auto_quantize_score_size: int = 128,
         auto_quantize_checkpoint: str | None = None,
+        sparse_cfg: str | None = None,
         **kwargs,
     ):
         random.seed(RAND_SEED)
@@ Expand Down Expand Up / @@ -289,6 +292,20 @@ def main( @@
                         auto_quantize_checkpoint=auto_quantize_checkpoint,
                     )
+            # Apply sparse attention if requested
+            if sparse_cfg:
+                model.load()
+                if is_attn_sparsified(model.model):
+                    warnings.warn(
+                        "Skipping sparse attention: model already has sparse attention applied."
+                    )
+                else:
+                    sparsify_model(
+                        model=model,
+                        sparse_cfg=sparse_cfg,
+                    )
         for subject in tqdm(subjects):
             dev_df = pd.read_csv(os.path.join(data_dir, "dev", subject + "_dev.csv"), header=None)[
                 :ntrain
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -179,6 +179,7 @@ class SeqToSeqModel(EvalModel): @@
         lora_path: str = ""
         device: str = "cuda"
         load_8bit: bool = False
+        attn_implementation: str | None = None
         def load(self):
             if self.model is None:
@@ Expand All / @@ -188,6 +189,8 @@ def load(self): @@
                 if self.load_8bit:
                     args.update(device_map="auto", load_in_8bit=True)
                 args.update(torch_dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto")
+                if self.attn_implementation:
+                    args["attn_implementation"] = self.attn_implementation
                 self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_path, **args)
                 print_gpu_utilization()
                 if self.lora_path:
@@ Expand Down Expand Up / @@ -241,6 +244,8 @@ def load(self): @@
                 if self.load_8bit:
                     args.update(device_map="auto", load_in_8bit=True)
                 args.update(torch_dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto")
+                if self.attn_implementation:
+                    args["attn_implementation"] = self.attn_implementation
                 self.model = AutoModelForCausalLM.from_pretrained(
                     self.model_path, trust_remote_code=True, **args
                 )
@@ Expand Down @@

-Original file line number
+Diff line change
@@ -0,0 +1,78 @@
+    # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+    # SPDX-License-Identifier: Apache-2.0
+    #
+    # Licensed under the Apache License, Version 2.0 (the "License");
+    # you may not use this file except in compliance with the License.
+    # You may obtain a copy of the License at
+    #
+    # http://www.apache.org/licenses/LICENSE-2.0
+    #
+    # Unless required by applicable law or agreed to in writing, software
+    # distributed under the License is distributed on an "AS IS" BASIS,
+    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    # See the License for the specific language governing permissions and
+    # limitations under the License.
+    """Utilities for sparse attention integration with llm_eval."""
+    import modelopt.torch.sparsity.attention_sparsity as mtsa
+    def _extract_model(model_obj):
+        """Extract actual model from wrapper (HFLM or EvalModel)."""
+        if hasattr(model_obj, "gpt2"):
+            return model_obj.gpt2
+        elif hasattr(model_obj, "model"):
+            return model_obj.model
+        else:
+            return model_obj
+    def sparsify_model(
+        model,
+        sparse_cfg: str,
+        backend=None,
+    ):
+        """Apply sparse attention to model with optional RULER calibration.
+        Args:
+            model: Model wrapper (HFLM or EvalModel) or raw model
+            sparse_cfg: Sparse attention config name or dict
+            backend: Backend to use (optional, overrides config backend)
+        Returns:
+            The model with sparse attention applied
+        Note:
+            Calibration is automatically triggered if the config contains a 'calibration' field.
+            The calibration will auto-generate RULER dataset from the model's tokenizer.
+        """
+        # Extract actual model
+        net = _extract_model(model)
+        # Resolve config
+        if isinstance(sparse_cfg, str):
+            # Get config from mtsa module (e.g., SKIP_SOFTMAX_CALIB, SKIP_SOFTMAX_DEFAULT)
+            mtsa_cfg = getattr(mtsa, sparse_cfg, None)
+            if mtsa_cfg is None:
+                raise ValueError(f"Unknown sparse_cfg: {sparse_cfg}.")
+        else:
+            mtsa_cfg = sparse_cfg
+        # Override backend if specified
+        if backend:
+            if isinstance(mtsa_cfg, dict) and "sparse_cfg" in mtsa_cfg:
+                modified_sparse_cfg = {}
+                for pattern, cfg in mtsa_cfg["sparse_cfg"].items():
+                    modified_cfg = cfg.copy() if isinstance(cfg, dict) else cfg
+                    if isinstance(modified_cfg, dict):
+                        modified_cfg["backend"] = backend
+                    modified_sparse_cfg[pattern] = modified_cfg
+                mtsa_cfg = {"sparse_cfg": modified_sparse_cfg}
+        # Apply sparsification
+        print(f"\nApplying sparse attention with config: {sparse_cfg}")
+        mtsa.sparsify(net, mtsa_cfg)
+        print("Sparse attention applied successfully!")
+        return model

Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Data directory for calibration
		data

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[OMNIML-2850] [3/n] Adds sparse attention calibration #538

Uh oh!

Diff view

Diff view

There are no files selected for viewing

realAsma Dec 24, 2025

Uh oh!

kaix-nv Dec 30, 2025

Uh oh!

coderabbitai bot Jan 28, 2026

Uh oh!

Uh oh!

-Original file line number
+Diff line change
@@ -0,0 +1,165 @@
+    # Attention Sparsity for HuggingFace Models
+    In this tutorial, we demonstrate how to use NVIDIA TensorRT Model Optimizer to apply attention sparsity to HuggingFace models. Attention sparsity reduces computational cost by skipping near-zero attention scores during the softmax computation.
+    ## Getting Started
+    ### Quick Example
+    ```python
+    import modelopt.torch.sparsity.attention_sparsity as mtsa
+    from modelopt.torch.sparsity.attention_sparsity.config import SKIP_SOFTMAX_DEFAULT
+    # Load your model
+    model = AutoModelForCausalLM.from_pretrained(
+        "Qwen/Qwen3-8B",
+        attn_implementation="eager",  # Required for sparse attention
+        torch_dtype=torch.bfloat16,
+    )
+    # Apply sparse attention
+    model = mtsa.sparsify(model, config=SKIP_SOFTMAX_DEFAULT)
+    ```
+    > [!Note]
+    > `attn_implementation="eager"` is required for sparse attention to work properly. Flash Attention 2 or SDPA would bypass the softmax patching needed for stats collection.
+    ## Configuration Options
+    Two pre-defined configurations are available:
+    ### 1. Fixed Threshold (SKIP_SOFTMAX_DEFAULT)
+    Uses a fixed threshold value. Simple but may not be optimal for all sequence lengths.
+    ```python
+    from modelopt.torch.sparsity.attention_sparsity.config import SKIP_SOFTMAX_DEFAULT
+    model = mtsa.sparsify(model, config=SKIP_SOFTMAX_DEFAULT)
+    ```
+    ### 2. Calibrated Threshold (SKIP_SOFTMAX_CALIB)
+    Uses RULER-based calibration to determine an optimal dynamic threshold that adapts to sequence length. Recommended for production use.
+    ```python
+    from modelopt.torch.sparsity.attention_sparsity.config import SKIP_SOFTMAX_CALIB
+    model = mtsa.sparsify(model, config=SKIP_SOFTMAX_CALIB)
+    ```
+    ## Prerequisites
+    ### Local Installation
+    For Hugging Face models, install Model Optimizer with `hf` dependencies using `pip` from [PyPI](https://pypi.org/project/nvidia-modelopt/) and install the requirements for the example:
+    ```bash
+    pip install nvidia-modelopt[hf]
+    ```
+    ### Download RULER Calibration Data (Required for Calibration)
+    If using `SKIP_SOFTMAX_CALIB`, you need to download the RULER calibration dataset first:
+    ```bash
+    bash modelopt/torch/sparsity/attention_sparsity/calibration/download_ruler_data.sh
+    ```
+    This downloads the Paul Graham essays dataset used for generating calibration samples.
+    ## Run Sparse Attention on HuggingFace Models
+    ### Basic Usage (Without Calibration)
+    Apply sparse attention with a fixed threshold:
+    ```bash
+    python examples/llm_sparsity/attention_sparsity/hf_sa.py \
+        --pyt_ckpt_path Qwen/Qwen3-8B \
+        --sparse_attn skip_softmax
+    ```
+    ### With RULER Calibration
+    Apply sparse attention with calibrated thresholds for optimal sparsity:
+    ```bash
+    python examples/llm_sparsity/attention_sparsity/hf_sa.py \
+        --pyt_ckpt_path Qwen/Qwen3-8B \
+        --sparse_attn skip_softmax_calib
+    ```
+    The calibration process:
+. Generates RULER calibration samples
+. Collects attention statistics during forward passes
+. Determines optimal threshold scale factor for target sparsity ratio
+    ### Command Line Arguments
+    | Argument | Default | Description |
+    |----------|---------|-------------|
+    | `--pyt_ckpt_path` | Required | HuggingFace model path or name |
+    | `--sparse_attn` | `skip_softmax` | Configuration: `skip_softmax` or `skip_softmax_calib` |
+    | `--backend` | `pytorch` | Backend: `pytorch` (only supported backend) |
+    | `--seq_len` | `2048` | Maximum sequence length for input prompts |
+    | `--export_dir` | `None` | Directory to export the sparsified model |
+    ## Output Comparison
+    The script automatically compares outputs before and after applying sparse attention:
+. Loads a test sample from the NarrativeQA dataset
+. Generates text before sparse attention is applied
+. Applies sparse attention (with optional calibration)
+. Generates text after sparse attention is applied
+. Compares and displays both outputs
+    ## Export Model
+    Export the sparsified model to a HuggingFace checkpoint:
+    ```bash
+    python examples/llm_sparsity/attention_sparsity/hf_sa.py \
+        --pyt_ckpt_path Qwen/Qwen3-8B \
+        --sparse_attn skip_softmax_calib \
+        --export_dir ./exported_sparse_model
+    ```
+    The exported model can be loaded and used with standard HuggingFace APIs.
+    ## Custom Configuration
+    You can create custom sparse attention configurations:
+    ```python
+    custom_config = {
+        "sparse_cfg": {
+            "calibration": {  # Optional: omit for fixed threshold
+                "target_sparse_ratio": {"prefill": 0.5, "decode": 0.5},  # Target 50% sparsity
+                "samples": 128,              # Number of calibration samples
+                "max_seqlen": 8192,          # Maximum sequence length
+                # Optional: customize threshold trials for calibration
+                "threshold_trials": [1e-4, 5e-4, 1e-3, 5e-3, 1e-2, 2e-2, 5e-2, 1e-1, 2e-1, 3e-1, 5e-1, 7e-1],
+            },
+            "*attn*": {  # Pattern to match attention modules
+                "method": "flash_skip_softmax",
+                "threshold": {"prefill": 1e-3, "decode": 1e-4},  # Phase-specific thresholds (ignored if calibration is used)
+                "br": 128,          # Flash Attention block rows
+                "bc": 128,          # Flash Attention block columns
+                "backend": "pytorch",
+                "collect_stats": True,
+                "enable": True,
+            },
+            "default": {"enable": False},
+        },
+    }
+    model = mtsa.sparsify(model, config=custom_config)
+    ```
+    ## References
+    - [TensorRT Model Optimizer Documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/)
+    - [RULER: What's the Real Context Size of Your Long-Context Language Models?](https://github.com/NVIDIA/RULER)

[OMNIML-2850] [3/n] Adds sparse attention calibration #538

Are you sure you want to change the base?

Uh oh!

[OMNIML-2850] [3/n] Adds sparse attention calibration #538

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

realAsma Dec 24, 2025

Choose a reason for hiding this comment

Uh oh!

kaix-nv Dec 30, 2025

Choose a reason for hiding this comment

Uh oh!

coderabbitai bot Jan 28, 2026

Choose a reason for hiding this comment

Uh oh!

Uh oh!