Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions examples/llm_eval/lm_eval_hf.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,11 @@
from lm_eval.api.model import T
from lm_eval.models.huggingface import HFLM
from quantization_utils import quantize_model
from sparse_attention_utils import sparsify_model

import modelopt.torch.opt as mto
from modelopt.torch.quantization.utils import is_quantized
from modelopt.torch.sparsity.attention_sparsity.conversion import is_attn_sparsified


def create_from_arg_obj(cls: type[T], arg_dict: dict, additional_config: dict | None = None) -> T:
Expand All @@ -60,6 +62,9 @@ def create_from_arg_obj(cls: type[T], arg_dict: dict, additional_config: dict |
calib_size = arg_dict.pop("calib_size", 512)
compress = arg_dict.pop("compress", False)

# Sparse attention arguments
sparse_cfg = arg_dict.pop("sparse_cfg", None)

additional_config = {} if additional_config is None else additional_config
additional_config = {k: v for k, v in additional_config.items() if v is not None}

Expand Down Expand Up @@ -91,6 +96,15 @@ def create_from_arg_obj(cls: type[T], arg_dict: dict, additional_config: dict |
auto_quantize_checkpoint=auto_quantize_checkpoint,
)

if sparse_cfg:
if is_attn_sparsified(model_obj.model):
warnings.warn("Skipping sparse attention: model already has sparse attention applied.")
else:
sparsify_model(
model=model_obj,
sparse_cfg=sparse_cfg,
)

return model_obj


Expand Down Expand Up @@ -152,6 +166,11 @@ def setup_parser_with_modelopt_args():
action="store_true",
help="Compress the model after quantization",
)
parser.add_argument(
"--sparse_cfg",
type=str,
help="Sparse attention configuration (e.g., SKIP_SOFTMAX_DEFAULT, SKIP_SOFTMAX_CALIB)",
)
return parser


Expand All @@ -177,6 +196,7 @@ def setup_parser_with_modelopt_args():
"calib_batch_size": args.calib_batch_size,
"calib_size": args.calib_size,
"compress": args.compress,
"sparse_cfg": args.sparse_cfg,
}
)

Expand Down
17 changes: 17 additions & 0 deletions examples/llm_eval/mmlu.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
from fire import Fire
from modeling import EvalModel, select_model
from quantization_utils import MAX_SEQ_LEN, get_tokenizer, quantize_model
from sparse_attention_utils import sparsify_model
from tqdm import tqdm

try:
Expand All @@ -56,6 +57,7 @@
LLM = None # type: ignore[misc]
import modelopt.torch.opt as mto
from modelopt.torch.quantization.utils import is_quantized
from modelopt.torch.sparsity.attention_sparsity.conversion import is_attn_sparsified

os.environ["TOKENIZERS_PARALLELISM"] = "false"

Expand Down Expand Up @@ -230,6 +232,7 @@ def main(
auto_quantize_method: str = "gradient",
auto_quantize_score_size: int = 128,
auto_quantize_checkpoint: str | None = None,
sparse_cfg: str | None = None,
**kwargs,
):
random.seed(RAND_SEED)
Expand Down Expand Up @@ -289,6 +292,20 @@ def main(
auto_quantize_checkpoint=auto_quantize_checkpoint,
)

# Apply sparse attention if requested
if sparse_cfg:
model.load()

if is_attn_sparsified(model.model):
warnings.warn(
"Skipping sparse attention: model already has sparse attention applied."
)
else:
sparsify_model(
model=model,
sparse_cfg=sparse_cfg,
)

for subject in tqdm(subjects):
dev_df = pd.read_csv(os.path.join(data_dir, "dev", subject + "_dev.csv"), header=None)[
:ntrain
Expand Down
5 changes: 5 additions & 0 deletions examples/llm_eval/modeling.py
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as https://github.com/NVIDIA/Model-Optimizer/pull/538/files#r2646356349 and avoid repeated attention modification

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've removed this check.

Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ class SeqToSeqModel(EvalModel):
lora_path: str = ""
device: str = "cuda"
load_8bit: bool = False
attn_implementation: str | None = None

def load(self):
if self.model is None:
Expand All @@ -188,6 +189,8 @@ def load(self):
if self.load_8bit:
args.update(device_map="auto", load_in_8bit=True)
args.update(torch_dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto")
if self.attn_implementation:
args["attn_implementation"] = self.attn_implementation
self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_path, **args)
print_gpu_utilization()
if self.lora_path:
Expand Down Expand Up @@ -241,6 +244,8 @@ def load(self):
if self.load_8bit:
args.update(device_map="auto", load_in_8bit=True)
args.update(torch_dtype=getattr(torch, self.dtype) if self.dtype != "auto" else "auto")
if self.attn_implementation:
args["attn_implementation"] = self.attn_implementation
self.model = AutoModelForCausalLM.from_pretrained(
self.model_path, trust_remote_code=True, **args
)
Expand Down
78 changes: 78 additions & 0 deletions examples/llm_eval/sparse_attention_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Utilities for sparse attention integration with llm_eval."""

import modelopt.torch.sparsity.attention_sparsity as mtsa


def _extract_model(model_obj):
"""Extract actual model from wrapper (HFLM or EvalModel)."""
if hasattr(model_obj, "gpt2"):
return model_obj.gpt2
elif hasattr(model_obj, "model"):
return model_obj.model
else:
return model_obj


def sparsify_model(
model,
sparse_cfg: str,
backend=None,
):
"""Apply sparse attention to model with optional RULER calibration.

Args:
model: Model wrapper (HFLM or EvalModel) or raw model
sparse_cfg: Sparse attention config name or dict
backend: Backend to use (optional, overrides config backend)

Returns:
The model with sparse attention applied

Note:
Calibration is automatically triggered if the config contains a 'calibration' field.
The calibration will auto-generate RULER dataset from the model's tokenizer.
"""
# Extract actual model
net = _extract_model(model)

# Resolve config
if isinstance(sparse_cfg, str):
# Get config from mtsa module (e.g., SKIP_SOFTMAX_CALIB, SKIP_SOFTMAX_DEFAULT)
mtsa_cfg = getattr(mtsa, sparse_cfg, None)
if mtsa_cfg is None:
raise ValueError(f"Unknown sparse_cfg: {sparse_cfg}.")
else:
mtsa_cfg = sparse_cfg

# Override backend if specified
if backend:
if isinstance(mtsa_cfg, dict) and "sparse_cfg" in mtsa_cfg:
modified_sparse_cfg = {}
for pattern, cfg in mtsa_cfg["sparse_cfg"].items():
modified_cfg = cfg.copy() if isinstance(cfg, dict) else cfg
if isinstance(modified_cfg, dict):
modified_cfg["backend"] = backend
modified_sparse_cfg[pattern] = modified_cfg
mtsa_cfg = {"sparse_cfg": modified_sparse_cfg}

# Apply sparsification
print(f"\nApplying sparse attention with config: {sparse_cfg}")
mtsa.sparsify(net, mtsa_cfg)
print("Sparse attention applied successfully!")

return model
2 changes: 2 additions & 0 deletions examples/llm_sparsity/attention_sparsity/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Data directory for calibration
data
165 changes: 165 additions & 0 deletions examples/llm_sparsity/attention_sparsity/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
# Attention Sparsity for HuggingFace Models

In this tutorial, we demonstrate how to use NVIDIA TensorRT Model Optimizer to apply attention sparsity to HuggingFace models. Attention sparsity reduces computational cost by skipping near-zero attention scores during the softmax computation.

## Getting Started

### Quick Example

```python
import modelopt.torch.sparsity.attention_sparsity as mtsa
from modelopt.torch.sparsity.attention_sparsity.config import SKIP_SOFTMAX_DEFAULT

# Load your model
model = AutoModelForCausalLM.from_pretrained(
"Qwen/Qwen3-8B",
attn_implementation="eager", # Required for sparse attention
torch_dtype=torch.bfloat16,
)

# Apply sparse attention
model = mtsa.sparsify(model, config=SKIP_SOFTMAX_DEFAULT)
```
Comment on lines +9 to +22
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Add missing imports in the quick example.

The code example is missing necessary imports for AutoModelForCausalLM and torch, which would cause the example to fail if copied as-is.

📝 Suggested fix
 ```python
+import torch
+from transformers import AutoModelForCausalLM
+
 import modelopt.torch.sparsity.attention_sparsity as mtsa
 from modelopt.torch.sparsity.attention_sparsity.config import SKIP_SOFTMAX_DEFAULT
🤖 Prompt for AI Agents
In `@examples/llm_sparsity/attention_sparsity/README.md` around lines 9 - 22, The
example in README.md is missing imports required to run the snippet; add an
import for torch and for AutoModelForCausalLM from transformers at the top so
the symbols AutoModelForCausalLM and torch referenced when loading the model
(and later calling mtsa.sparsify with SKIP_SOFTMAX_DEFAULT) are defined; update
the snippet to include these two imports before importing
modelopt.torch.sparsity.attention_sparsity and using mtsa.sparsify.


> [!Note]
> `attn_implementation="eager"` is required for sparse attention to work properly. Flash Attention 2 or SDPA would bypass the softmax patching needed for stats collection.

## Configuration Options

Two pre-defined configurations are available:

### 1. Fixed Threshold (SKIP_SOFTMAX_DEFAULT)

Uses a fixed threshold value. Simple but may not be optimal for all sequence lengths.

```python
from modelopt.torch.sparsity.attention_sparsity.config import SKIP_SOFTMAX_DEFAULT

model = mtsa.sparsify(model, config=SKIP_SOFTMAX_DEFAULT)
```

### 2. Calibrated Threshold (SKIP_SOFTMAX_CALIB)

Uses RULER-based calibration to determine an optimal dynamic threshold that adapts to sequence length. Recommended for production use.

```python
from modelopt.torch.sparsity.attention_sparsity.config import SKIP_SOFTMAX_CALIB

model = mtsa.sparsify(model, config=SKIP_SOFTMAX_CALIB)
```

## Prerequisites

### Local Installation

For Hugging Face models, install Model Optimizer with `hf` dependencies using `pip` from [PyPI](https://pypi.org/project/nvidia-modelopt/) and install the requirements for the example:

```bash
pip install nvidia-modelopt[hf]
```

### Download RULER Calibration Data (Required for Calibration)

If using `SKIP_SOFTMAX_CALIB`, you need to download the RULER calibration dataset first:

```bash
bash modelopt/torch/sparsity/attention_sparsity/calibration/download_ruler_data.sh
```

This downloads the Paul Graham essays dataset used for generating calibration samples.

## Run Sparse Attention on HuggingFace Models

### Basic Usage (Without Calibration)

Apply sparse attention with a fixed threshold:

```bash
python examples/llm_sparsity/attention_sparsity/hf_sa.py \
--pyt_ckpt_path Qwen/Qwen3-8B \
--sparse_attn skip_softmax
```

### With RULER Calibration

Apply sparse attention with calibrated thresholds for optimal sparsity:

```bash
python examples/llm_sparsity/attention_sparsity/hf_sa.py \
--pyt_ckpt_path Qwen/Qwen3-8B \
--sparse_attn skip_softmax_calib
```

The calibration process:

1. Generates RULER calibration samples
2. Collects attention statistics during forward passes
3. Determines optimal threshold scale factor for target sparsity ratio

### Command Line Arguments

| Argument | Default | Description |
|----------|---------|-------------|
| `--pyt_ckpt_path` | Required | HuggingFace model path or name |
| `--sparse_attn` | `skip_softmax` | Configuration: `skip_softmax` or `skip_softmax_calib` |
| `--backend` | `pytorch` | Backend: `pytorch` (only supported backend) |
| `--seq_len` | `2048` | Maximum sequence length for input prompts |
| `--export_dir` | `None` | Directory to export the sparsified model |

## Output Comparison

The script automatically compares outputs before and after applying sparse attention:

1. Loads a test sample from the NarrativeQA dataset
2. Generates text before sparse attention is applied
3. Applies sparse attention (with optional calibration)
4. Generates text after sparse attention is applied
5. Compares and displays both outputs

## Export Model

Export the sparsified model to a HuggingFace checkpoint:

```bash
python examples/llm_sparsity/attention_sparsity/hf_sa.py \
--pyt_ckpt_path Qwen/Qwen3-8B \
--sparse_attn skip_softmax_calib \
--export_dir ./exported_sparse_model
```

The exported model can be loaded and used with standard HuggingFace APIs.

## Custom Configuration

You can create custom sparse attention configurations:

```python
custom_config = {
"sparse_cfg": {
"calibration": { # Optional: omit for fixed threshold
"target_sparse_ratio": {"prefill": 0.5, "decode": 0.5}, # Target 50% sparsity
"samples": 128, # Number of calibration samples
"max_seqlen": 8192, # Maximum sequence length
# Optional: customize threshold trials for calibration
"threshold_trials": [1e-4, 5e-4, 1e-3, 5e-3, 1e-2, 2e-2, 5e-2, 1e-1, 2e-1, 3e-1, 5e-1, 7e-1],
},
"*attn*": { # Pattern to match attention modules
"method": "flash_skip_softmax",
"threshold": {"prefill": 1e-3, "decode": 1e-4}, # Phase-specific thresholds (ignored if calibration is used)
"br": 128, # Flash Attention block rows
"bc": 128, # Flash Attention block columns
"backend": "pytorch",
"collect_stats": True,
"enable": True,
},
"default": {"enable": False},
},
}

model = mtsa.sparsify(model, config=custom_config)
```

## References

- [TensorRT Model Optimizer Documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/)
- [RULER: What's the Real Context Size of Your Long-Context Language Models?](https://github.com/NVIDIA/RULER)
Loading