Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 66 additions & 8 deletions tests/_test_utils/deploy_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,48 @@
import pytest
import torch

# Cache for available backends detection (computed once at import time)
_AVAILABLE_BACKENDS = None


def get_available_backends():
"""Detect which backends are available in the current environment.

Returns:
set: A set of available backend names ('trtllm', 'vllm', 'sglang')
"""
global _AVAILABLE_BACKENDS
if _AVAILABLE_BACKENDS is not None:
return _AVAILABLE_BACKENDS

available = set()

try:
import tensorrt_llm # noqa: F401

available.add("trtllm")
except ImportError:
pass

try:
import vllm # noqa: F401

available.add("vllm")
except ImportError:
pass

try:
import sglang # noqa: F401

available.add("sglang")
except ImportError:
pass

_AVAILABLE_BACKENDS = available
print(f"[deploy_utils] Detected available backends: {available}")
return _AVAILABLE_BACKENDS


# Common test prompts for all backends
COMMON_PROMPTS = [
"Hello, my name is",
Expand Down Expand Up @@ -93,15 +135,18 @@ def _deploy_trtllm(self):
try:
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm.llmapi import CudaGraphConfig, EagleDecodingConfig, KvCacheConfig
except ImportError:
pytest.skip("tensorrt_llm package not available")
except ImportError as e:
raise ImportError("tensorrt_llm package not available. ") from e

sampling_params = SamplingParams(max_tokens=32)
spec_config = None
llm = None
kv_cache_config = KvCacheConfig(enable_block_reuse=True, free_gpu_memory_fraction=0.8)

if self.model_id == "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8":
if self.model_id in (
"nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8",
"nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4",
):
llm = LLM(
model=self.model_id,
tensor_parallel_size=self.tensor_parallel_size,
Expand Down Expand Up @@ -175,8 +220,8 @@ def _deploy_vllm(self):
"""Deploy a model using vLLM."""
try:
from vllm import LLM, SamplingParams
except ImportError:
pytest.skip("vllm package not available")
except ImportError as e:
raise ImportError("vllm package not available.") from e

quantization_method = "modelopt"
if "fp4" in self.model_id.lower():
Expand Down Expand Up @@ -212,8 +257,8 @@ def _deploy_sglang(self):
"""Deploy a model using SGLang."""
try:
import sglang as sgl
except ImportError:
pytest.skip("sglang package not available")
except ImportError as e:
raise ImportError("sglang package not available.") from e
quantization_method = "modelopt"
if "fp4" in self.model_id.lower():
quantization_method = "modelopt_fp4"
Expand All @@ -230,7 +275,10 @@ def _deploy_sglang(self):
mem_fraction_static=0.7,
context_length=1024,
)
elif self.model_id == "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8":
elif self.model_id in (
"nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8",
"nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4",
):
llm = sgl.Engine(
model_path=self.model_id,
quantization=quantization_method,
Expand Down Expand Up @@ -259,10 +307,20 @@ def __init__(self, **params):
else:
self.params[key] = [value]

# Filter backends to only include available ones
if "backend" in self.params:
available = get_available_backends()
original_backends = self.params["backend"]
self.params["backend"] = [b for b in original_backends if b in available]

# Pre-generate all deployers for pytest compatibility
self._deployers = list(self._generate_deployers())

def _generate_deployers(self):
# If no backends available after filtering, yield nothing
if "backend" in self.params and not self.params["backend"]:
return

for values in itertools.product(*self.params.values()):
deployer = ModelDeployer(**dict(zip(self.params.keys(), values)))
# Set test case ID in format "model_id_backend"
Expand Down
17 changes: 7 additions & 10 deletions tests/examples/gpt_oss/test_gpt_oss_qat.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,51 +294,48 @@ def deploy_gpt_oss_trtllm(self, tmp_path, model_path_override=None):
)
def test_gpt_oss_complete_pipeline(model_path, tmp_path):
"""Test the complete GPT-OSS optimization pipeline by executing all 3 steps in sequence."""
import pathlib

# Use current directory instead of tmp_path for checkpoints
current_dir = pathlib.Path.cwd()
# Create GPTOSS instance with model path
gpt_oss = GPTOSS(model_path)

if model_path == "openai/gpt-oss-20b":
# Step 1: SFT Training
sft_checkpoint = gpt_oss.gpt_oss_sft_training(current_dir)
sft_checkpoint = gpt_oss.gpt_oss_sft_training(tmp_path)
if not sft_checkpoint or not sft_checkpoint.exists():
print("Step 1 failed: SFT checkpoint not found, stopping pipeline.")
return
print(f"Step 1 completed: SFT checkpoint at {sft_checkpoint}")

# Step 2: QAT Training (depends on Step 1)
qat_checkpoint = gpt_oss.gpt_oss_qat_training(current_dir, sft_dir=sft_checkpoint)
qat_checkpoint = gpt_oss.gpt_oss_qat_training(tmp_path, sft_dir=sft_checkpoint)
if not qat_checkpoint or not qat_checkpoint.exists():
print("Step 2 failed: QAT checkpoint not found, stopping pipeline.")
return
print(f"Step 2 completed: QAT checkpoint at {qat_checkpoint}")

# Step 3: MXFP4 Conversion (depends on Step 2)
mxfp4_checkpoint = gpt_oss.gpt_oss_mxfp4_conversion(current_dir, qat_dir=qat_checkpoint)
mxfp4_checkpoint = gpt_oss.gpt_oss_mxfp4_conversion(tmp_path, qat_dir=qat_checkpoint)
if not mxfp4_checkpoint or not mxfp4_checkpoint.exists():
print("Step 3 failed: MXFP4 checkpoint not found, stopping pipeline.")
return
print(f"Step 3 completed: MXFP4 checkpoint at {mxfp4_checkpoint}")

# Step 4: Deploy with TensorRT-LLM (depends on Step 3)
print("Step 4: Running deployment with MXFP4 checkpoint...")
gpt_oss.deploy_gpt_oss_trtllm(current_dir, model_path_override=mxfp4_checkpoint)
gpt_oss.deploy_gpt_oss_trtllm(tmp_path, model_path_override=mxfp4_checkpoint)
print("Step 4 completed: Deployment successful")

elif model_path == "openai/gpt-oss-120b":
# Step 1: QAT Training with LoRA
qat_lora_checkpoint = gpt_oss.gpt_oss_qat_training_lora(current_dir)
qat_lora_checkpoint = gpt_oss.gpt_oss_qat_training_lora(tmp_path)
if not qat_lora_checkpoint or not qat_lora_checkpoint.exists():
print("Step 1 failed: QAT-LoRA checkpoint not found, stopping pipeline.")
return
print(f"Step 1 completed: QAT-LoRA checkpoint at {qat_lora_checkpoint}")

# Step 2: MXFP4 Conversion for LoRA model (depends on Step 1)
mxfp4_checkpoint = gpt_oss.gpt_oss_mxfp4_conversion_lora(
current_dir, qat_lora_dir=qat_lora_checkpoint
tmp_path, qat_lora_dir=qat_lora_checkpoint
)
if not mxfp4_checkpoint or not mxfp4_checkpoint.exists():
print("Step 2 failed: MXFP4 checkpoint not found, stopping pipeline.")
Expand All @@ -347,5 +344,5 @@ def test_gpt_oss_complete_pipeline(model_path, tmp_path):

# Step 3: Deploy with TensorRT-LLM (depends on Step 2)
print("Step 3: Running deployment with MXFP4 checkpoint...")
gpt_oss.deploy_gpt_oss_trtllm(current_dir, model_path_override=mxfp4_checkpoint)
gpt_oss.deploy_gpt_oss_trtllm(tmp_path, model_path_override=mxfp4_checkpoint)
print("Step 3 completed: Deployment successful")
Loading