diff --git a/tests/_test_utils/deploy_utils.py b/tests/_test_utils/deploy_utils.py index 805624b8f..cfe489d6a 100644 --- a/tests/_test_utils/deploy_utils.py +++ b/tests/_test_utils/deploy_utils.py @@ -19,6 +19,48 @@ import pytest import torch +# Cache for available backends detection (computed once at import time) +_AVAILABLE_BACKENDS = None + + +def get_available_backends(): + """Detect which backends are available in the current environment. + + Returns: + set: A set of available backend names ('trtllm', 'vllm', 'sglang') + """ + global _AVAILABLE_BACKENDS + if _AVAILABLE_BACKENDS is not None: + return _AVAILABLE_BACKENDS + + available = set() + + try: + import tensorrt_llm # noqa: F401 + + available.add("trtllm") + except ImportError: + pass + + try: + import vllm # noqa: F401 + + available.add("vllm") + except ImportError: + pass + + try: + import sglang # noqa: F401 + + available.add("sglang") + except ImportError: + pass + + _AVAILABLE_BACKENDS = available + print(f"[deploy_utils] Detected available backends: {available}") + return _AVAILABLE_BACKENDS + + # Common test prompts for all backends COMMON_PROMPTS = [ "Hello, my name is", @@ -93,15 +135,18 @@ def _deploy_trtllm(self): try: from tensorrt_llm import LLM, SamplingParams from tensorrt_llm.llmapi import CudaGraphConfig, EagleDecodingConfig, KvCacheConfig - except ImportError: - pytest.skip("tensorrt_llm package not available") + except ImportError as e: + raise ImportError("tensorrt_llm package not available. ") from e sampling_params = SamplingParams(max_tokens=32) spec_config = None llm = None kv_cache_config = KvCacheConfig(enable_block_reuse=True, free_gpu_memory_fraction=0.8) - if self.model_id == "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8": + if self.model_id in ( + "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8", + "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4", + ): llm = LLM( model=self.model_id, tensor_parallel_size=self.tensor_parallel_size, @@ -175,8 +220,8 @@ def _deploy_vllm(self): """Deploy a model using vLLM.""" try: from vllm import LLM, SamplingParams - except ImportError: - pytest.skip("vllm package not available") + except ImportError as e: + raise ImportError("vllm package not available.") from e quantization_method = "modelopt" if "fp4" in self.model_id.lower(): @@ -212,8 +257,8 @@ def _deploy_sglang(self): """Deploy a model using SGLang.""" try: import sglang as sgl - except ImportError: - pytest.skip("sglang package not available") + except ImportError as e: + raise ImportError("sglang package not available.") from e quantization_method = "modelopt" if "fp4" in self.model_id.lower(): quantization_method = "modelopt_fp4" @@ -230,7 +275,10 @@ def _deploy_sglang(self): mem_fraction_static=0.7, context_length=1024, ) - elif self.model_id == "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8": + elif self.model_id in ( + "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-FP8", + "nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4", + ): llm = sgl.Engine( model_path=self.model_id, quantization=quantization_method, @@ -259,10 +307,20 @@ def __init__(self, **params): else: self.params[key] = [value] + # Filter backends to only include available ones + if "backend" in self.params: + available = get_available_backends() + original_backends = self.params["backend"] + self.params["backend"] = [b for b in original_backends if b in available] + # Pre-generate all deployers for pytest compatibility self._deployers = list(self._generate_deployers()) def _generate_deployers(self): + # If no backends available after filtering, yield nothing + if "backend" in self.params and not self.params["backend"]: + return + for values in itertools.product(*self.params.values()): deployer = ModelDeployer(**dict(zip(self.params.keys(), values))) # Set test case ID in format "model_id_backend" diff --git a/tests/examples/gpt_oss/test_gpt_oss_qat.py b/tests/examples/gpt_oss/test_gpt_oss_qat.py index e5f9b8ab9..43464110b 100644 --- a/tests/examples/gpt_oss/test_gpt_oss_qat.py +++ b/tests/examples/gpt_oss/test_gpt_oss_qat.py @@ -294,30 +294,27 @@ def deploy_gpt_oss_trtllm(self, tmp_path, model_path_override=None): ) def test_gpt_oss_complete_pipeline(model_path, tmp_path): """Test the complete GPT-OSS optimization pipeline by executing all 3 steps in sequence.""" - import pathlib - # Use current directory instead of tmp_path for checkpoints - current_dir = pathlib.Path.cwd() # Create GPTOSS instance with model path gpt_oss = GPTOSS(model_path) if model_path == "openai/gpt-oss-20b": # Step 1: SFT Training - sft_checkpoint = gpt_oss.gpt_oss_sft_training(current_dir) + sft_checkpoint = gpt_oss.gpt_oss_sft_training(tmp_path) if not sft_checkpoint or not sft_checkpoint.exists(): print("Step 1 failed: SFT checkpoint not found, stopping pipeline.") return print(f"Step 1 completed: SFT checkpoint at {sft_checkpoint}") # Step 2: QAT Training (depends on Step 1) - qat_checkpoint = gpt_oss.gpt_oss_qat_training(current_dir, sft_dir=sft_checkpoint) + qat_checkpoint = gpt_oss.gpt_oss_qat_training(tmp_path, sft_dir=sft_checkpoint) if not qat_checkpoint or not qat_checkpoint.exists(): print("Step 2 failed: QAT checkpoint not found, stopping pipeline.") return print(f"Step 2 completed: QAT checkpoint at {qat_checkpoint}") # Step 3: MXFP4 Conversion (depends on Step 2) - mxfp4_checkpoint = gpt_oss.gpt_oss_mxfp4_conversion(current_dir, qat_dir=qat_checkpoint) + mxfp4_checkpoint = gpt_oss.gpt_oss_mxfp4_conversion(tmp_path, qat_dir=qat_checkpoint) if not mxfp4_checkpoint or not mxfp4_checkpoint.exists(): print("Step 3 failed: MXFP4 checkpoint not found, stopping pipeline.") return @@ -325,12 +322,12 @@ def test_gpt_oss_complete_pipeline(model_path, tmp_path): # Step 4: Deploy with TensorRT-LLM (depends on Step 3) print("Step 4: Running deployment with MXFP4 checkpoint...") - gpt_oss.deploy_gpt_oss_trtllm(current_dir, model_path_override=mxfp4_checkpoint) + gpt_oss.deploy_gpt_oss_trtllm(tmp_path, model_path_override=mxfp4_checkpoint) print("Step 4 completed: Deployment successful") elif model_path == "openai/gpt-oss-120b": # Step 1: QAT Training with LoRA - qat_lora_checkpoint = gpt_oss.gpt_oss_qat_training_lora(current_dir) + qat_lora_checkpoint = gpt_oss.gpt_oss_qat_training_lora(tmp_path) if not qat_lora_checkpoint or not qat_lora_checkpoint.exists(): print("Step 1 failed: QAT-LoRA checkpoint not found, stopping pipeline.") return @@ -338,7 +335,7 @@ def test_gpt_oss_complete_pipeline(model_path, tmp_path): # Step 2: MXFP4 Conversion for LoRA model (depends on Step 1) mxfp4_checkpoint = gpt_oss.gpt_oss_mxfp4_conversion_lora( - current_dir, qat_lora_dir=qat_lora_checkpoint + tmp_path, qat_lora_dir=qat_lora_checkpoint ) if not mxfp4_checkpoint or not mxfp4_checkpoint.exists(): print("Step 2 failed: MXFP4 checkpoint not found, stopping pipeline.") @@ -347,5 +344,5 @@ def test_gpt_oss_complete_pipeline(model_path, tmp_path): # Step 3: Deploy with TensorRT-LLM (depends on Step 2) print("Step 3: Running deployment with MXFP4 checkpoint...") - gpt_oss.deploy_gpt_oss_trtllm(current_dir, model_path_override=mxfp4_checkpoint) + gpt_oss.deploy_gpt_oss_trtllm(tmp_path, model_path_override=mxfp4_checkpoint) print("Step 3 completed: Deployment successful") diff --git a/tests/examples/llm_ptq/test_deploy.py b/tests/examples/llm_ptq/test_deploy.py index 868304f48..4dd98ad9d 100644 --- a/tests/examples/llm_ptq/test_deploy.py +++ b/tests/examples/llm_ptq/test_deploy.py @@ -60,31 +60,43 @@ def cleanup_after_test(): "command", [ *ModelDeployerList( - model_id="nvidia/DeepSeek-R1-FP4", + model_id="nvidia/DeepSeek-R1-NVFP4", backend=("vllm", "trtllm", "sglang"), tensor_parallel_size=8, mini_sm=100, ), *ModelDeployerList( - model_id="nvidia/DeepSeek-R1-FP4-v2", + model_id="nvidia/DeepSeek-R1-NVFP4-v2", backend=("vllm", "trtllm", "sglang"), tensor_parallel_size=8, mini_sm=100, ), *ModelDeployerList( - model_id="nvidia/DeepSeek-R1-0528-FP4", + model_id="nvidia/DeepSeek-R1-0528-NVFP4", backend=("vllm", "trtllm", "sglang"), tensor_parallel_size=8, mini_sm=100, ), *ModelDeployerList( - model_id="nvidia/DeepSeek-R1-0528-FP4-v2", + model_id="nvidia/DeepSeek-R1-0528-NVFP4-v2", backend=("vllm", "trtllm", "sglang"), tensor_parallel_size=8, mini_sm=100, ), *ModelDeployerList( - model_id="nvidia/DeepSeek-V3-0324-FP4", + model_id="nvidia/DeepSeek-V3-0324-NVFP4", + backend=("vllm", "trtllm", "sglang"), + tensor_parallel_size=8, + mini_sm=100, + ), + *ModelDeployerList( + model_id="nvidia/DeepSeek-V3.1-NVFP4", + backend=("vllm", "trtllm", "sglang"), + tensor_parallel_size=8, + mini_sm=100, + ), + *ModelDeployerList( + model_id="nvidia/DeepSeek-V3.2-NVFP4", backend=("vllm", "trtllm", "sglang"), tensor_parallel_size=8, mini_sm=100, @@ -107,7 +119,7 @@ def test_deepseek(command): mini_sm=89, ), *ModelDeployerList( - model_id="nvidia/Llama-3.1-8B-Instruct-FP4", + model_id="nvidia/Llama-3.1-8B-Instruct-NVFP4", backend=("trtllm", "vllm", "sglang"), tensor_parallel_size=1, mini_sm=100, @@ -119,7 +131,7 @@ def test_deepseek(command): tensor_parallel_size=4, ), *ModelDeployerList( - model_id="nvidia/Llama-3.3-70B-Instruct-FP4", + model_id="nvidia/Llama-3.3-70B-Instruct-NVFP4", backend=("trtllm", "vllm", "sglang"), tensor_parallel_size=4, mini_sm=100, @@ -136,7 +148,7 @@ def test_deepseek(command): tensor_parallel_size=8, ), *ModelDeployerList( - model_id="nvidia/Llama-3.1-405B-Instruct-FP4", + model_id="nvidia/Llama-3.1-405B-Instruct-NVFP4", backend=("trtllm", "vllm", "sglang"), tensor_parallel_size=8, mini_sm=100, @@ -148,7 +160,7 @@ def test_deepseek(command): tensor_parallel_size=8, ), *ModelDeployerList( - model_id="nvidia/Llama-4-Maverick-17B-128E-Instruct-FP4", + model_id="nvidia/Llama-4-Maverick-17B-128E-Instruct-NVFP4", backend=("trtllm", "vllm", "sglang"), tensor_parallel_size=8, mini_sm=100, @@ -160,7 +172,7 @@ def test_deepseek(command): mini_sm=89, ), *ModelDeployerList( - model_id="nvidia/Llama-4-Scout-17B-16E-Instruct-FP4", + model_id="nvidia/Llama-4-Scout-17B-16E-Instruct-NVFP4", backend=("trtllm", "vllm", "sglang"), tensor_parallel_size=8, mini_sm=100, @@ -176,7 +188,7 @@ def test_llama(command): "command", [ *ModelDeployerList( - model_id="nvidia/Qwen3-8B-FP4", + model_id="nvidia/Qwen3-8B-NVFP4", backend=("trtllm", "vllm", "sglang"), tensor_parallel_size=1, mini_sm=100, @@ -188,7 +200,7 @@ def test_llama(command): mini_sm=89, ), *ModelDeployerList( - model_id="nvidia/Qwen3-14B-FP4", + model_id="nvidia/Qwen3-14B-NVFP4", backend=("trtllm", "vllm", "sglang"), tensor_parallel_size=1, mini_sm=100, @@ -200,7 +212,7 @@ def test_llama(command): mini_sm=89, ), *ModelDeployerList( - model_id="nvidia/Qwen3-235B-A22B-FP4", + model_id="nvidia/Qwen3-235B-A22B-NVFP4", backend=("trtllm", "vllm", "sglang"), tensor_parallel_size=2, mini_sm=100, @@ -212,16 +224,16 @@ def test_llama(command): mini_sm=89, ), *ModelDeployerList( - model_id="nvidia/QwQ-32B-FP4", backend=("trtllm", "vllm", "sglang"), mini_sm=100 + model_id="nvidia/QwQ-32B-NVFP4", backend=("trtllm", "vllm", "sglang"), mini_sm=100 ), *ModelDeployerList( - model_id="nvidia/Qwen3-32B-FP4", + model_id="nvidia/Qwen3-32B-NVFP4", backend=("trtllm", "vllm", "sglang"), tensor_parallel_size=4, mini_sm=100, ), *ModelDeployerList( - model_id="nvidia/Qwen2.5-VL-7B-Instruct-FP4", + model_id="nvidia/Qwen2.5-VL-7B-Instruct-NVFP4", backend=("trtllm", "vllm", "sglang"), tensor_parallel_size=4, mini_sm=100, @@ -233,11 +245,23 @@ def test_llama(command): mini_sm=100, ), *ModelDeployerList( - model_id="nvidia/Qwen3-30B-A3B-FP4", + model_id="nvidia/Qwen3-30B-A3B-NVFP4", backend=("trtllm", "vllm", "sglang"), tensor_parallel_size=4, mini_sm=100, ), + *ModelDeployerList( + model_id="nvidia/Qwen3-Next-80B-A3B-Instruct-NVFP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=8, + mini_sm=100, + ), + *ModelDeployerList( + model_id="nvidia/Qwen3-Next-80B-A3B-Thinking-NVFP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=8, + mini_sm=100, + ), ], ids=idfn, ) @@ -252,11 +276,10 @@ def test_qwen(command): model_id="nvidia/Mixtral-8x7B-Instruct-v0.1-FP8", backend=("trtllm", "vllm", "sglang") ), *ModelDeployerList( - model_id="nvidia/Mixtral-8x7B-Instruct-v0.1-FP4", + model_id="nvidia/Mixtral-8x7B-Instruct-v0.1-NVFP4", backend=("trtllm", "vllm", "sglang"), mini_sm=100, ), - # ModelDeployer(model_id="nvidia/Mixtral-8x7B-Instruct-v0.1-FP8", backend="sglang"), unsupported ], ids=idfn, ) @@ -266,9 +289,9 @@ def test_mixtral(command): @pytest.mark.parametrize( "command", - [ # TRTLLM bug: https://nvbugs/5451286 + [ *ModelDeployerList( - model_id="nvidia/gemma-3-12b-it-FP4", + model_id="nvidia/gemma-3-12b-it-NVFP4", backend=("trtllm", "vllm", "sglang"), tensor_parallel_size=1, mini_sm=100, @@ -282,7 +305,7 @@ def test_mixtral(command): attn_backend="FLASHINFER", ), *ModelDeployerList( - model_id="nvidia/gemma-3-27b-it-FP4", + model_id="nvidia/gemma-3-27b-it-NVFP4", backend=("trtllm", "vllm", "sglang"), tensor_parallel_size=1, mini_sm=100, @@ -307,7 +330,7 @@ def test_gemma(command): "command", [ *ModelDeployerList( - model_id="nvidia/Phi-4-multimodal-instruct-FP4", + model_id="nvidia/Phi-4-multimodal-instruct-NVFP4", backend=("trtllm", "vllm", "sglang"), tensor_parallel_size=1, mini_sm=100, @@ -319,7 +342,7 @@ def test_gemma(command): mini_sm=89, ), *ModelDeployerList( - model_id="nvidia/Phi-4-reasoning-plus-FP4", + model_id="nvidia/Phi-4-reasoning-plus-NVFP4", backend=("trtllm", "vllm", "sglang"), tensor_parallel_size=1, mini_sm=100, @@ -341,7 +364,7 @@ def test_phi(command): "command", [ *ModelDeployerList( - model_id="nvidia/Kimi-K2-Instruct-FP4", + model_id="nvidia/Kimi-K2-Instruct-NVFP4", backend=("trtllm", "vllm", "sglang"), tensor_parallel_size=8, mini_sm=100, @@ -374,12 +397,6 @@ def test_kimi(command): tensor_parallel_size=1, mini_sm=89, ), - *ModelDeployerList( - model_id="nvidia/Llama-3_1-Nemotron-Ultra-253B-v1-FP8", - backend=("trtllm", "vllm", "sglang"), - tensor_parallel_size=4, - mini_sm=89, - ), *ModelDeployerList( model_id="nvidia/Llama-3_1-Nemotron-Ultra-253B-v1-FP8", backend=("vllm",), @@ -393,6 +410,13 @@ def test_kimi(command): mini_sm=89, attn_backend="FLASHINFER", ), + *ModelDeployerList( + model_id="nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-NVFP4", + backend=("trtllm", "vllm", "sglang"), + tensor_parallel_size=1, + mini_sm=89, + attn_backend="FLASHINFER", + ), ], ids=idfn, ) @@ -454,6 +478,14 @@ def test_medusa(command): mini_sm=89, eagle3_one_model=False, ), + *ModelDeployerList( + base_model="Qwen/Qwen3-235B-A22B-Thinking-2507", + model_id="nvidia/Qwen3-235B-A22B-Thinking-2507-FP4-Eagle3", + backend=("trtllm", "sglang"), + tensor_parallel_size=8, + mini_sm=89, + eagle3_one_model=False, + ), *ModelDeployerList( base_model="Qwen/Qwen3-30B-A3B", model_id="nvidia/Qwen3-30B-A3B-Eagle3",