diff --git a/ATTRIBUTIONS-Python.md b/ATTRIBUTIONS-Python.md index e6917c6e25..dca9e142a7 100644 --- a/ATTRIBUTIONS-Python.md +++ b/ATTRIBUTIONS-Python.md @@ -441,7 +441,7 @@ License: `Apache` - `Homepage`: https://github.com/huggingface/accelerate -## aiconfigurator (0.2.0) +## aiconfigurator (0.4.0) ### Licenses License: `Apache-2.0` diff --git a/benchmarks/profiler/deploy/profile_sla_aic_dgdr.yaml b/benchmarks/profiler/deploy/profile_sla_aic_dgdr.yaml index 2c2784c561..966bf9319b 100644 --- a/benchmarks/profiler/deploy/profile_sla_aic_dgdr.yaml +++ b/benchmarks/profiler/deploy/profile_sla_aic_dgdr.yaml @@ -19,7 +19,7 @@ spec: # AI Configurator mode (fast simulation-based profiling) use_ai_configurator: true aic_system: h200_sxm - aic_model_name: QWEN3_32B + aic_hf_id: Qwen/Qwen3-32B aic_backend_version: "0.20.0" # SLA targets for profiling diff --git a/benchmarks/profiler/profile_sla.py b/benchmarks/profiler/profile_sla.py index 7e8fc16acf..e370fcaa43 100644 --- a/benchmarks/profiler/profile_sla.py +++ b/benchmarks/profiler/profile_sla.py @@ -143,9 +143,6 @@ async def run_profile(args): assert args.backend in [ "sglang" ], "MoE model support is only available for SGLang" - assert ( - not args.use_ai_configurator - ), "MoE model is not supported in ai-configurator" else: logger.info( "Dense model profiling, sweeping TP size for prefill and decode" @@ -204,26 +201,30 @@ async def run_profile(args): raise ValueError( "Must provide --aic-system when using --use-ai-configurator." ) - if not args.aic_model_name: - raise ValueError( - "Must provide --aic-model-name when using --use-ai-configurator." - ) - if not args.aic_backend_version: - raise ValueError( - "Must provide --aic-backend-version when using --use-ai-configurator." - ) + + # Fallback to args.model if aic_hf_id is not provided + if not args.aic_hf_id: + if args.model: + logger.info( + f"--aic-hf-id not provided, using --model ({args.model}) as HuggingFace ID for AI configurator" + ) + args.aic_hf_id = args.model + else: + raise ValueError( + "Must provide --aic-hf-id or --model when using --use-ai-configurator." + ) logger.info("Using aiconfigurator to estimate performance...") ai_configurator_perf_estimator = AIConfiguratorPerfEstimator( - args.aic_model_name, + args.aic_hf_id, args.aic_system.lower(), args.aic_backend, args.aic_backend_version, ) else: - if args.aic_system or args.aic_model_name or args.aic_backend_version: + if args.aic_system or args.aic_hf_id or args.aic_backend_version: logger.warning( - "Ignoring --aic-system, --aic-model-name, and/or --backend-version " + "Ignoring --aic-system, --aic-hf-id, and/or --backend-version " "when not using --use-ai-configurator." ) diff --git a/benchmarks/profiler/utils/estimate_perf.py b/benchmarks/profiler/utils/estimate_perf.py index a6abc0d096..be6b12cc50 100644 --- a/benchmarks/profiler/utils/estimate_perf.py +++ b/benchmarks/profiler/utils/estimate_perf.py @@ -36,7 +36,7 @@ class AIConfiguratorPerfEstimator: def __init__( self, - model_name: str, # e.g. "QWEN3_32B" + hf_id: str, # e.g. "Qwen/Qwen3-32B" system: str, # e.g. "h200_sxm" backend: str, # e.g. "trtllm" version: str, # e.g. "0.20.0" @@ -44,6 +44,11 @@ def __init__( aiconfigurator = _try_import_aiconfigurator() logger.info("Loading aiconfigurator database. This might take a few seconds...") + if not version: + version = aiconfigurator.sdk.perf_database.get_latest_database_version( + system, + backend, + ) self.database = aiconfigurator.sdk.perf_database.get_database( system=system, backend=backend, @@ -56,10 +61,7 @@ def __init__( logger.info("aiconfigurator database loaded.") self.backend = aiconfigurator.sdk.backends.factory.get_backend(backend) - - # This is the aiconfigurator model name (such as QWEN3_32B or DEEPSEEK_V3) - # rather than the HF model name. - self.model_name = model_name + self.hf_id = hf_id def _get_model(self, **model_config_kwargs): aiconfigurator = _try_import_aiconfigurator() @@ -67,7 +69,7 @@ def _get_model(self, **model_config_kwargs): # NOTE: MOE models error out unless moe_tp_size and moe_ep_size are provided. model_config = aiconfigurator.sdk.config.ModelConfig(**model_config_kwargs) model = aiconfigurator.sdk.models.get_model( - self.model_name, model_config, self.backend + self.hf_id, model_config, self.backend ) return model diff --git a/benchmarks/profiler/utils/profiler_argparse.py b/benchmarks/profiler/utils/profiler_argparse.py index b42d96880a..4a35ef8387 100644 --- a/benchmarks/profiler/utils/profiler_argparse.py +++ b/benchmarks/profiler/utils/profiler_argparse.py @@ -80,7 +80,7 @@ def create_profiler_parser() -> argparse.Namespace: decode_interpolation_granularity: Int (how many samples to benchmark to interpolate ITL under different active kv cache size and decode context length, default: 6) use_ai_configurator: Boolean (use ai-configurator to estimate benchmarking results instead of running actual deployment, default: False) aic_system: String (target system for use with aiconfigurator, default: None) - aic_model_name: String (aiconfigurator name of the target model, default: None) + aic_hf_id: String (aiconfigurator huggingface id of the target model, default: None) aic_backend: String (aiconfigurator backend of the target model, if not provided, will use args.backend, default: "") aic_backend_version: String (specify backend version when using aiconfigurator to estimate perf, default: None) dry_run: Boolean (dry run the profile job, default: False) @@ -260,10 +260,10 @@ def create_profiler_parser() -> argparse.Namespace: help="Target system for use with aiconfigurator (e.g. h100_sxm, h200_sxm)", ) parser.add_argument( - "--aic-model-name", + "--aic-hf-id", type=str, - default=config.get("sweep", {}).get("aic_model_name"), - help="aiconfigurator name of the target model (e.g. QWEN3_32B, DEEPSEEK_V3)", + default=config.get("sweep", {}).get("aic_hf_id"), + help="aiconfigurator name of the target model (e.g. Qwen/Qwen3-32B, meta-llama/Llama-3.1-405B)", ) parser.add_argument( "--aic-backend", diff --git a/benchmarks/pyproject.toml b/benchmarks/pyproject.toml index d99b7c611c..9ee8804cd9 100644 --- a/benchmarks/pyproject.toml +++ b/benchmarks/pyproject.toml @@ -40,7 +40,7 @@ classifiers = [ ] dependencies = [ - "aiconfigurator @ git+https://github.com/ai-dynamo/aiconfigurator.git@11b6d821f1fbb34300bb0ed4945f647e89fb411a", + "aiconfigurator @ git+https://github.com/ai-dynamo/aiconfigurator.git@5554d2eb8206738c66048bf2d72183e9bcd85759", "networkx", "pandas", "pydantic>=2", diff --git a/container/deps/requirements.txt b/container/deps/requirements.txt index cec76abeab..d69c6b659c 100644 --- a/container/deps/requirements.txt +++ b/container/deps/requirements.txt @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 accelerate==1.6.0 -aiconfigurator @ git+https://github.com/ai-dynamo/aiconfigurator.git@11b6d821f1fbb34300bb0ed4945f647e89fb411a +aiconfigurator @ git+https://github.com/ai-dynamo/aiconfigurator.git@5554d2eb8206738c66048bf2d72183e9bcd85759 aiofiles aiperf @ git+https://github.com/ai-dynamo/aiperf.git@16dad7c02fcd959ba96823d7bfe7e681e5d5b41d av==15.0.0 diff --git a/deploy/cloud/operator/config/samples/nvidia.com_v1alpha1_dynamographdeploymentrequest.yaml b/deploy/cloud/operator/config/samples/nvidia.com_v1alpha1_dynamographdeploymentrequest.yaml index a7ffc60023..69af2dcb25 100644 --- a/deploy/cloud/operator/config/samples/nvidia.com_v1alpha1_dynamographdeploymentrequest.yaml +++ b/deploy/cloud/operator/config/samples/nvidia.com_v1alpha1_dynamographdeploymentrequest.yaml @@ -53,7 +53,7 @@ spec: # AI Configurator mode (fast simulation-based profiling, 20-30 seconds) use_ai_configurator: false # Set to false for online profiling (2-4 hours) aic_system: h200_sxm # Target GPU system for AI Configurator - aic_model_name: QWEN3_0.6B # Model name for AI Configurator + aic_hf_id: Qwen/Qwen3-0.6B # HuggingFace model ID for AI Configurator aic_backend_version: "0.20.0" # Backend version for AI Configurator # SLA targets for profiling diff --git a/deploy/cloud/operator/internal/controller/dynamographdeploymentrequest_controller_test.go b/deploy/cloud/operator/internal/controller/dynamographdeploymentrequest_controller_test.go index 9187ddec81..d34b50d288 100644 --- a/deploy/cloud/operator/internal/controller/dynamographdeploymentrequest_controller_test.go +++ b/deploy/cloud/operator/internal/controller/dynamographdeploymentrequest_controller_test.go @@ -348,7 +348,7 @@ var _ = Describe("DynamoGraphDeploymentRequest Controller", func() { "sweep": map[string]interface{}{ "use_ai_configurator": true, "aic_system": "h200_sxm", - "aic_model_name": "QWEN3_32B", + "aic_hf_id": "Qwen/Qwen3-32B", "aic_backend_version": "0.20.0", }, }), @@ -1058,7 +1058,7 @@ var _ = Describe("DGDR Profiler Arguments", func() { "sweep": map[string]interface{}{ "use_ai_configurator": true, "aic_system": "h200_sxm", - "aic_model_name": "QWEN3_32B", + "aic_hf_id": "Qwen/Qwen3-32B", "aic_backend_version": "0.20.0", }, }), diff --git a/docs/benchmarks/sla_driven_profiling.md b/docs/benchmarks/sla_driven_profiling.md index b6c3abb0cf..b6991f1800 100644 --- a/docs/benchmarks/sla_driven_profiling.md +++ b/docs/benchmarks/sla_driven_profiling.md @@ -303,17 +303,12 @@ profilingConfig: sweep: use_ai_configurator: true aic_system: h200_sxm # GPU system: h100_sxm, h200_sxm, b200_sxm, gb200_sxm, a100_sxm - aic_model_name: QWEN3_32B # AIC model identifier (see supported list) - aic_backend_version: "0.20.0" # TensorRT-LLM version: 0.20.0, 1.0.0rc3, 1.0.0rc6 + aic_hf_id: Qwen/Qwen3-32B # Huggingface model id + aic_backend_version: "0.20.0" # TensorRT-LLM version: 0.20.0, 1.0.0rc3 ``` **Supported configurations:** See [AI Configurator documentation](https://github.com/ai-dynamo/aiconfigurator#supported-features) -**Model name mapping examples:** -- `Qwen/Qwen3-32B` → `QWEN3_32B` -- `meta-llama/Llama-3.1-70B` → `LLAMA3.1_70B` -- `deepseek-ai/DeepSeek-V3` → `DEEPSEEK_V3` - ### Planner Configuration (Optional) Pass arguments to the SLA planner: diff --git a/docs/planner/sla_planner_quickstart.md b/docs/planner/sla_planner_quickstart.md index 8c20ef5702..38151b6a13 100644 --- a/docs/planner/sla_planner_quickstart.md +++ b/docs/planner/sla_planner_quickstart.md @@ -229,7 +229,7 @@ sweep: sweep: use_ai_configurator: true aic_system: h200_sxm - aic_model_name: QWEN3_32B + aic_hf_id: Qwen/Qwen3-32B aic_backend_version: "0.20.0" ``` diff --git a/tests/profiler/test_profile_sla_aiconfigurator.py b/tests/profiler/test_profile_sla_aiconfigurator.py index 14b624e272..72f1dde18e 100644 --- a/tests/profiler/test_profile_sla_aiconfigurator.py +++ b/tests/profiler/test_profile_sla_aiconfigurator.py @@ -60,9 +60,9 @@ def __init__(self): self.dry_run = False self.use_ai_configurator = True self.aic_system = "h200_sxm" - self.aic_model_name = "QWEN3_32B" + self.aic_hf_id = "Qwen/Qwen3-32B" self.aic_backend = "" - self.aic_backend_version = "0.20.0" + self.aic_backend_version = None self.num_gpus_per_node = 8 self.deploy_after_profile = False # Provide minimal model_info to avoid HF queries @@ -77,11 +77,10 @@ def __init__(self): @pytest.mark.pre_merge @pytest.mark.asyncio - @pytest.mark.parametrize( - "missing_arg", ["aic_system", "aic_model_name", "aic_backend_version"] - ) + @pytest.mark.parametrize("missing_arg", ["aic_system", "aic_hf_id"]) async def test_aiconfigurator_missing_args(self, trtllm_args, missing_arg): # Check that validation error happens when a required arg is missing. + # Note: aic_backend_version is optional - when None, auto-detects latest version setattr(trtllm_args, missing_arg, None) with pytest.raises(ValueError): await run_profile(trtllm_args) @@ -113,16 +112,23 @@ async def test_trtllm_aiconfigurator_single_model(self, trtllm_args): @pytest.mark.parametrize( "backend, aic_backend_version", [ + ("trtllm", None), ("trtllm", "0.20.0"), ("trtllm", "1.0.0rc3"), ], ) - @pytest.mark.parametrize("model_name", ["QWEN3_32B", "LLAMA3.1_405B"]) + @pytest.mark.parametrize( + "hf_model_id", + [ + "Qwen/Qwen3-32B", + "meta-llama/Llama-3.1-405B", + ], + ) async def test_trtllm_aiconfigurator_many( - self, trtllm_args, model_name, backend, aic_backend_version + self, trtllm_args, hf_model_id, backend, aic_backend_version ): # Test that profile_sla works with a variety of backend versions and model names. - trtllm_args.aic_model_name = model_name + trtllm_args.aic_hf_id = hf_model_id trtllm_args.backend = backend trtllm_args.aic_backend_version = aic_backend_version await run_profile(trtllm_args) diff --git a/tests/profiler/test_profile_sla_dryrun.py b/tests/profiler/test_profile_sla_dryrun.py index f0b77853c8..e02409ecf5 100644 --- a/tests/profiler/test_profile_sla_dryrun.py +++ b/tests/profiler/test_profile_sla_dryrun.py @@ -67,7 +67,7 @@ def __init__(self): self.dry_run = True self.use_ai_configurator = False self.aic_system = None - self.aic_model_name = None + self.aic_hf_id = None self.aic_backend = "" self.aic_backend_version = None self.num_gpus_per_node = 8 @@ -109,7 +109,7 @@ def __init__(self): self.dry_run = True self.use_ai_configurator = False self.aic_system = None - self.aic_model_name = None + self.aic_hf_id = None self.aic_backend = "" self.aic_backend_version = None self.num_gpus_per_node = 8 @@ -164,7 +164,7 @@ def __init__(self): self.dry_run = True self.use_ai_configurator = False self.aic_system = None - self.aic_model_name = None + self.aic_hf_id = None self.aic_backend = "" self.aic_backend_version = None self.num_gpus_per_node = 8 @@ -212,7 +212,7 @@ def __init__(self): self.dry_run = True self.use_ai_configurator = False self.aic_system = None - self.aic_model_name = None + self.aic_hf_id = None self.aic_backend = "" self.aic_backend_version = None self.num_gpus_per_node = 8 @@ -282,7 +282,7 @@ def __init__(self): self.dry_run = True self.use_ai_configurator = False self.aic_system = None - self.aic_model_name = None + self.aic_hf_id = None self.aic_backend = "" self.aic_backend_version = None # Set to 0 to trigger auto-generation path @@ -345,7 +345,7 @@ def __init__(self): self.dry_run = True self.use_ai_configurator = False self.aic_system = None - self.aic_model_name = None + self.aic_hf_id = None self.aic_backend = "" self.aic_backend_version = None self.num_gpus_per_node = 0 @@ -407,7 +407,7 @@ def __init__(self): self.dry_run = True self.use_ai_configurator = False self.aic_system = None - self.aic_model_name = None + self.aic_hf_id = None self.aic_backend = "" self.aic_backend_version = None self.num_gpus_per_node = 0