From bd97b448da1611f2a08c902f9535431504bc52a0 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Tue, 30 Sep 2025 12:28:16 +0000 Subject: [PATCH] Fix head_size handling in ModelConfig.get_bindings_model_config, unskip and do minor configuration fixes in test_bielik_11b_v2_2_instruct_multi_lora that failed because of the bad head_size handling Signed-off-by: Ubuntu --- tensorrt_llm/_torch/model_config.py | 3 ++- tests/unittest/llmapi/test_llm_pytorch.py | 11 +++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/tensorrt_llm/_torch/model_config.py b/tensorrt_llm/_torch/model_config.py index f8c84781b3c..7f69abcf491 100644 --- a/tensorrt_llm/_torch/model_config.py +++ b/tensorrt_llm/_torch/model_config.py @@ -524,12 +524,13 @@ def get_bindings_model_config(self, ) # For kv cache size calculation: set size_per_head + head_size = None head_dim_names = ["head_size", "head_dim"] for head_dim_name in head_dim_names: if head_dim_name in self.pretrained_config: head_size = getattr(self.pretrained_config, head_dim_name) break - else: + if head_size is None: logger.warning( f"head_size/head_dim is not set, using default value {hidden_size // num_heads}" ) diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py index 62253df45a5..1d4e17ba060 100644 --- a/tests/unittest/llmapi/test_llm_pytorch.py +++ b/tests/unittest/llmapi/test_llm_pytorch.py @@ -557,7 +557,6 @@ def test_codellama_fp8_with_bf16_lora() -> None: @skip_gpu_memory_less_than_80gb -@pytest.mark.skip(reason="https://nvbugs/5521949") def test_bielik_11b_v2_2_instruct_multi_lora() -> None: model_dir = f"{llm_models_root()}/Bielik-11B-v2.2-Instruct" @@ -584,12 +583,16 @@ def test_bielik_11b_v2_2_instruct_multi_lora() -> None: lora_model.save_pretrained(lora_path) lora_paths.append(lora_path) - trtllm_lora_config = LoraConfig(lora_dir=lora_paths, - lora_target_modules=target_modules, + trtllm_lora_config = LoraConfig(lora_target_modules=target_modules, max_lora_rank=8, max_loras=2, max_cpu_loras=2) - llm = LLM(model_dir, lora_config=trtllm_lora_config) + llm = LLM( + model_dir, + lora_config=trtllm_lora_config, + # Disable CUDA graph + # TODO: remove this once we have a proper fix for CUDA graph in LoRA + cuda_graph_config=None) prompts = [ "Kim był Mikołaj Kopernik i z czego zasłynął?",