From bd97b448da1611f2a08c902f9535431504bc52a0 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@viking-dvt-151.nvidia.com>
Date: Tue, 30 Sep 2025 12:28:16 +0000
Subject: [PATCH] Fix head_size handling in
 ModelConfig.get_bindings_model_config, unskip and do minor configuration
 fixes in test_bielik_11b_v2_2_instruct_multi_lora that failed because of the
 bad head_size handling

Signed-off-by: Ubuntu <ubuntu@viking-dvt-151.nvidia.com>
---
 tensorrt_llm/_torch/model_config.py       |  3 ++-
 tests/unittest/llmapi/test_llm_pytorch.py | 11 +++++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/tensorrt_llm/_torch/model_config.py b/tensorrt_llm/_torch/model_config.py
index f8c84781b3c..7f69abcf491 100644
--- a/tensorrt_llm/_torch/model_config.py
+++ b/tensorrt_llm/_torch/model_config.py
@@ -524,12 +524,13 @@ def get_bindings_model_config(self,
             )
 
         # For kv cache size calculation: set size_per_head
+        head_size = None
         head_dim_names = ["head_size", "head_dim"]
         for head_dim_name in head_dim_names:
             if head_dim_name in self.pretrained_config:
                 head_size = getattr(self.pretrained_config, head_dim_name)
                 break
-        else:
+        if head_size is None:
             logger.warning(
                 f"head_size/head_dim is not set, using default value {hidden_size // num_heads}"
             )
diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py
index 62253df45a5..1d4e17ba060 100644
--- a/tests/unittest/llmapi/test_llm_pytorch.py
+++ b/tests/unittest/llmapi/test_llm_pytorch.py
@@ -557,7 +557,6 @@ def test_codellama_fp8_with_bf16_lora() -> None:
 
 
 @skip_gpu_memory_less_than_80gb
-@pytest.mark.skip(reason="https://nvbugs/5521949")
 def test_bielik_11b_v2_2_instruct_multi_lora() -> None:
     model_dir = f"{llm_models_root()}/Bielik-11B-v2.2-Instruct"
 
@@ -584,12 +583,16 @@ def test_bielik_11b_v2_2_instruct_multi_lora() -> None:
             lora_model.save_pretrained(lora_path)
             lora_paths.append(lora_path)
 
-        trtllm_lora_config = LoraConfig(lora_dir=lora_paths,
-                                        lora_target_modules=target_modules,
+        trtllm_lora_config = LoraConfig(lora_target_modules=target_modules,
                                         max_lora_rank=8,
                                         max_loras=2,
                                         max_cpu_loras=2)
-        llm = LLM(model_dir, lora_config=trtllm_lora_config)
+        llm = LLM(
+            model_dir,
+            lora_config=trtllm_lora_config,
+            # Disable CUDA graph
+            # TODO: remove this once we have a proper fix for CUDA graph in LoRA
+            cuda_graph_config=None)
 
         prompts = [
             "Kim był Mikołaj Kopernik i z czego zasłynął?",