NVIDIA · amitz-nv · Sep 30, 2025
@@ -524,12 +524,13 @@ def get_bindings_model_config(self,
             )
 
         # For kv cache size calculation: set size_per_head
+        head_size = None
         head_dim_names = ["head_size", "head_dim"]
         for head_dim_name in head_dim_names:
             if head_dim_name in self.pretrained_config:
                 head_size = getattr(self.pretrained_config, head_dim_name)
                 break
-        else:
+        if head_size is None:
             logger.warning(
                 f"head_size/head_dim is not set, using default value {hidden_size // num_heads}"
             )

diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py
@@ -557,7 +557,6 @@ def test_codellama_fp8_with_bf16_lora() -> None:
 
 
 @skip_gpu_memory_less_than_80gb
-@pytest.mark.skip(reason="https://nvbugs/5521949")
 def test_bielik_11b_v2_2_instruct_multi_lora() -> None:
     model_dir = f"{llm_models_root()}/Bielik-11B-v2.2-Instruct"
 
@@ -584,12 +583,16 @@ def test_bielik_11b_v2_2_instruct_multi_lora() -> None:
             lora_model.save_pretrained(lora_path)
             lora_paths.append(lora_path)
 
-        trtllm_lora_config = LoraConfig(lora_dir=lora_paths,
-                                        lora_target_modules=target_modules,
+        trtllm_lora_config = LoraConfig(lora_target_modules=target_modules,
                                         max_lora_rank=8,
                                         max_loras=2,
                                         max_cpu_loras=2)
-        llm = LLM(model_dir, lora_config=trtllm_lora_config)
+        llm = LLM(
+            model_dir,
+            lora_config=trtllm_lora_config,
+            # Disable CUDA graph
+            # TODO: remove this once we have a proper fix for CUDA graph in LoRA
+            cuda_graph_config=None)
 
         prompts = [
             "Kim był Mikołaj Kopernik i z czego zasłynął?",