diff --git a/tests/unittest/_torch/modeling/test_modeling_qwen2_5vl.py b/tests/unittest/_torch/modeling/test_modeling_qwen2_5vl.py
index a3ae698127c..8b54c979eea 100644
--- a/tests/unittest/_torch/modeling/test_modeling_qwen2_5vl.py
+++ b/tests/unittest/_torch/modeling/test_modeling_qwen2_5vl.py
@@ -2,6 +2,7 @@
 import unittest
 from copy import deepcopy
 from dataclasses import dataclass
+from pathlib import Path
 from typing import List
 
 import torch
@@ -106,20 +107,24 @@ def __repr__(self) -> str:
 class TestQwen2_5_VL(unittest.TestCase):
 
     def get_test_inputs(self, modality: str):
+
+        test_data_root = Path(
+            os.path.join(llm_models_root(), "multimodals", "test_data"))
+
         if modality == "image":
             return ["Describe the natural environment in the image."], \
-                ["https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/seashore.png"]
+                [str(test_data_root / "seashore.png")]
         elif modality == "multiple_image":
             return ["Describe the difference between the two images."], \
-                ["https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png",
-                 "https://huggingface.co/datasets/Sayali9141/traffic_signal_images/resolve/main/61.jpg"]
+                [str(test_data_root / "inpaint.png"),
+                 str(test_data_root / "61.jpg")]
         elif modality == "video":
             return ["Tell me what you see in the video briefly."], \
-                ["https://huggingface.co/datasets/Efficient-Large-Model/VILA-inference-demos/resolve/main/OAI-sora-tokyo-walk.mp4"]
+                [str(test_data_root / "OAI-sora-tokyo-walk.mp4")]
         elif modality == "mixture_text_image":
             return ["Describe the scene in the image briefly.",
                     "Who invented the internet?"], \
-                ["https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png",
+                [str(test_data_root / "inpaint.png"),
                  ""]
         elif modality == "text":
             return ["Who invented the internet?"], []