From c7cf17c81741930ac078b35675499c5b2d03d63e Mon Sep 17 00:00:00 2001
From: root <ragul.ramesh@multicorewareinc.com>
Date: Fri, 8 Aug 2025 20:04:50 +0000
Subject: [PATCH 1/2] WIP vLLM support for Qwen 2.5 VL 7B

---
 examples/offline_inference_tt.py | 38 +++++++++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/examples/offline_inference_tt.py b/examples/offline_inference_tt.py
index af97de193df7..8aa8c4ab7993 100644
--- a/examples/offline_inference_tt.py
+++ b/examples/offline_inference_tt.py
@@ -48,8 +48,10 @@ def register_tt_models():
 
     # Qwen2.5 - Text
     path_qwen_text = "models.tt_transformers.tt.generator_vllm:QwenForCausalLM"
+    path_qwen_vision = "models.tt_transformers.tt.generator_vllm:Qwen2_5_VLForConditionalGeneration"
     ModelRegistry.register_model("TTQwen2ForCausalLM", path_qwen_text)
     ModelRegistry.register_model("TTQwen3ForCausalLM", path_qwen_text)
+    ModelRegistry.register_model("TTQwen2_5_VLForConditionalGeneration", path_qwen_vision)
 
     # Mistral
     ModelRegistry.register_model(
@@ -88,6 +90,32 @@ def get_sample_multi_modal_llama_inputs():
             inputs.append({"prompt": question})
     return inputs
 
+def get_sample_multi_modal_hf_inputs():
+    '''
+    Prepare 4 sample multi-modal prompts for HF multimodals
+    '''
+    IMG_PATH = Path(resource_filename("llama_models", "scripts/resources/"))
+    relative_img_paths = [None, "pasta.jpeg", "ocr_image.jpeg", "clutter.jpeg"]
+    questions = [
+        "Write a haiku.", "What is for dinner?",
+        "What is the full text of this image? Do OCR",
+        "What objects are in this image?"
+    ]
+    inputs = []
+    for relative_img_path, question in zip(relative_img_paths, questions):
+        if relative_img_path is not None:
+            with open(IMG_PATH / relative_img_path, "rb") as f:
+                img = PIL_Image.open(f).convert("RGB")
+            prompt = f"{question}"
+            inputs.append({
+                "prompt": prompt,
+                "multi_modal_data": {
+                    "image": img
+                }
+            })
+        else:
+            inputs.append({"prompt": question})
+    return inputs
 
 def check_tt_model_supported(model):
     supported_models = [
@@ -118,6 +146,7 @@ def check_tt_model_supported(model):
         "Qwen/Qwen3-8B",
         "Qwen/Qwen3-14B",
         "Qwen/Qwen3-32B",
+        "Qwen/Qwen2.5-VL-7B-Instruct",
         "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
         "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
         "mistralai/Mistral-7B-Instruct-v0.3",
@@ -177,8 +206,8 @@ def run_inference(
     check_tt_model_supported(model)
 
     if multi_modal:
-        assert "Llama-3.2" in model, "The multi-modal inference test " + \
-            "currently only supports Llama-3.2 models"
+        assert "Llama-3.2" in model or "Qwen2.5-VL" in model, "The multi-modal inference test " + \
+            "currently only supports Llama-3.2 and Qwen2.5 models"
 
     # LLM args
     engine_kw_args = {
@@ -234,7 +263,10 @@ def run_inference(
                               list), "Prompts must be a list of strings"
         else:
             print("Ignoring prompts json for multi-modal inference")
-            prompts = get_sample_multi_modal_llama_inputs()
+            if "Qwen2.5-VL" in model:
+                prompts = get_sample_multi_modal_hf_inputs()
+            else:
+                prompts = get_sample_multi_modal_llama_inputs()
         if num_repeat_prompts is not None:
             prompts = prompts * num_repeat_prompts
         print("Number of prompts:", len(prompts))

From 8e5a98992435feea2c2dc41686d13d3aaa9a08eb Mon Sep 17 00:00:00 2001
From: mcw <ganesan.suresh@multicorewareinc.com>
Date: Tue, 12 Aug 2025 13:57:41 +0530
Subject: [PATCH 2/2] Qwen VL 7B vLLM support

---
 examples/offline_inference_tt.py | 74 +++++++++++++++++++++++---------
 1 file changed, 53 insertions(+), 21 deletions(-)

diff --git a/examples/offline_inference_tt.py b/examples/offline_inference_tt.py
index 8aa8c4ab7993..f8ee9815d982 100644
--- a/examples/offline_inference_tt.py
+++ b/examples/offline_inference_tt.py
@@ -10,7 +10,7 @@
 from PIL import Image as PIL_Image
 from pkg_resources import resource_filename
 from tqdm import tqdm
-from transformers import AutoTokenizer
+from transformers import AutoProcessor, AutoTokenizer
 
 from vllm import LLM, ModelRegistry, SamplingParams
 from vllm.engine.arg_utils import AsyncEngineArgs
@@ -90,31 +90,56 @@ def get_sample_multi_modal_llama_inputs():
             inputs.append({"prompt": question})
     return inputs
 
-def get_sample_multi_modal_hf_inputs():
-    '''
-    Prepare 4 sample multi-modal prompts for HF multimodals
-    '''
-    IMG_PATH = Path(resource_filename("llama_models", "scripts/resources/"))
-    relative_img_paths = [None, "pasta.jpeg", "ocr_image.jpeg", "clutter.jpeg"]
-    questions = [
-        "Write a haiku.", "What is for dinner?",
-        "What is the full text of this image? Do OCR",
-        "What objects are in this image?"
+def get_sample_multi_modal_qwen_inputs(model):
+    # Prepare a sample multi-modal prompt for Qwen2.5-VL
+    text_prompts = []
+    imgs = []
+    questions = ["Describe this image."]
+    img_refs = [
+        "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
     ]
+    prompts = [[{
+        "role":
+        "user",
+        "content": [{
+            "type": "image",
+            "image": img_ref,
+            "resized_height": 224,
+            "resized_width": 224,
+        }, {
+            "type": "text",
+            "text": question
+        }]
+    }] for img_ref, question in zip(img_refs, questions)]
+    tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
+    for prompt in prompts:
+        chat_prompt = tokenizer.apply_chat_template(prompt,
+                                                    tokenize=False,
+                                                    add_generation_prompt=True)
+        if any(ctnt["type"] == "image" for entry in prompt
+               for ctnt in entry['content']):
+            from qwen_vl_utils import (
+                process_vision_info)  # Import here to avoid for other models
+            image_inputs, video_inputs = process_vision_info(prompt)
+            assert video_inputs is None, "Video inputs not supported yet"
+            assert len(
+                image_inputs) == 1, "Multi-image inputs not supported yet"
+            imgs.append(image_inputs[0])
+        else:
+            imgs.append(None)
+        text_prompts.append(chat_prompt)
+
     inputs = []
-    for relative_img_path, question in zip(relative_img_paths, questions):
-        if relative_img_path is not None:
-            with open(IMG_PATH / relative_img_path, "rb") as f:
-                img = PIL_Image.open(f).convert("RGB")
-            prompt = f"{question}"
+    for img, text_prompt in zip(imgs, text_prompts):
+        if img is not None:
             inputs.append({
-                "prompt": prompt,
+                "prompt": text_prompt,
                 "multi_modal_data": {
                     "image": img
                 }
             })
         else:
-            inputs.append({"prompt": question})
+            inputs.append({"prompt": text_prompt})
     return inputs
 
 def check_tt_model_supported(model):
@@ -264,7 +289,7 @@ def run_inference(
         else:
             print("Ignoring prompts json for multi-modal inference")
             if "Qwen2.5-VL" in model:
-                prompts = get_sample_multi_modal_hf_inputs()
+                prompts = get_sample_multi_modal_qwen_inputs(model)
             else:
                 prompts = get_sample_multi_modal_llama_inputs()
         if num_repeat_prompts is not None:
@@ -284,8 +309,15 @@ def run_inference(
                 "prompt_token_ids": prompt_token_ids_user
             } for _ in range(max_seqs_in_batch)]
         else:
-            MLLAMA_IMAGE_TOKEN_ID = 128256  # Specific to multi-modal llama
-            prompt_token_ids_user.insert(0, MLLAMA_IMAGE_TOKEN_ID)
+            if "Llama-3.2" in model:
+                IMAGE_TOKEN_ID = 128256  # Specific to multi-modal llama
+            elif "Qwen2.5-VL" in model:
+                IMAGE_TOKEN_ID = 151655  # Specific to multi-modal qwen
+            else:
+                raise ValueError(
+                    f"Unsupported model for multi-modal inference test in perf "
+                    f"mode: {model}")
+            prompt_token_ids_user.insert(0, IMAGE_TOKEN_ID)
             random_pixels = np.random.randint(0,
                                               256, (512, 512, 3),
                                               dtype=np.uint8)