Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions QEfficient/transformers/models/pytorch_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,9 @@
Qwen2Model,
Qwen2RMSNorm,
)
from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
Qwen2_5_VLForConditionalGeneration,
)
from transformers.models.starcoder2.modeling_starcoder2 import (
Starcoder2Attention,
Starcoder2DecoderLayer,
Expand Down Expand Up @@ -303,6 +306,9 @@
QEffQwen2ForCausalLM,
QEffQwen2Model,
)
from QEfficient.transformers.models.qwen_2_5_vl.modeling_qwen2_5_vl import (
QEffQwen_2_5_vl_ForConditionalGeneration,
)
from QEfficient.transformers.models.starcoder2.modeling_starcoder2 import (
QEffStarcoder2Attention,
QEFFStarcoder2DecoderLayer,
Expand Down Expand Up @@ -383,6 +389,8 @@ class KVCacheTransform(ModuleMappingTransform):
LlavaForConditionalGeneration: QEffLlavaForConditionalGeneration,
# Llava Next
LlavaNextForConditionalGeneration: QEffLlavaNextForConditionalGeneration,
# Qwen2.5 VL
Qwen2_5_VLForConditionalGeneration: QEffQwen_2_5_vl_ForConditionalGeneration,
# Gemma
GemmaAttention: QEffGemmaAttention,
GemmaDecoderLayer: QEffGemmaDecoderLayer,
Expand Down
6 changes: 6 additions & 0 deletions QEfficient/transformers/models/qwen_2_5_vl/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# -----------------------------------------------------------------------------
#
# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
# SPDX-License-Identifier: BSD-3-Clause
#
# -----------------------------------------------------------------------------
138 changes: 138 additions & 0 deletions QEfficient/transformers/models/qwen_2_5_vl/modeling_qwen2_5_vl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import torch
import torch.nn as nn
from transformers import Qwen2_5_VLForConditionalGeneration

from QEfficient.utils import constants


class QEffQwen_2_5_vl_EncoderWrapper(nn.Module):
def __init__(self, model):
super().__init__()
self.model = model
self.model.vision_model = self.model.visual

def forward(self, pixel_values, image_grid_thw):
pixel_values = pixel_values.type(self.model.visual.dtype)
image_embeds = self.model.visual(pixel_values, grid_thw=image_grid_thw)
return image_embeds


class QEffQwen_2_5_vl_DecoderWrapper(nn.Module):
def __init__(self, model):
super().__init__()
self.model = model
self.config = self.model.config
self.language_model = self.model.model

def forward(self, input_ids, vision_embeds, position_ids, image_idx, past_key_values):
breakpoint()
pass


class QEffQwen_2_5_vl_ForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
def get_qeff_vision_encoder(self):
return QEffQwen_2_5_vl_EncoderWrapper(self)

def get_qeff_language_decoder(self):
return QEffQwen_2_5_vl_DecoderWrapper(self)

def get_dummy_inputs(self, kv_offload: bool = False, **kwargs):
num_layers = self.config.num_hidden_layers
num_key_value_heads = self.config.num_key_value_heads
head_dim = self.config.hidden_size // self.config.num_attention_heads
vision_inputs = {
"pixel_values": torch.zeros(
(11016, 1176),
dtype=torch.float32,
),
"image_grid_thw": torch.tensor([[1, 108, 102]]),
}

lang_inputs = {
"input_ids": torch.ones((1, 2779), dtype=torch.int64),
"attention_mask": torch.ones((1, 2779), dtype=torch.int64),
"vision_embeds": torch.ones(
(11016, 1176),
dtype=torch.float32,
),
"image_idx": torch.zeros((1, 1), dtype=torch.int64),
}
lang_inputs["position_ids"] = lang_inputs.pop("attention_mask").cumsum(1)
lang_inputs["past_key_values"] = []
for i in range(num_layers):
lang_inputs["past_key_values"].append(
(
torch.zeros(
1,
num_key_value_heads,
6000,
head_dim,
),
torch.zeros(
1,
num_key_value_heads,
6000,
head_dim,
),
)
)

lang_inputs["position_ids"] = torch.full(lang_inputs["position_ids"].shape, constants.GRANITEVISION_CTX_LEN - 1)
inputs = {}
if kv_offload:
inputs["vision"] = vision_inputs
inputs["lang"] = lang_inputs
breakpoint()
return inputs

def get_specializations(
self,
batch_size: int,
prefill_seq_len: int,
ctx_len: int,
img_size: int,
kv_offload: bool = False,
**compiler_options,
):
pass

def get_onnx_dynamic_axes(self, kv_offload: bool = False):
# Define dynamic axes
num_layers = self.config.num_hidden_layers
vision_dynamic_axes = {
"pixel_values": {0: "batch_size", 1: "num_patches"},
"image_grid_thw": {0: "batch_size", 1: "batch_size"},
}
lang_dynamic_axes = {
"input_ids": {0: "batch_size", 1: "seq_len"},
"position_ids": {0: "batch_size", 1: "seq_len", 2: "seq_len"},
"vision_embeds": {0: "batch_size", 1: "vision_size"},
}
for i in range(num_layers):
lang_dynamic_axes[f"past_key.{i}"] = {0: "batch_size", 2: "ctx_len"}
lang_dynamic_axes[f"past_value.{i}"] = {0: "batch_size", 2: "ctx_len"}
dynamic_axes = {}
if kv_offload:
dynamic_axes["vision"] = vision_dynamic_axes
dynamic_axes["lang"] = lang_dynamic_axes
return dynamic_axes

def get_inputs_info(self):
pass

def get_output_names(self, kv_offload: bool = False):
vision_output_names = ["vision_embeds"]
lang_output_names = ["logits"]
# breakpoint()
for i in range(64):
for kv in ["key", "value"]:
lang_output_names.append(f"past_{kv}.{i}_RetainedState")

output_names = {}
if kv_offload:
lang_output_names.insert(1, "vision_embeds_RetainedState")
lang_output_names.insert(2, "image_idx_output")
output_names["vision"] = vision_output_names
output_names["lang"] = lang_output_names

return output_names
107 changes: 107 additions & 0 deletions examples/qwenvl_example/qwen2_5_vl_inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
# -----------------------------------------------------------------------------
#
# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
# SPDX-License-Identifier: BSD-3-Clause
#
# -----------------------------------------------------------------------------

import requests
from PIL import Image
from transformers import AutoProcessor, TextStreamer

from QEfficient import QEFFAutoModelForImageTextToText

# Add HuggingFace Token to access the model
HF_TOKEN = ""


def run_model(
model_name,
token,
query,
image_url,
kv_offload=False,
prefill_seq_len=5500,
ctx_len=6000,
generation_len=128,
img_size=384,
num_cores=16,
num_devices=1,
):
## STEP - 1 Load the Processor and Model

processor = AutoProcessor.from_pretrained(model_name, token=token)

# `kv_offload` is used to compile the model in a 2 QPCs.Currently we are not supporting 1 qpc so the flag false is not allowed.
# The `kv_offload` flag should always be set to True.
# The Dual QPC approach splits the model to perform Image Encoding and Output generation in 2 different QPCs.
# The outputs of the Vision Encoder are then passed to the Language model via host in this case.

model = QEFFAutoModelForImageTextToText.from_pretrained(model_name, token=token, kv_offload=kv_offload)

## STEP - 2 Export & Compile the Model

model.compile(
prefill_seq_len=prefill_seq_len,
ctx_len=ctx_len,
img_size=img_size,
num_cores=num_cores,
num_devices=num_devices,
mxfp6_matmul=False,
)

## STEP - 3 Load and process the inputs for Inference

image = Image.open(requests.get(image_url, stream=True).raw)
messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": query}]}]
input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(image, input_text, add_special_tokens=False, return_tensors="pt")

## STEP - 4 Run Inference on the compiled model

streamer = TextStreamer(processor.tokenizer)
output = model.generate(inputs=inputs, streamer=streamer, generation_len=generation_len)
breakpoint()
print(output)


if __name__ == "__main__":
# Model name and Input parameters
model_name = "Qwen/Qwen2.5-VL-32B-Instruct"

# Please add prompt here
query = "Describe the image"

# Please pass image url or image path .The format of the image should be jpg.
image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"

# Compilation parameters for the model
kv_offload = True
prefill_seq_len = 5500
ctx_len = 6000
generation_len = 128
img_size = 384
num_cores = 16
num_devices = 4

run_model(
model_name=model_name,
token=HF_TOKEN,
query=query,
kv_offload=kv_offload,
image_url=image_url,
prefill_seq_len=prefill_seq_len,
ctx_len=ctx_len,
generation_len=generation_len,
img_size=img_size,
num_cores=num_cores,
num_devices=num_devices,
)


"""
Expected Response:



"""
59 changes: 59 additions & 0 deletions examples/qwenvl_example/test_hf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from qwen_vl_utils import process_vision_info
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration

# default: Load the model on the available device(s)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2.5-VL-32B-Instruct", torch_dtype="auto", device_map="auto"
)


# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
# "Qwen/Qwen2.5-VL-32B-Instruct",
# torch_dtype=torch.bfloat16,
# attn_implementation="flash_attention_2",
# device_map="auto",
# )

# default processer
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-32B-Instruct")

# The default range for the number of visual tokens per image in the model is 4-16384.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-32B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)

messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg",
},
{"type": "text", "text": "Describe this image."},
],
}
]

# Preparation for inference
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
inputs = inputs.to("cuda")
# breakpoint()
# Inference: Generation of the output
breakpoint()
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)
Loading