Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/offline_inference.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ messages = [
}
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False)
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
images, videos = [], []
for message in messages:
content = message["content"]
Expand Down
2 changes: 1 addition & 1 deletion docs/zh/offline_inference.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ messages = [
}
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False)
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
images, videos = [], []
for message in messages:
content = message["content"]
Expand Down
77 changes: 40 additions & 37 deletions fastdeploy/model_executor/models/qwen2_5_vl/qwen2_5_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from paddleformers.utils.log import logger

from fastdeploy.config import FDConfig
from fastdeploy.model_executor.forward_meta import ForwardMeta
from fastdeploy.model_executor.graph_optimization.decorator import (
support_graph_optimization,
)
Expand All @@ -35,12 +36,6 @@
from fastdeploy.model_executor.layers.normalization import RMSNorm
from fastdeploy.model_executor.models.model_base import ModelForCasualLM
from fastdeploy.model_executor.models.qwen2 import Qwen2DecoderLayer
from fastdeploy.platforms import current_platform

if current_platform.is_cuda():
from fastdeploy.model_executor.ops.gpu import extract_text_token_output

from fastdeploy.model_executor.forward_meta import ForwardMeta


@support_graph_optimization
Expand Down Expand Up @@ -104,31 +99,17 @@ def load_state_dict(self, state_dict):
logger.info(f"Start load layer {i}")
self.layers[i].load_state_dict(state_dict)

def get_input_embeddings(self, ids_remove_padding: paddle.Tensor) -> paddle.Tensor:
return self.embed_tokens(ids_remove_padding=ids_remove_padding)

def forward(
self,
input_embeddings: paddle.Tensor,
ids_remove_padding: paddle.Tensor,
image_features: Optional[paddle.Tensor],
forward_meta: ForwardMeta,
):

hidden_states = self.embed_tokens(ids_remove_padding=ids_remove_padding)

# -----------------------
# 将 image_embeds 替换 input_embeds 里的 image video 占位符
image_mask = ids_remove_padding == self.image_token_id
image_token_num = image_mask.sum()

video_mask = ids_remove_padding == self.video_token_id
video_token_num = video_mask.sum()

# 由于框架只有 image_features,所以目前不支持图片和视频混合
# TODO(wangyafeng) 后续考虑支持传入 video_features
if image_token_num > 0:
hidden_states[image_mask] = image_features.cast(self._dtype)
if video_token_num > 0:
hidden_states[video_mask] = image_features.cast(self._dtype)

# -----------------------
hidden_states = input_embeddings

residual = None
for i in range(self.num_layers):
Expand All @@ -140,18 +121,6 @@ def forward(

hidden_states = hidden_states + residual

# -----------------------
max_seq_len, max_seq_len_index = paddle.topk(forward_meta.seq_lens_this_time, k=1)
hidden_states = extract_text_token_output(
max_seq_len,
max_seq_len_index.cast("int32"),
image_token_num.cast("int32"),
forward_meta.seq_lens_this_time,
forward_meta.cu_seqlens_q,
hidden_states.cast("float32"),
).cast(self._dtype)
# -----------------------

out = self.norm(hidden_states)

return out
Expand All @@ -173,6 +142,12 @@ def __init__(self, fd_config: FDConfig):
# ----------- language model -------------
self.model = Qwen2_5_VLModel(fd_config=fd_config)

# Persistent buffers for CUDA graphs.
self._input_embeddings = paddle.zeros(
[fd_config.parallel_config.max_model_len, fd_config.model_config.hidden_size],
dtype=fd_config.model_config.dtype,
)

self.ori_vocab_size = fd_config.model_config.ori_vocab_size

self.lm_head = ParallelLMHead(
Expand Down Expand Up @@ -236,14 +211,42 @@ def empty_input_forward(self):
self.ernie.layers[i].mlp.text_fused_moe(fake_hidden_states)
self.ernie.layers[i].mlp.image_fused_moe(fake_hidden_states)

def get_input_embeddings(
self,
ids_remove_padding: paddle.Tensor,
image_features: Optional[paddle.Tensor] = None,
) -> paddle.Tensor:

input_embeddings = self.model.get_input_embeddings(ids_remove_padding=ids_remove_padding)

image_mask = ids_remove_padding == self.model.image_token_id
image_token_num = image_mask.sum()

video_mask = ids_remove_padding == self.model.video_token_id
video_token_num = video_mask.sum()

# 由于框架只有 image_features,所以目前不支持图片和视频混合
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

后面提交PR时,注释都转成英文

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

get

# TODO(wangyafeng) 后续考虑支持传入 video_features
if image_token_num > 0:
input_embeddings[image_mask] = image_features.cast(self.model._dtype)
if video_token_num > 0:
input_embeddings[video_mask] = image_features.cast(self.model._dtype)

return input_embeddings

def forward(
self,
ids_remove_padding: paddle.Tensor,
image_features: Optional[paddle.Tensor],
forward_meta: ForwardMeta,
):
input_embeddings = self.get_input_embeddings(
ids_remove_padding=ids_remove_padding, image_features=image_features
)
self._input_embeddings.copy_(input_embeddings, False)

hidden_states = self.model(
input_embeddings=self._input_embeddings,
ids_remove_padding=ids_remove_padding,
image_features=image_features,
forward_meta=forward_meta,
Expand Down
Loading
Loading