From 8e4d38fdafb6a1b2da88c9ddcd0b6f4874a7662e Mon Sep 17 00:00:00 2001 From: Severus Qin Date: Wed, 11 Feb 2026 10:30:31 +0800 Subject: [PATCH 1/9] update qwen3vl text model support --- src/paddlefleet/models/gpt/gpt_embedding.py | 26 +- src/paddlefleet/models/gpt/gpt_layer_specs.py | 7 +- src/paddlefleet/models/qwen3_vl/embedding.py | 187 +++++++++++++++ .../models/qwen3_vl/qwen3_vl_model.py | 224 ++++++++++++++++++ 4 files changed, 417 insertions(+), 27 deletions(-) diff --git a/src/paddlefleet/models/gpt/gpt_embedding.py b/src/paddlefleet/models/gpt/gpt_embedding.py index 6464408af..de4e3638d 100644 --- a/src/paddlefleet/models/gpt/gpt_embedding.py +++ b/src/paddlefleet/models/gpt/gpt_embedding.py @@ -107,12 +107,6 @@ def forward( attn_mask_startend_row_indices = dict_args.get( "attn_mask_startend_row_indices", None ) - deepstack_image_embeds = dict_args.get("deepstack_image_embeds", None) - deepstack_video_embeds = dict_args.get("deepstack_video_embeds", None) - visual_pos_masks = None - # Deepstack - deepstack_visual_embeds = None - visual_pos_mask = None mtp_emb_res = None if decoder_input is None: decoder_input = self.embedding( @@ -182,8 +176,6 @@ def forward( decoder_input = decoder_input.masked_scatter( image_mask, image_embeds.astype(decoder_input.dtype) ) - visual_pos_masks = image_mask[..., 0] - deepstack_visual_embeds = deepstack_image_embeds if video_embeds is not None: _, video_mask = self.get_placeholder_mask( @@ -194,25 +186,11 @@ def forward( decoder_input = decoder_input.masked_scatter( video_mask, video_embeds.astype(decoder_input.dtype) ) - visual_pos_masks = video_mask[..., 0] - deepstack_visual_embeds = deepstack_video_embeds if image_embeds is not None and video_embeds is not None: image_mask = image_mask[..., 0] video_mask = video_mask[..., 0] - visual_pos_masks = image_mask | video_mask - deepstack_visual_embeds = [] - image_mask_joint = image_mask[visual_pos_masks] - video_mask_joint = video_mask[visual_pos_masks] - for img_embed, vid_embed in zip( - deepstack_image_embeds, deepstack_video_embeds - ): - embed_joint = img_embed.new_zeros( - visual_pos_masks.sum(), img_embed.shape[-1] - ).to(img_embed.device) - embed_joint[image_mask_joint, :] = img_embed - embed_joint[video_mask_joint, :] = vid_embed - deepstack_visual_embeds.append(embed_joint) + # Rotary positional embeddings (embedding is None for PP intermediate devices) rotary_pos_emb = None rotary_pos_cos = None @@ -257,8 +235,6 @@ def forward( "rotary_pos_cos": rotary_pos_cos, "rotary_pos_sin": rotary_pos_sin, "position_ids": position_ids, - "deepstack_visual_emb": deepstack_visual_embeds, - "visual_pos_masks": visual_pos_masks, } if mtp_emb_res is not None: assert ( diff --git a/src/paddlefleet/models/gpt/gpt_layer_specs.py b/src/paddlefleet/models/gpt/gpt_layer_specs.py index 2d4fde54f..34995b883 100644 --- a/src/paddlefleet/models/gpt/gpt_layer_specs.py +++ b/src/paddlefleet/models/gpt/gpt_layer_specs.py @@ -104,7 +104,9 @@ def get_gpt_layer_local_spec( num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm, ) - transformer_cls = getattr(config, "specific_layer", TransformerLayer) + transformer_cls = getattr( + config, "specific_transformer_layer", TransformerLayer + ) if paddle.distributed.is_initialized(): use_overlap = fleet.fleet._user_defined_strategy.hybrid_configs[ "pp_configs" @@ -360,6 +362,7 @@ def get_gpt_spec( language_embedding=language_embedding_spec, rope_embedding=rope_embedding_spec, ) + embedding_cls = getattr(config, "specific_embedding", GPTEmbedding) return LayerSpec( layer=GPTModel, @@ -369,7 +372,7 @@ def get_gpt_spec( }, sublayers_spec=GPTSublayersSpec( embedding=LayerSpec( - layer=GPTEmbedding, + layer=embedding_cls, sublayers_spec=embedding_spec, extra_kwargs=embedding_extra_kwargs, ), diff --git a/src/paddlefleet/models/qwen3_vl/embedding.py b/src/paddlefleet/models/qwen3_vl/embedding.py index 1a3357978..a060e6a20 100644 --- a/src/paddlefleet/models/qwen3_vl/embedding.py +++ b/src/paddlefleet/models/qwen3_vl/embedding.py @@ -15,12 +15,16 @@ import paddle from paddle import nn +from paddle.distributed.fleet.utils.sequence_parallel_utils import ( + ScatterOp, +) from paddle.nn import functional as F from ...packed_seq_params import PackedSeqParams from ...spec_utils import LayerSpec, build_layer from ...transformer import TransformerConfig from ...transformer.layer import FleetLayer +from ..gpt.gpt_embedding import GPTEmbedding @dataclass @@ -240,3 +244,186 @@ def forward(self, dict_args: dict): } return preproc_output + + +class TextEmbedding(GPTEmbedding): + def forward( + self, + dict_args: dict, + decoder_input: paddle.Tensor = None, + packed_seq_params: PackedSeqParams = None, + ): + input_ids = dict_args["input_ids"] + position_ids = dict_args.get("position_ids", None) + position_ids = ( + position_ids.to("gpu") if position_ids is not None else None + ) + attention_mask = dict_args.get("attention_mask", None) + attn_mask_startend_row_indices = dict_args.get( + "attn_mask_startend_row_indices", None + ) + deepstack_image_embeds = dict_args.get("deepstack_image_embeds", None) + deepstack_video_embeds = dict_args.get("deepstack_video_embeds", None) + # Deepstack + deepstack_visual_embeds = None + visual_pos_masks = None + mtp_emb_res = None + if decoder_input is None: + decoder_input = self.embedding( + input_ids=input_ids, + position_ids=None + if self.multimodal_embedding + else position_ids, + ) + if ( + self.config.num_nextn_predict_layers is not None + and self.config.num_nextn_predict_layers > 0 + ): + assert not self.multimodal_embedding, ( + "MTP not support mm for now." + ) + inputs_embeds_extra = decoder_input[ + :, -self.config.num_nextn_predict_layers :, : + ] # [B, S, H] + inputs_embeds = decoder_input[ + :, : -self.config.num_nextn_predict_layers, : + ] + inputs_embeds_ori = inputs_embeds + batch_size, seq_length, hidden_size = inputs_embeds.shape + + if self.sequence_parallel: + inputs_embeds = inputs_embeds.reshape( + [-1, inputs_embeds.shape[-1]] + ) + inputs_embeds = ScatterOp.apply(inputs_embeds) + inputs_embeds = ( + inputs_embeds.reshape([batch_size, -1, hidden_size]) + .permute(1, 0, 2) + .contiguous() + ) # change to [S, B, H] + mtp_emb_res = [inputs_embeds] + for depth in range(self.config.num_nextn_predict_layers): + inputs_embeds_mtp = paddle.concat( + [ + inputs_embeds_ori[:, (depth + 1) :, :], + inputs_embeds_extra[:, : (depth + 1), :], + ], + axis=1, + ) + if self.sequence_parallel: + inputs_embeds_mtp = inputs_embeds_mtp.reshape( + [-1, inputs_embeds_mtp.shape[-1]] + ) + inputs_embeds_mtp = ScatterOp.apply(inputs_embeds_mtp) + inputs_embeds_mtp = ( + inputs_embeds_mtp.reshape( + [batch_size, -1, hidden_size] + ) + .permute(1, 0, 2) + .contiguous() + ) # change to [S, B, H] + mtp_emb_res.append(inputs_embeds_mtp) + + if self.multimodal_embedding: + image_embeds = dict_args.get("image_embeds", None) + video_embeds = dict_args.get("video_embeds", None) + if image_embeds is not None: + image_mask, _ = self.get_placeholder_mask( + input_ids, + inputs_embeds=decoder_input, + image_features=image_embeds, + ) + decoder_input = decoder_input.masked_scatter( + image_mask, image_embeds.astype(decoder_input.dtype) + ) + visual_pos_masks = image_mask[..., 0] + deepstack_visual_embeds = deepstack_image_embeds + + if video_embeds is not None: + _, video_mask = self.get_placeholder_mask( + input_ids, + inputs_embeds=decoder_input, + video_features=video_embeds, + ) + decoder_input = decoder_input.masked_scatter( + video_mask, video_embeds.astype(decoder_input.dtype) + ) + visual_pos_masks = video_mask[..., 0] + deepstack_visual_embeds = deepstack_video_embeds + + if image_embeds is not None and video_embeds is not None: + image_mask = image_mask[..., 0] + video_mask = video_mask[..., 0] + visual_pos_masks = image_mask | video_mask + deepstack_visual_embeds = [] + image_mask_joint = image_mask[visual_pos_masks] + video_mask_joint = video_mask[visual_pos_masks] + for img_embed, vid_embed in zip( + deepstack_image_embeds, deepstack_video_embeds + ): + embed_joint = img_embed.new_zeros( + visual_pos_masks.sum(), img_embed.shape[-1] + ).to(img_embed.device) + embed_joint[image_mask_joint, :] = img_embed + embed_joint[video_mask_joint, :] = vid_embed + deepstack_visual_embeds.append(embed_joint) + # Rotary positional embeddings (embedding is None for PP intermediate devices) + rotary_pos_emb = None + rotary_pos_cos = None + rotary_pos_sin = None + + if ( + self.position_embedding_type == "rope" + and self.rotary_pos_emb is not None + ): + rope_base = decoder_input if mtp_emb_res is None else mtp_emb_res[0] + rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len( + rope_base, self.config, packed_seq_params + ) + rotary_pos_emb = self.rotary_pos_emb( + rotary_seq_len, + packed_seq=packed_seq_params is not None + and packed_seq_params.qkv_format == "thd", + position_ids=position_ids, + ) + elif ( + self.position_embedding_type == "mrope" + and self.rotary_pos_emb is not None + ): + rotary_pos_emb = self.rotary_pos_emb( + position_ids, self.mrope_section + ) + + if rotary_pos_emb is not None: + if self.config.apply_rope_fusion: + rotary_pos_cos = paddle.cos(rotary_pos_emb) + rotary_pos_sin = paddle.sin(rotary_pos_emb) + if self.config.sequence_parallel: + rotary_pos_emb = rotary_pos_emb.transpose( + [1, 0, 2, 3] + ).contiguous() + + preproc_output = { + "hidden_states": decoder_input, + "attention_mask": attention_mask, + "attn_mask_startend_row_indices": attn_mask_startend_row_indices, + "rotary_pos_emb": rotary_pos_emb, + "rotary_pos_cos": rotary_pos_cos, + "rotary_pos_sin": rotary_pos_sin, + "position_ids": position_ids, + "deepstack_visual_emb": deepstack_visual_embeds, + "visual_pos_masks": visual_pos_masks, + } + if mtp_emb_res is not None: + assert ( + self.config.num_nextn_predict_layers is not None + and self.config.num_nextn_predict_layers > 0 + ) + assert len(mtp_emb_res) == self.config.num_nextn_predict_layers + 1 + hidden_states_concat = paddle.concat(mtp_emb_res) + preproc_output["hidden_states"] = hidden_states_concat + + for key in list(preproc_output.keys()): + if preproc_output[key] is None: + preproc_output.pop(key) + return preproc_output diff --git a/src/paddlefleet/models/qwen3_vl/qwen3_vl_model.py b/src/paddlefleet/models/qwen3_vl/qwen3_vl_model.py index 6b869bd40..221bd9980 100644 --- a/src/paddlefleet/models/qwen3_vl/qwen3_vl_model.py +++ b/src/paddlefleet/models/qwen3_vl/qwen3_vl_model.py @@ -219,3 +219,227 @@ def _forward_impl( if context is not None: return hidden_states, context, deepstack_feature return hidden_states, deepstack_feature + + +class Qwen3VLTextTransformerLayer(TransformerLayer): + """Qwen3VL text model for adapt deepstack process""" + + def forward( + self, + dict_args: dict, + ): + """ + Perform a forward pass through the transformer layer. + + This method calls the core computation of a transformer layer, including + self-attention, cross-attention (if applicable), and feed-forward operations. + """ + # Remove 'dynamic_inference_decode_only' from kwargs if present + # this is only used to uniquely identify decode and non-decode cuda graph + # runners in the cuda graph manager + dict_args.pop("dynamic_inference_decode_only", None) + dict_args.pop("position_ids", None) + if self.full_recompute: + hidden_states = dict_args["hidden_states"] + attention_mask = dict_args.get("attention_mask", None) + attn_mask_startend_row_indices = dict_args.get( + "attn_mask_startend_row_indices", None + ) + context = dict_args.get("context", None) + context_mask = dict_args.get("context_mask", None) + rotary_pos_emb = dict_args.get("rotary_pos_emb", None) + rotary_pos_cos = dict_args.get("rotary_pos_cos", None) + rotary_pos_sin = dict_args.get("rotary_pos_sin", None) + attention_bias = dict_args.get("attention_bias", None) + packed_seq_params = dict_args.get("packed_seq_params", None) + deepstack_visual_emb = dict_args.get("deepstack_visual_emb", None) + visual_pos_masks = dict_args.get("visual_pos_masks", None) + + assert (rotary_pos_sin is None) == (rotary_pos_cos is None) + + if rotary_pos_cos is not None and rotary_pos_sin is not None: + rotary_pos_cos = rotary_pos_cos.clone() + rotary_pos_sin = rotary_pos_sin.clone() + if self.config.apply_rope_fusion: + rotary_pos_cos = rotary_pos_cos[0, ...] + rotary_pos_sin = rotary_pos_sin[0, ...] + if rotary_pos_cos.ndim == 2: + rotary_pos_cos = rotary_pos_cos.reshape( + [ + 1, + rotary_pos_cos.shape[0], + 1, + rotary_pos_cos.shape[1], + ] + ) + rotary_pos_sin = rotary_pos_sin.reshape( + [ + 1, + rotary_pos_sin.shape[0], + 1, + rotary_pos_sin.shape[1], + ] + ) + + outputs = recompute( + self._forward_impl, + hidden_states=hidden_states, + attention_mask=attention_mask, + attn_mask_startend_row_indices=attn_mask_startend_row_indices.clone() # Clone is necessary! + if attn_mask_startend_row_indices is not None + else None, + context=context, + context_mask=context_mask, + rotary_pos_emb=rotary_pos_emb.clone() + if rotary_pos_emb is not None + else None, # Clone is necessary! + rotary_pos_cos=rotary_pos_cos, + rotary_pos_sin=rotary_pos_sin, + attention_bias=attention_bias, + packed_seq_params=packed_seq_params, + deepstack_visual_emb=deepstack_visual_emb, + visual_pos_masks=visual_pos_masks, + ) + else: + outputs = self._forward_impl(**dict_args) + + if isinstance(outputs, tuple): + output, context = outputs[0], outputs[1] + else: + output, context = outputs, None + + rst = OrderedDict() + rst = {"hidden_states": output} + if context is not None: + rst["context"] = context + rst = {**dict_args, **rst} + return rst + + def _forward_impl( + self, + hidden_states: paddle.Tensor, + attention_mask: paddle.Tensor = None, + attn_mask_startend_row_indices: paddle.Tensor = None, + context: paddle.Tensor = None, + context_mask: paddle.Tensor = None, + rotary_pos_emb: paddle.Tensor = None, + rotary_pos_cos: paddle.Tensor = None, + rotary_pos_sin: paddle.Tensor = None, + attention_bias: paddle.Tensor = None, + packed_seq_params: PackedSeqParams = None, + deepstack_visual_emb: list[paddle.Tensor] | None = None, + visual_pos_masks: paddle.Tensor = None, + ): + hidden_states, context = self._forward_attention( + hidden_states=hidden_states, + attention_mask=attention_mask, + attn_mask_startend_row_indices=attn_mask_startend_row_indices, + context=context, + context_mask=context_mask, + rotary_pos_emb=rotary_pos_emb, + rotary_pos_cos=rotary_pos_cos, + rotary_pos_sin=rotary_pos_sin, + attention_bias=attention_bias, + packed_seq_params=packed_seq_params, + ) + hidden_states = self._forward_mlp(hidden_states) + if deepstack_visual_emb and self.layer_number in range( + len(deepstack_visual_emb) + ): + # print("process _deepstack_process ",hidden_states.shape,visual_pos_masks.shape,deepstack_visual_emb[self.layer_number].shape) + hidden_states = self._deepstack_process( + hidden_states=hidden_states, + visual_embeds=deepstack_visual_emb[self.layer_number], + visual_pos_masks=visual_pos_masks, + ) + if context is not None: + return hidden_states, context + return hidden_states + + def _deepstack_process( + self, + hidden_states: paddle.Tensor, + visual_pos_masks: paddle.Tensor, + visual_embeds: paddle.Tensor, + ): + # Store original shape and flatten hidden_states to 2D [B*S, D] + original_shape = hidden_states.shape + if hidden_states.ndim > 2: + hidden_states = hidden_states.flatten(start_axis=0, stop_axis=1) + + visual_embeds = visual_embeds.to( + hidden_states.device, hidden_states.dtype + ) + + # complicated logic for sequential parallelism + if visual_pos_masks.ndim > 1: + visual_pos_masks = visual_pos_masks.flatten() + + # This block handles Sequence Parallelism (Row Slicing) + if visual_pos_masks.shape[0] > hidden_states.shape[0]: + try: + from paddle.distributed.fleet import ( + get_hybrid_communicate_group, + ) + + hcg = get_hybrid_communicate_group() + mp_rank = hcg.get_model_parallel_rank() + mp_size = hcg.get_model_parallel_world_size() + except (ImportError, AttributeError): + mp_size = visual_pos_masks.shape[0] // hidden_states.shape[0] + mp_rank = paddle.distributed.get_rank() % mp_size + total_len = visual_pos_masks.shape[0] + chunk_size = total_len // mp_size + start_idx = mp_rank * chunk_size + end_idx = start_idx + chunk_size + if start_idx > 0: + pre_mask = visual_pos_masks[:start_idx] + visual_offset = paddle.sum( + paddle.cast(pre_mask, "int32") + ).item() + else: + visual_offset = 0 + local_mask = visual_pos_masks[start_idx:end_idx] + local_visual_count = paddle.sum( + paddle.cast(local_mask, "int32") + ).item() + + visual_embeds = visual_embeds[ + visual_offset : visual_offset + local_visual_count + ] + visual_pos_masks = local_mask + + # If TP is enabled, hidden_states has shape [..., Hidden_Dim / TP_Size], + # but visual_embeds usually has full [Hidden_Dim]. We need to slice visual_embeds column-wise. + if hidden_states.shape[-1] != visual_embeds.shape[-1]: + try: + from paddle.distributed.fleet import ( + get_hybrid_communicate_group, + ) + + hcg = get_hybrid_communicate_group() + tp_rank = hcg.get_model_parallel_rank() + tp_size = hcg.get_model_parallel_world_size() + except (ImportError, AttributeError): + # Fallback simple estimation + tp_size = visual_embeds.shape[-1] // hidden_states.shape[-1] + tp_rank = paddle.distributed.get_rank() % tp_size + + if tp_size > 1: + embed_dim = visual_embeds.shape[-1] + slice_width = embed_dim // tp_size + start_col = tp_rank * slice_width + end_col = start_col + slice_width + visual_embeds = visual_embeds[:, start_col:end_col] + + hidden_states = hidden_states.clone() + local_this = hidden_states[visual_pos_masks, :] + visual_embeds + hidden_states[visual_pos_masks, :] = ( + local_this # 这个操作可能会导致paddle转静态图或推理时出问题,建议使用 scatter + ) + + # [Supplement 3] Restore original shape [B*S, D] -> [B, S, D] if necessary + if len(original_shape) > 2: + hidden_states = hidden_states.reshape(original_shape) + + return hidden_states From 0e737cf819e017a5cb4cec702f2bb055bf3427a9 Mon Sep 17 00:00:00 2001 From: Severus Qin Date: Fri, 13 Feb 2026 19:14:58 +0800 Subject: [PATCH 2/9] complete qwen3vl model --- .../common/embeddings/rotary_pos_embedding.py | 6 +- src/paddlefleet/models/gpt/gpt_layer_specs.py | 1 + src/paddlefleet/models/gpt/gpt_model.py | 406 +-------------- src/paddlefleet/models/qwen3_vl/__init__.py | 30 ++ src/paddlefleet/models/qwen3_vl/embedding.py | 47 +- .../models/qwen3_vl/layer_specs.py | 31 +- .../models/qwen3_vl/patch_merger.py | 51 +- .../models/qwen3_vl/qwen3_vl_builders.py | 2 - .../models/qwen3_vl/qwen3_vl_model.py | 476 ++++++++++++++++-- .../models/qwen3_vl/qwen3_vl_provider.py | 16 +- .../transformer/transformer_encoder.py | 8 +- 11 files changed, 593 insertions(+), 481 deletions(-) create mode 100644 src/paddlefleet/models/qwen3_vl/__init__.py diff --git a/src/paddlefleet/models/common/embeddings/rotary_pos_embedding.py b/src/paddlefleet/models/common/embeddings/rotary_pos_embedding.py index d7b6ccae6..f8c0858a5 100644 --- a/src/paddlefleet/models/common/embeddings/rotary_pos_embedding.py +++ b/src/paddlefleet/models/common/embeddings/rotary_pos_embedding.py @@ -155,9 +155,9 @@ def get_freqs_non_repeated( return freqs - def get_cos_sin( - self, max_seq_len: int, offset: int = 0 - ) -> (Tensor, Tensor): + def get_cos_sin(self, max_seq_len: int, offset: int = 0) -> tuple( + Tensor, Tensor + ): """Cosine and sine values for RoPE are precomputed for all positions up to the maximum sequence length""" freqs = self.get_freqs_non_repeated(max_seq_len, offset) diff --git a/src/paddlefleet/models/gpt/gpt_layer_specs.py b/src/paddlefleet/models/gpt/gpt_layer_specs.py index 34995b883..9abbe8b20 100644 --- a/src/paddlefleet/models/gpt/gpt_layer_specs.py +++ b/src/paddlefleet/models/gpt/gpt_layer_specs.py @@ -369,6 +369,7 @@ def get_gpt_spec( extra_kwargs={ "config": config, "tie_word_embeddings": tie_word_embeddings, + "modal": "language_model" if config.multimodal_embedding else None, }, sublayers_spec=GPTSublayersSpec( embedding=LayerSpec( diff --git a/src/paddlefleet/models/gpt/gpt_model.py b/src/paddlefleet/models/gpt/gpt_model.py index 584249bee..10980f3e0 100644 --- a/src/paddlefleet/models/gpt/gpt_model.py +++ b/src/paddlefleet/models/gpt/gpt_model.py @@ -19,12 +19,8 @@ from paddlefleet.pipeline_parallel import ( LayerDesc, - PipelineLayer, SharedLayerDesc, ) -from paddlefleet.pipeline_parallel.pp_utils.utils import ( - dict_to_tuple_helper, -) if TYPE_CHECKING: from paddlefleet.spec_utils import LayerSpec @@ -34,8 +30,8 @@ from paddlefleet.models.gpt.gpt_embedding import GPTEmbedding from paddlefleet.models.gpt.lm_head import GPTLMHead from paddlefleet.pipeline_parallel import ScheduleChunk +from paddlefleet.transformer.transformer_encoder import TransformerEncoder from paddlefleet.transformer.transformer_layer import ( - TransformerLayer, TransformerLayerNode, TransformerLayerOverlappedScheduleNode, ) @@ -139,7 +135,7 @@ class GPTSublayersSpec: lm_head: LayerSpec | None = None -class GPTModel(PipelineLayer): +class GPTModel(TransformerEncoder): """GPT Transformer language model. Args: @@ -152,6 +148,7 @@ def __init__( **kwargs, ) -> None: self.config = kwargs["config"] + self.modal = kwargs.pop("modal", None) tie_word_embeddings = ( kwargs["tie_word_embeddings"] and self.config.pipeline_model_parallel_size > 1 @@ -176,7 +173,7 @@ def __init__( else fleet.get_hybrid_communicate_group().topology() ) - super().__init__( + super(TransformerEncoder, self).__init__( layers=self.layers, topology=topology, num_virtual_pipeline_stages=self.config.virtual_pipeline_model_parallel_size, @@ -193,7 +190,7 @@ def __init__( def get_layer_desc_list(self, spec, tie_word_embeddings): layers = [] - if "qwen3_vl" in getattr(self.config, "model_type", ""): + if self.modal: name_prefix = "model.language_model" else: name_prefix = "model" @@ -257,396 +254,3 @@ def get_layer_desc_list(self, spec, tie_word_embeddings): ) return layers - - def overlapped_forward_backward( - self, - forward_chunk, - forward_inputs, - forward_loss_fn_node, - backward_chunk, - backward_loss_fn_node, - backward_input_grads, - scaler, - p2p_async_handle, - ): - if backward_loss_fn_node is not None: - if scaler: - backward_input_grads = backward_loss_fn_node.backward( - scaler=scaler - ) - else: - backward_input_grads = backward_loss_fn_node.backward() - - ( - forward_pre_node, - backward_pre_node, - overlap_node, - forward_post_node, - backward_post_node, - ) = build_overlapped_nodes(forward_chunk, backward_chunk) - - if len(overlap_node.nodes) > 0: - assert not any( - isinstance(node, TransformerLayerNode) - for node in overlap_node.nodes - ) - # origin assert, why ? - # assert not any( - # isinstance(node, TransformerLayerNode) - # for node in forward_post_node.nodes - # ) - # assert not any( - # isinstance(node, TransformerLayerNode) - # for node in backward_post_node.nodes - # ) - - if p2p_async_handle is not None: - p2p_async_handle.forward_handle_wait() - p2p_async_handle.backward_handle_wait() - - forward_inputs = forward_pre_node.forward(forward_inputs) - backward_input_grads = backward_pre_node.backward(backward_input_grads) - - for i, node in enumerate(overlap_node.nodes): - forward_inputs, backward_input_grads = node.forward_backward( - forward_inputs, - backward_input_grads, - # split_bw=(i == len(overlap_node.nodes) - 1), - ) - - forward_inputs = forward_post_node.forward(forward_inputs) - backward_input_grads = backward_post_node.backward(backward_input_grads) - - # forward_inputs = forward_chunk.forward(forward_inputs) - - if p2p_async_handle is not None: - forward_inputs = dict_to_tuple_helper(forward_inputs) - p2p_async_handle.forward_async_comm(forward_inputs) - p2p_async_handle.backward_async_comm(backward_input_grads) - - # backward_input_grads = backward_chunk.backward(backward_input_grads) - - # used for bw split - # if len(overlap_node.nodes) > 0: - # WeightGradStore.pop() - # assert WeightGradStore.funcs_queue.empty() - - if forward_loss_fn_node is not None: - forward_loss = forward_loss_fn_node.forward(forward_inputs) - else: - forward_loss = None - - return forward_inputs, forward_loss, backward_input_grads - - def get_hardware_flops(self): - return 989e3 - - def add_sequential_layer(self, layers, layer_desc, name_prefix=""): - """ - Add a sequential layer to the network with specified description and name prefix. - - Args: - layers (list): List to store layer descriptions. Each element should be a dict - with keys "layer" (LayerDesc) and "name_prefix" (str). - layer_desc (LayerDesc|SharedLayerDesc): Layer description object containing - layer self.configuration. - name_prefix (str, optional): Prefix for layer names in the pipeline. - Defaults to empty string. - - Returns: - None: The layer description is appended to the input layers list. - """ - layers.append({"layer": layer_desc, "name_prefix": name_prefix}) - - def get_sequential_layers(self): - """ - Get all layers in the sequential network. - - Returns: - List[paddle.nn.Layer]: List containing all layers. - """ - return [x["layer"] for x in self._sequential_layers] - - def get_sequential_name_prefixes(self): - """ - Retrieve name prefixes for all parallel layers in the sequential network. - - Returns: - Dict[str, str]: A dictionary mapping layer indices (as strings) to their - corresponding name prefixes. The indices represent the position of - each layer in the sequential order. - """ - return { - str(index): x["name_prefix"] - for index, x in enumerate(self._sequential_layers) - } - - def get_shardlayer_prefix(self, name_splited): - """_summary_ - This function retrieves the prefix of a shared layer. The process involves: - 1. Identifying all key names of shared layers, like 'shared_weight01', 'shared_weight02', etc. - 2. For instance, given name_splited = ['shared_layers', 'shared_weight01', 'weight'], - the 'shared_layer_key' would be name_splited[1], which is 'shared_weight01'. - 3. By traversing through all layers, the function checks if the specified - shared_layer is present in the current stage. If found, it returns the corresponding prefix. - - Note: For retrieving all SharedLayer instances in Paddle, you can refer to the following Paddle code. - https://github.com/PaddlePaddle/Paddle/blob/2cf724d055679a1a0e48766dfb1708b920273078/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py#L460-L513 - Args: - name_splited (_type_): _description_ - - Returns: - _type_: _description_ - """ - shared_layer_names = { - s.layer_name for s in self.layers if isinstance(s, SharedLayerDesc) - } - assert name_splited[1] in shared_layer_names, ( - f"The shared layer name {name_splited[1]} must be in prefixes!" - ) - shared_layer_key = name_splited[1] - for idx, layer in enumerate(self.layers): - if ( - isinstance(layer, SharedLayerDesc) - and layer.layer_name == shared_layer_key - ): - if self.get_stage_from_index(idx) == self._stage_id: - return self.get_sequential_name_prefixes()[str(idx)] - - # the prefix must be in the current stage, else raise error - raise ValueError( - f"The shared layer {shared_layer_key} must be in the current stage!" - ) - - def _set_pipeline_name_mapping(self, mappings=None): - """ - Set the name mapping for pipeline. - - Args: - mappings (dict, optional): Dictionary storing name mapping relationships. Default is None, meaning no mapping operation. - - Returns: - dict: Returns the updated or existing mapping relationship. - - """ - if mappings is not None: - self._pipeline_name_mapping = mappings - else: - single_to_pp_mapping = {} - pp_to_single_mapping = {} - - state_dict_keys = list(super().state_dict().keys()) - first_key = "" - for k in state_dict_keys: - if "shared_layers" not in k: - first_key = k - break - first_key = first_key.split(".") - # if use virtual pp_degree, the prefix is like 0.0.xxx - # else it will be like 0.xxx - use_virtual_pp_degree = ( - first_key[0].isdigit() and first_key[1].isdigit() - ) - - prefixes = self.get_sequential_name_prefixes() - for k in state_dict_keys: - name_splited = k.split(".") - if use_virtual_pp_degree: - if name_splited[0].isdigit(): - if name_splited[1].isdigit(): - idx = str( - int(name_splited[0]) + int(name_splited[1]) - ) - single_name = [prefixes[idx]] - single_name.extend(name_splited[2:]) - else: - single_name = [prefixes[str(len(prefixes) - 1)]] - single_name.extend(name_splited[2:]) - logger.warning( - f"Please check! we treat this key as last layer, get {k}, \ - set origin name as {'.'.join(single_name)}" - ) - elif name_splited[0] == "shared_layers": - single_name = [self.get_shardlayer_prefix(name_splited)] - single_name.extend(name_splited[2:]) - else: - single_to_pp_mapping[k] = k - pp_to_single_mapping[k] = k - continue - else: - idx = name_splited[0] - # for normal pp layer - if idx.isdigit(): - # allow empty prefix - single_name = ( - [] if prefixes[idx] == "" else [prefixes[idx]] - ) - single_name.extend(name_splited[1:]) - elif idx == "shared_layers": - single_name = [self.get_shardlayer_prefix(name_splited)] - single_name.extend(name_splited[2:]) - else: - single_to_pp_mapping[k] = k - pp_to_single_mapping[k] = k - continue - - single_to_pp_mapping[".".join(single_name)] = k - pp_to_single_mapping[k] = ".".join(single_name) - - self._pipeline_name_mapping = single_to_pp_mapping - self._pp_to_single_mapping = pp_to_single_mapping - - return self._pipeline_name_mapping - - def state_dict(self, *args, **kwargs): - """ - Return a dictionary with Pipeline Stage mapping. - Args: - *args (tuple): Variable argument list passed to parent method. - **kwargs (dict): Optional keyword arguments passed to parent method. - Returns: - dict: Dictionary containing Pipeline Stage mapping. - """ - state_dict = super().state_dict(*args, **kwargs) - - if "qwen3_vl" in getattr(self.config, "model_type", ""): - name_prefix = "model.language_model." - else: - name_prefix = "" - if self._pipeline_name_mapping is None: - self._set_pipeline_name_mapping() - # assert len(self._pipeline_name_mapping) > 0, "The pipeline stage must have parameters!" - for k in list(state_dict.keys()): - v = state_dict.pop(k) - if name_prefix and k.startswith(name_prefix): - k = k[len(name_prefix) :] - if k not in self._pp_to_single_mapping: - state_dict[k] = v - continue - v.key = self._pp_to_single_mapping[k] - state_dict[self._pp_to_single_mapping[k]] = v - return state_dict - - def set_state_dict(self, state_dict, *args, **kwargs): - if self._pipeline_name_mapping is None: - self._set_pipeline_name_mapping() - assert len(self._pipeline_name_mapping) > 0, ( - "The pipeline stage must have parameters!" - ) - - for k in list(state_dict.keys()): - v = state_dict.pop(k) - if k not in self._pipeline_name_mapping: - continue - state_dict[self._pipeline_name_mapping[k]] = v - - ret = super().set_state_dict(state_dict, *args, **kwargs) - return ret - - def _check_shared_model_state(self): - if self._pipeline_name_mapping is None: - self._set_pipeline_name_mapping() - - super_state_dict = super().state_dict() - structure_name_to_tensor = {} - for k, v in super_state_dict.items(): - k = self._pp_to_single_mapping[k] - if k not in structure_name_to_tensor: - structure_name_to_tensor[k] = v - else: - old_v = structure_name_to_tensor[k] - assert old_v is v, ( - f"Shared tensor with different structure name: {k}" - ) - - missing_shared_keys = {} - for k, v in self._pp_to_single_mapping.items(): - mapped_k = self._pipeline_name_mapping[v] - if k != mapped_k: - missing_shared_keys[k] = mapped_k - return missing_shared_keys - - def sharded_state_dict(self, *args, **kwargs): - """ - sharded_state_dict method for PipelinePretrainedModel. - - Remaps parameter keys according to the pipeline stage mapping, and converts expert indices from local to global. - """ - sharded_state_dict = super().sharded_state_dict(*args, **kwargs) - if self._pipeline_name_mapping is None: - self._set_pipeline_name_mapping() - - if "qwen3_vl" in getattr(self.config, "model_type", ""): - name_prefix = "model.language_model." - else: - name_prefix = "" - - for k in list(sharded_state_dict.keys()): - v = sharded_state_dict.pop(k) - # remove name_prefix - if name_prefix and k.startswith(name_prefix): - k = k[len(name_prefix) :] - if k not in self._pp_to_single_mapping: - sharded_state_dict[k] = v - continue - v.key = self._pp_to_single_mapping[k] - sharded_state_dict[self._pp_to_single_mapping[k]] = v - - def increment_expert_number(s, increment): - import re - - def replace(match): - original_number = int(match.group(0)) - new_number = original_number + increment - return str(new_number) - - return re.sub(r"(?<=experts\.)\d+", replace, s) - - renamed_sharded_state_dict = {} - for k, v in sharded_state_dict.items(): - global_expert_id_offset = getattr( - v, "global_expert_id_offset", None - ) - layer_cnt = getattr(v, "layer_cnt", None) - if global_expert_id_offset is not None: - new_key = increment_expert_number(k, global_expert_id_offset) - v.key = new_key - delattr(v, "global_expert_id_offset") - renamed_sharded_state_dict[new_key] = v - elif layer_cnt is not None: - new_key = k + "_layer_" + str(layer_cnt) - v.key = new_key - delattr(v, "layer_cnt") - renamed_sharded_state_dict[new_key] = v - else: - renamed_sharded_state_dict[k] = v - - return renamed_sharded_state_dict - - def fp8_quant_weight(self, batch_mode=False, quant_transpose=True): - if self._num_virtual_pipeline_stages > 1: - for idx, chunk in enumerate(self._model_chunks): - for idx, layer in enumerate(chunk): - if isinstance(layer, TransformerLayer): - layer.fp8_quant_weight( - batch_mode=batch_mode, - quant_transpose=quant_transpose, - ) - else: - for idx, layer in enumerate(self.run_function): - if isinstance(layer, TransformerLayer): - layer.fp8_quant_weight( - batch_mode=batch_mode, quant_transpose=quant_transpose - ) - - def use_fp8(self): - if self._num_virtual_pipeline_stages > 1: - for idx, chunk in enumerate(self._model_chunks): - for idx, layer in enumerate(chunk): - if isinstance(layer, TransformerLayer) and layer.use_fp8(): - return True - else: - for idx, layer in enumerate(self.run_function): - if isinstance(layer, TransformerLayer) and layer.use_fp8(): - return True - return False diff --git a/src/paddlefleet/models/qwen3_vl/__init__.py b/src/paddlefleet/models/qwen3_vl/__init__.py new file mode 100644 index 000000000..291619e4e --- /dev/null +++ b/src/paddlefleet/models/qwen3_vl/__init__.py @@ -0,0 +1,30 @@ +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .embedding import Qwen3VLTextEmbedding +from .qwen3_vl_model import ( + Qwen3VLModelDist, + Qwen3VLTextTransformerLayer, + Qwen3VLVisionModel, + Qwen3VLVisionTransformerLayer, +) +from .qwen3_vl_provider import Qwen3VLVisionProvider + +__all__ = [ + "Qwen3VLTextEmbedding", + "Qwen3VLModelDist", + "Qwen3VLTextTransformerLayer", + "Qwen3VLVisionModel", + "Qwen3VLVisionTransformerLayer", + "Qwen3VLVisionProvider", +] diff --git a/src/paddlefleet/models/qwen3_vl/embedding.py b/src/paddlefleet/models/qwen3_vl/embedding.py index a060e6a20..14c87020b 100644 --- a/src/paddlefleet/models/qwen3_vl/embedding.py +++ b/src/paddlefleet/models/qwen3_vl/embedding.py @@ -32,6 +32,22 @@ class VisionEmbeddingSpec: rope_embedding: LayerSpec = None +class VisionRotaryEmbedding(nn.Layer): + inv_freq: paddle.Tensor + + def __init__(self, dim: int, theta: float = 10000.0) -> None: + super().__init__() + inv_freq = 1.0 / ( + theta ** (paddle.arange(0, dim, 2, dtype=paddle.float32) / dim) + ) + self.register_buffer("inv_freq", inv_freq, persistable=False) + + def forward(self, seqlen: int) -> paddle.Tensor: + seq = paddle.arange(seqlen, dtype=self.inv_freq.dtype) + freqs = paddle.outer(seq, self.inv_freq) + return freqs + + class VisionEmbedding(FleetLayer): def __init__( self, @@ -219,7 +235,20 @@ def forward(self, dict_args: dict): pixel_values = dict_args["pixel_values"] grid_thw = dict_args["grid_thw"] - hidden_states = self.patch_embed(pixel_values).view() + target_dtype = self.patch_embed.weight.dtype + hidden_states = pixel_values.view( + -1, + self.in_channels, + self.temporal_patch_size, + self.patch_size, + self.patch_size, + ) + hidden_states = ( + self.patch_embed(hidden_states) + .to(dtype=target_dtype) + .view(-1, self.embed_dim) + ) + pos_embeds = self.fast_pos_embed_interpolate(grid_thw) hidden_states = hidden_states + pos_embeds @@ -227,11 +256,14 @@ def forward(self, dict_args: dict): hidden_states = hidden_states.reshape([seq_len, -1]) hidden_states = hidden_states.unsqueeze(0) - rotary_pos_emb = self.rotary_pos_emb(grid_thw) - rotary_pos_cos, rotary_pos_sin = self.rotary_pos_emb.get_cos_sin( - grid_thw - ) + rotary_pos_emb = self.rot_pos_emb(grid_thw) + rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1) + rotary_pos_emb = paddle.cat((rotary_pos_emb, rotary_pos_emb), dim=-1) + rotary_pos_cos = rotary_pos_emb.cos() + rotary_pos_sin = rotary_pos_emb.sin() + rotary_pos_emb = rotary_pos_emb[:, None, None, :] + rotary_pos_emb = rotary_pos_emb.transpose([1, 0]) packed_seq_params = self.get_packed_seq_params(grid_thw) preproc_output = { @@ -246,7 +278,7 @@ def forward(self, dict_args: dict): return preproc_output -class TextEmbedding(GPTEmbedding): +class Qwen3VLTextEmbedding(GPTEmbedding): def forward( self, dict_args: dict, @@ -427,3 +459,6 @@ def forward( if preproc_output[key] is None: preproc_output.pop(key) return preproc_output + + +__all__ = ["Qwen3VLTextEmbedding"] diff --git a/src/paddlefleet/models/qwen3_vl/layer_specs.py b/src/paddlefleet/models/qwen3_vl/layer_specs.py index 362b0d32f..7c3b60eea 100644 --- a/src/paddlefleet/models/qwen3_vl/layer_specs.py +++ b/src/paddlefleet/models/qwen3_vl/layer_specs.py @@ -16,12 +16,16 @@ from ...fusions.fused_bias_dropout import get_bias_dropout_add from ...spec_utils import LayerSpec from ...transformer.attention import SelfAttention, SelfAttentionSublayersSpec +from ...transformer.enums import AttnMaskType from ...transformer.identity_op import IdentityOp from ...transformer.transformer_config import TransformerConfig from ..backends import LocalSpecProvider -from ..common.embeddings.rotary_pos_embedding import RotaryEmbedding from ..gpt.gpt_layer_specs import get_mlp_layer_spec_for_backend -from .embedding import VisionEmbedding, VisionEmbeddingSpec +from .embedding import ( + VisionEmbedding, + VisionEmbeddingSpec, + VisionRotaryEmbedding, +) from .patch_merger import Qwen3VLVisionPatchMergerSpec, Qwen3VLVisionPathMerger from .qwen3_vl_model import ( Qwen3VLVisionModel, @@ -38,8 +42,8 @@ def get_qwen3_vl_vision_layer_local_spec( append_deepstack: bool = False, ) -> LayerSpec: backend = LocalSpecProvider() - layer_norm = backend.layer_norm(rms_norm=False, for_qk=False) - qk_norm = backend.layer_norm(rms_norm=False, for_qk=True) + layer_norm = backend.layer_norm(rms_norm=False, for_qk=False, fused=False) + qk_norm = backend.layer_norm(rms_norm=False, for_qk=True, fused=False) mlp = get_mlp_layer_spec_for_backend( backend=backend, ) @@ -48,7 +52,9 @@ def get_qwen3_vl_vision_layer_local_spec( layer=Qwen3VLVisionPathMerger, sublayers_spec=Qwen3VLVisionPatchMergerSpec( backend.layer_norm( - rms_norm=(config.normalization == "RMSNorm"), for_qk=False + rms_norm=(config.normalization == "RMSNorm"), + for_qk=False, + fused=False, ) ), extra_kwargs={"config": config, "use_postshuffle_norm": True}, @@ -66,6 +72,7 @@ def get_qwen3_vl_vision_layer_local_spec( q_norm=qk_norm if use_qk_norm else IdentityOp, k_norm=qk_norm if use_qk_norm else IdentityOp, ), + extra_kwargs={"attn_mask_type": AttnMaskType.no_mask}, ), self_attn_bda=get_bias_dropout_add, post_attention_layernorm=layer_norm, @@ -97,8 +104,8 @@ def get_qwen3vl_vision_encoder_layers_spec( use_qk_norm=config.use_qk_norm, ) layer_specs = [] - append_deepstack = False for layer_number in range(config.num_hidden_layers): + append_deepstack = False real_layer_number = layer_number + config.num_empty_layers_add_in_head if layer_number in config.deepstack_visual_indexes: append_deepstack = True @@ -117,21 +124,17 @@ def get_qwen3_vl_vision_spec( transformer_layers_spec: list[LayerSpec], head_empty_layers_spec: list[LayerSpec] | None = None, tail_empty_layer_spec: list[LayerSpec] | None = None, - rotary_percent: float = 1.0, - rotary_base: int = 10000, - rope_scaling: bool = False, + rotary_base: int = 10000.0, ): backend = LocalSpecProvider() embedding_extra_kwargs = {"config": config} rotary_emb_extra_kwargs = { - "head_dim": config.head_dim // 2, - "rotary_base": rotary_base, - "rope_scaling": rope_scaling, - "rotary_percent": rotary_percent, + "dim": config.head_dim // 2, + "theta": rotary_base, } embedding_spec = VisionEmbeddingSpec( rope_embedding=LayerSpec( - layer=RotaryEmbedding, + layer=VisionRotaryEmbedding, extra_kwargs=rotary_emb_extra_kwargs, ) ) diff --git a/src/paddlefleet/models/qwen3_vl/patch_merger.py b/src/paddlefleet/models/qwen3_vl/patch_merger.py index 5a85f4ca7..200c478ea 100644 --- a/src/paddlefleet/models/qwen3_vl/patch_merger.py +++ b/src/paddlefleet/models/qwen3_vl/patch_merger.py @@ -13,14 +13,11 @@ # limitations under the License. from dataclasses import dataclass -import paddle from paddle import nn -from paddle.nn import functional as F from ...spec_utils import LayerSpec, build_layer from ...tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear from ...transformer.identity_op import IdentityOp -from ...transformer.mlp import MLP, MLPSublayersSpec @dataclass @@ -49,24 +46,33 @@ def __init__( sublayers_spec.norm, config=config, hidden_size=norm_dim ) self.use_postshuffle_norm = use_postshuffle_norm - self.mlp = build_layer( - LayerSpec( - layer=MLP, - sublayers_spec=MLPSublayersSpec( - up_gate_proj=ColumnParallelLinear, - down_proj=RowParallelLinear, - hidden_act=F.gelu, - ), - extra_kwargs={ - "config": config, - "input_size": self.hidden_size, - "intermediate_size": self.hidden_size, - "hidden_size": dim, - }, - ) + + self.linear_fc1 = build_layer( + ColumnParallelLinear, + config=config, + input_size=self.hidden_size, + output_size=self.hidden_size, + init_method=config.init_method, + bias=config.use_bias, + skip_bias_add=False, + gather_output=False, + is_expert=False, + ) + self.act_fn = nn.GELU() + self.linear_fc2 = build_layer( + RowParallelLinear, + config=config, + input_size=self.hidden_size, + output_size=dim, + init_method=config.output_layer_init_method, + input_is_parallel=True, + bias=config.use_bias, + skip_bias_add=False, + is_expert=False, ) - def forward(self, x: paddle.Tensor): + def forward(self, dict_args): + x = dict_args["hidden_states"] if self.use_postshuffle_norm: x = self.norm(x.reshape([-1, self.hidden_size])) x = x.reshape([-1, self.hidden_size]) @@ -74,5 +80,8 @@ def forward(self, x: paddle.Tensor): x = self.norm(x) x = x.reshape([-1, self.hidden_size]) - x = self.mlp(x) - return x + x, _ = self.linear_fc1(x) + x, _ = self.linear_fc2(self.act_fn(x)) + rst = {"hidden_states": x} + rst = {**dict_args, **rst} + return rst diff --git a/src/paddlefleet/models/qwen3_vl/qwen3_vl_builders.py b/src/paddlefleet/models/qwen3_vl/qwen3_vl_builders.py index 562e88ce1..efb32eedf 100644 --- a/src/paddlefleet/models/qwen3_vl/qwen3_vl_builders.py +++ b/src/paddlefleet/models/qwen3_vl/qwen3_vl_builders.py @@ -42,8 +42,6 @@ def qwen3_vl_vision_builder(config, **kwargs): transformer_layers_spec=transformer_layer_specs, tail_empty_layer_spec=tail_empty_layers_spec, rotary_base=config.rope_theta, - rotary_percent=config.rotary_percent, - rope_scaling=config.rope_scaling, ) return build_layer(res_spec, **kwargs) diff --git a/src/paddlefleet/models/qwen3_vl/qwen3_vl_model.py b/src/paddlefleet/models/qwen3_vl/qwen3_vl_model.py index 221bd9980..9bfd26f1f 100644 --- a/src/paddlefleet/models/qwen3_vl/qwen3_vl_model.py +++ b/src/paddlefleet/models/qwen3_vl/qwen3_vl_model.py @@ -21,6 +21,8 @@ from ...pipeline_parallel import LayerDesc from ...process_groups_config import ProcessGroupCollection from ...spec_utils import LayerSpec, build_layer +from ...transformer.enums import ModelType +from ...transformer.layer import FleetLayer from ...transformer.transformer_config import TransformerConfig from ...transformer.transformer_encoder import TransformerEncoder from ...transformer.transformer_layer import ( @@ -29,6 +31,15 @@ ) +def get_image_sequence_length( + img_h, img_w, patch_dim, add_class_token, class_token_len +): + num_patches_per_dim_h = img_h // patch_dim + num_patches_per_dim_w = img_w // patch_dim + num_patches = num_patches_per_dim_h * num_patches_per_dim_w + return num_patches + (class_token_len if add_class_token else 0) + + @dataclass class Qwen3VLVisionSublayersSpec: """ @@ -167,21 +178,21 @@ def forward( else: outputs = self._forward_impl(**dict_args) - if len(outputs) == 3: - output, context = outputs[0], outputs[1] - else: - output, context = outputs, None - - deepstack_feature = outputs[-1] + context, deepstack_feature = None, None + hidden_states = outputs[0] + if len(outputs) > 1: + deepstack_feature = outputs[-1] + if len(outputs) == 3: + context = outputs[1] rst = OrderedDict() - rst = {"hidden_states": output} + rst = {"hidden_states": hidden_states} if context is not None: rst["context"] = context - if "deepstack_feature_lists" not in rst: - rst["deepstack_feature_lists"] = [] + if "deepstack_feature_lists" not in dict_args: + dict_args["deepstack_feature_lists"] = [] if deepstack_feature is not None: - rst["deepstack_feature_lists"].append(deepstack_feature) + dict_args["deepstack_feature_lists"].append(deepstack_feature) rst = {**dict_args, **rst} return rst @@ -210,15 +221,21 @@ def _forward_impl( attention_bias=attention_bias, packed_seq_params=packed_seq_params, ) + hidden_states = self._forward_mlp(hidden_states) deepstack_feature = None if self.deepstack_merger is not None: - deepstack_feature = self.deepstack_merger(hidden_states) + deepstack_feature = self.deepstack_merger( + {"hidden_states": hidden_states} + )["hidden_states"] + res = (hidden_states,) if context is not None: - return hidden_states, context, deepstack_feature - return hidden_states, deepstack_feature + res += (context,) + if deepstack_feature is not None: + res += (deepstack_feature,) + return res class Qwen3VLTextTransformerLayer(TransformerLayer): @@ -239,6 +256,9 @@ def forward( # runners in the cuda graph manager dict_args.pop("dynamic_inference_decode_only", None) dict_args.pop("position_ids", None) + deepstack_visual_emb = dict_args.get("deepstack_visual_emb", None) + visual_pos_masks = dict_args.get("visual_pos_masks", None) + if self.full_recompute: hidden_states = dict_args["hidden_states"] attention_mask = dict_args.get("attention_mask", None) @@ -252,8 +272,6 @@ def forward( rotary_pos_sin = dict_args.get("rotary_pos_sin", None) attention_bias = dict_args.get("attention_bias", None) packed_seq_params = dict_args.get("packed_seq_params", None) - deepstack_visual_emb = dict_args.get("deepstack_visual_emb", None) - visual_pos_masks = dict_args.get("visual_pos_masks", None) assert (rotary_pos_sin is None) == (rotary_pos_cos is None) @@ -297,8 +315,6 @@ def forward( rotary_pos_sin=rotary_pos_sin, attention_bias=attention_bias, packed_seq_params=packed_seq_params, - deepstack_visual_emb=deepstack_visual_emb, - visual_pos_masks=visual_pos_masks, ) else: outputs = self._forward_impl(**dict_args) @@ -308,6 +324,17 @@ def forward( else: output, context = outputs, None + # Apply deepstack visual embedding outside of recompute to avoid issues + # with recompute not properly handling list-of-tensors (deepstack_visual_emb) + if deepstack_visual_emb and self.layer_number in range( + len(deepstack_visual_emb) + ): + output = self._deepstack_process( + hidden_states=output, + visual_embeds=deepstack_visual_emb[self.layer_number], + visual_pos_masks=visual_pos_masks, + ) + rst = OrderedDict() rst = {"hidden_states": output} if context is not None: @@ -327,8 +354,7 @@ def _forward_impl( rotary_pos_sin: paddle.Tensor = None, attention_bias: paddle.Tensor = None, packed_seq_params: PackedSeqParams = None, - deepstack_visual_emb: list[paddle.Tensor] | None = None, - visual_pos_masks: paddle.Tensor = None, + **kwargs, ): hidden_states, context = self._forward_attention( hidden_states=hidden_states, @@ -343,15 +369,6 @@ def _forward_impl( packed_seq_params=packed_seq_params, ) hidden_states = self._forward_mlp(hidden_states) - if deepstack_visual_emb and self.layer_number in range( - len(deepstack_visual_emb) - ): - # print("process _deepstack_process ",hidden_states.shape,visual_pos_masks.shape,deepstack_visual_emb[self.layer_number].shape) - hidden_states = self._deepstack_process( - hidden_states=hidden_states, - visual_embeds=deepstack_visual_emb[self.layer_number], - visual_pos_masks=visual_pos_masks, - ) if context is not None: return hidden_states, context return hidden_states @@ -443,3 +460,406 @@ def _deepstack_process( hidden_states = hidden_states.reshape(original_shape) return hidden_states + + +class Qwen3VLModelDist(FleetLayer): + """Qwen3VL Model Base Model Class.""" + + def __init__( + self, + config: TransformerConfig, + tokenizer=None, + pre_process: bool = True, + post_process: bool = True, + add_encoder: bool = True, + add_decoder: bool = True, + drop_vision_class_token: bool = False, + vp_stage: int | None = None, + model_version: str | None = None, + criterion=False, + ) -> None: + super().__init__(config=config) + + language_transformer_config = config.text_config + vision_transformer_config = config.vision_config + self.model_version = ( + vision_transformer_config.model_version + if model_version is None + else model_version + ) + self._language_max_sequence_length = ( + language_transformer_config.max_sequence_length + ) + assert self.model_version is not None + + self.config = config + self.pre_process = pre_process + self.post_process = post_process + self.add_encoder = add_encoder + self.add_decoder = add_decoder + self.vp_stage = vp_stage + + self.encoder_hidden_state = None + self.vision_model = None + self.language_model = None + self.image_token_index = config.image_token_id + self.video_token_index = config.video_token_id + + self.sequence_parallel_lm = ( + language_transformer_config.sequence_parallel + ) + self.tp_comm_overlap_lm = language_transformer_config.tp_comm_overlap + self.context_parallel_lm = ( + language_transformer_config.context_parallel_size + ) + assert not ( + self.sequence_parallel_lm or self.context_parallel_lm > 1 + ), ( + f"qwenvl donnot support sequence parallel {self.sequence_parallel_lm} " + f"or context parallel {self.context_parallel_lm}" + ) + self.share_embeddings_and_output_weights = False + self.rope_deltas = None + + if self.add_decoder: + self.language_model = language_transformer_config.provide( + pre_process=pre_process, + post_process=post_process, + vp_stage=vp_stage, + ) + self._language_is_pipeline_parallel = ( + language_transformer_config.pipeline_model_parallel_size > 1 + ) + + if self.add_encoder: + self.vision_model = vision_transformer_config.provide() + self._drop_vision_class_token = drop_vision_class_token + + self.model_type = ModelType.encoder_or_decoder + + self._img_seq_len = get_image_sequence_length( + img_h=vision_transformer_config.img_h, + img_w=vision_transformer_config.img_w, + patch_dim=vision_transformer_config.patch_size, + add_class_token=not drop_vision_class_token, + class_token_len=vision_transformer_config.class_token_len, + ) + self.criterion = criterion + + def get_rope_index( + self, + input_ids: paddle.LongTensor | None = None, + image_grid_thw: paddle.LongTensor | None = None, + video_grid_thw: paddle.LongTensor | None = None, + attention_mask: paddle.Tensor | None = None, + ) -> tuple[paddle.Tensor, paddle.Tensor]: + if video_grid_thw is not None: + video_grid_thw = paddle.repeat_interleave( + video_grid_thw, video_grid_thw[:, 0], dim=0 + ) + video_grid_thw[:, 0] = 1 + + spatial_merge_size = self.config.vision_config.spatial_merge_size + # TODO when implemented data file. + image_token_id = self.image_token_index + video_token_id = self.video_token_index + vision_start_token_id = 151652 + mrope_position_deltas = [] + if input_ids is not None and ( + image_grid_thw is not None or video_grid_thw is not None + ): + total_input_ids = input_ids + if attention_mask is None: + attention_mask = paddle.ones_like(total_input_ids) + position_ids = paddle.ones( + [3, input_ids.shape[0], input_ids.shape[1]], + dtype=input_ids.dtype, + ) + image_index, video_index = 0, 0 + for i, input_ids in enumerate(total_input_ids): + input_ids = input_ids[attention_mask[i] == 1] + image_nums, video_nums = 0, 0 + vision_start_indices = paddle.argwhere( + input_ids == vision_start_token_id + ).squeeze(1) + vision_tokens = input_ids[vision_start_indices + 1] + image_nums = (vision_tokens == image_token_id).sum() + video_nums = (vision_tokens == video_token_id).sum() + input_tokens = input_ids.tolist() + llm_pos_ids_list: list = [] + st = 0 + remain_images, remain_videos = image_nums, video_nums + for _ in range(image_nums + video_nums): + if image_token_id in input_tokens and remain_images > 0: + ed_image = input_tokens.index(image_token_id, st) + else: + ed_image = len(input_tokens) + 1 + if video_token_id in input_tokens and remain_videos > 0: + ed_video = input_tokens.index(video_token_id, st) + else: + ed_video = len(input_tokens) + 1 + if ed_image < ed_video: + t, h, w = ( + image_grid_thw[image_index][0], + image_grid_thw[image_index][1], + image_grid_thw[image_index][2], + ) + image_index += 1 + remain_images -= 1 + ed = ed_image + + else: + t, h, w = ( + video_grid_thw[video_index][0], + video_grid_thw[video_index][1], + video_grid_thw[video_index][2], + ) + video_index += 1 + remain_videos -= 1 + ed = ed_video + llm_grid_t, llm_grid_h, llm_grid_w = ( + t.item(), + h.item() // spatial_merge_size, + w.item() // spatial_merge_size, + ) + text_len = ed - st + + st_idx = ( + llm_pos_ids_list[-1].max() + 1 + if llm_pos_ids_list + else 0 + ) + llm_pos_ids_list.append( + paddle.arange(text_len).view(1, -1).expand(3, -1) + + st_idx + ) + + t_index = ( + paddle.arange(llm_grid_t) + .view(-1, 1) + .expand(-1, llm_grid_h * llm_grid_w) + .flatten() + ) + h_index = ( + paddle.arange(llm_grid_h) + .view(1, -1, 1) + .expand(llm_grid_t, -1, llm_grid_w) + .flatten() + ) + w_index = ( + paddle.arange(llm_grid_w) + .view(1, 1, -1) + .expand(llm_grid_t, llm_grid_h, -1) + .flatten() + ) + llm_pos_ids_list.append( + paddle.stack([t_index, h_index, w_index]) + + text_len + + st_idx + ) + st = ed + llm_grid_t * llm_grid_h * llm_grid_w + + if st < len(input_tokens): + st_idx = ( + llm_pos_ids_list[-1].max() + 1 + if len(llm_pos_ids_list) > 0 + else 0 + ) + text_len = len(input_tokens) - st + llm_pos_ids_list.append( + paddle.arange(text_len).view(1, -1).expand(3, -1) + + st_idx + ) + + llm_positions = paddle.cat(llm_pos_ids_list, dim=1).reshape( + 3, -1 + ) + position_ids[..., i, attention_mask[i] == 1] = llm_positions + mrope_position_deltas.append( + llm_positions.max() + 1 - len(total_input_ids[i]) + ) + mrope_position_deltas = paddle.to_tensor( + mrope_position_deltas + ).unsqueeze(1) + return position_ids, mrope_position_deltas + else: + if attention_mask is not None: + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + position_ids = ( + position_ids.unsqueeze(0) + .expand(3, -1, -1) + .to(attention_mask.device) + ) + max_position_ids = position_ids.max(0, keepdim=False)[0].max( + -1, keepdim=True + )[0] + mrope_position_deltas = ( + max_position_ids + 1 - attention_mask.shape[-1] + ) + else: + position_ids = ( + paddle.arange(input_ids.shape[1]) + .view(1, 1, -1) + .expand(3, input_ids.shape[0], -1) + ) + mrope_position_deltas = paddle.zeros( + [input_ids.shape[0], 1], + dtype=input_ids.dtype, + ) + return position_ids, mrope_position_deltas + + def get_video_features( + self, + pixel_values_videos: paddle.FloatTensor, + video_grid_thw: paddle.LongTensor | None = None, + ): + return self.get_image_features(pixel_values_videos, video_grid_thw) + + def get_image_features( + self, + pixel_values: paddle.FloatTensor, + image_grid_thw: paddle.LongTensor | None = None, + ): + dict_args = { + "pixel_values": pixel_values, + "grid_thw": image_grid_thw, + } + vision_output = self.vision_model(dict_args) + image_embeds, deepstack_image_embeds = ( + vision_output["hidden_states"], + vision_output["deepstack_feature_lists"], + ) + split_sizes = ( + image_grid_thw.prod(-1) + // self.config.vision_config.spatial_merge_size**2 + ).tolist() + image_embeds = paddle.split(image_embeds, split_sizes) + return image_embeds, deepstack_image_embeds + + def forward( + self, + input_ids: paddle.LongTensor = None, + attention_mask: paddle.Tensor | None = None, + position_ids: paddle.LongTensor | None = None, + loss_mask: paddle.Tensor | None = None, + labels: paddle.Tensor | None = None, + inference_params=None, + pixel_values: paddle.Tensor | None = None, + pixel_values_videos=None, + image_grid_thw=None, + video_grid_thw=None, + runtime_gather_output: bool | None = None, + cache_position: paddle.Tensor | None = None, + attn_mask_startend_row_indices: paddle.Tensor | None = None, + **kwargs, + ) -> paddle.Tensor: + assert loss_mask is None, "loss_mask is not supported yet" + ( + image_embeds, + video_embeds, + deepstack_image_embeds, + deepstack_video_embeds, + ) = (None for _ in range(4)) + if self.add_encoder and pixel_values is not None: + pixel_values = pixel_values.to( + self.vision_model.parameters()[0].dtype + ) + image_embeds, deepstack_image_embeds = self.get_image_features( + pixel_values, image_grid_thw + ) + image_embeds = paddle.cat(image_embeds, dim=0) + + if self.add_encoder and pixel_values_videos is not None: + pixel_values_videos = pixel_values_videos.to( + self.vision_model.parameters()[0].dtype + ) + video_embeds, deepstack_video_embeds = self.get_video_features( + pixel_values_videos, video_grid_thw + ) + video_embeds = paddle.cat(video_embeds, axis=0) + + if position_ids is None: + if ( + self.rope_deltas is None + or cache_position is None + or cache_position[0] == 0 + ): + position_ids, rope_deltas = self.get_rope_index( + input_ids, + image_grid_thw, + video_grid_thw, + attention_mask=attention_mask, + ) + self.rope_deltas = rope_deltas + else: + batch_size, seq_length = input_ids.shape + position_ids = paddle.arange(seq_length) + position_ids = position_ids.view(1, 1, -1).expand( + 3, batch_size, -1 + ) + if cache_position is not None: + delta = cache_position[0] + self.rope_deltas + else: + delta = paddle.zeros((batch_size, seq_length)) + delta = delta.repeat_interleave( + batch_size // delta.shape[0], axis=1 + ) + position_ids = position_ids + delta + else: + if position_ids.shape == input_ids.shape: + position_ids = position_ids.expand(3, position_ids.shape[0], -1) + + input_dict = { + "input_ids": input_ids, + "position_ids": position_ids, + "attention_mask": None, + "attn_mask_startend_row_indices": attn_mask_startend_row_indices, + "decoder_input": None, + "image_embeds": image_embeds, + "video_embeds": video_embeds, + "labels": labels, + "deepstack_image_embeds": deepstack_image_embeds, + "deepstack_video_embeds": deepstack_video_embeds, + "runtime_gather_output": runtime_gather_output, + } + output = self.language_model(input_dict) + + # print("qwenvl criterion ",self.criterion) + if labels is None: + return output + elif self.criterion is not None: + # print("qwenvl output loss ",self.criterion(output, labels)) + return self.criterion(output, labels) + else: + return output + + def set_input_tensor(self, input_tensor) -> None: + """Set model chunk input tensor.""" + # This is usually handled in schedules.py but some inference code still + # gives us non-lists or None + if not isinstance(input_tensor, list): + input_tensor = [input_tensor] + assert len(input_tensor) == 1, ( + "input_tensor should only be length 1 for llava" + ) + + if self.add_encoder and self.add_decoder: + self.vision_model.set_input_tensor(input_tensor[0]) + elif self.add_encoder: + self.vision_model.set_input_tensor(input_tensor[0]) + elif self.pre_process: + self.encoder_hidden_state = input_tensor[0] + else: + self.language_model.set_input_tensor(input_tensor[0]) + + # def get_input_embeddings(self): + # return self.language_model.get_input_embeddings() + + +__all__ = [ + "Qwen3VLTextTransformerLayer", + "Qwen3VLVisionModel", + "Qwen3VLVisionTransformerLayer", + "Qwen3VLModelDist", +] diff --git a/src/paddlefleet/models/qwen3_vl/qwen3_vl_provider.py b/src/paddlefleet/models/qwen3_vl/qwen3_vl_provider.py index 38113f65d..7400fc804 100644 --- a/src/paddlefleet/models/qwen3_vl/qwen3_vl_provider.py +++ b/src/paddlefleet/models/qwen3_vl/qwen3_vl_provider.py @@ -85,9 +85,21 @@ def provide(self) -> "Qwen3VLVisionModel": model_init_device_context = partial(paddle.device, device="meta") with model_init_device_context(): - res_model = qwen3_vl_vision_builder( + fleet_model = qwen3_vl_vision_builder( self, seg_method="layer:TransformerLayer|EmptyLayer", num_stages=pp_size, ) - return res_model + model = Qwen3VLVisionModel.__new__(Qwen3VLVisionModel) + + for attr_name in dir(fleet_model): + if not attr_name.startswith("__"): + try: + attr_value = getattr(fleet_model, attr_name) + setattr(model, attr_name, attr_value) + except: + pass + return model + + +__all__ = ["Qwen3VLVisionProvider"] diff --git a/src/paddlefleet/transformer/transformer_encoder.py b/src/paddlefleet/transformer/transformer_encoder.py index 39b4c26b2..bb7210be2 100644 --- a/src/paddlefleet/transformer/transformer_encoder.py +++ b/src/paddlefleet/transformer/transformer_encoder.py @@ -436,8 +436,8 @@ def state_dict(self, *args, **kwargs): """ state_dict = super().state_dict(*args, **kwargs) - if "qwen3_vl" in getattr(self.config, "model_type", ""): - name_prefix = "model.language_model." + if self.modal is not None: + name_prefix = f"model.{self.modal}." else: name_prefix = "" if self._pipeline_name_mapping is None: @@ -503,8 +503,8 @@ def sharded_state_dict(self, *args, **kwargs): if self._pipeline_name_mapping is None: self._set_pipeline_name_mapping() - if "qwen3_vl" in getattr(self.config, "model_type", ""): - name_prefix = "model.language_model." + if self.modal is not None: + name_prefix = f"model.{self.modal}." else: name_prefix = "" From ec39f333bd82fc6410c4e4be825db54f51cea269 Mon Sep 17 00:00:00 2001 From: Severus Qin Date: Wed, 4 Mar 2026 18:46:23 +0800 Subject: [PATCH 3/9] finish qwen3_vl model --- src/paddlefleet/models/backends.py | 15 ++++++++-- src/paddlefleet/models/gpt/gpt_layer_specs.py | 16 +++++++--- .../models/qwen3_vl/layer_specs.py | 14 +++++++-- .../models/qwen3_vl/patch_merger.py | 30 +++---------------- .../models/qwen3_vl/qwen3_vl_model.py | 11 +++---- 5 files changed, 46 insertions(+), 40 deletions(-) diff --git a/src/paddlefleet/models/backends.py b/src/paddlefleet/models/backends.py index f06d7fdbd..be05a7ec6 100644 --- a/src/paddlefleet/models/backends.py +++ b/src/paddlefleet/models/backends.py @@ -32,6 +32,8 @@ ) from paddlefleet.transformer.mlp import MLPSublayersSpec +from ..spec_utils import LayerSpec + # from paddlefleet.transformer.moe.experts import GroupedMLP, SequentialMLP # HACK(Guoxia Wang): need remove later @@ -113,14 +115,23 @@ def column_parallel_layer_norm_linear(self) -> type | None: """Which layer for sequential layernorm and linear""" return None - def layer_norm(self, rms_norm: bool = False, for_qk: bool = False) -> type: + def layer_norm( + self, + rms_norm: bool = False, + for_qk: bool = False, + fused: bool = True, + eps: float = 1e-5, + ) -> type: """Which module to use for layer norm""" if rms_norm: # Matching get_gpt_layer_local_spec. # Why does the global need to be updated? global LNImpl LNImpl = WrappedPaddleNorm - return LNImpl + return LayerSpec( + layer=LNImpl, + extra_kwargs={"eps": eps}, + ) def core_attention(self) -> type: """Which layer to use for attention""" diff --git a/src/paddlefleet/models/gpt/gpt_layer_specs.py b/src/paddlefleet/models/gpt/gpt_layer_specs.py index 9abbe8b20..015b4ed58 100644 --- a/src/paddlefleet/models/gpt/gpt_layer_specs.py +++ b/src/paddlefleet/models/gpt/gpt_layer_specs.py @@ -93,11 +93,19 @@ def get_gpt_layer_local_spec( backend = LocalSpecProvider() # Adjust for RMS norm. if normalization == "RMSNorm": - layer_norm = backend.layer_norm(rms_norm=True, for_qk=False) - qk_norm = backend.layer_norm(rms_norm=True, for_qk=True) + layer_norm = backend.layer_norm( + rms_norm=True, for_qk=False, eps=config.rms_norm_eps + ) + qk_norm = backend.layer_norm( + rms_norm=True, for_qk=True, eps=config.rms_norm_eps + ) else: - layer_norm = backend.layer_norm(rms_norm=False, for_qk=False) - qk_norm = backend.layer_norm(rms_norm=False, for_qk=True) + layer_norm = backend.layer_norm( + rms_norm=False, for_qk=False, eps=config.rms_norm_eps + ) + qk_norm = backend.layer_norm( + rms_norm=False, for_qk=True, eps=config.rms_norm_eps + ) mlp = get_mlp_layer_spec_for_backend( backend=backend, diff --git a/src/paddlefleet/models/qwen3_vl/layer_specs.py b/src/paddlefleet/models/qwen3_vl/layer_specs.py index 7c3b60eea..2beb4d34d 100644 --- a/src/paddlefleet/models/qwen3_vl/layer_specs.py +++ b/src/paddlefleet/models/qwen3_vl/layer_specs.py @@ -42,8 +42,12 @@ def get_qwen3_vl_vision_layer_local_spec( append_deepstack: bool = False, ) -> LayerSpec: backend = LocalSpecProvider() - layer_norm = backend.layer_norm(rms_norm=False, for_qk=False, fused=False) - qk_norm = backend.layer_norm(rms_norm=False, for_qk=True, fused=False) + layer_norm = backend.layer_norm( + rms_norm=False, for_qk=False, fused=False, eps=config.rms_norm_eps + ) + qk_norm = backend.layer_norm( + rms_norm=False, for_qk=True, fused=False, eps=config.rms_norm_eps + ) mlp = get_mlp_layer_spec_for_backend( backend=backend, ) @@ -55,6 +59,7 @@ def get_qwen3_vl_vision_layer_local_spec( rms_norm=(config.normalization == "RMSNorm"), for_qk=False, fused=False, + eps=1e-6, ) ), extra_kwargs={"config": config, "use_postshuffle_norm": True}, @@ -139,7 +144,10 @@ def get_qwen3_vl_vision_spec( ) ) merger_norm = backend.layer_norm( - rms_norm=(config.normalization == "RMSNorm"), for_qk=False + rms_norm=(config.normalization == "RMSNorm"), + for_qk=False, + fused=False, + eps=1e-6, ) merger_spec = LayerSpec( layer=Qwen3VLVisionPathMerger, diff --git a/src/paddlefleet/models/qwen3_vl/patch_merger.py b/src/paddlefleet/models/qwen3_vl/patch_merger.py index 200c478ea..82e8e67e2 100644 --- a/src/paddlefleet/models/qwen3_vl/patch_merger.py +++ b/src/paddlefleet/models/qwen3_vl/patch_merger.py @@ -16,7 +16,6 @@ from paddle import nn from ...spec_utils import LayerSpec, build_layer -from ...tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear from ...transformer.identity_op import IdentityOp @@ -47,32 +46,12 @@ def __init__( ) self.use_postshuffle_norm = use_postshuffle_norm - self.linear_fc1 = build_layer( - ColumnParallelLinear, - config=config, - input_size=self.hidden_size, - output_size=self.hidden_size, - init_method=config.init_method, - bias=config.use_bias, - skip_bias_add=False, - gather_output=False, - is_expert=False, - ) + self.linear_fc1 = nn.Linear(self.hidden_size, self.hidden_size) self.act_fn = nn.GELU() - self.linear_fc2 = build_layer( - RowParallelLinear, - config=config, - input_size=self.hidden_size, - output_size=dim, - init_method=config.output_layer_init_method, - input_is_parallel=True, - bias=config.use_bias, - skip_bias_add=False, - is_expert=False, - ) + self.linear_fc2 = nn.Linear(self.hidden_size, dim) def forward(self, dict_args): - x = dict_args["hidden_states"] + x = dict_args.pop("hidden_states") if self.use_postshuffle_norm: x = self.norm(x.reshape([-1, self.hidden_size])) x = x.reshape([-1, self.hidden_size]) @@ -80,8 +59,7 @@ def forward(self, dict_args): x = self.norm(x) x = x.reshape([-1, self.hidden_size]) - x, _ = self.linear_fc1(x) - x, _ = self.linear_fc2(self.act_fn(x)) + x = self.linear_fc2(self.act_fn(self.linear_fc1(x))) rst = {"hidden_states": x} rst = {**dict_args, **rst} return rst diff --git a/src/paddlefleet/models/qwen3_vl/qwen3_vl_model.py b/src/paddlefleet/models/qwen3_vl/qwen3_vl_model.py index 9bfd26f1f..21c06079a 100644 --- a/src/paddlefleet/models/qwen3_vl/qwen3_vl_model.py +++ b/src/paddlefleet/models/qwen3_vl/qwen3_vl_model.py @@ -118,6 +118,7 @@ def forward( # runners in the cuda graph manager dict_args.pop("dynamic_inference_decode_only", None) dict_args.pop("position_ids", None) + deepstack_features_list = dict_args.pop("deepstack_features_list", None) if self.full_recompute: hidden_states = dict_args["hidden_states"] attention_mask = dict_args.get("attention_mask", None) @@ -189,10 +190,11 @@ def forward( rst = {"hidden_states": hidden_states} if context is not None: rst["context"] = context - if "deepstack_feature_lists" not in dict_args: - dict_args["deepstack_feature_lists"] = [] + if deepstack_features_list is None: + deepstack_features_list = [] if deepstack_feature is not None: - dict_args["deepstack_feature_lists"].append(deepstack_feature) + deepstack_features_list.append(deepstack_feature) + rst["deepstack_features_list"] = deepstack_features_list rst = {**dict_args, **rst} return rst @@ -468,7 +470,6 @@ class Qwen3VLModelDist(FleetLayer): def __init__( self, config: TransformerConfig, - tokenizer=None, pre_process: bool = True, post_process: bool = True, add_encoder: bool = True, @@ -728,7 +729,7 @@ def get_image_features( vision_output = self.vision_model(dict_args) image_embeds, deepstack_image_embeds = ( vision_output["hidden_states"], - vision_output["deepstack_feature_lists"], + vision_output["deepstack_features_list"], ) split_sizes = ( image_grid_thw.prod(-1) From b3e9fa772a922560a60532b584f744a852675644 Mon Sep 17 00:00:00 2001 From: Severus Qin Date: Wed, 4 Mar 2026 19:49:07 +0800 Subject: [PATCH 4/9] fix CI case --- src/paddlefleet/models/gpt/gpt_layer_specs.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/paddlefleet/models/gpt/gpt_layer_specs.py b/src/paddlefleet/models/gpt/gpt_layer_specs.py index 015b4ed58..a51ec0323 100644 --- a/src/paddlefleet/models/gpt/gpt_layer_specs.py +++ b/src/paddlefleet/models/gpt/gpt_layer_specs.py @@ -92,20 +92,17 @@ def get_gpt_layer_local_spec( backend = LocalSpecProvider() # Adjust for RMS norm. + norm_eps = config.rms_norm_eps if config is not None else 1e-5 if normalization == "RMSNorm": layer_norm = backend.layer_norm( - rms_norm=True, for_qk=False, eps=config.rms_norm_eps - ) - qk_norm = backend.layer_norm( - rms_norm=True, for_qk=True, eps=config.rms_norm_eps + rms_norm=True, for_qk=False, eps=norm_eps ) + qk_norm = backend.layer_norm(rms_norm=True, for_qk=True, eps=norm_eps) else: layer_norm = backend.layer_norm( - rms_norm=False, for_qk=False, eps=config.rms_norm_eps - ) - qk_norm = backend.layer_norm( - rms_norm=False, for_qk=True, eps=config.rms_norm_eps + rms_norm=False, for_qk=False, eps=norm_eps ) + qk_norm = backend.layer_norm(rms_norm=False, for_qk=True, eps=norm_eps) mlp = get_mlp_layer_spec_for_backend( backend=backend, From 415c0dd9fe85d1bcac3b47500ea82dade186268c Mon Sep 17 00:00:00 2001 From: Severus Qin Date: Wed, 4 Mar 2026 22:20:02 +0800 Subject: [PATCH 5/9] move something into fleet --- src/paddlefleet/models/qwen3_vl/__init__.py | 8 +- .../models/qwen3_vl/layer_specs.py | 20 +- .../models/qwen3_vl/patch_merger.py | 14 +- .../models/qwen3_vl/qwen3_vl_provider.py | 180 +++++++++++++++++- 4 files changed, 187 insertions(+), 35 deletions(-) diff --git a/src/paddlefleet/models/qwen3_vl/__init__.py b/src/paddlefleet/models/qwen3_vl/__init__.py index 291619e4e..b3569f6c5 100644 --- a/src/paddlefleet/models/qwen3_vl/__init__.py +++ b/src/paddlefleet/models/qwen3_vl/__init__.py @@ -18,7 +18,11 @@ Qwen3VLVisionModel, Qwen3VLVisionTransformerLayer, ) -from .qwen3_vl_provider import Qwen3VLVisionProvider +from .qwen3_vl_provider import ( + Qwen3VLProvider, + Qwen3VLTextProvider, + Qwen3VLVisionProvider, +) __all__ = [ "Qwen3VLTextEmbedding", @@ -27,4 +31,6 @@ "Qwen3VLVisionModel", "Qwen3VLVisionTransformerLayer", "Qwen3VLVisionProvider", + "Qwen3VLProvider", + "Qwen3VLTextProvider", ] diff --git a/src/paddlefleet/models/qwen3_vl/layer_specs.py b/src/paddlefleet/models/qwen3_vl/layer_specs.py index 2beb4d34d..fbd10090a 100644 --- a/src/paddlefleet/models/qwen3_vl/layer_specs.py +++ b/src/paddlefleet/models/qwen3_vl/layer_specs.py @@ -26,7 +26,7 @@ VisionEmbeddingSpec, VisionRotaryEmbedding, ) -from .patch_merger import Qwen3VLVisionPatchMergerSpec, Qwen3VLVisionPathMerger +from .patch_merger import Qwen3VLVisionPathMerger from .qwen3_vl_model import ( Qwen3VLVisionModel, Qwen3VLVisionSublayersSpec, @@ -54,14 +54,6 @@ def get_qwen3_vl_vision_layer_local_spec( transformer_cls = Qwen3VLVisionTransformerLayer merger_spec = LayerSpec( layer=Qwen3VLVisionPathMerger, - sublayers_spec=Qwen3VLVisionPatchMergerSpec( - backend.layer_norm( - rms_norm=(config.normalization == "RMSNorm"), - for_qk=False, - fused=False, - eps=1e-6, - ) - ), extra_kwargs={"config": config, "use_postshuffle_norm": True}, ) return LayerSpec( @@ -131,7 +123,6 @@ def get_qwen3_vl_vision_spec( tail_empty_layer_spec: list[LayerSpec] | None = None, rotary_base: int = 10000.0, ): - backend = LocalSpecProvider() embedding_extra_kwargs = {"config": config} rotary_emb_extra_kwargs = { "dim": config.head_dim // 2, @@ -143,17 +134,8 @@ def get_qwen3_vl_vision_spec( extra_kwargs=rotary_emb_extra_kwargs, ) ) - merger_norm = backend.layer_norm( - rms_norm=(config.normalization == "RMSNorm"), - for_qk=False, - fused=False, - eps=1e-6, - ) merger_spec = LayerSpec( layer=Qwen3VLVisionPathMerger, - sublayers_spec=Qwen3VLVisionPatchMergerSpec( - norm=merger_norm, - ), extra_kwargs={ "config": config, "dim": config.out_hidden_size, diff --git a/src/paddlefleet/models/qwen3_vl/patch_merger.py b/src/paddlefleet/models/qwen3_vl/patch_merger.py index 82e8e67e2..27c3fe9fa 100644 --- a/src/paddlefleet/models/qwen3_vl/patch_merger.py +++ b/src/paddlefleet/models/qwen3_vl/patch_merger.py @@ -11,24 +11,14 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from dataclasses import dataclass from paddle import nn -from ...spec_utils import LayerSpec, build_layer -from ...transformer.identity_op import IdentityOp - - -@dataclass -class Qwen3VLVisionPatchMergerSpec: - norm: LayerSpec = IdentityOp - class Qwen3VLVisionPathMerger(nn.Module): def __init__( self, config, - sublayers_spec: Qwen3VLVisionPatchMergerSpec, dim: int | None = None, context_dim: int | None = None, use_postshuffle_norm: bool = False, @@ -41,9 +31,7 @@ def __init__( self.hidden_size = context_dim * (config.spatial_merge_size**2) norm_dim = self.hidden_size if use_postshuffle_norm else context_dim - self.norm = build_layer( - sublayers_spec.norm, config=config, hidden_size=norm_dim - ) + self.norm = nn.LayerNorm(norm_dim, epsilon=1e-6) self.use_postshuffle_norm = use_postshuffle_norm self.linear_fc1 = nn.Linear(self.hidden_size, self.hidden_size) diff --git a/src/paddlefleet/models/qwen3_vl/qwen3_vl_provider.py b/src/paddlefleet/models/qwen3_vl/qwen3_vl_provider.py index 7400fc804..8d453d738 100644 --- a/src/paddlefleet/models/qwen3_vl/qwen3_vl_provider.py +++ b/src/paddlefleet/models/qwen3_vl/qwen3_vl_provider.py @@ -18,11 +18,20 @@ import paddle from paddle.nn import functional as F +from paddleformers.transformers.gpt_provider import GPTModelProvider + +from paddlefleet import parallel_state from ...spec_utils import LayerSpec from ...transformer import TransformerConfig +from .embedding import Qwen3VLTextEmbedding from .qwen3_vl_builders import qwen3_vl_vision_builder -from .qwen3_vl_model import Qwen3VLVisionModel, Qwen3VLVisionTransformerLayer +from .qwen3_vl_model import ( + Qwen3VLModelDist, + Qwen3VLTextTransformerLayer, + Qwen3VLVisionModel, + Qwen3VLVisionTransformerLayer, +) @dataclass @@ -102,4 +111,171 @@ def provide(self) -> "Qwen3VLVisionModel": return model -__all__ = ["Qwen3VLVisionProvider"] +@dataclass +class Qwen3VLTextProvider(GPTModelProvider): + """ + Base config for Qwen3 Models. + """ + + normalization: str = "RMSNorm" + activation_func: Callable = F.silu + gated_linear_unit: bool = True + use_bias: bool = False + add_qkv_bias: bool = True + seq_length: int = 4096 + init_method_std: int = 0.02 + hidden_dropout_prob: float = 0.0 + attention_dropout: float = 0.0 + vocab_size: int = 151936 + share_embeddings_and_output_weights: bool | None = False + rms_norm_eps: float = 1e-6 + rotary_base: float = 1000000.0 + position_embedding_type: str = "rope" + use_qk_norm: bool = True + specific_embedding: type = Qwen3VLTextEmbedding + specific_transformer_layer: type = Qwen3VLTextTransformerLayer + max_sequence_length: int = 262144 + multimodal_embedding: bool = False + _save_to_hf: bool = False + use_fused_linear_cross_entropy: bool = True + high_precision_rope: bool = True + moe_grouped_gemm: bool = True + + n_shared_experts: int = 0 + transform_rules = { + "dtype": "params_dtype", + "num_heads": "num_attention_heads", + "depth": "num_hidden_layers", + "initializer_range": "init_method_std", + "num_experts": "n_routed_experts", + } + + def __post_init__(self): + super().__post_init__() + self.mrope_section = self.rope_scaling.get( + "mrope_section", [24, 20, 20] + ) + + +@dataclass +class Qwen3VLProvider(TransformerConfig): + text_config: Qwen3VLTextProvider | None = None + vision_config: Qwen3VLVisionProvider | None = None + + drop_vision_class_token: bool = False + vision_feature_layer: int = -2 + + encoder_pipeline_model_parallel_size: int = 0 + encoder_tensor_model_parallel_size: int = 1 + + seq_length: int = 1024 + + language_model_from_pretrained: str | None = None + vision_model_from_pretrained: str | None = None + + def provide( + self, tokenizer=None, vp_stage: int | None = None + ) -> "Qwen3VLModelDist": + self.text_config.scatter_embedding_sequence_parallel = False + self.text_config.tensor_model_parallel_size = ( + self.tensor_model_parallel_size + ) + self.text_config.sequence_parallel = self.sequence_parallel + self.text_config.context_parallel_size = self.context_parallel_size + self.vision_config.tensor_model_parallel_size = ( + self.tensor_model_parallel_size + ) + # self.vision_projection_config.tensor_model_parallel_size = self.tensor_model_parallel_size + self.text_config.pipeline_model_parallel_size = ( + self.pipeline_model_parallel_size + ) + + if self.encoder_pipeline_model_parallel_size > 0: + assert self.encoder_pipeline_model_parallel_size == 1, ( + "ViT can only live on 1 pipeline stage." + ) + self.vision_config.pipeline_model_parallel_size = ( + self.encoder_pipeline_model_parallel_size + ) + # self.vision_projection_config.pipeline_model_parallel_size = self.encoder_pipeline_model_parallel_size + self.text_config.encoder_pipeline_model_parallel_size = ( + self.encoder_pipeline_model_parallel_size + ) + if self.encoder_tensor_model_parallel_size > 0: + self.vision_config.tensor_model_parallel_size = ( + self.encoder_tensor_model_parallel_size + ) + # self.vision_projection_config.tensor_model_parallel_size = self.encoder_tensor_model_parallel_size + + config_attrs = [ + "cross_entropy_loss_fusion", + "gradient_accumulation_fusion", + "bias_activation_fusion", + "bias_dropout_fusion", + "masked_softmax_fusion", + "attention_softmax_in_fp32", + "apply_rope_fusion", + "overlap_p2p_comm", + "batch_p2p_comm", + ] + + for config in [ + self.text_config, + self.vision_config, + # self.vision_projection_config, + ]: + for attr in config_attrs: + setattr(config, attr, getattr(self, attr)) + + self.text_config.tp_comm_overlap = self.tp_comm_overlap + self.vision_config.tp_comm_overlap = False + # self.vision_projection_config.tp_comm_overlap = False + + vp_stage = vp_stage or 0 + + model = Qwen3VLModelDist( + config=self, + tokenizer=tokenizer, + pre_process=parallel_state.is_pipeline_first_stage( + ignore_virtual=False, vp_stage=vp_stage + ) + or parallel_state.get_pipeline_model_parallel_rank() + == self.encoder_pipeline_model_parallel_size, + post_process=parallel_state.is_pipeline_last_stage( + ignore_virtual=False, vp_stage=vp_stage + ), + add_encoder=parallel_state.is_pipeline_first_stage( + ignore_virtual=False, vp_stage=vp_stage + ), + add_decoder=parallel_state.is_pipeline_last_stage( + ignore_virtual=False, vp_stage=vp_stage + ) + or parallel_state.get_pipeline_model_parallel_rank() + >= self.encoder_pipeline_model_parallel_size, + drop_vision_class_token=self.drop_vision_class_token, + vp_stage=vp_stage, + ) + + return model + + @classmethod + def from_config(cls, config): + res = super().from_config(config) + res.vision_config = Qwen3VLVisionProvider.from_config( + config.vision_config + ) + res.text_config = Qwen3VLTextProvider.from_config(config.text_config) + res.vision_config.normalization = "LayerNorm" + res.vision_config.gated_linear_unit = False + res.text_config.multimodal_embedding = True + res.text_config.position_embedding_type = "mrope" + res.text_config.image_token_id = config.image_token_id + res.text_config.video_token_id = config.video_token_id + return res + + +__all__ = [ + "Qwen3VLVisionProvider", + "Qwen3VLTextProvider", + "Qwen3VLProvider", +] From 06dd54de6817b5278b2c9688f49da9820773b629 Mon Sep 17 00:00:00 2001 From: Severus Qin Date: Mon, 9 Mar 2026 17:41:44 +0800 Subject: [PATCH 6/9] move provider back to paddleformers --- src/paddlefleet/models/qwen3_vl/__init__.py | 8 - .../models/qwen3_vl/qwen3_vl_provider.py | 281 ------------------ 2 files changed, 289 deletions(-) delete mode 100644 src/paddlefleet/models/qwen3_vl/qwen3_vl_provider.py diff --git a/src/paddlefleet/models/qwen3_vl/__init__.py b/src/paddlefleet/models/qwen3_vl/__init__.py index b3569f6c5..736407af7 100644 --- a/src/paddlefleet/models/qwen3_vl/__init__.py +++ b/src/paddlefleet/models/qwen3_vl/__init__.py @@ -18,11 +18,6 @@ Qwen3VLVisionModel, Qwen3VLVisionTransformerLayer, ) -from .qwen3_vl_provider import ( - Qwen3VLProvider, - Qwen3VLTextProvider, - Qwen3VLVisionProvider, -) __all__ = [ "Qwen3VLTextEmbedding", @@ -30,7 +25,4 @@ "Qwen3VLTextTransformerLayer", "Qwen3VLVisionModel", "Qwen3VLVisionTransformerLayer", - "Qwen3VLVisionProvider", - "Qwen3VLProvider", - "Qwen3VLTextProvider", ] diff --git a/src/paddlefleet/models/qwen3_vl/qwen3_vl_provider.py b/src/paddlefleet/models/qwen3_vl/qwen3_vl_provider.py deleted file mode 100644 index 8d453d738..000000000 --- a/src/paddlefleet/models/qwen3_vl/qwen3_vl_provider.py +++ /dev/null @@ -1,281 +0,0 @@ -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import contextlib -from collections.abc import Callable -from dataclasses import dataclass -from functools import partial - -import paddle -from paddle.nn import functional as F -from paddleformers.transformers.gpt_provider import GPTModelProvider - -from paddlefleet import parallel_state - -from ...spec_utils import LayerSpec -from ...transformer import TransformerConfig -from .embedding import Qwen3VLTextEmbedding -from .qwen3_vl_builders import qwen3_vl_vision_builder -from .qwen3_vl_model import ( - Qwen3VLModelDist, - Qwen3VLTextTransformerLayer, - Qwen3VLVisionModel, - Qwen3VLVisionTransformerLayer, -) - - -@dataclass -class Qwen3VLVisionProvider(TransformerConfig): - patch_size: int = 16 - use_bias: bool = True - add_qkv_bias: bool = True - num_position_embeddings: int = 2304 - embed_dim: int = (1152,) - hidden_size: int = 1152 - out_hidden_size: int = 4096 - in_channels: int = 3 - spatial_merge_size: int = 2 - spatial_patch_size: int = 16 - temporal_patch_size: int = 2 - hidden_dropout_prob: float = 0.0 - attention_dropout: float = 0.0 - intermediate_size: int = 4304 - initializer_range: float = 0.02 - gated_linear_unit: bool = False - activation_func: Callable = F.gelu - layernorm_zero_centered_gamma: bool = False - apply_query_key_layer_scaling: bool = False - persist_layer_norm: bool = True - bias_activation_fusion: bool = False - bias_dropout_fusion: bool = False - attention_softmax_in_fp32: bool = True - normalization: str = "LayerNorm" - apply_rope_fusion: bool = True - rms_norm_eps: float = 1e-6 - transformer_layer_spec: LayerSpec = Qwen3VLVisionTransformerLayer - model_version: str = "qwen3_vl" - img_h: int = 336 - img_w: int = 336 - add_class_token: bool = False - class_token_len: int = 1 - high_precision_rope: bool = True - rotary_percent: float = 1.0 - transform_rules = { - "dtype": "params_dtype", - "num_heads": "num_attention_heads", - "depth": "num_hidden_layers", - "initializer_range": "init_method_std", - } - - def provide(self) -> "Qwen3VLVisionModel": - pp_size = self.pipeline_model_parallel_size - - is_pipeline_asymmetric = getattr( - self, "account_for_embedding_in_pipeline_split", False - ) or getattr(self, "account_for_loss_in_pipeline_split", False) - is_pipeline_asymmetric |= ( - getattr(self, "num_empty_layers_add_in_head", None) - or getattr(self, "num_empty_layers_add_in_tail", None) - ) is not None - - # Initialize model as meta data instead of allocating data on a device - model_init_device_context = contextlib.nullcontext - if self.init_model_with_meta_device: - model_init_device_context = partial(paddle.device, device="meta") - - with model_init_device_context(): - fleet_model = qwen3_vl_vision_builder( - self, - seg_method="layer:TransformerLayer|EmptyLayer", - num_stages=pp_size, - ) - model = Qwen3VLVisionModel.__new__(Qwen3VLVisionModel) - - for attr_name in dir(fleet_model): - if not attr_name.startswith("__"): - try: - attr_value = getattr(fleet_model, attr_name) - setattr(model, attr_name, attr_value) - except: - pass - return model - - -@dataclass -class Qwen3VLTextProvider(GPTModelProvider): - """ - Base config for Qwen3 Models. - """ - - normalization: str = "RMSNorm" - activation_func: Callable = F.silu - gated_linear_unit: bool = True - use_bias: bool = False - add_qkv_bias: bool = True - seq_length: int = 4096 - init_method_std: int = 0.02 - hidden_dropout_prob: float = 0.0 - attention_dropout: float = 0.0 - vocab_size: int = 151936 - share_embeddings_and_output_weights: bool | None = False - rms_norm_eps: float = 1e-6 - rotary_base: float = 1000000.0 - position_embedding_type: str = "rope" - use_qk_norm: bool = True - specific_embedding: type = Qwen3VLTextEmbedding - specific_transformer_layer: type = Qwen3VLTextTransformerLayer - max_sequence_length: int = 262144 - multimodal_embedding: bool = False - _save_to_hf: bool = False - use_fused_linear_cross_entropy: bool = True - high_precision_rope: bool = True - moe_grouped_gemm: bool = True - - n_shared_experts: int = 0 - transform_rules = { - "dtype": "params_dtype", - "num_heads": "num_attention_heads", - "depth": "num_hidden_layers", - "initializer_range": "init_method_std", - "num_experts": "n_routed_experts", - } - - def __post_init__(self): - super().__post_init__() - self.mrope_section = self.rope_scaling.get( - "mrope_section", [24, 20, 20] - ) - - -@dataclass -class Qwen3VLProvider(TransformerConfig): - text_config: Qwen3VLTextProvider | None = None - vision_config: Qwen3VLVisionProvider | None = None - - drop_vision_class_token: bool = False - vision_feature_layer: int = -2 - - encoder_pipeline_model_parallel_size: int = 0 - encoder_tensor_model_parallel_size: int = 1 - - seq_length: int = 1024 - - language_model_from_pretrained: str | None = None - vision_model_from_pretrained: str | None = None - - def provide( - self, tokenizer=None, vp_stage: int | None = None - ) -> "Qwen3VLModelDist": - self.text_config.scatter_embedding_sequence_parallel = False - self.text_config.tensor_model_parallel_size = ( - self.tensor_model_parallel_size - ) - self.text_config.sequence_parallel = self.sequence_parallel - self.text_config.context_parallel_size = self.context_parallel_size - self.vision_config.tensor_model_parallel_size = ( - self.tensor_model_parallel_size - ) - # self.vision_projection_config.tensor_model_parallel_size = self.tensor_model_parallel_size - self.text_config.pipeline_model_parallel_size = ( - self.pipeline_model_parallel_size - ) - - if self.encoder_pipeline_model_parallel_size > 0: - assert self.encoder_pipeline_model_parallel_size == 1, ( - "ViT can only live on 1 pipeline stage." - ) - self.vision_config.pipeline_model_parallel_size = ( - self.encoder_pipeline_model_parallel_size - ) - # self.vision_projection_config.pipeline_model_parallel_size = self.encoder_pipeline_model_parallel_size - self.text_config.encoder_pipeline_model_parallel_size = ( - self.encoder_pipeline_model_parallel_size - ) - if self.encoder_tensor_model_parallel_size > 0: - self.vision_config.tensor_model_parallel_size = ( - self.encoder_tensor_model_parallel_size - ) - # self.vision_projection_config.tensor_model_parallel_size = self.encoder_tensor_model_parallel_size - - config_attrs = [ - "cross_entropy_loss_fusion", - "gradient_accumulation_fusion", - "bias_activation_fusion", - "bias_dropout_fusion", - "masked_softmax_fusion", - "attention_softmax_in_fp32", - "apply_rope_fusion", - "overlap_p2p_comm", - "batch_p2p_comm", - ] - - for config in [ - self.text_config, - self.vision_config, - # self.vision_projection_config, - ]: - for attr in config_attrs: - setattr(config, attr, getattr(self, attr)) - - self.text_config.tp_comm_overlap = self.tp_comm_overlap - self.vision_config.tp_comm_overlap = False - # self.vision_projection_config.tp_comm_overlap = False - - vp_stage = vp_stage or 0 - - model = Qwen3VLModelDist( - config=self, - tokenizer=tokenizer, - pre_process=parallel_state.is_pipeline_first_stage( - ignore_virtual=False, vp_stage=vp_stage - ) - or parallel_state.get_pipeline_model_parallel_rank() - == self.encoder_pipeline_model_parallel_size, - post_process=parallel_state.is_pipeline_last_stage( - ignore_virtual=False, vp_stage=vp_stage - ), - add_encoder=parallel_state.is_pipeline_first_stage( - ignore_virtual=False, vp_stage=vp_stage - ), - add_decoder=parallel_state.is_pipeline_last_stage( - ignore_virtual=False, vp_stage=vp_stage - ) - or parallel_state.get_pipeline_model_parallel_rank() - >= self.encoder_pipeline_model_parallel_size, - drop_vision_class_token=self.drop_vision_class_token, - vp_stage=vp_stage, - ) - - return model - - @classmethod - def from_config(cls, config): - res = super().from_config(config) - res.vision_config = Qwen3VLVisionProvider.from_config( - config.vision_config - ) - res.text_config = Qwen3VLTextProvider.from_config(config.text_config) - res.vision_config.normalization = "LayerNorm" - res.vision_config.gated_linear_unit = False - res.text_config.multimodal_embedding = True - res.text_config.position_embedding_type = "mrope" - res.text_config.image_token_id = config.image_token_id - res.text_config.video_token_id = config.video_token_id - return res - - -__all__ = [ - "Qwen3VLVisionProvider", - "Qwen3VLTextProvider", - "Qwen3VLProvider", -] From 4aba63c564cf4470db19322549fa66bcf8c06d95 Mon Sep 17 00:00:00 2001 From: Severus Qin Date: Wed, 11 Mar 2026 10:44:35 +0800 Subject: [PATCH 7/9] sync main develop embeddings --- src/paddlefleet/models/gpt/gpt_embedding.py | 43 ---------- src/paddlefleet/models/qwen3_vl/embedding.py | 84 +++++++++++++++++--- 2 files changed, 73 insertions(+), 54 deletions(-) diff --git a/src/paddlefleet/models/gpt/gpt_embedding.py b/src/paddlefleet/models/gpt/gpt_embedding.py index 377cc96d0..4e8f4196d 100644 --- a/src/paddlefleet/models/gpt/gpt_embedding.py +++ b/src/paddlefleet/models/gpt/gpt_embedding.py @@ -205,8 +205,6 @@ def forward( image_embeds.astype(decoder_input.dtype).reshape([-1]), ) # scatter bwd is a simple gather — no sparse atomics decoder_input = image_src_flat.reshape(decoder_input.shape) - visual_pos_masks = image_mask[..., 0] - deepstack_visual_embeds = deepstack_image_embeds if video_embeds is not None: _, video_mask = self.get_placeholder_mask( @@ -227,48 +225,7 @@ def forward( video_embeds.astype(decoder_input.dtype).reshape([-1]), ) decoder_input = video_src_flat.reshape(decoder_input.shape) - visual_pos_masks = video_mask[..., 0] - deepstack_visual_embeds = deepstack_video_embeds - if image_embeds is not None and video_embeds is not None: - image_mask = image_mask[..., 0] # [B, S] bool - video_mask = video_mask[..., 0] # [B, S] bool - visual_pos_masks = image_mask | video_mask - deepstack_visual_embeds = [] - for img_embed, vid_embed in zip( - deepstack_image_embeds, deepstack_video_embeds - ): - # Build embed_joint [N_visual, H] without boolean-index - # scatter. Use dense mask arithmetic instead. - # img_embed : [N_img, H] - # vid_embed : [N_vid, H] - # visual_pos_masks: [B, S] bool, N_visual True entries - # img_mask_in_visual[i] = True iff visual position i is image - # Computed as: image_mask flattened, keep only visual positions, - # expressed as a dense [N_visual] float mask — no indexing. - h = img_embed.shape[-1] - n_visual = int(visual_pos_masks.sum()) - # visual_pos_flat: [B*S] bool - visual_pos_flat = visual_pos_masks.reshape([-1]) - image_mask_flat = image_mask.reshape([-1]) # [B*S] bool - video_mask_flat = video_mask.reshape([-1]) # [B*S] bool - # Dense [B*S] float masks, then compress to [N_visual] via - # paddle.masked_select (forward: gather, backward: scatter_add - # — but scalar backward is efficient, no sparse atomics) - img_mask_in_vis_f = paddle.masked_select( - image_mask_flat.astype(img_embed.dtype), - visual_pos_flat, - ).unsqueeze(-1) # [N_visual, 1] - vid_mask_in_vis_f = paddle.masked_select( - video_mask_flat.astype(vid_embed.dtype), - visual_pos_flat, - ).unsqueeze(-1) # [N_visual, 1] - embed_joint = ( - img_embed.reshape([n_visual, h]) * img_mask_in_vis_f - + vid_embed.reshape([n_visual, h]) - * vid_mask_in_vis_f - ) - deepstack_visual_embeds.append(embed_joint) # Rotary positional embeddings (embedding is None for PP intermediate devices) rotary_pos_emb = None rotary_pos_cos = None diff --git a/src/paddlefleet/models/qwen3_vl/embedding.py b/src/paddlefleet/models/qwen3_vl/embedding.py index 14c87020b..5e1bfae44 100644 --- a/src/paddlefleet/models/qwen3_vl/embedding.py +++ b/src/paddlefleet/models/qwen3_vl/embedding.py @@ -365,9 +365,38 @@ def forward( inputs_embeds=decoder_input, image_features=image_embeds, ) - decoder_input = decoder_input.masked_scatter( - image_mask, image_embeds.astype(decoder_input.dtype) + # Replace masked_scatter with arithmetic blend to avoid + # IndexingBackwardKernel (sparse scatter) in the backward pass. + # image_mask : [B, S, H] bool + # image_embeds: [N_img, H] (N_img = number of image tokens) + # Expand image_embeds into the full [B, S, H] space by: + # 1. flatten decoder_input and image_mask to 1-D + # 2. use paddle.scatter (dense backward = gather) to place + # image_embeds values at the True positions + # 3. blend with original decoder_input via mask arithmetic + # + # Optimization: reuse decoder_input's flattened buffer as the + # scatter base (scaled by (1-mask)) to avoid a separate + # paddle.zeros([n_total]) allocation (~192 MB bf16 tensor). + image_mask_f = image_mask.astype( + decoder_input.dtype + ) # [B,S,H] float + flat_indices = paddle.nonzero( + image_mask.reshape([-1]) + ).squeeze( + -1 + ) # [N_img*H] int64 — dense nonzero, no scatter bwd + # Scale the base tensor by (1 - mask) in-place before scatter + # so that visual positions are zero — no extra zeros allocation. + base_flat = (decoder_input * (1.0 - image_mask_f)).reshape( + [-1] ) + image_src_flat = paddle.scatter( + base_flat, + flat_indices, + image_embeds.astype(decoder_input.dtype).reshape([-1]), + ) # scatter bwd is a simple gather — no sparse atomics + decoder_input = image_src_flat.reshape(decoder_input.shape) visual_pos_masks = image_mask[..., 0] deepstack_visual_embeds = deepstack_image_embeds @@ -377,9 +406,19 @@ def forward( inputs_embeds=decoder_input, video_features=video_embeds, ) - decoder_input = decoder_input.masked_scatter( - video_mask, video_embeds.astype(decoder_input.dtype) + video_mask_f = video_mask.astype(decoder_input.dtype) + flat_indices = paddle.nonzero( + video_mask.reshape([-1]) + ).squeeze(-1) + base_flat = (decoder_input * (1.0 - video_mask_f)).reshape( + [-1] ) + video_src_flat = paddle.scatter( + base_flat, + flat_indices, + video_embeds.astype(decoder_input.dtype).reshape([-1]), + ) + decoder_input = video_src_flat.reshape(decoder_input.shape) visual_pos_masks = video_mask[..., 0] deepstack_visual_embeds = deepstack_video_embeds @@ -388,16 +427,39 @@ def forward( video_mask = video_mask[..., 0] visual_pos_masks = image_mask | video_mask deepstack_visual_embeds = [] - image_mask_joint = image_mask[visual_pos_masks] - video_mask_joint = video_mask[visual_pos_masks] for img_embed, vid_embed in zip( deepstack_image_embeds, deepstack_video_embeds ): - embed_joint = img_embed.new_zeros( - visual_pos_masks.sum(), img_embed.shape[-1] - ).to(img_embed.device) - embed_joint[image_mask_joint, :] = img_embed - embed_joint[video_mask_joint, :] = vid_embed + # Build embed_joint [N_visual, H] without boolean-index + # scatter. Use dense mask arithmetic instead. + # img_embed : [N_img, H] + # vid_embed : [N_vid, H] + # visual_pos_masks: [B, S] bool, N_visual True entries + # img_mask_in_visual[i] = True iff visual position i is image + # Computed as: image_mask flattened, keep only visual positions, + # expressed as a dense [N_visual] float mask — no indexing. + h = img_embed.shape[-1] + n_visual = int(visual_pos_masks.sum()) + # visual_pos_flat: [B*S] bool + visual_pos_flat = visual_pos_masks.reshape([-1]) + image_mask_flat = image_mask.reshape([-1]) # [B*S] bool + video_mask_flat = video_mask.reshape([-1]) # [B*S] bool + # Dense [B*S] float masks, then compress to [N_visual] via + # paddle.masked_select (forward: gather, backward: scatter_add + # — but scalar backward is efficient, no sparse atomics) + img_mask_in_vis_f = paddle.masked_select( + image_mask_flat.astype(img_embed.dtype), + visual_pos_flat, + ).unsqueeze(-1) # [N_visual, 1] + vid_mask_in_vis_f = paddle.masked_select( + video_mask_flat.astype(vid_embed.dtype), + visual_pos_flat, + ).unsqueeze(-1) # [N_visual, 1] + embed_joint = ( + img_embed.reshape([n_visual, h]) * img_mask_in_vis_f + + vid_embed.reshape([n_visual, h]) + * vid_mask_in_vis_f + ) deepstack_visual_embeds.append(embed_joint) # Rotary positional embeddings (embedding is None for PP intermediate devices) rotary_pos_emb = None From 3ccac61a1da6de7b0e719135ada0c7b18f9f7d0f Mon Sep 17 00:00:00 2001 From: Severus Qin Date: Thu, 2 Apr 2026 19:02:59 +0800 Subject: [PATCH 8/9] fix some bug caused by conflict --- src/paddlefleet/models/qwen3_vl/embedding.py | 9 ++++++++- src/paddlefleet/models/qwen3_vl/layer_specs.py | 13 ++++++++++++- src/paddlefleet/models/qwen3_vl/patch_merger.py | 11 +++++++---- 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/src/paddlefleet/models/qwen3_vl/embedding.py b/src/paddlefleet/models/qwen3_vl/embedding.py index faa488033..ad8074cfc 100644 --- a/src/paddlefleet/models/qwen3_vl/embedding.py +++ b/src/paddlefleet/models/qwen3_vl/embedding.py @@ -267,7 +267,14 @@ def forward(self, dict_args: dict) -> paddle.Tensor: grid_thw = dict_args["grid_thw"] # Pathed embedding - hidden_states = self.patch_embed(pixel_values).view(-1, self.embed_dim) + hidden_states = pixel_values.reshape( + -1, + self.in_channels, + self.temporal_patch_size, + self.patch_size, + self.patch_size, + ) + hidden_states = self.patch_embed(hidden_states).view(-1, self.embed_dim) # Share token-to-image mapping to avoid redundant computation image_id, frame_local_idx, total_tokens, max_hw = ( diff --git a/src/paddlefleet/models/qwen3_vl/layer_specs.py b/src/paddlefleet/models/qwen3_vl/layer_specs.py index fbd10090a..dfd210da4 100644 --- a/src/paddlefleet/models/qwen3_vl/layer_specs.py +++ b/src/paddlefleet/models/qwen3_vl/layer_specs.py @@ -26,7 +26,7 @@ VisionEmbeddingSpec, VisionRotaryEmbedding, ) -from .patch_merger import Qwen3VLVisionPathMerger +from .patch_merger import Qwen3VLVisionPatchMergerSpec, Qwen3VLVisionPathMerger from .qwen3_vl_model import ( Qwen3VLVisionModel, Qwen3VLVisionSublayersSpec, @@ -54,6 +54,11 @@ def get_qwen3_vl_vision_layer_local_spec( transformer_cls = Qwen3VLVisionTransformerLayer merger_spec = LayerSpec( layer=Qwen3VLVisionPathMerger, + sublayers_spec=Qwen3VLVisionPatchMergerSpec( + norm=backend.layer_norm( + rms_norm=False, for_qk=False, fused=False, eps=1e-6 + ) + ), extra_kwargs={"config": config, "use_postshuffle_norm": True}, ) return LayerSpec( @@ -123,6 +128,7 @@ def get_qwen3_vl_vision_spec( tail_empty_layer_spec: list[LayerSpec] | None = None, rotary_base: int = 10000.0, ): + backend = LocalSpecProvider() embedding_extra_kwargs = {"config": config} rotary_emb_extra_kwargs = { "dim": config.head_dim // 2, @@ -136,6 +142,11 @@ def get_qwen3_vl_vision_spec( ) merger_spec = LayerSpec( layer=Qwen3VLVisionPathMerger, + sublayers_spec=Qwen3VLVisionPatchMergerSpec( + norm=backend.layer_norm( + rms_norm=False, for_qk=False, fused=False, eps=1e-6 + ) + ), extra_kwargs={ "config": config, "dim": config.out_hidden_size, diff --git a/src/paddlefleet/models/qwen3_vl/patch_merger.py b/src/paddlefleet/models/qwen3_vl/patch_merger.py index d3dea3922..93b442210 100644 --- a/src/paddlefleet/models/qwen3_vl/patch_merger.py +++ b/src/paddlefleet/models/qwen3_vl/patch_merger.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from collections import OrderedDict from dataclasses import dataclass from paddle import nn @@ -65,9 +66,8 @@ def __init__( ) ) - def forward(self, x): - if isinstance(x, dict): - x = x["hidden_states"].squeeze(0) + def forward(self, dict_args): + x = dict_args["hidden_states"].squeeze(0) if self.use_postshuffle_norm: x = self.norm(x.reshape([-1, self.hidden_size])) x = x.reshape([-1, self.hidden_size]) @@ -78,4 +78,7 @@ def forward(self, x): x, output_bias = self.mlp(x) if output_bias is not None: x += output_bias - return x, None + rst = OrderedDict() + rst = {"hidden_states": x} + rst = {**dict_args, **rst} + return rst From 0a6204225ed46db88addb4e24fb56ae5c14c606c Mon Sep 17 00:00:00 2001 From: Severus Qin Date: Fri, 3 Apr 2026 17:00:23 +0800 Subject: [PATCH 9/9] merge SP in text model. --- .../models/qwen3_vl/qwen3_vl_model.py | 124 +++++++++++++----- 1 file changed, 89 insertions(+), 35 deletions(-) diff --git a/src/paddlefleet/models/qwen3_vl/qwen3_vl_model.py b/src/paddlefleet/models/qwen3_vl/qwen3_vl_model.py index 63b68f30e..df2ca841e 100644 --- a/src/paddlefleet/models/qwen3_vl/qwen3_vl_model.py +++ b/src/paddlefleet/models/qwen3_vl/qwen3_vl_model.py @@ -381,7 +381,19 @@ def _deepstack_process( visual_pos_masks: paddle.Tensor, visual_embeds: paddle.Tensor, ): - # Store original shape and flatten hidden_states to 2D [B*S, D] + # SP layout is [S/tp, B, H] (seq-first); transpose to [B, S/tp, H] so that + # flatten(0,1) produces batch-first [B*S/tp, H], consistent with visual_pos_masks [B, S]. + _sp_transposed = False + if ( + getattr(self.config, "sequence_parallel", False) + and hidden_states.ndim == 3 + ): + hidden_states = hidden_states.transpose( + [1, 0, 2] + ) # [S/tp,B,H] -> [B,S/tp,H] + _sp_transposed = True + # Save original_shape AFTER the SP transpose so that reshape restores the + # batch-first [B, S/tp, H] form (needed for the final back-transpose). original_shape = hidden_states.shape if hidden_states.ndim > 2: hidden_states = hidden_states.flatten(start_axis=0, stop_axis=1) @@ -390,12 +402,15 @@ def _deepstack_process( hidden_states.device, hidden_states.dtype ) - # complicated logic for sequential parallelism - if visual_pos_masks.ndim > 1: - visual_pos_masks = visual_pos_masks.flatten() - - # This block handles Sequence Parallelism (Row Slicing) - if visual_pos_masks.shape[0] > hidden_states.shape[0]: + # Sequence Parallelism (SP) row slicing. + # visual_pos_masks is [B, S] (full sequence), hidden_states is [B*S/tp, H] + # (batch-major after transpose+flatten). We must slice along the S dimension + # (dim=1) to match the batch-major layout, NOT flatten-then-chunk which + # breaks when B > 1. + if visual_pos_masks.ndim > 1 and visual_pos_masks.shape[ + 1 + ] > hidden_states.shape[0] // max(visual_pos_masks.shape[0], 1): + # visual_pos_masks: [B, S], hidden_states: [B*S/tp, H] try: from paddle.distributed.fleet import ( get_hybrid_communicate_group, @@ -405,28 +420,60 @@ def _deepstack_process( mp_rank = hcg.get_model_parallel_rank() mp_size = hcg.get_model_parallel_world_size() except (ImportError, AttributeError): - mp_size = visual_pos_masks.shape[0] // hidden_states.shape[0] + batch_size = visual_pos_masks.shape[0] + full_seq_len = visual_pos_masks.shape[1] + mp_size = (batch_size * full_seq_len) // hidden_states.shape[0] mp_rank = paddle.distributed.get_rank() % mp_size - total_len = visual_pos_masks.shape[0] - chunk_size = total_len // mp_size - start_idx = mp_rank * chunk_size - end_idx = start_idx + chunk_size - if start_idx > 0: - pre_mask = visual_pos_masks[:start_idx] - visual_offset = paddle.sum( - paddle.cast(pre_mask, "int32") - ).item() + + full_seq_len = visual_pos_masks.shape[1] + chunk_s = full_seq_len // mp_size + start_s = mp_rank * chunk_s + + # Slice along S dimension: [B, S] -> [B, S/tp] + local_mask = visual_pos_masks[:, start_s : start_s + chunk_s] + batch_size = visual_pos_masks.shape[0] + + # Gather per-sample visual_embeds. + # visual_embeds is ordered as [sample0_all_vis, sample1_all_vis, ...]. + # Each rank only needs the visual tokens that fall within its local + # sequence chunk [start_s, start_s+chunk_s) for each sample. + per_sample_total = paddle.cast(visual_pos_masks, "int32").sum( + axis=1 + ) # [B] + per_sample_pre = ( + paddle.cast(visual_pos_masks[:, :start_s], "int32").sum(axis=1) + if start_s > 0 + else paddle.zeros([batch_size], dtype="int32") + ) # [B] + per_sample_local = paddle.cast(local_mask, "int32").sum( + axis=1 + ) # [B] + + gather_indices = [] + cumulative_total = 0 + for i in range(batch_size): + total_i = int(per_sample_total[i].item()) + pre_i = int(per_sample_pre[i].item()) + count_i = int(per_sample_local[i].item()) + if count_i > 0: + gather_indices.append( + paddle.arange( + cumulative_total + pre_i, + cumulative_total + pre_i + count_i, + ) + ) + cumulative_total += total_i + + if gather_indices: + gather_indices = paddle.concat(gather_indices) + visual_embeds = visual_embeds[gather_indices] else: - visual_offset = 0 - local_mask = visual_pos_masks[start_idx:end_idx] - local_visual_count = paddle.sum( - paddle.cast(local_mask, "int32") - ).item() + visual_embeds = visual_embeds[:0] # empty - visual_embeds = visual_embeds[ - visual_offset : visual_offset + local_visual_count - ] - visual_pos_masks = local_mask + # Flatten local mask to [B*S/tp] matching hidden_states batch-major layout + visual_pos_masks = local_mask.flatten() + elif visual_pos_masks.ndim > 1: + visual_pos_masks = visual_pos_masks.flatten() # If TP is enabled, hidden_states has shape [..., Hidden_Dim / TP_Size], # but visual_embeds usually has full [Hidden_Dim]. We need to slice visual_embeds column-wise. @@ -452,14 +499,24 @@ def _deepstack_process( visual_embeds = visual_embeds[:, start_col:end_col] hidden_states = hidden_states.clone() - local_this = hidden_states[visual_pos_masks, :] + visual_embeds - hidden_states[visual_pos_masks, :] = ( - local_this # 这个操作可能会导致paddle转静态图或推理时出问题,建议使用 scatter - ) + update_indices = paddle.nonzero(visual_pos_masks) + # Under SP, visual tokens are unevenly distributed across ranks. After row-slicing + # visual_pos_masks and visual_embeds to the local sequence chunk, some ranks may + # have zero visual tokens (local_visual_count == 0), producing visual_embeds with + # shape [0, H]. Guard against passing an empty updates tensor to scatter_nd_add, + # whose behavior is undefined / backend-dependent in that case. + if visual_embeds.shape[0] > 0: + hidden_states = paddle.scatter_nd_add( + hidden_states, update_indices, visual_embeds + ) # [Supplement 3] Restore original shape [B*S, D] -> [B, S, D] if necessary if len(original_shape) > 2: hidden_states = hidden_states.reshape(original_shape) + if _sp_transposed: + hidden_states = hidden_states.transpose( + [1, 0, 2] + ) # [B,S/tp,H] -> [S/tp,B,H] return hidden_states @@ -513,11 +570,8 @@ def __init__( self.context_parallel_lm = ( language_transformer_config.context_parallel_size ) - assert not ( - self.sequence_parallel_lm or self.context_parallel_lm > 1 - ), ( - f"qwenvl donnot support sequence parallel {self.sequence_parallel_lm} " - f"or context parallel {self.context_parallel_lm}" + assert not (self.context_parallel_lm > 1), ( + f"qwenvl donnot support context parallel {self.context_parallel_lm}" ) self.share_embeddings_and_output_weights = False self.rope_deltas = None