PaddlePaddle · huangjiyi · Apr 1, 2026 · Copilot · Apr 1, 2026 · Copilot
diff --git a/src/paddlefleet/models/gpt/gpt_embedding.py b/src/paddlefleet/models/gpt/gpt_embedding.py
@@ -344,6 +344,77 @@ def forward(
                         [1, 0, 2, 3]
                     ).contiguous()
 
+        # Precompute visual scatter indices for _deepstack_process.
+        # These indices are layer-independent and would otherwise be
+        # redundantly computed in every deepstack layer.
+        visual_update_indices = None
+        visual_gather_indices = None
+        if visual_pos_masks is not None:
+            if self.sequence_parallel:
+                try:
+                    from paddle.distributed.fleet import (
+                        get_hybrid_communicate_group,
+                    )
+
+                    hcg = get_hybrid_communicate_group()
+                    mp_rank = hcg.get_model_parallel_rank()
+                    mp_size = hcg.get_model_parallel_world_size()
+                except (ImportError, AttributeError):
+                    batch_size = visual_pos_masks.shape[0]
+                    full_seq_len = visual_pos_masks.shape[1]
+                    # decoder_input is already [S/tp, B, H] after SP scatter
+                    local_seq = decoder_input.shape[0]
+                    mp_size = (batch_size * full_seq_len) // (
+                        local_seq * batch_size
+                    )
+                    mp_rank = paddle.distributed.get_rank() % mp_size
+
-                    batch_size = visual_pos_masks.shape[0]
-                    full_seq_len = visual_pos_masks.shape[1]
-                    # decoder_input is already [S/tp, B, H] after SP scatter
-                    local_seq = decoder_input.shape[0]
-                    mp_size = (batch_size * full_seq_len) // (
-                        local_seq * batch_size
-                    )
-                    mp_rank = paddle.distributed.get_rank() % mp_size
+                    # Fallback: try to get tensor model parallel rank/world_size
+                    try:
+                        from paddlefleet import parallel_state
+
+                        mp_rank = parallel_state.get_tensor_model_parallel_rank()
+                        mp_size = (
+                            parallel_state.get_tensor_model_parallel_world_size()
+                        )
+                    except (ImportError, AttributeError):
+                        # Final fallback: infer mp_size from shapes and assume rank 0
+                        batch_size = visual_pos_masks.shape[0]
+                        full_seq_len = visual_pos_masks.shape[1]
+                        # decoder_input is already [S/tp, B, H] after SP scatter
+                        local_seq = decoder_input.shape[0]
+                        mp_size = (batch_size * full_seq_len) // (
+                            local_seq * batch_size
+                        )
+                        mp_rank = 0
-                    batch_size = visual_pos_masks.shape[0]
-                    full_seq_len = visual_pos_masks.shape[1]
-                    # decoder_input is already [S/tp, B, H] after SP scatter
-                    local_seq = decoder_input.shape[0]
-                    mp_size = (batch_size * full_seq_len) // (
-                        local_seq * batch_size
-                    )
-                    mp_rank = paddle.distributed.get_rank() % mp_size
+                    # Fallback: try to get tensor model parallel rank/world_size
+                    try:
+                        from paddlefleet import parallel_state
+
+                        mp_rank = parallel_state.get_tensor_model_parallel_rank()
+                        mp_size = (
+                            parallel_state.get_tensor_model_parallel_world_size()
+                        )
+                    except (ImportError, AttributeError):
+                        # Final fallback: infer mp_size from shapes and assume rank 0
+                        batch_size = visual_pos_masks.shape[0]
+                        full_seq_len = visual_pos_masks.shape[1]
+                        # decoder_input is already [S/tp, B, H] after SP scatter
+                        local_seq = decoder_input.shape[0]
+                        mp_size = (batch_size * full_seq_len) // (
+                            local_seq * batch_size
+                        )
+                        mp_rank = 0
+                full_seq_len = visual_pos_masks.shape[1]
+                chunk_s = full_seq_len // mp_size
+                start_s = mp_rank * chunk_s
-                chunk_s = full_seq_len // mp_size
-                start_s = mp_rank * chunk_s
+                # Support non-divisible sequence length by distributing the remainder
+                base_chunk = full_seq_len // mp_size
+                remainder = full_seq_len % mp_size
+                if mp_rank < remainder:
+                    # Ranks before `remainder` get one extra token
+                    chunk_s = base_chunk + 1
+                    start_s = mp_rank * (base_chunk + 1)
+                else:
+                    # Remaining ranks get the base_chunk size
+                    chunk_s = base_chunk
+                    start_s = remainder * (base_chunk + 1) + (mp_rank - remainder) * base_chunk
-                chunk_s = full_seq_len // mp_size
-                start_s = mp_rank * chunk_s
+                # Support non-divisible sequence length by distributing the remainder
+                base_chunk = full_seq_len // mp_size
+                remainder = full_seq_len % mp_size
+                if mp_rank < remainder:
+                    # Ranks before `remainder` get one extra token
+                    chunk_s = base_chunk + 1
+                    start_s = mp_rank * (base_chunk + 1)
+                else:
+                    # Remaining ranks get the base_chunk size
+                    chunk_s = base_chunk
+                    start_s = remainder * (base_chunk + 1) + (mp_rank - remainder) * base_chunk
+
+                local_mask = visual_pos_masks[:, start_s : start_s + chunk_s]
+                batch_size = visual_pos_masks.shape[0]
+
+                per_sample_total = paddle.cast(visual_pos_masks, "int32").sum(
+                    axis=1
+                )
+                per_sample_pre = (
+                    paddle.cast(visual_pos_masks[:, :start_s], "int32").sum(
+                        axis=1
+                    )
+                    if start_s > 0
+                    else paddle.zeros([batch_size], dtype="int32")
+                )
+
+                gather_indices_list = []
+                cumulative_total = 0
+                for i in range(batch_size):
+                    total_i = int(per_sample_total[i].item())
+                    pre_i = int(per_sample_pre[i].item())
+                    count_i = int(
+                        paddle.cast(local_mask[i], "int32").sum().item()
+                    )
+                    if count_i > 0:
+                        gather_indices_list.append(
+                            paddle.arange(
+                                cumulative_total + pre_i,
+                                cumulative_total + pre_i + count_i,
+                            )
+                        )
+                    cumulative_total += total_i
+
+                if gather_indices_list:
+                    visual_gather_indices = paddle.concat(gather_indices_list)
+                else:
+                    visual_gather_indices = paddle.zeros([0], dtype="int64")
-                per_sample_total = paddle.cast(visual_pos_masks, "int32").sum(
-                    axis=1
-                )
-                per_sample_pre = (
-                    paddle.cast(visual_pos_masks[:, :start_s], "int32").sum(
-                        axis=1
-                    )
-                    if start_s > 0
-                    else paddle.zeros([batch_size], dtype="int32")
-                )
-
-                gather_indices_list = []
-                cumulative_total = 0
-                for i in range(batch_size):
-                    total_i = int(per_sample_total[i].item())
-                    pre_i = int(per_sample_pre[i].item())
-                    count_i = int(
-                        paddle.cast(local_mask[i], "int32").sum().item()
-                    )
-                    if count_i > 0:
-                        gather_indices_list.append(
-                            paddle.arange(
-                                cumulative_total + pre_i,
-                                cumulative_total + pre_i + count_i,
-                            )
-                        )
-                    cumulative_total += total_i
-
-                if gather_indices_list:
-                    visual_gather_indices = paddle.concat(gather_indices_list)
-                else:
-                    visual_gather_indices = paddle.zeros([0], dtype="int64")
+                visual_mask_int = paddle.cast(visual_pos_masks, "int32")
+                per_sample_total = visual_mask_int.sum(axis=1)
+
+                # 计算每个样本在全局打平 visual 序列中的起始偏移：
+                # sample_offsets[i] = sum_{j < i} per_sample_total[j]
+                sample_offsets = paddle.concat(
+                    [
+                        paddle.zeros([1], dtype="int32"),
+                        paddle.cumsum(per_sample_total[:-1]),
+                    ],
+                    axis=0,
+                )
+
+                # prefix[b, s] = 样本 b 在位置 s 及之前的 visual 数量 - 1（得到 0-based index）
+                prefix = paddle.cumsum(visual_mask_int, axis=1) - 1
+                global_idx = prefix + sample_offsets.unsqueeze(1)
+
+                # 仅保留当前 rank 负责的序列片段，并在 local_mask 为 1 的位置选出对应全局索引
+                local_global_idx = global_idx[:, start_s : start_s + chunk_s]
+                visual_gather_indices = paddle.masked_select(
+                    local_global_idx,
+                    paddle.cast(local_mask, "bool"),
+                ).astype("int64")
-                per_sample_total = paddle.cast(visual_pos_masks, "int32").sum(
-                    axis=1
-                )
-                per_sample_pre = (
-                    paddle.cast(visual_pos_masks[:, :start_s], "int32").sum(
-                        axis=1
-                    )
-                    if start_s > 0
-                    else paddle.zeros([batch_size], dtype="int32")
-                )
-
-                gather_indices_list = []
-                cumulative_total = 0
-                for i in range(batch_size):
-                    total_i = int(per_sample_total[i].item())
-                    pre_i = int(per_sample_pre[i].item())
-                    count_i = int(
-                        paddle.cast(local_mask[i], "int32").sum().item()
-                    )
-                    if count_i > 0:
-                        gather_indices_list.append(
-                            paddle.arange(
-                                cumulative_total + pre_i,
-                                cumulative_total + pre_i + count_i,
-                            )
-                        )
-                    cumulative_total += total_i
-
-                if gather_indices_list:
-                    visual_gather_indices = paddle.concat(gather_indices_list)
-                else:
-                    visual_gather_indices = paddle.zeros([0], dtype="int64")
+                visual_mask_int = paddle.cast(visual_pos_masks, "int32")
+                per_sample_total = visual_mask_int.sum(axis=1)
+
+                # 计算每个样本在全局打平 visual 序列中的起始偏移：
+                # sample_offsets[i] = sum_{j < i} per_sample_total[j]
+                sample_offsets = paddle.concat(
+                    [
+                        paddle.zeros([1], dtype="int32"),
+                        paddle.cumsum(per_sample_total[:-1]),
+                    ],
+                    axis=0,
+                )
+
+                # prefix[b, s] = 样本 b 在位置 s 及之前的 visual 数量 - 1（得到 0-based index）
+                prefix = paddle.cumsum(visual_mask_int, axis=1) - 1
+                global_idx = prefix + sample_offsets.unsqueeze(1)
+
+                # 仅保留当前 rank 负责的序列片段，并在 local_mask 为 1 的位置选出对应全局索引
+                local_global_idx = global_idx[:, start_s : start_s + chunk_s]
+                visual_gather_indices = paddle.masked_select(
+                    local_global_idx,
+                    paddle.cast(local_mask, "bool"),
+                ).astype("int64")
+
+                visual_update_indices = paddle.nonzero(local_mask.flatten())
+            else:
+                visual_update_indices = paddle.nonzero(
+                    visual_pos_masks.flatten()
+                )
-                visual_update_indices = paddle.nonzero(local_mask.flatten())
-            else:
-                visual_update_indices = paddle.nonzero(
-                    visual_pos_masks.flatten()
-                )
+                # nonzero on 1D input returns indices of shape [N, 1]; squeeze to get 1D [N] index
+                visual_update_indices = paddle.nonzero(
+                    local_mask.flatten()
+                ).squeeze(-1)
+            else:
+                # nonzero on 1D input returns indices of shape [N, 1]; squeeze to get 1D [N] index
+                visual_update_indices = paddle.nonzero(
+                    visual_pos_masks.flatten()
+                ).squeeze(-1)
-                visual_update_indices = paddle.nonzero(local_mask.flatten())
-            else:
-                visual_update_indices = paddle.nonzero(
-                    visual_pos_masks.flatten()
-                )
+                # nonzero on 1D input returns indices of shape [N, 1]; squeeze to get 1D [N] index
+                visual_update_indices = paddle.nonzero(
+                    local_mask.flatten()
+                ).squeeze(-1)
+            else:
+                # nonzero on 1D input returns indices of shape [N, 1]; squeeze to get 1D [N] index
+                visual_update_indices = paddle.nonzero(
+                    visual_pos_masks.flatten()
+                ).squeeze(-1)
+
         preproc_output = {
             "hidden_states": decoder_input,
             "attention_mask": attention_mask,
@@ -354,6 +425,8 @@ def forward(
             "position_ids": position_ids,
             "deepstack_visual_emb": deepstack_visual_embeds,
             "visual_pos_masks": visual_pos_masks,
+            "visual_update_indices": visual_update_indices,
+            "visual_gather_indices": visual_gather_indices,
         }
         if mtp_emb_res is not None:
             assert (