PaddlePaddle · zhupengyang · Sep 15, 2025 · Sep 4, 2025
diff --git a/custom_ops/xpu_ops/src/ops/moe_topk_select.cc b/custom_ops/xpu_ops/src/ops/moe_topk_select.cc
@@ -43,18 +43,20 @@ std::vector<paddle::Tensor> MoeTopkSelect(
   int32_t* block_statistic = nullptr;
   const float* bias_data =
       bias.get_ptr() != nullptr ? bias.get_ptr()->data<float>() : nullptr;
-  int ret = infer_ops::moe_softmax_topk_norm_fusion(
-      xpu_ctx->x_context(),
-      gating_logits.data<float>(),
-      topk_weights.mutable_data<float>(),
-      topk_ids.mutable_data<int>(),
-      block_statistic,
-      token_num,
-      expert_num,
-      moe_topk,
-      0,
-      bias_data);
-  PD_CHECK(ret == 0);
+  if (token_num > 0) {
+    int ret = infer_ops::moe_softmax_topk_norm_fusion(
+        xpu_ctx->x_context(),
+        gating_logits.data<float>(),
+        topk_weights.mutable_data<float>(),
+        topk_ids.mutable_data<int>(),
+        block_statistic,
+        token_num,
+        expert_num,
+        moe_topk,
+        0,
+        bias_data);
+    PD_CHECK(ret == 0);
+  }
 
   return {topk_ids, topk_weights};
 }

diff --git a/custom_ops/xpu_ops/src/ops/pybind/pybind.cc b/custom_ops/xpu_ops/src/ops/pybind/pybind.cc
@@ -416,7 +416,7 @@ PYBIND11_MODULE(fastdeploy_ops, m) {
         py::arg("bias"),
         py::arg("weight_dtype"),
         py::arg("arch"),
-        py::arg("group_size"));
+        py::arg("group_size")=-1);
 
   m.def("ep_moe_expert_combine",
         &MoeEPCombine,

diff --git a/fastdeploy/input/ernie4_5_processor.py b/fastdeploy/input/ernie4_5_processor.py
@@ -265,7 +265,7 @@ def process_response(self, response_dict, **kwargs):
             if tool_call_info.tools_called:
                 response_dict.outputs.tool_calls = tool_call_info.tool_calls
                 response_dict.outputs.text = tool_call_info.content
-        data_processor_logger.info(f"req_id:{req_id}, token)ids: {token_ids}")
+        data_processor_logger.info(f"req_id:{req_id}, token_ids: {token_ids}")
         if response_dict.outputs.text == "" and response_dict.outputs.reasoning_content == "":
             return None
         return response_dict

diff --git a/fastdeploy/input/text_processor.py b/fastdeploy/input/text_processor.py
@@ -377,7 +377,7 @@ def process_response(self, response_dict, **kwargs):
             if tool_call_info.tools_called:
                 response_dict.outputs.tool_calls = tool_call_info.tool_calls
                 response_dict.outputs.text = tool_call_info.content
-        data_processor_logger.info(f"req_id:{req_id}, token)ids: {token_ids}")
+        data_processor_logger.info(f"req_id:{req_id}, token_ids: {token_ids}")
 
         return response_dict
 

diff --git a/fastdeploy/model_executor/layers/backends/xpu/__init__.py b/fastdeploy/model_executor/layers/backends/xpu/__init__.py
@@ -16,6 +16,16 @@
 xpu backend methods
 """
 
+from .moe.fused_moe import (
+    XPUMoEMethod,
+    XPUWeightOnlyMoeEpMethod,
+    XPUWeightOnlyMoEMethod,
+)
 from .quantization.weight_only import XPUWeightOnlyLinearMethod
 
-__all__ = ["XPUWeightOnlyLinearMethod"]
+__all__ = [
+    "XPUWeightOnlyLinearMethod",
+    "XPUMoEMethod",
+    "XPUWeightOnlyMoEMethod",
+    "XPUWeightOnlyMoeEpMethod",
+]
diff --git a/fastdeploy/model_executor/layers/backends/xpu/moe/__init__.py b/fastdeploy/model_executor/layers/backends/xpu/moe/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+xpu fused moe methods
+"""