From 7ab13756bdd10c4dbd0d5f3e05bdea7b16426040 Mon Sep 17 00:00:00 2001
From: K11OntheBoat <“ruianmaidanglao@163.com”>
Date: Thu, 11 Sep 2025 11:50:04 +0000
Subject: [PATCH] Support limit thinking lengths

---
 fastdeploy/config.py                          |  2 +
 fastdeploy/engine/engine.py                   |  7 +++
 fastdeploy/entrypoints/engine_client.py       |  2 -
 .../ernie4_5_vl_processor.py                  |  4 ++
 .../model_executor/pre_and_post_process.py    |  2 +-
 fastdeploy/worker/gpu_model_runner.py         | 58 ++++++++++-------
 fastdeploy/worker/worker_process.py           |  1 +
 .../EB_VL_Lite/test_EB_VL_Lite_serving.py     | 62 +++++++++++++++++++
 tests/e2e/test_EB_VL_Lite_serving.py          | 62 +++++++++++++++++++
 9 files changed, 174 insertions(+), 26 deletions(-)
diff --git a/fastdeploy/config.py b/fastdeploy/config.py
index 42f86bc85b..82518bf95f 100644
--- a/fastdeploy/config.py
+++ b/fastdeploy/config.py
@@ -190,6 +190,7 @@ def __init__(
         self.reasoning_parser = None
         self.pad_token_id: int = -1
         self.eos_tokens_lens: int = 2
+        self.think_end_id = None
         self.lm_head_fp32: bool = False
         self.model_format = "auto"
         self.runner = "auto"
@@ -224,6 +225,7 @@ def __init__(
             self.vision_config = PretrainedConfig.from_dict(self.vision_config)
 
         self.ori_vocab_size = args.get("ori_vocab_size", self.vocab_size)
+        self.think_end_id = args.get("think_end_id", None)
 
         architectures = self.architectures[0]
 
diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py
index 84890b1e15..8df2641447 100644
--- a/fastdeploy/engine/engine.py
+++ b/fastdeploy/engine/engine.py
@@ -468,6 +468,12 @@ def _start_worker_service(self):
             else len(self.data_processor.tokenizer.vocab)
         )
 
+        think_end_id = self.data_processor.tokenizer.get_vocab().get("</think>", None)
+        if think_end_id is not None:
+            llm_logger.info(f"Get think_end_id {think_end_id} from vocab.")
+        else:
+            llm_logger.info("No </think> token found in vocabulary, the model can not do reasoning.")
+
         ports = ",".join(self.cfg.parallel_config.engine_worker_queue_port)
         ips = None
         if self.cfg.ips is not None:
@@ -494,6 +500,7 @@ def _start_worker_service(self):
             f" --data_parallel_size {self.cfg.parallel_config.data_parallel_size}"
             f" --quantization '{json.dumps(self.cfg.model_config.quantization)}'"
             f" --ori_vocab_size {ori_vocab_size}"
+            f" --think_end_id {think_end_id}"
             f" --speculative_config '{self.cfg.speculative_config.to_json_string()}'"
             f" --graph_optimization_config '{self.cfg.graph_opt_config.to_json_string()}'"
             f" --guided_decoding_backend {self.cfg.guided_decoding_backend}"
diff --git a/fastdeploy/entrypoints/engine_client.py b/fastdeploy/entrypoints/engine_client.py
index 5fe3f531e4..239ff59505 100644
--- a/fastdeploy/entrypoints/engine_client.py
+++ b/fastdeploy/entrypoints/engine_client.py
@@ -154,8 +154,6 @@ async def add_requests(self, task):
             task["prompt_token_ids_len"] = len(task["prompt_token_ids"])
             input_ids_len = task["prompt_token_ids_len"]
             task["max_tokens"] = min(self.max_model_len - input_ids_len, task.get("max_tokens"))
-            if task.get("reasoning_max_tokens", None) is None:
-                task["reasoning_max_tokens"] = max(int(task["max_tokens"] * 0.8), 1)
             min_tokens = task.get("min_tokens", 1)
             if "messages" in task:
                 del task["messages"]
diff --git a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py
index 77690b9209..f65d3a1df6 100644
--- a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py
+++ b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py
@@ -253,6 +253,10 @@ def process_request_dict(self, request, max_model_len=None):
             request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1]
         if request.get("max_tokens") is None:
             request["max_tokens"] = max(1, max_model_len - len(request["prompt_token_ids"]))
+        else:
+            request["max_tokens"] = min(max_model_len - len(request["prompt_token_ids"]), request["max_tokens"])
+        if request.get("reasoning_max_tokens") is None:
+            request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1)
         data_processor_logger.info(f"Processed request {request}")
 
         return request
diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py
index 01cc699cb6..06cd0a9cc7 100644
--- a/fastdeploy/model_executor/pre_and_post_process.py
+++ b/fastdeploy/model_executor/pre_and_post_process.py
@@ -193,7 +193,7 @@ def post_process_normal(
 ) -> ModelRunnerOutput:
     """Post-processing steps after completing a single token generation."""
     # handle vl:
-    if model_output.enable_thinking:
+    if model_output.enable_thinking and model_output.think_end_id is not None:
         exists_think_end = sampler_output.sampled_token_ids == model_output.think_end_id
         paddle.assign(
             paddle.where(
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index 90c1eb6b54..e3deee3f07 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -322,15 +322,21 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int =
                     else:
                         position_ids = None
 
-                    enable_thinking = request.get("enable_thinking", True)
-                    enable_thinking = enable_thinking if enable_thinking is not None else True
-                    self.share_inputs["enable_thinking"][:] = enable_thinking
-                    self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 if enable_thinking else 0
-                    self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens", 2048)
                     self.share_inputs["rope_emb"][idx : idx + 1, :] = self.prepare_rope3d(
                         position_ids, request.get("max_tokens", 2048)
                     )
 
+                if request.get("enable_thinking", False) and request.get("reasoning_max_tokens") is not None:
+                    # Enable thinking
+                    self.share_inputs["enable_thinking"][:] = True
+                    self.share_inputs["need_think_end"][idx : idx + 1, :] = 1
+                    self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens")
+                else:
+                    # Disable thinking
+                    self.share_inputs["enable_thinking"][:] = False
+                    self.share_inputs["need_think_end"][idx : idx + 1, :] = 0
+                    self.share_inputs["reasoning_index"][idx : idx + 1, :] = 0
+
                 if isinstance(request.prompt_token_ids, np.ndarray):
                     prompt_token_ids = request.prompt_token_ids.tolist()
                 else:
@@ -549,16 +555,22 @@ def insert_prefill_inputs(self, req_dicts: List[Request], num_running_requests:
                     self.share_inputs["prompt_lens"][idx : idx + 1] = length
 
                 if self.enable_mm:
-                    enable_thinking = request.get("enable_thinking", True)
-                    enable_thinking = enable_thinking if enable_thinking is not None else True
-                    self.share_inputs["enable_thinking"][:] = enable_thinking
-                    self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 if enable_thinking else 0
-                    self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens", 2048)
                     self.share_inputs["rope_emb"][idx : idx + 1, :] = self.prepare_rope3d(
                         position_ids, request.get("max_tokens", 2048)
                     )
                     self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0
 
+                if request.get("enable_thinking", False) and request.get("reasoning_max_tokens") is not None:
+                    # Enable thinking
+                    self.share_inputs["enable_thinking"][:] = True
+                    self.share_inputs["need_think_end"][idx : idx + 1, :] = 1
+                    self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens")
+                else:
+                    # Disable thinking
+                    self.share_inputs["enable_thinking"][:] = False
+                    self.share_inputs["need_think_end"][idx : idx + 1, :] = 0
+                    self.share_inputs["reasoning_index"][idx : idx + 1, :] = 0
+
             def get_attr_from_request(request, attr, default_value=None):
                 res = request.get(attr, default_value)
                 if res is not None:
@@ -853,6 +865,11 @@ def _init_share_inputs(self, max_num_seqs: int):
         # Initialize rotary position embedding
         tmp_position_ids = paddle.arange(self.parallel_config.max_model_len).reshape((1, -1))
 
+        # Initialize thinking related buffers
+        self.share_inputs["need_think_end"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32")
+        self.share_inputs["enable_thinking"] = paddle.full(shape=[1], fill_value=False, dtype="bool")
+        self.share_inputs["reasoning_index"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32")
+
         # TODO(gongshaotian): move to models
         if not self.enable_mm:
             self.share_inputs["rope_emb"] = get_rope(
@@ -952,11 +969,6 @@ def _init_share_inputs(self, max_num_seqs: int):
                 dtype="float32",
             )
             self.share_inputs["image_features"] = None
-            self.share_inputs["need_think_end"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32")
-            self.share_inputs["enable_thinking"] = paddle.full(
-                shape=[1], fill_value=("ernie" in self.model_config.model_type), dtype="bool"
-            )
-            self.share_inputs["reasoning_index"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32")
 
     def _prepare_inputs(self) -> None:
         """Prepare the model inputs"""
@@ -1398,10 +1410,10 @@ def _dummy_run(
                 ),
                 accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None),
                 accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None),
-                enable_thinking=(self.share_inputs["enable_thinking"] if self.enable_mm else None),
-                think_end_id=(getattr(self.model_config, "think_end_id", -1) if self.enable_mm else -1),
-                need_think_end=(self.share_inputs["need_think_end"] if self.enable_mm else None),
-                reasoning_index=(self.share_inputs["reasoning_index"] if self.enable_mm else None),
+                enable_thinking=self.share_inputs["enable_thinking"],
+                think_end_id=self.model_config.think_end_id,
+                need_think_end=self.share_inputs["need_think_end"],
+                reasoning_index=self.share_inputs["reasoning_index"],
                 stop_token_ids=self.share_inputs["stop_seqs"],
                 stop_seqs_len=self.share_inputs["stop_seqs_len"],
             )
@@ -1714,10 +1726,10 @@ class at the server level, which is too granular for ModelRunner.
             ),
             accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None),
             accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None),
-            enable_thinking=(self.share_inputs["enable_thinking"] if self.enable_mm else None),
-            think_end_id=(getattr(self.model_config, "think_end_id", -1) if self.enable_mm else -1),
-            need_think_end=(self.share_inputs["need_think_end"][:num_running_requests] if self.enable_mm else None),
-            reasoning_index=(self.share_inputs["reasoning_index"][:num_running_requests] if self.enable_mm else None),
+            enable_thinking=self.share_inputs["enable_thinking"],
+            think_end_id=self.model_config.think_end_id,
+            need_think_end=self.share_inputs["need_think_end"][:num_running_requests],
+            reasoning_index=self.share_inputs["reasoning_index"][:num_running_requests],
             stop_token_ids=self.share_inputs["stop_seqs"],
             stop_seqs_len=self.share_inputs["stop_seqs_len"],
         )
diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py
index 186dd58ea6..c3abf228d6 100644
--- a/fastdeploy/worker/worker_process.py
+++ b/fastdeploy/worker/worker_process.py
@@ -559,6 +559,7 @@ def parse_args():
         help="enable expert parallel",
     )
     parser.add_argument("--ori_vocab_size", type=int, default=None)
+    parser.add_argument("--think_end_id", default=None)
 
     parser.add_argument(
         "--quantization",
diff --git a/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py b/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py
index 57717379c7..6acefb1334 100644
--- a/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py
+++ b/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py
@@ -516,6 +516,21 @@ def test_chat_with_thinking(openai_client, capsys):
     assert response.choices[0].message.reasoning_content is None
     assert "</think>" not in response.choices[0].message.content
 
+    # test logic
+    reasoning_max_tokens = None
+    response = openai_client.chat.completions.create(
+        model="default",
+        messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}],
+        temperature=1,
+        stream=False,
+        max_tokens=20,
+        extra_body={
+            "chat_template_kwargs": {"enable_thinking": True},
+            "reasoning_max_tokens": reasoning_max_tokens,
+        },
+    )
+    assert response.choices[0].message.reasoning_content is not None
+
     # enable thinking, streaming
     reasoning_max_tokens = 3
     response = openai_client.chat.completions.create(
@@ -927,3 +942,50 @@ def test_profile_reset_block_num():
         f"Reset total_block_num {actual_value} 与 baseline {baseline} diff需要在5%以内"
         f"Allowed range: [{lower_bound:.1f}, {upper_bound:.1f}]"
     )
+
+
+def test_thinking_logic_flag(openai_client, capsys):
+    """
+    Test the interaction between token calculation logic and conditional thinking.
+    This test covers:
+    1. Default max_tokens calculation when not provided.
+    2. Capping of max_tokens when it exceeds model limits.
+    3. Default reasoning_max_tokens calculation when not provided.
+    4. Activation of thinking based on the final state of reasoning_max_tokens.
+    """
+
+    response_case_1 = openai_client.chat.completions.create(
+        model="default",
+        messages=[{"role": "user", "content": "Explain gravity briefly."}],
+        temperature=1,
+        stream=False,
+        extra_body={
+            "chat_template_kwargs": {"enable_thinking": True},
+        },
+    )
+    assert response_case_1.choices[0].message.reasoning_content is not None
+
+    response_case_2 = openai_client.chat.completions.create(
+        model="default",
+        messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}],
+        temperature=1,
+        stream=False,
+        max_tokens=20,
+        extra_body={
+            "chat_template_kwargs": {"enable_thinking": True},
+            "reasoning_max_tokens": 5,
+        },
+    )
+    assert response_case_2.choices[0].message.reasoning_content is not None
+
+    response_case_3 = openai_client.chat.completions.create(
+        model="default",
+        messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}],
+        temperature=1,
+        stream=False,
+        max_tokens=20,
+        extra_body={
+            "chat_template_kwargs": {"enable_thinking": False},
+        },
+    )
+    assert response_case_3.choices[0].message.reasoning_content is None
diff --git a/tests/e2e/test_EB_VL_Lite_serving.py b/tests/e2e/test_EB_VL_Lite_serving.py
index 027e663fe8..41dd81a097 100644
--- a/tests/e2e/test_EB_VL_Lite_serving.py
+++ b/tests/e2e/test_EB_VL_Lite_serving.py
@@ -535,6 +535,21 @@ def test_chat_with_thinking(openai_client, capsys):
     assert response.choices[0].message.reasoning_content is None
     assert "</think>" not in response.choices[0].message.content
 
+    # test logic
+    reasoning_max_tokens = None
+    response = openai_client.chat.completions.create(
+        model="default",
+        messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}],
+        temperature=1,
+        stream=False,
+        max_tokens=20,
+        extra_body={
+            "chat_template_kwargs": {"enable_thinking": True},
+            "reasoning_max_tokens": reasoning_max_tokens,
+        },
+    )
+    assert response.choices[0].message.reasoning_content is not None
+
     # enable thinking, streaming
     reasoning_max_tokens = 3
     response = openai_client.chat.completions.create(
@@ -642,3 +657,50 @@ def test_profile_reset_block_num():
         f"Reset total_block_num {actual_value} 与 baseline {baseline} diff需要在5%以内"
         f"Allowed range: [{lower_bound:.1f}, {upper_bound:.1f}]"
     )
+
+
+def test_thinking_logic_flag(openai_client, capsys):
+    """
+    Test the interaction between token calculation logic and conditional thinking.
+    This test covers:
+    1. Default max_tokens calculation when not provided.
+    2. Capping of max_tokens when it exceeds model limits.
+    3. Default reasoning_max_tokens calculation when not provided.
+    4. Activation of thinking based on the final state of reasoning_max_tokens.
+    """
+
+    response_case_1 = openai_client.chat.completions.create(
+        model="default",
+        messages=[{"role": "user", "content": "Explain gravity briefly."}],
+        temperature=1,
+        stream=False,
+        extra_body={
+            "chat_template_kwargs": {"enable_thinking": True},
+        },
+    )
+    assert response_case_1.choices[0].message.reasoning_content is not None
+
+    response_case_2 = openai_client.chat.completions.create(
+        model="default",
+        messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}],
+        temperature=1,
+        stream=False,
+        max_tokens=20,
+        extra_body={
+            "chat_template_kwargs": {"enable_thinking": True},
+            "reasoning_max_tokens": 5,
+        },
+    )
+    assert response_case_2.choices[0].message.reasoning_content is not None
+
+    response_case_3 = openai_client.chat.completions.create(
+        model="default",
+        messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}],
+        temperature=1,
+        stream=False,
+        max_tokens=20,
+        extra_body={
+            "chat_template_kwargs": {"enable_thinking": False},
+        },
+    )
+    assert response_case_3.choices[0].message.reasoning_content is None