From 7ab13756bdd10c4dbd0d5f3e05bdea7b16426040 Mon Sep 17 00:00:00 2001 From: K11OntheBoat <“ruianmaidanglao@163.com”> Date: Thu, 11 Sep 2025 11:50:04 +0000 Subject: [PATCH] Support limit thinking lengths --- fastdeploy/config.py | 2 + fastdeploy/engine/engine.py | 7 +++ fastdeploy/entrypoints/engine_client.py | 2 - .../ernie4_5_vl_processor.py | 4 ++ .../model_executor/pre_and_post_process.py | 2 +- fastdeploy/worker/gpu_model_runner.py | 58 ++++++++++------- fastdeploy/worker/worker_process.py | 1 + .../EB_VL_Lite/test_EB_VL_Lite_serving.py | 62 +++++++++++++++++++ tests/e2e/test_EB_VL_Lite_serving.py | 62 +++++++++++++++++++ 9 files changed, 174 insertions(+), 26 deletions(-) diff --git a/fastdeploy/config.py b/fastdeploy/config.py index 42f86bc85b..82518bf95f 100644 --- a/fastdeploy/config.py +++ b/fastdeploy/config.py @@ -190,6 +190,7 @@ def __init__( self.reasoning_parser = None self.pad_token_id: int = -1 self.eos_tokens_lens: int = 2 + self.think_end_id = None self.lm_head_fp32: bool = False self.model_format = "auto" self.runner = "auto" @@ -224,6 +225,7 @@ def __init__( self.vision_config = PretrainedConfig.from_dict(self.vision_config) self.ori_vocab_size = args.get("ori_vocab_size", self.vocab_size) + self.think_end_id = args.get("think_end_id", None) architectures = self.architectures[0] diff --git a/fastdeploy/engine/engine.py b/fastdeploy/engine/engine.py index 84890b1e15..8df2641447 100644 --- a/fastdeploy/engine/engine.py +++ b/fastdeploy/engine/engine.py @@ -468,6 +468,12 @@ def _start_worker_service(self): else len(self.data_processor.tokenizer.vocab) ) + think_end_id = self.data_processor.tokenizer.get_vocab().get("", None) + if think_end_id is not None: + llm_logger.info(f"Get think_end_id {think_end_id} from vocab.") + else: + llm_logger.info("No token found in vocabulary, the model can not do reasoning.") + ports = ",".join(self.cfg.parallel_config.engine_worker_queue_port) ips = None if self.cfg.ips is not None: @@ -494,6 +500,7 @@ def _start_worker_service(self): f" --data_parallel_size {self.cfg.parallel_config.data_parallel_size}" f" --quantization '{json.dumps(self.cfg.model_config.quantization)}'" f" --ori_vocab_size {ori_vocab_size}" + f" --think_end_id {think_end_id}" f" --speculative_config '{self.cfg.speculative_config.to_json_string()}'" f" --graph_optimization_config '{self.cfg.graph_opt_config.to_json_string()}'" f" --guided_decoding_backend {self.cfg.guided_decoding_backend}" diff --git a/fastdeploy/entrypoints/engine_client.py b/fastdeploy/entrypoints/engine_client.py index 5fe3f531e4..239ff59505 100644 --- a/fastdeploy/entrypoints/engine_client.py +++ b/fastdeploy/entrypoints/engine_client.py @@ -154,8 +154,6 @@ async def add_requests(self, task): task["prompt_token_ids_len"] = len(task["prompt_token_ids"]) input_ids_len = task["prompt_token_ids_len"] task["max_tokens"] = min(self.max_model_len - input_ids_len, task.get("max_tokens")) - if task.get("reasoning_max_tokens", None) is None: - task["reasoning_max_tokens"] = max(int(task["max_tokens"] * 0.8), 1) min_tokens = task.get("min_tokens", 1) if "messages" in task: del task["messages"] diff --git a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py index 77690b9209..f65d3a1df6 100644 --- a/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py +++ b/fastdeploy/input/ernie4_5_vl_processor/ernie4_5_vl_processor.py @@ -253,6 +253,10 @@ def process_request_dict(self, request, max_model_len=None): request["prompt_token_ids"] = request["prompt_token_ids"][: max_model_len - 1] if request.get("max_tokens") is None: request["max_tokens"] = max(1, max_model_len - len(request["prompt_token_ids"])) + else: + request["max_tokens"] = min(max_model_len - len(request["prompt_token_ids"]), request["max_tokens"]) + if request.get("reasoning_max_tokens") is None: + request["reasoning_max_tokens"] = max(int(request["max_tokens"] * 0.8), 1) data_processor_logger.info(f"Processed request {request}") return request diff --git a/fastdeploy/model_executor/pre_and_post_process.py b/fastdeploy/model_executor/pre_and_post_process.py index 01cc699cb6..06cd0a9cc7 100644 --- a/fastdeploy/model_executor/pre_and_post_process.py +++ b/fastdeploy/model_executor/pre_and_post_process.py @@ -193,7 +193,7 @@ def post_process_normal( ) -> ModelRunnerOutput: """Post-processing steps after completing a single token generation.""" # handle vl: - if model_output.enable_thinking: + if model_output.enable_thinking and model_output.think_end_id is not None: exists_think_end = sampler_output.sampled_token_ids == model_output.think_end_id paddle.assign( paddle.where( diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 90c1eb6b54..e3deee3f07 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -322,15 +322,21 @@ def insert_tasks_v1(self, req_dicts: List[Request], num_running_requests: int = else: position_ids = None - enable_thinking = request.get("enable_thinking", True) - enable_thinking = enable_thinking if enable_thinking is not None else True - self.share_inputs["enable_thinking"][:] = enable_thinking - self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 if enable_thinking else 0 - self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens", 2048) self.share_inputs["rope_emb"][idx : idx + 1, :] = self.prepare_rope3d( position_ids, request.get("max_tokens", 2048) ) + if request.get("enable_thinking", False) and request.get("reasoning_max_tokens") is not None: + # Enable thinking + self.share_inputs["enable_thinking"][:] = True + self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 + self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens") + else: + # Disable thinking + self.share_inputs["enable_thinking"][:] = False + self.share_inputs["need_think_end"][idx : idx + 1, :] = 0 + self.share_inputs["reasoning_index"][idx : idx + 1, :] = 0 + if isinstance(request.prompt_token_ids, np.ndarray): prompt_token_ids = request.prompt_token_ids.tolist() else: @@ -549,16 +555,22 @@ def insert_prefill_inputs(self, req_dicts: List[Request], num_running_requests: self.share_inputs["prompt_lens"][idx : idx + 1] = length if self.enable_mm: - enable_thinking = request.get("enable_thinking", True) - enable_thinking = enable_thinking if enable_thinking is not None else True - self.share_inputs["enable_thinking"][:] = enable_thinking - self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 if enable_thinking else 0 - self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens", 2048) self.share_inputs["rope_emb"][idx : idx + 1, :] = self.prepare_rope3d( position_ids, request.get("max_tokens", 2048) ) self.share_inputs["seq_lens_decoder"][idx : idx + 1] = 0 + if request.get("enable_thinking", False) and request.get("reasoning_max_tokens") is not None: + # Enable thinking + self.share_inputs["enable_thinking"][:] = True + self.share_inputs["need_think_end"][idx : idx + 1, :] = 1 + self.share_inputs["reasoning_index"][idx : idx + 1, :] = request.get("reasoning_max_tokens") + else: + # Disable thinking + self.share_inputs["enable_thinking"][:] = False + self.share_inputs["need_think_end"][idx : idx + 1, :] = 0 + self.share_inputs["reasoning_index"][idx : idx + 1, :] = 0 + def get_attr_from_request(request, attr, default_value=None): res = request.get(attr, default_value) if res is not None: @@ -853,6 +865,11 @@ def _init_share_inputs(self, max_num_seqs: int): # Initialize rotary position embedding tmp_position_ids = paddle.arange(self.parallel_config.max_model_len).reshape((1, -1)) + # Initialize thinking related buffers + self.share_inputs["need_think_end"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") + self.share_inputs["enable_thinking"] = paddle.full(shape=[1], fill_value=False, dtype="bool") + self.share_inputs["reasoning_index"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") + # TODO(gongshaotian): move to models if not self.enable_mm: self.share_inputs["rope_emb"] = get_rope( @@ -952,11 +969,6 @@ def _init_share_inputs(self, max_num_seqs: int): dtype="float32", ) self.share_inputs["image_features"] = None - self.share_inputs["need_think_end"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") - self.share_inputs["enable_thinking"] = paddle.full( - shape=[1], fill_value=("ernie" in self.model_config.model_type), dtype="bool" - ) - self.share_inputs["reasoning_index"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") def _prepare_inputs(self) -> None: """Prepare the model inputs""" @@ -1398,10 +1410,10 @@ def _dummy_run( ), accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None), accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None), - enable_thinking=(self.share_inputs["enable_thinking"] if self.enable_mm else None), - think_end_id=(getattr(self.model_config, "think_end_id", -1) if self.enable_mm else -1), - need_think_end=(self.share_inputs["need_think_end"] if self.enable_mm else None), - reasoning_index=(self.share_inputs["reasoning_index"] if self.enable_mm else None), + enable_thinking=self.share_inputs["enable_thinking"], + think_end_id=self.model_config.think_end_id, + need_think_end=self.share_inputs["need_think_end"], + reasoning_index=self.share_inputs["reasoning_index"], stop_token_ids=self.share_inputs["stop_seqs"], stop_seqs_len=self.share_inputs["stop_seqs_len"], ) @@ -1714,10 +1726,10 @@ class at the server level, which is too granular for ModelRunner. ), accept_tokens=(self.share_inputs["accept_tokens"] if self.speculative_decoding else None), accept_num=(self.share_inputs["accept_num"] if self.speculative_decoding else None), - enable_thinking=(self.share_inputs["enable_thinking"] if self.enable_mm else None), - think_end_id=(getattr(self.model_config, "think_end_id", -1) if self.enable_mm else -1), - need_think_end=(self.share_inputs["need_think_end"][:num_running_requests] if self.enable_mm else None), - reasoning_index=(self.share_inputs["reasoning_index"][:num_running_requests] if self.enable_mm else None), + enable_thinking=self.share_inputs["enable_thinking"], + think_end_id=self.model_config.think_end_id, + need_think_end=self.share_inputs["need_think_end"][:num_running_requests], + reasoning_index=self.share_inputs["reasoning_index"][:num_running_requests], stop_token_ids=self.share_inputs["stop_seqs"], stop_seqs_len=self.share_inputs["stop_seqs_len"], ) diff --git a/fastdeploy/worker/worker_process.py b/fastdeploy/worker/worker_process.py index 186dd58ea6..c3abf228d6 100644 --- a/fastdeploy/worker/worker_process.py +++ b/fastdeploy/worker/worker_process.py @@ -559,6 +559,7 @@ def parse_args(): help="enable expert parallel", ) parser.add_argument("--ori_vocab_size", type=int, default=None) + parser.add_argument("--think_end_id", default=None) parser.add_argument( "--quantization", diff --git a/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py b/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py index 57717379c7..6acefb1334 100644 --- a/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py +++ b/tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py @@ -516,6 +516,21 @@ def test_chat_with_thinking(openai_client, capsys): assert response.choices[0].message.reasoning_content is None assert "" not in response.choices[0].message.content + # test logic + reasoning_max_tokens = None + response = openai_client.chat.completions.create( + model="default", + messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}], + temperature=1, + stream=False, + max_tokens=20, + extra_body={ + "chat_template_kwargs": {"enable_thinking": True}, + "reasoning_max_tokens": reasoning_max_tokens, + }, + ) + assert response.choices[0].message.reasoning_content is not None + # enable thinking, streaming reasoning_max_tokens = 3 response = openai_client.chat.completions.create( @@ -927,3 +942,50 @@ def test_profile_reset_block_num(): f"Reset total_block_num {actual_value} 与 baseline {baseline} diff需要在5%以内" f"Allowed range: [{lower_bound:.1f}, {upper_bound:.1f}]" ) + + +def test_thinking_logic_flag(openai_client, capsys): + """ + Test the interaction between token calculation logic and conditional thinking. + This test covers: + 1. Default max_tokens calculation when not provided. + 2. Capping of max_tokens when it exceeds model limits. + 3. Default reasoning_max_tokens calculation when not provided. + 4. Activation of thinking based on the final state of reasoning_max_tokens. + """ + + response_case_1 = openai_client.chat.completions.create( + model="default", + messages=[{"role": "user", "content": "Explain gravity briefly."}], + temperature=1, + stream=False, + extra_body={ + "chat_template_kwargs": {"enable_thinking": True}, + }, + ) + assert response_case_1.choices[0].message.reasoning_content is not None + + response_case_2 = openai_client.chat.completions.create( + model="default", + messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}], + temperature=1, + stream=False, + max_tokens=20, + extra_body={ + "chat_template_kwargs": {"enable_thinking": True}, + "reasoning_max_tokens": 5, + }, + ) + assert response_case_2.choices[0].message.reasoning_content is not None + + response_case_3 = openai_client.chat.completions.create( + model="default", + messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}], + temperature=1, + stream=False, + max_tokens=20, + extra_body={ + "chat_template_kwargs": {"enable_thinking": False}, + }, + ) + assert response_case_3.choices[0].message.reasoning_content is None diff --git a/tests/e2e/test_EB_VL_Lite_serving.py b/tests/e2e/test_EB_VL_Lite_serving.py index 027e663fe8..41dd81a097 100644 --- a/tests/e2e/test_EB_VL_Lite_serving.py +++ b/tests/e2e/test_EB_VL_Lite_serving.py @@ -535,6 +535,21 @@ def test_chat_with_thinking(openai_client, capsys): assert response.choices[0].message.reasoning_content is None assert "" not in response.choices[0].message.content + # test logic + reasoning_max_tokens = None + response = openai_client.chat.completions.create( + model="default", + messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}], + temperature=1, + stream=False, + max_tokens=20, + extra_body={ + "chat_template_kwargs": {"enable_thinking": True}, + "reasoning_max_tokens": reasoning_max_tokens, + }, + ) + assert response.choices[0].message.reasoning_content is not None + # enable thinking, streaming reasoning_max_tokens = 3 response = openai_client.chat.completions.create( @@ -642,3 +657,50 @@ def test_profile_reset_block_num(): f"Reset total_block_num {actual_value} 与 baseline {baseline} diff需要在5%以内" f"Allowed range: [{lower_bound:.1f}, {upper_bound:.1f}]" ) + + +def test_thinking_logic_flag(openai_client, capsys): + """ + Test the interaction between token calculation logic and conditional thinking. + This test covers: + 1. Default max_tokens calculation when not provided. + 2. Capping of max_tokens when it exceeds model limits. + 3. Default reasoning_max_tokens calculation when not provided. + 4. Activation of thinking based on the final state of reasoning_max_tokens. + """ + + response_case_1 = openai_client.chat.completions.create( + model="default", + messages=[{"role": "user", "content": "Explain gravity briefly."}], + temperature=1, + stream=False, + extra_body={ + "chat_template_kwargs": {"enable_thinking": True}, + }, + ) + assert response_case_1.choices[0].message.reasoning_content is not None + + response_case_2 = openai_client.chat.completions.create( + model="default", + messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}], + temperature=1, + stream=False, + max_tokens=20, + extra_body={ + "chat_template_kwargs": {"enable_thinking": True}, + "reasoning_max_tokens": 5, + }, + ) + assert response_case_2.choices[0].message.reasoning_content is not None + + response_case_3 = openai_client.chat.completions.create( + model="default", + messages=[{"role": "user", "content": "Explain gravity in a way that a five-year-old child can understand."}], + temperature=1, + stream=False, + max_tokens=20, + extra_body={ + "chat_template_kwargs": {"enable_thinking": False}, + }, + ) + assert response_case_3.choices[0].message.reasoning_content is None