From f038a82c33003d77e817b7df347af6b9057f6233 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Thu, 5 Jun 2025 15:32:41 +0800 Subject: [PATCH 01/23] cache tokens in Python side to reduce pybind reading overhead Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/_torch/pyexecutor/llm_request.py | 13 +++++++++++++ tensorrt_llm/_torch/pyexecutor/model_engine.py | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py index 63ac568f4dd..91e80d07bdf 100644 --- a/tensorrt_llm/_torch/pyexecutor/llm_request.py +++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py @@ -246,6 +246,8 @@ def __init__( self.is_cuda_graph_dummy = False self.py_lora_task_layer_module_configs = None + self.py_tokens = super().get_tokens() + self.py_return_log_probs = return_log_probs self.py_return_context_logits = return_context_logits self.py_return_generation_logits = return_generation_logits @@ -260,6 +262,17 @@ def __init__( return_log_probs, return_context_logits, return_generation_logits) + def get_tokens(self, beam: int): + return self.py_tokens[beam] + + def get_last_tokens(self, beam: int): + return self.py_tokens[beam][-1] + + def add_new_token(self, token: int, beam: int): + self.py_tokens[beam].append(token) + # sync to C++ side + super().add_new_token(token, beam) + def create_response( self, use_fast_logits=False, diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py index 49bf6194b27..db9b421d7ad 100644 --- a/tensorrt_llm/_torch/pyexecutor/model_engine.py +++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py @@ -1120,7 +1120,7 @@ def _prepare_tp_inputs( gather_ids.append(len(input_ids) - 1) sequence_lengths.append(len(prompt_tokens)) prompt_lengths.append(len(prompt_tokens)) - past_seen_token_num = request.context_current_position + past_seen_token_num = begin_compute num_cached_tokens_per_seq.append(past_seen_token_num) multimodal_embedding = request.multimodal_embedding if multimodal_embedding is not None: From bba671970cdd2b5341a8d6842f59978976bc32a5 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Thu, 5 Jun 2025 15:36:41 +0800 Subject: [PATCH 02/23] refine Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/_torch/pyexecutor/llm_request.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py index 91e80d07bdf..0c29be89f38 100644 --- a/tensorrt_llm/_torch/pyexecutor/llm_request.py +++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py @@ -262,16 +262,16 @@ def __init__( return_log_probs, return_context_logits, return_generation_logits) - def get_tokens(self, beam: int): + def get_tokens(self, beam: int) -> int: return self.py_tokens[beam] - def get_last_tokens(self, beam: int): + def get_last_tokens(self, beam: int) -> int: return self.py_tokens[beam][-1] - def add_new_token(self, token: int, beam: int): + def add_new_token(self, token: int, beam: int) -> int: self.py_tokens[beam].append(token) # sync to C++ side - super().add_new_token(token, beam) + return super().add_new_token(token, beam) def create_response( self, From 3c836448fb422479406fe8f1856549730bde9e40 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Thu, 12 Jun 2025 18:57:54 +0800 Subject: [PATCH 03/23] pure Python LlmResponse Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- .../tensorrt_llm/batch_manager/llmRequest.h | 2 + cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 15 ++++--- .../pybind/batch_manager/bindings.cpp | 2 + tensorrt_llm/_torch/pyexecutor/llm_request.py | 43 ++++++++++--------- tensorrt_llm/_torch/pyexecutor/py_executor.py | 24 ++++++----- tensorrt_llm/executor/result.py | 12 +++--- tensorrt_llm/executor/worker.py | 7 ++- 7 files changed, 57 insertions(+), 48 deletions(-) diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h index 086dc2bf4a5..dca20816dba 100644 --- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h +++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h @@ -2328,6 +2328,8 @@ class LlmRequest : public GenericLlmRequest /// @return An optional Response std::optional createResponse(bool useFastLogits = false, int32_t mpiWorldRank = 0); + executor::Result createResult(bool useFastLogits = false, int32_t mpiWorldRank = 0); + void validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen, SizeType32 vocabSizePadded, std::optional maxEncoderInputLen = std::nullopt, bool enableKVCacheReuse = false); diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp index 6fc7051ad7e..a722587b799 100644 --- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp +++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp @@ -39,8 +39,16 @@ runtime::SizeType32 GenericLlmRequest::getBeamWidthByIter(bool template class GenericLlmRequest; -/// Note that there is some dependency on the order of operations in this method. Modify with care! std::optional LlmRequest::createResponse(bool useFastLogits, int32_t mpiWorldRank) +{ + auto requestId = isChild() ? mParentRequestId : mRequestId; + auto response = executor::Response(requestId, std::move(createResult(useFastLogits, mpiWorldRank)), mClientId); + + return response; +} + +/// Note that there is some dependency on the order of operations in this method. Modify with care! +executor::Result createResult(bool useFastLogits = false, int32_t mpiWorldRank = 0) { TLLM_CHECK(!isDisaggContextCompleteState()); if (!(isFinished() || (mIsStreaming && mState == LlmRequestState::kGENERATION_IN_PROGRESS))) @@ -192,11 +200,6 @@ std::optional LlmRequest::createResponse(bool useFastLogits, // Update position of last sent response setMaxSentTokenLen(maxNbTokens); - - auto requestId = isChild() ? mParentRequestId : mRequestId; - auto response = executor::Response(requestId, std::move(result), mClientId); - - return response; } void LlmRequest::validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen, diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp index 35f32a3b128..a3399e1833b 100644 --- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp +++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp @@ -360,6 +360,8 @@ void initBindings(pybind11::module_& m) py::arg("enable_kv_cache_reuse") = false) .def("create_response", &tb::LlmRequest::createResponse, py::arg("use_fast_logits") = false, py::arg("mpi_world_rank") = 0) + .def("create_result", &tb::LlmRequest::createResult, py::arg("use_fast_logits") = false, + py::arg("mpi_world_rank") = 0) .def("move_prompt_embedding_table_to_gpu", &tb::LlmRequest::movePromptEmbeddingTableToGpu, py::arg("manager")) .def("move_lora_weights_to_gpu", &tb::LlmRequest::moveLoraWeightsToGpu, py::arg("manager")) .def("finish_by_reason", &tb::LlmRequest::finishByReason, py::arg("finish_reason")); diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py index c89ad56ec60..ef9bdbde637 100644 --- a/tensorrt_llm/_torch/pyexecutor/llm_request.py +++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py @@ -219,25 +219,18 @@ def __getattr__(self, item): class LlmResponse: """LlmResponse wraps `bindings.executor.Response` but detour some features to Python implementation""" - def __init__(self, response: tensorrt_llm.bindings.executor.Response, - py_result: PyResult): - self._response = response - self._py_result = py_result - - def __getstate__(self): - return self._response, self._py_result - - def __setstate__(self, state): - self._response, self._py_result = state - - @property - def result(self) -> tensorrt_llm.bindings.executor.Result: - return LlmResult( - self._response.result, - self._py_result) # LlmResult masquerades bindings.executor.Result + def __init__(self, + request_id: int, + error: str = None, + result: LlmResult = None, + client_id: int = None): + self.request_id = request_id + self.error = error + self.result = result + self.client_id = client_id - def __getattr__(self, item): - return getattr(self._response, item) + def has_error(self): + return self.error is not None class LlmRequest(tensorrt_llm.bindings.internal.batch_manager.LlmRequest): @@ -269,6 +262,7 @@ def __init__( **kwargs) self.py_client_id = client_id self.py_request_id = self.request_id + self.py_llm_request_type = self.llm_request_type self.py_end_id = self.end_id self.py_prompt_len = self.prompt_len self.py_orig_prompt_len = self.orig_prompt_len @@ -299,6 +293,9 @@ def __init__( return_generation_logits, exclude_last_generation_logits) + def is_generation_only_request(self): + return self.py_llm_request_type == LlmRequestType.LLMREQUEST_TYPE_GENERATION_ONLY + def get_tokens(self, beam: int) -> int: return self.py_tokens[beam] @@ -314,9 +311,13 @@ def create_response( self, use_fast_logits=False, mpi_world_rank=0) -> tensorrt_llm.bindings.executor.Response | None: - response = super().create_response(use_fast_logits, mpi_world_rank) - return LlmResponse(response, - self.py_result) if response is not None else None + return LlmResponse( + request_id=self.py_request_id, + result=LlmResult( + super().create_result(use_fast_logits, mpi_world_rank), + self.py_result), + client_id=self.py_client_id, + ) @property def is_dummy(self): diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py index 54ccc556504..37ae2da1963 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py @@ -30,8 +30,8 @@ from ..distributed import Distributed from .kv_cache_transceiver import KvCacheTransceiver -from .llm_request import (ExecutorRequest, ExecutorResponse, LlmRequest, - LlmRequestState, executor_request_to_llm_request) +from .llm_request import (ExecutorRequest, LlmRequest, LlmRequestState, + LlmResponse, executor_request_to_llm_request) from .model_engine import ModelEngine from .sampler import Sampler, SampleState, SampleStateTensors, TorchSampler from .scheduler import ScheduledRequests @@ -323,14 +323,14 @@ def await_responses( self, id: Optional[Union[List[int], int]] = None, timeout: Optional[datetime.timedelta] = None, - ) -> Union[List[List[ExecutorResponse]], List[ExecutorResponse]]: + ) -> Union[List[List[LlmResponse]], List[LlmResponse]]: """ Await for ready responses Args: id (Optional[Union[List[int], int]]): Request id timeout (Optional[datetime.timedelta]): The maximum time to wait for new responses Returns: - Union[List[tensorrt_llm.bindings.executor.Response], List[List[tensorrt_llm.bindings.executor.Response]]]: Responses + Union[List[LlmResponse], List[List[LlmResponse]]]: Responses """ timeout = timeout.total_seconds() if timeout is not None else None if id is None: @@ -1934,8 +1934,10 @@ def _handle_errors(self, error_msg: Optional[str] = None): req_id = request.py_request_id request.state = LlmRequestState.GENERATION_COMPLETE self._terminate_request(request) - error_responses[req_id] = ExecutorResponse( - req_id, error_msg, client_id=request.py_client_id) + error_responses[req_id] = LlmResponse( + request_id=req_id, + error=error_msg, + client_id=request.py_client_id) self.active_requests.clear() self._enqueue_responses(error_responses) @@ -1979,7 +1981,7 @@ def _handle_cancelled_requests(self): self._enqueue_responses(cancelled_responses) @nvtx_range("_enqueue_responses") - def _enqueue_responses(self, responses: Dict[int, ExecutorResponse]): + def _enqueue_responses(self, responses: Dict[int, LlmResponse]): if 0 not in self.dist.mapping.tp_group and not self.gather_all_responses: return @@ -2036,7 +2038,7 @@ def _handle_responses(self): requests_to_terminate.append(request) continue - if request.is_generation_only_request: + if request.is_generation_only_request(): # If request is in transmission, so we don't need to emit a response # Also, for the first iteration with overlap, we should skip since first # token has already been emitted previously @@ -2048,7 +2050,7 @@ def _handle_responses(self): request.draft_tokens = request.py_draft_tokens request.decoding_iter = request.py_decoding_iter - response: Response = request.create_response(False, self.dist.rank) + response = request.create_response(False, self.dist.rank) request_done = False if response: request_done = response.result.is_final @@ -2075,7 +2077,7 @@ def _terminate_ctx_finished_requests(self): def _await_any_response(self, timeout: Optional[float] = None - ) -> List[ExecutorResponse]: + ) -> List[LlmResponse]: def any_responses_ready(): return len(self.responses) > 0 or self.is_shutdown @@ -2092,7 +2094,7 @@ def any_responses_ready(): def _await_single_response( self, id: int, - timeout: Optional[float] = None) -> List[ExecutorResponse]: + timeout: Optional[float] = None) -> List[LlmResponse]: with self.response_cv: def key_has_response(): diff --git a/tensorrt_llm/executor/result.py b/tensorrt_llm/executor/result.py index 0f2e1581cae..acfdc007958 100644 --- a/tensorrt_llm/executor/result.py +++ b/tensorrt_llm/executor/result.py @@ -16,7 +16,7 @@ from ..llmapi.tracer import global_tracer from ..llmapi.utils import AsyncQueue from ..sampling_params import LogprobParams, SamplingParams -from .utils import ErrorResponse, has_event_loop, is_llm_response +from .utils import ErrorResponse, has_event_loop if TYPE_CHECKING: from .executor import GenerationExecutor @@ -282,7 +282,11 @@ def _handle_response(self, if self._background_error_handler is not None and ( handler := self._background_error_handler()): handler(response.error) - elif is_llm_response(response): + elif isinstance(response, ErrorResponse): + if self._background_error_handler is not None and ( + handler := self._background_error_handler()): + handler(response.error_msg) + elif hasattr(response, "request_id"): if response.has_error(): if self._background_error_handler is not None and ( handler := self._background_error_handler()): @@ -318,10 +322,6 @@ def _handle_response(self, if self._background_error_handler and ( handler := self._background_error_handler()): handler() - elif isinstance(response, ErrorResponse): - if self._background_error_handler is not None and ( - handler := self._background_error_handler()): - handler(response.error_msg) else: raise ValueError(f"Unknown response type: {response}") diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py index 5f057c65012..0a421e4668a 100644 --- a/tensorrt_llm/executor/worker.py +++ b/tensorrt_llm/executor/worker.py @@ -39,7 +39,7 @@ from .result import (GenerationResult, IterationResult, LogProbsResult, ResponseWrapper, compute_logprobs) from .utils import (ErrorResponse, IntraProcessQueue, RequestError, - WorkerCommIpcAddrs, has_event_loop, is_llm_response) + WorkerCommIpcAddrs, has_event_loop) __all__ = [ "GenerationExecutorWorker", @@ -994,9 +994,8 @@ def _send_rsp( # Eliminate the finished GenerationRequest instances timely, which may # take considerable memory. - if is_llm_response(response): - if response.has_error() or response.result.is_final: - worker._pop_result(response.client_id) + if response.has_error() or response.result.is_final: + worker._pop_result(response.client_id) elif isinstance(response, ErrorResponse): worker._pop_result(response.client_id) else: From f0bb7c8234e89ee0b57ec241a0a88db7c1dd63ad Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Thu, 12 Jun 2025 19:04:38 +0800 Subject: [PATCH 04/23] pure Python LlmResponse Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/_torch/pyexecutor/llm_request.py | 6 +++--- tensorrt_llm/_torch/pyexecutor/py_executor.py | 2 +- tensorrt_llm/executor/proxy.py | 6 ++---- tensorrt_llm/executor/utils.py | 9 --------- 4 files changed, 6 insertions(+), 17 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py index ef9bdbde637..522f39babf2 100644 --- a/tensorrt_llm/_torch/pyexecutor/llm_request.py +++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py @@ -221,16 +221,16 @@ class LlmResponse: def __init__(self, request_id: int, - error: str = None, + error_msg: str = None, result: LlmResult = None, client_id: int = None): self.request_id = request_id - self.error = error + self.error_msg = error_msg self.result = result self.client_id = client_id def has_error(self): - return self.error is not None + return self.error_msg is not None class LlmRequest(tensorrt_llm.bindings.internal.batch_manager.LlmRequest): diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py index 37ae2da1963..5b84cd7e373 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py @@ -1936,7 +1936,7 @@ def _handle_errors(self, error_msg: Optional[str] = None): self._terminate_request(request) error_responses[req_id] = LlmResponse( request_id=req_id, - error=error_msg, + error_msg=error_msg, client_id=request.py_client_id) self.active_requests.clear() self._enqueue_responses(error_responses) diff --git a/tensorrt_llm/executor/proxy.py b/tensorrt_llm/executor/proxy.py index 76cb2737c6e..4e54be21684 100644 --- a/tensorrt_llm/executor/proxy.py +++ b/tensorrt_llm/executor/proxy.py @@ -24,8 +24,7 @@ from .request import CancellingRequest, GenerationRequest from .result import GenerationResult, IterationResult from .utils import (ErrorResponse, IntraProcessQueue, WorkerCommIpcAddrs, - create_mpi_comm_session, get_spawn_proxy_process_env, - is_llm_response) + create_mpi_comm_session, get_spawn_proxy_process_env) from .worker import GenerationExecutorWorker, worker_main __all__ = [ @@ -172,8 +171,7 @@ def process_res(res): event_loop = event_loop or queue.loop else: queue.put(res) - - if (is_llm_response(res) and res.result.is_final) or isinstance( + if (hasattr(res, "result") and res.result.is_final) or isinstance( res, ErrorResponse): self._results.pop(client_id) diff --git a/tensorrt_llm/executor/utils.py b/tensorrt_llm/executor/utils.py index bb6466373f1..e7b9975a5df 100644 --- a/tensorrt_llm/executor/utils.py +++ b/tensorrt_llm/executor/utils.py @@ -8,7 +8,6 @@ from strenum import StrEnum from tensorrt_llm._utils import mpi_rank -from tensorrt_llm.bindings.executor import Response from tensorrt_llm.llmapi.utils import print_colored_debug from ..llmapi.mpi_session import (MpiCommSession, MpiPoolSession, MpiSession, @@ -141,11 +140,3 @@ class WorkerCommIpcAddrs(NamedTuple): result_queue_addr: tuple[str, Optional[bytes]] stats_queue_addr: tuple[str, Optional[bytes]] kv_cache_events_queue_addr: tuple[str, Optional[bytes]] - - -def is_llm_response(instance): - from tensorrt_llm._torch.pyexecutor.llm_request import \ - LlmResponse as PyLlmResponse - - from .result import ResponseWrapper - return isinstance(instance, (Response, PyLlmResponse, ResponseWrapper)) From 60ca761095701150ee7c842bfd0bafd655d28660 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Thu, 12 Jun 2025 19:12:04 +0800 Subject: [PATCH 05/23] clean Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/executor/proxy.py | 5 +++-- tensorrt_llm/executor/result.py | 12 ++++++------ tensorrt_llm/executor/utils.py | 4 ++++ tensorrt_llm/executor/worker.py | 7 ++++--- 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/tensorrt_llm/executor/proxy.py b/tensorrt_llm/executor/proxy.py index 4e54be21684..8f16031a000 100644 --- a/tensorrt_llm/executor/proxy.py +++ b/tensorrt_llm/executor/proxy.py @@ -24,7 +24,8 @@ from .request import CancellingRequest, GenerationRequest from .result import GenerationResult, IterationResult from .utils import (ErrorResponse, IntraProcessQueue, WorkerCommIpcAddrs, - create_mpi_comm_session, get_spawn_proxy_process_env) + create_mpi_comm_session, get_spawn_proxy_process_env, + is_llm_response) from .worker import GenerationExecutorWorker, worker_main __all__ = [ @@ -171,7 +172,7 @@ def process_res(res): event_loop = event_loop or queue.loop else: queue.put(res) - if (hasattr(res, "result") and res.result.is_final) or isinstance( + if (is_llm_response(res) and res.result.is_final) or isinstance( res, ErrorResponse): self._results.pop(client_id) diff --git a/tensorrt_llm/executor/result.py b/tensorrt_llm/executor/result.py index acfdc007958..0f2e1581cae 100644 --- a/tensorrt_llm/executor/result.py +++ b/tensorrt_llm/executor/result.py @@ -16,7 +16,7 @@ from ..llmapi.tracer import global_tracer from ..llmapi.utils import AsyncQueue from ..sampling_params import LogprobParams, SamplingParams -from .utils import ErrorResponse, has_event_loop +from .utils import ErrorResponse, has_event_loop, is_llm_response if TYPE_CHECKING: from .executor import GenerationExecutor @@ -282,11 +282,7 @@ def _handle_response(self, if self._background_error_handler is not None and ( handler := self._background_error_handler()): handler(response.error) - elif isinstance(response, ErrorResponse): - if self._background_error_handler is not None and ( - handler := self._background_error_handler()): - handler(response.error_msg) - elif hasattr(response, "request_id"): + elif is_llm_response(response): if response.has_error(): if self._background_error_handler is not None and ( handler := self._background_error_handler()): @@ -322,6 +318,10 @@ def _handle_response(self, if self._background_error_handler and ( handler := self._background_error_handler()): handler() + elif isinstance(response, ErrorResponse): + if self._background_error_handler is not None and ( + handler := self._background_error_handler()): + handler(response.error_msg) else: raise ValueError(f"Unknown response type: {response}") diff --git a/tensorrt_llm/executor/utils.py b/tensorrt_llm/executor/utils.py index e7b9975a5df..fd4cd8444ec 100644 --- a/tensorrt_llm/executor/utils.py +++ b/tensorrt_llm/executor/utils.py @@ -140,3 +140,7 @@ class WorkerCommIpcAddrs(NamedTuple): result_queue_addr: tuple[str, Optional[bytes]] stats_queue_addr: tuple[str, Optional[bytes]] kv_cache_events_queue_addr: tuple[str, Optional[bytes]] + + +def is_llm_response(instance): + return hasattr(instance, "result") diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py index 0a421e4668a..5f057c65012 100644 --- a/tensorrt_llm/executor/worker.py +++ b/tensorrt_llm/executor/worker.py @@ -39,7 +39,7 @@ from .result import (GenerationResult, IterationResult, LogProbsResult, ResponseWrapper, compute_logprobs) from .utils import (ErrorResponse, IntraProcessQueue, RequestError, - WorkerCommIpcAddrs, has_event_loop) + WorkerCommIpcAddrs, has_event_loop, is_llm_response) __all__ = [ "GenerationExecutorWorker", @@ -994,8 +994,9 @@ def _send_rsp( # Eliminate the finished GenerationRequest instances timely, which may # take considerable memory. - if response.has_error() or response.result.is_final: - worker._pop_result(response.client_id) + if is_llm_response(response): + if response.has_error() or response.result.is_final: + worker._pop_result(response.client_id) elif isinstance(response, ErrorResponse): worker._pop_result(response.client_id) else: From 5f7e9ea233acdcd741c52d06ff6968600331a8fe Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Thu, 12 Jun 2025 20:32:15 +0800 Subject: [PATCH 06/23] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp index a722587b799..2cdb8a5becb 100644 --- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp +++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp @@ -48,7 +48,7 @@ std::optional LlmRequest::createResponse(bool useFastLogits, } /// Note that there is some dependency on the order of operations in this method. Modify with care! -executor::Result createResult(bool useFastLogits = false, int32_t mpiWorldRank = 0) +executor::Result LlmRequest::createResult(bool useFastLogits = false, int32_t mpiWorldRank = 0) { TLLM_CHECK(!isDisaggContextCompleteState()); if (!(isFinished() || (mIsStreaming && mState == LlmRequestState::kGENERATION_IN_PROGRESS))) From 1b3a7b7a3286956b18d188b5b1795d80d371b827 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Thu, 12 Jun 2025 20:34:05 +0800 Subject: [PATCH 07/23] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp index 2cdb8a5becb..492b45f1d8c 100644 --- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp +++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp @@ -48,7 +48,7 @@ std::optional LlmRequest::createResponse(bool useFastLogits, } /// Note that there is some dependency on the order of operations in this method. Modify with care! -executor::Result LlmRequest::createResult(bool useFastLogits = false, int32_t mpiWorldRank = 0) +executor::Result LlmRequest::createResult(bool useFastLogits, int32_t mpiWorldRank) { TLLM_CHECK(!isDisaggContextCompleteState()); if (!(isFinished() || (mIsStreaming && mState == LlmRequestState::kGENERATION_IN_PROGRESS))) From df6007306b59862662494a2517924a1f5f17612d Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Thu, 12 Jun 2025 20:39:35 +0800 Subject: [PATCH 08/23] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- cpp/include/tensorrt_llm/batch_manager/llmRequest.h | 2 +- cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h index dca20816dba..ac5b11d8822 100644 --- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h +++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h @@ -2328,7 +2328,7 @@ class LlmRequest : public GenericLlmRequest /// @return An optional Response std::optional createResponse(bool useFastLogits = false, int32_t mpiWorldRank = 0); - executor::Result createResult(bool useFastLogits = false, int32_t mpiWorldRank = 0); + std::optional createResult(bool useFastLogits = false, int32_t mpiWorldRank = 0); void validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen, SizeType32 vocabSizePadded, std::optional maxEncoderInputLen = std::nullopt, bool enableKVCacheReuse = false); diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp index 492b45f1d8c..1c11688e9ea 100644 --- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp +++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp @@ -42,13 +42,16 @@ template class GenericLlmRequest; std::optional LlmRequest::createResponse(bool useFastLogits, int32_t mpiWorldRank) { auto requestId = isChild() ? mParentRequestId : mRequestId; - auto response = executor::Response(requestId, std::move(createResult(useFastLogits, mpiWorldRank)), mClientId); - - return response; + auto result = createResult(useFastLogits, mpiWorldRank); + if (result.has_value()) + { + return executor::Response(requestId, std::move(result), mClientId); + } + return std::nullopt; } /// Note that there is some dependency on the order of operations in this method. Modify with care! -executor::Result LlmRequest::createResult(bool useFastLogits, int32_t mpiWorldRank) +std::optional LlmRequest::createResult(bool useFastLogits, int32_t mpiWorldRank) { TLLM_CHECK(!isDisaggContextCompleteState()); if (!(isFinished() || (mIsStreaming && mState == LlmRequestState::kGENERATION_IN_PROGRESS))) From 91c904e85e3f50afb0f1ad1aabbea3fad1758f66 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Thu, 12 Jun 2025 20:41:44 +0800 Subject: [PATCH 09/23] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp index 1c11688e9ea..702f2083752 100644 --- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp +++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp @@ -45,7 +45,7 @@ std::optional LlmRequest::createResponse(bool useFastLogits, auto result = createResult(useFastLogits, mpiWorldRank); if (result.has_value()) { - return executor::Response(requestId, std::move(result), mClientId); + return executor::Response(requestId, result, mClientId); } return std::nullopt; } From 5d62ceac9b53c522cb12d57c33ae21ab2c5bde6b Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Thu, 12 Jun 2025 20:54:53 +0800 Subject: [PATCH 10/23] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp index 702f2083752..1a55482b07b 100644 --- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp +++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp @@ -45,7 +45,7 @@ std::optional LlmRequest::createResponse(bool useFastLogits, auto result = createResult(useFastLogits, mpiWorldRank); if (result.has_value()) { - return executor::Response(requestId, result, mClientId); + return executor::Response(requestId, result.value(), mClientId); } return std::nullopt; } From 50eb5b964acb634ebab93dc885b359b26ba07143 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Thu, 12 Jun 2025 21:12:32 +0800 Subject: [PATCH 11/23] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp index 1a55482b07b..cfa10eb4056 100644 --- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp +++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp @@ -203,6 +203,7 @@ std::optional LlmRequest::createResult(bool useFastLogits, int // Update position of last sent response setMaxSentTokenLen(maxNbTokens); + return result; } void LlmRequest::validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen, From ea2f8cc9e4fd01cfa9a24538ccf38675a4648ac8 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Thu, 12 Jun 2025 21:20:45 +0800 Subject: [PATCH 12/23] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/_torch/pyexecutor/py_executor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py index 5b84cd7e373..3470737f04a 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py @@ -2052,7 +2052,7 @@ def _handle_responses(self): request.decoding_iter = request.py_decoding_iter response = request.create_response(False, self.dist.rank) request_done = False - if response: + if response and response.result: request_done = response.result.is_final new_responses.update({req_id: response}) if request_done: From 431926d21274e2a2be7483b00b35a51d0e204671 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Thu, 12 Jun 2025 21:27:19 +0800 Subject: [PATCH 13/23] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/_torch/pyexecutor/llm_request.py | 3 +++ tensorrt_llm/_torch/pyexecutor/py_executor.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py index 522f39babf2..cecadd6e511 100644 --- a/tensorrt_llm/_torch/pyexecutor/llm_request.py +++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py @@ -232,6 +232,9 @@ def __init__(self, def has_error(self): return self.error_msg is not None + def has_result(self): + return self.result._result + class LlmRequest(tensorrt_llm.bindings.internal.batch_manager.LlmRequest): """LlmRequest wraps `bindings.internal.batch_manager.LlmRequest` diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py index 3470737f04a..900aa711274 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py @@ -2052,7 +2052,7 @@ def _handle_responses(self): request.decoding_iter = request.py_decoding_iter response = request.create_response(False, self.dist.rank) request_done = False - if response and response.result: + if response.has_result(): request_done = response.result.is_final new_responses.update({req_id: response}) if request_done: From 3fb4d84591a09268932b08ff1e3a4290f2b0708c Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Thu, 12 Jun 2025 23:10:53 +0800 Subject: [PATCH 14/23] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/_torch/pyexecutor/llm_request.py | 11 +++-------- tensorrt_llm/_torch/pyexecutor/py_executor.py | 2 +- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py index cecadd6e511..752305555ad 100644 --- a/tensorrt_llm/_torch/pyexecutor/llm_request.py +++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py @@ -232,9 +232,6 @@ def __init__(self, def has_error(self): return self.error_msg is not None - def has_result(self): - return self.result._result - class LlmRequest(tensorrt_llm.bindings.internal.batch_manager.LlmRequest): """LlmRequest wraps `bindings.internal.batch_manager.LlmRequest` @@ -314,13 +311,11 @@ def create_response( self, use_fast_logits=False, mpi_world_rank=0) -> tensorrt_llm.bindings.executor.Response | None: + result = super().create_result(use_fast_logits, mpi_world_rank) return LlmResponse( request_id=self.py_request_id, - result=LlmResult( - super().create_result(use_fast_logits, mpi_world_rank), - self.py_result), - client_id=self.py_client_id, - ) + result=LlmResult(result, self.py_result), + client_id=self.py_client_id) if result is not None else None @property def is_dummy(self): diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py index 900aa711274..5b84cd7e373 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py @@ -2052,7 +2052,7 @@ def _handle_responses(self): request.decoding_iter = request.py_decoding_iter response = request.create_response(False, self.dist.rank) request_done = False - if response.has_result(): + if response: request_done = response.result.is_final new_responses.update({req_id: response}) if request_done: From 04370bad2ae9279aa8831e2c7ccf72aae2c570eb Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Fri, 13 Jun 2025 00:07:21 +0800 Subject: [PATCH 15/23] polish Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/executor/proxy.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorrt_llm/executor/proxy.py b/tensorrt_llm/executor/proxy.py index 8f16031a000..76cb2737c6e 100644 --- a/tensorrt_llm/executor/proxy.py +++ b/tensorrt_llm/executor/proxy.py @@ -172,6 +172,7 @@ def process_res(res): event_loop = event_loop or queue.loop else: queue.put(res) + if (is_llm_response(res) and res.result.is_final) or isinstance( res, ErrorResponse): self._results.pop(client_id) From acd09a45940f4b70f5e61b12798ffe4b7f111204 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Fri, 13 Jun 2025 12:37:14 +0800 Subject: [PATCH 16/23] expose createSerializedResult api Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- .../tensorrt_llm/batch_manager/llmRequest.h | 3 +++ cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 25 +++++++++++++++++++ .../pybind/batch_manager/bindings.cpp | 8 ++++++ 3 files changed, 36 insertions(+) diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h index ac5b11d8822..d71b6e89f6a 100644 --- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h +++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h @@ -2330,6 +2330,9 @@ class LlmRequest : public GenericLlmRequest std::optional createResult(bool useFastLogits = false, int32_t mpiWorldRank = 0); + void createSerializedResult( + std::vector& serializedResult, bool& isFinal, bool useFastLogits = false, int32_t mpiWorldRank = 0); + void validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen, SizeType32 vocabSizePadded, std::optional maxEncoderInputLen = std::nullopt, bool enableKVCacheReuse = false); diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp index cfa10eb4056..ea1cffa2545 100644 --- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp +++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp @@ -16,6 +16,7 @@ */ #include "tensorrt_llm/batch_manager/llmRequest.h" +#include "tensorrt_llm/executor/serializeUtils.h" #include "tensorrt_llm/kernels/beamSearchKernels.h" namespace tensorrt_llm::batch_manager @@ -50,6 +51,21 @@ std::optional LlmRequest::createResponse(bool useFastLogits, return std::nullopt; } +void LlmRequest::createSerializedResult( + std::vector& serializedResult, bool& isFinal, bool useFastLogits = false, int32_t mpiWorldRank = 0) +{ + auto result = createResult(useFastLogits, mpiWorldRank); + if (result.has_value()) + { + std::ostringstream oStream; + executor::serialize_utils::serialize(result.value(), oStream); + auto str = oStream.str(); + serializedResult.resize(str.size()); + std::copy(serializedResult.begin(), str.begin(), str.end()); + isFinal = result.isFinal; + } +} + /// Note that there is some dependency on the order of operations in this method. Modify with care! std::optional LlmRequest::createResult(bool useFastLogits, int32_t mpiWorldRank) { @@ -206,6 +222,15 @@ std::optional LlmRequest::createResult(bool useFastLogits, int return result; } +bool LlmRequest::createSerializedResult(std::string& serializedResult, bool useFastLogits, int32_t mpiWorldRank) +{ + auto result = createResult(useFastLogits, mpiWorldRank); + if (result.has_value()) + { + executor::serialize + } +} + void LlmRequest::validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen, SizeType32 vocabSizePadded, std::optional maxEncoderInputLen, bool enableKVCacheReuse) { diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp index a3399e1833b..62ef664ac64 100644 --- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp +++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp @@ -362,6 +362,14 @@ void initBindings(pybind11::module_& m) py::arg("mpi_world_rank") = 0) .def("create_result", &tb::LlmRequest::createResult, py::arg("use_fast_logits") = false, py::arg("mpi_world_rank") = 0) + .def("create_serialized_result", + [](tb::LlmRequest& self, bool use_fast_logits = false, int mpi_world_rank = 0) + { + std::vector serialized_result; + bool is_final = False; + self.createSerializedResult(serialized_result, is_final, use_fast_logits, mpi_world_rank); + return py::str(serialized_result.data(), serialized_result.size()), is_final; + }) .def("move_prompt_embedding_table_to_gpu", &tb::LlmRequest::movePromptEmbeddingTableToGpu, py::arg("manager")) .def("move_lora_weights_to_gpu", &tb::LlmRequest::moveLoraWeightsToGpu, py::arg("manager")) .def("finish_by_reason", &tb::LlmRequest::finishByReason, py::arg("finish_reason")); From 48a999ce2abe906d86751119ef11897821885c8c Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Fri, 13 Jun 2025 12:38:49 +0800 Subject: [PATCH 17/23] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- tensorrt_llm/_torch/pyexecutor/llm_request.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py index 752305555ad..3ea0eb7ec46 100644 --- a/tensorrt_llm/_torch/pyexecutor/llm_request.py +++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py @@ -213,7 +213,8 @@ def __init__(self, result: tensorrt_llm.bindings.executor.Result, def __getattr__(self, item): if item in self.py_result_properties: return getattr(self._py_result, item) - return getattr(self._result, item) + result = object.__getattribute__(self, '_result') + return getattr(result, item) class LlmResponse: From 1e36d773786d98d17d5b6d8f172a6d738e27b74d Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Fri, 13 Jun 2025 12:45:18 +0800 Subject: [PATCH 18/23] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp index ea1cffa2545..56e2addf61c 100644 --- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp +++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp @@ -52,7 +52,7 @@ std::optional LlmRequest::createResponse(bool useFastLogits, } void LlmRequest::createSerializedResult( - std::vector& serializedResult, bool& isFinal, bool useFastLogits = false, int32_t mpiWorldRank = 0) + std::vector& serializedResult, bool& isFinal, bool useFastLogits = false, int32_t mpiWorldRank) { auto result = createResult(useFastLogits, mpiWorldRank); if (result.has_value()) @@ -61,8 +61,8 @@ void LlmRequest::createSerializedResult( executor::serialize_utils::serialize(result.value(), oStream); auto str = oStream.str(); serializedResult.resize(str.size()); - std::copy(serializedResult.begin(), str.begin(), str.end()); - isFinal = result.isFinal; + std::copy(str.begin(), str.end(), serializedResult.begin()); + isFinal = result.value().isFinal; } } From 4fd71bc78730c7a26cc3759b2a231ae5d8d7ac62 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Fri, 13 Jun 2025 12:47:36 +0800 Subject: [PATCH 19/23] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp index 56e2addf61c..cd119dccf94 100644 --- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp +++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp @@ -222,15 +222,6 @@ std::optional LlmRequest::createResult(bool useFastLogits, int return result; } -bool LlmRequest::createSerializedResult(std::string& serializedResult, bool useFastLogits, int32_t mpiWorldRank) -{ - auto result = createResult(useFastLogits, mpiWorldRank); - if (result.has_value()) - { - executor::serialize - } -} - void LlmRequest::validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen, SizeType32 vocabSizePadded, std::optional maxEncoderInputLen, bool enableKVCacheReuse) { From 1a4920ba5fdb345b6dd86f7159f765bd4fd6d693 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Fri, 13 Jun 2025 13:38:20 +0800 Subject: [PATCH 20/23] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp index cd119dccf94..433f349b07d 100644 --- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp +++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp @@ -52,7 +52,7 @@ std::optional LlmRequest::createResponse(bool useFastLogits, } void LlmRequest::createSerializedResult( - std::vector& serializedResult, bool& isFinal, bool useFastLogits = false, int32_t mpiWorldRank) + std::vector& serializedResult, bool& isFinal, bool useFastLogits, int32_t mpiWorldRank) { auto result = createResult(useFastLogits, mpiWorldRank); if (result.has_value()) From 5e9888a3e774c51a98eb3d478ca45b24b50c25d3 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Fri, 13 Jun 2025 13:40:53 +0800 Subject: [PATCH 21/23] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp index 62ef664ac64..ab299af0bae 100644 --- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp +++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp @@ -366,7 +366,7 @@ void initBindings(pybind11::module_& m) [](tb::LlmRequest& self, bool use_fast_logits = false, int mpi_world_rank = 0) { std::vector serialized_result; - bool is_final = False; + bool is_final = false; self.createSerializedResult(serialized_result, is_final, use_fast_logits, mpi_world_rank); return py::str(serialized_result.data(), serialized_result.size()), is_final; }) From 9bfa834cafbdf7368c72e5aad9aed5b7eff604a8 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Fri, 13 Jun 2025 13:49:17 +0800 Subject: [PATCH 22/23] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp index ab299af0bae..e287550535a 100644 --- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp +++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp @@ -36,6 +36,7 @@ #include #include #include +#include namespace py = pybind11; namespace tb = tensorrt_llm::batch_manager; @@ -368,7 +369,7 @@ void initBindings(pybind11::module_& m) std::vector serialized_result; bool is_final = false; self.createSerializedResult(serialized_result, is_final, use_fast_logits, mpi_world_rank); - return py::str(serialized_result.data(), serialized_result.size()), is_final; + return std::make_tuple < (py::str(serialized_result.data(), serialized_result.size()), is_final); }) .def("move_prompt_embedding_table_to_gpu", &tb::LlmRequest::movePromptEmbeddingTableToGpu, py::arg("manager")) .def("move_lora_weights_to_gpu", &tb::LlmRequest::moveLoraWeightsToGpu, py::arg("manager")) From b37703cfade9c0d16f0f8107297d08d6b609bf24 Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Fri, 13 Jun 2025 14:27:36 +0800 Subject: [PATCH 23/23] fix Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com> --- cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp index e287550535a..9f8f95e8ed7 100644 --- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp +++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp @@ -369,7 +369,7 @@ void initBindings(pybind11::module_& m) std::vector serialized_result; bool is_final = false; self.createSerializedResult(serialized_result, is_final, use_fast_logits, mpi_world_rank); - return std::make_tuple < (py::str(serialized_result.data(), serialized_result.size()), is_final); + return std::make_tuple(serialized_result, is_final); }) .def("move_prompt_embedding_table_to_gpu", &tb::LlmRequest::movePromptEmbeddingTableToGpu, py::arg("manager")) .def("move_lora_weights_to_gpu", &tb::LlmRequest::moveLoraWeightsToGpu, py::arg("manager"))