From f038a82c33003d77e817b7df347af6b9057f6233 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 5 Jun 2025 15:32:41 +0800
Subject: [PATCH 01/23] cache tokens in Python side to reduce pybind reading
 overhead

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/llm_request.py  | 13 +++++++++++++
 tensorrt_llm/_torch/pyexecutor/model_engine.py |  2 +-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
index 63ac568f4dd..91e80d07bdf 100644
--- a/tensorrt_llm/_torch/pyexecutor/llm_request.py
+++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -246,6 +246,8 @@ def __init__(
         self.is_cuda_graph_dummy = False
         self.py_lora_task_layer_module_configs = None
 
+        self.py_tokens = super().get_tokens()
+
         self.py_return_log_probs = return_log_probs
         self.py_return_context_logits = return_context_logits
         self.py_return_generation_logits = return_generation_logits
@@ -260,6 +262,17 @@ def __init__(
                                   return_log_probs, return_context_logits,
                                   return_generation_logits)
 
+    def get_tokens(self, beam: int):
+        return self.py_tokens[beam]
+
+    def get_last_tokens(self, beam: int):
+        return self.py_tokens[beam][-1]
+
+    def add_new_token(self, token: int, beam: int):
+        self.py_tokens[beam].append(token)
+        # sync to C++ side
+        super().add_new_token(token, beam)
+
     def create_response(
             self,
             use_fast_logits=False,
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
index 49bf6194b27..db9b421d7ad 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_engine.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -1120,7 +1120,7 @@ def _prepare_tp_inputs(
             gather_ids.append(len(input_ids) - 1)
             sequence_lengths.append(len(prompt_tokens))
             prompt_lengths.append(len(prompt_tokens))
-            past_seen_token_num = request.context_current_position
+            past_seen_token_num = begin_compute
             num_cached_tokens_per_seq.append(past_seen_token_num)
             multimodal_embedding = request.multimodal_embedding
             if multimodal_embedding is not None:

From bba671970cdd2b5341a8d6842f59978976bc32a5 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 5 Jun 2025 15:36:41 +0800
Subject: [PATCH 02/23] refine

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/llm_request.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
index 91e80d07bdf..0c29be89f38 100644
--- a/tensorrt_llm/_torch/pyexecutor/llm_request.py
+++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -262,16 +262,16 @@ def __init__(
                                   return_log_probs, return_context_logits,
                                   return_generation_logits)
 
-    def get_tokens(self, beam: int):
+    def get_tokens(self, beam: int) -> int:
         return self.py_tokens[beam]
 
-    def get_last_tokens(self, beam: int):
+    def get_last_tokens(self, beam: int) -> int:
         return self.py_tokens[beam][-1]
 
-    def add_new_token(self, token: int, beam: int):
+    def add_new_token(self, token: int, beam: int) -> int:
         self.py_tokens[beam].append(token)
         # sync to C++ side
-        super().add_new_token(token, beam)
+        return super().add_new_token(token, beam)
 
     def create_response(
             self,

From 3c836448fb422479406fe8f1856549730bde9e40 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 12 Jun 2025 18:57:54 +0800
Subject: [PATCH 03/23] pure Python LlmResponse

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 .../tensorrt_llm/batch_manager/llmRequest.h   |  2 +
 cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 15 ++++---
 .../pybind/batch_manager/bindings.cpp         |  2 +
 tensorrt_llm/_torch/pyexecutor/llm_request.py | 43 ++++++++++---------
 tensorrt_llm/_torch/pyexecutor/py_executor.py | 24 ++++++-----
 tensorrt_llm/executor/result.py               | 12 +++---
 tensorrt_llm/executor/worker.py               |  7 ++-
 7 files changed, 57 insertions(+), 48 deletions(-)

diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
index 086dc2bf4a5..dca20816dba 100644
--- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
+++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@@ -2328,6 +2328,8 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
     /// @return An optional Response
     std::optional<executor::Response> createResponse(bool useFastLogits = false, int32_t mpiWorldRank = 0);
 
+    executor::Result createResult(bool useFastLogits = false, int32_t mpiWorldRank = 0);
+
     void validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen, SizeType32 vocabSizePadded,
         std::optional<SizeType32> maxEncoderInputLen = std::nullopt, bool enableKVCacheReuse = false);
 
diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
index 6fc7051ad7e..a722587b799 100644
--- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
+++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
@@ -39,8 +39,16 @@ runtime::SizeType32 GenericLlmRequest<TTensor, TStream>::getBeamWidthByIter(bool
 
 template class GenericLlmRequest<runtime::ITensor::SharedPtr>;
 
-/// Note that there is some dependency on the order of operations in this method. Modify with care!
 std::optional<executor::Response> LlmRequest::createResponse(bool useFastLogits, int32_t mpiWorldRank)
+{
+    auto requestId = isChild() ? mParentRequestId : mRequestId;
+    auto response = executor::Response(requestId, std::move(createResult(useFastLogits, mpiWorldRank)), mClientId);
+
+    return response;
+}
+
+/// Note that there is some dependency on the order of operations in this method. Modify with care!
+executor::Result createResult(bool useFastLogits = false, int32_t mpiWorldRank = 0)
 {
     TLLM_CHECK(!isDisaggContextCompleteState());
     if (!(isFinished() || (mIsStreaming && mState == LlmRequestState::kGENERATION_IN_PROGRESS)))
@@ -192,11 +200,6 @@ std::optional<executor::Response> LlmRequest::createResponse(bool useFastLogits,
 
     // Update position of last sent response
     setMaxSentTokenLen(maxNbTokens);
-
-    auto requestId = isChild() ? mParentRequestId : mRequestId;
-    auto response = executor::Response(requestId, std::move(result), mClientId);
-
-    return response;
 }
 
 void LlmRequest::validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen,
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
index 35f32a3b128..a3399e1833b 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
@@ -360,6 +360,8 @@ void initBindings(pybind11::module_& m)
             py::arg("enable_kv_cache_reuse") = false)
         .def("create_response", &tb::LlmRequest::createResponse, py::arg("use_fast_logits") = false,
             py::arg("mpi_world_rank") = 0)
+        .def("create_result", &tb::LlmRequest::createResult, py::arg("use_fast_logits") = false,
+            py::arg("mpi_world_rank") = 0)
         .def("move_prompt_embedding_table_to_gpu", &tb::LlmRequest::movePromptEmbeddingTableToGpu, py::arg("manager"))
         .def("move_lora_weights_to_gpu", &tb::LlmRequest::moveLoraWeightsToGpu, py::arg("manager"))
         .def("finish_by_reason", &tb::LlmRequest::finishByReason, py::arg("finish_reason"));
diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
index c89ad56ec60..ef9bdbde637 100644
--- a/tensorrt_llm/_torch/pyexecutor/llm_request.py
+++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -219,25 +219,18 @@ def __getattr__(self, item):
 class LlmResponse:
     """LlmResponse wraps `bindings.executor.Response` but detour some features to Python implementation"""
 
-    def __init__(self, response: tensorrt_llm.bindings.executor.Response,
-                 py_result: PyResult):
-        self._response = response
-        self._py_result = py_result
-
-    def __getstate__(self):
-        return self._response, self._py_result
-
-    def __setstate__(self, state):
-        self._response, self._py_result = state
-
-    @property
-    def result(self) -> tensorrt_llm.bindings.executor.Result:
-        return LlmResult(
-            self._response.result,
-            self._py_result)  # LlmResult masquerades bindings.executor.Result
+    def __init__(self,
+                 request_id: int,
+                 error: str = None,
+                 result: LlmResult = None,
+                 client_id: int = None):
+        self.request_id = request_id
+        self.error = error
+        self.result = result
+        self.client_id = client_id
 
-    def __getattr__(self, item):
-        return getattr(self._response, item)
+    def has_error(self):
+        return self.error is not None
 
 
 class LlmRequest(tensorrt_llm.bindings.internal.batch_manager.LlmRequest):
@@ -269,6 +262,7 @@ def __init__(
             **kwargs)
         self.py_client_id = client_id
         self.py_request_id = self.request_id
+        self.py_llm_request_type = self.llm_request_type
         self.py_end_id = self.end_id
         self.py_prompt_len = self.prompt_len
         self.py_orig_prompt_len = self.orig_prompt_len
@@ -299,6 +293,9 @@ def __init__(
                                   return_generation_logits,
                                   exclude_last_generation_logits)
 
+    def is_generation_only_request(self):
+        return self.py_llm_request_type == LlmRequestType.LLMREQUEST_TYPE_GENERATION_ONLY
+
     def get_tokens(self, beam: int) -> int:
         return self.py_tokens[beam]
 
@@ -314,9 +311,13 @@ def create_response(
             self,
             use_fast_logits=False,
             mpi_world_rank=0) -> tensorrt_llm.bindings.executor.Response | None:
-        response = super().create_response(use_fast_logits, mpi_world_rank)
-        return LlmResponse(response,
-                           self.py_result) if response is not None else None
+        return LlmResponse(
+            request_id=self.py_request_id,
+            result=LlmResult(
+                super().create_result(use_fast_logits, mpi_world_rank),
+                self.py_result),
+            client_id=self.py_client_id,
+        )
 
     @property
     def is_dummy(self):
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
index 54ccc556504..37ae2da1963 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -30,8 +30,8 @@
 
 from ..distributed import Distributed
 from .kv_cache_transceiver import KvCacheTransceiver
-from .llm_request import (ExecutorRequest, ExecutorResponse, LlmRequest,
-                          LlmRequestState, executor_request_to_llm_request)
+from .llm_request import (ExecutorRequest, LlmRequest, LlmRequestState,
+                          LlmResponse, executor_request_to_llm_request)
 from .model_engine import ModelEngine
 from .sampler import Sampler, SampleState, SampleStateTensors, TorchSampler
 from .scheduler import ScheduledRequests
@@ -323,14 +323,14 @@ def await_responses(
         self,
         id: Optional[Union[List[int], int]] = None,
         timeout: Optional[datetime.timedelta] = None,
-    ) -> Union[List[List[ExecutorResponse]], List[ExecutorResponse]]:
+    ) -> Union[List[List[LlmResponse]], List[LlmResponse]]:
         """
         Await for ready responses
         Args:
             id (Optional[Union[List[int], int]]): Request id
             timeout (Optional[datetime.timedelta]): The maximum time to wait for new responses
         Returns:
-            Union[List[tensorrt_llm.bindings.executor.Response], List[List[tensorrt_llm.bindings.executor.Response]]]: Responses
+            Union[List[LlmResponse], List[List[LlmResponse]]]: Responses
         """
         timeout = timeout.total_seconds() if timeout is not None else None
         if id is None:
@@ -1934,8 +1934,10 @@ def _handle_errors(self, error_msg: Optional[str] = None):
             req_id = request.py_request_id
             request.state = LlmRequestState.GENERATION_COMPLETE
             self._terminate_request(request)
-            error_responses[req_id] = ExecutorResponse(
-                req_id, error_msg, client_id=request.py_client_id)
+            error_responses[req_id] = LlmResponse(
+                request_id=req_id,
+                error=error_msg,
+                client_id=request.py_client_id)
         self.active_requests.clear()
         self._enqueue_responses(error_responses)
 
@@ -1979,7 +1981,7 @@ def _handle_cancelled_requests(self):
         self._enqueue_responses(cancelled_responses)
 
     @nvtx_range("_enqueue_responses")
-    def _enqueue_responses(self, responses: Dict[int, ExecutorResponse]):
+    def _enqueue_responses(self, responses: Dict[int, LlmResponse]):
         if 0 not in self.dist.mapping.tp_group and not self.gather_all_responses:
             return
 
@@ -2036,7 +2038,7 @@ def _handle_responses(self):
                 requests_to_terminate.append(request)
                 continue
 
-            if request.is_generation_only_request:
+            if request.is_generation_only_request():
                 # If request is in transmission, so we don't need to emit a response
                 # Also, for the first iteration with overlap, we should skip since first
                 # token has already been emitted previously
@@ -2048,7 +2050,7 @@ def _handle_responses(self):
 
             request.draft_tokens = request.py_draft_tokens
             request.decoding_iter = request.py_decoding_iter
-            response: Response = request.create_response(False, self.dist.rank)
+            response = request.create_response(False, self.dist.rank)
             request_done = False
             if response:
                 request_done = response.result.is_final
@@ -2075,7 +2077,7 @@ def _terminate_ctx_finished_requests(self):
 
     def _await_any_response(self,
                             timeout: Optional[float] = None
-                            ) -> List[ExecutorResponse]:
+                            ) -> List[LlmResponse]:
 
         def any_responses_ready():
             return len(self.responses) > 0 or self.is_shutdown
@@ -2092,7 +2094,7 @@ def any_responses_ready():
     def _await_single_response(
             self,
             id: int,
-            timeout: Optional[float] = None) -> List[ExecutorResponse]:
+            timeout: Optional[float] = None) -> List[LlmResponse]:
         with self.response_cv:
 
             def key_has_response():
diff --git a/tensorrt_llm/executor/result.py b/tensorrt_llm/executor/result.py
index 0f2e1581cae..acfdc007958 100644
--- a/tensorrt_llm/executor/result.py
+++ b/tensorrt_llm/executor/result.py
@@ -16,7 +16,7 @@
 from ..llmapi.tracer import global_tracer
 from ..llmapi.utils import AsyncQueue
 from ..sampling_params import LogprobParams, SamplingParams
-from .utils import ErrorResponse, has_event_loop, is_llm_response
+from .utils import ErrorResponse, has_event_loop
 
 if TYPE_CHECKING:
     from .executor import GenerationExecutor
@@ -282,7 +282,11 @@ def _handle_response(self,
                 if self._background_error_handler is not None and (
                         handler := self._background_error_handler()):
                     handler(response.error)
-        elif is_llm_response(response):
+        elif isinstance(response, ErrorResponse):
+            if self._background_error_handler is not None and (
+                    handler := self._background_error_handler()):
+                handler(response.error_msg)
+        elif hasattr(response, "request_id"):
             if response.has_error():
                 if self._background_error_handler is not None and (
                         handler := self._background_error_handler()):
@@ -318,10 +322,6 @@ def _handle_response(self,
             if self._background_error_handler and (
                     handler := self._background_error_handler()):
                 handler()
-        elif isinstance(response, ErrorResponse):
-            if self._background_error_handler is not None and (
-                    handler := self._background_error_handler()):
-                handler(response.error_msg)
         else:
             raise ValueError(f"Unknown response type: {response}")
 
diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py
index 5f057c65012..0a421e4668a 100644
--- a/tensorrt_llm/executor/worker.py
+++ b/tensorrt_llm/executor/worker.py
@@ -39,7 +39,7 @@
 from .result import (GenerationResult, IterationResult, LogProbsResult,
                      ResponseWrapper, compute_logprobs)
 from .utils import (ErrorResponse, IntraProcessQueue, RequestError,
-                    WorkerCommIpcAddrs, has_event_loop, is_llm_response)
+                    WorkerCommIpcAddrs, has_event_loop)
 
 __all__ = [
     "GenerationExecutorWorker",
@@ -994,9 +994,8 @@ def _send_rsp(
 
     # Eliminate the finished GenerationRequest instances timely, which may
     # take considerable memory.
-    if is_llm_response(response):
-        if response.has_error() or response.result.is_final:
-            worker._pop_result(response.client_id)
+    if response.has_error() or response.result.is_final:
+        worker._pop_result(response.client_id)
     elif isinstance(response, ErrorResponse):
         worker._pop_result(response.client_id)
     else:

From f0bb7c8234e89ee0b57ec241a0a88db7c1dd63ad Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 12 Jun 2025 19:04:38 +0800
Subject: [PATCH 04/23] pure Python LlmResponse

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/llm_request.py | 6 +++---
 tensorrt_llm/_torch/pyexecutor/py_executor.py | 2 +-
 tensorrt_llm/executor/proxy.py                | 6 ++----
 tensorrt_llm/executor/utils.py                | 9 ---------
 4 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
index ef9bdbde637..522f39babf2 100644
--- a/tensorrt_llm/_torch/pyexecutor/llm_request.py
+++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -221,16 +221,16 @@ class LlmResponse:
 
     def __init__(self,
                  request_id: int,
-                 error: str = None,
+                 error_msg: str = None,
                  result: LlmResult = None,
                  client_id: int = None):
         self.request_id = request_id
-        self.error = error
+        self.error_msg = error_msg
         self.result = result
         self.client_id = client_id
 
     def has_error(self):
-        return self.error is not None
+        return self.error_msg is not None
 
 
 class LlmRequest(tensorrt_llm.bindings.internal.batch_manager.LlmRequest):
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
index 37ae2da1963..5b84cd7e373 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -1936,7 +1936,7 @@ def _handle_errors(self, error_msg: Optional[str] = None):
             self._terminate_request(request)
             error_responses[req_id] = LlmResponse(
                 request_id=req_id,
-                error=error_msg,
+                error_msg=error_msg,
                 client_id=request.py_client_id)
         self.active_requests.clear()
         self._enqueue_responses(error_responses)
diff --git a/tensorrt_llm/executor/proxy.py b/tensorrt_llm/executor/proxy.py
index 76cb2737c6e..4e54be21684 100644
--- a/tensorrt_llm/executor/proxy.py
+++ b/tensorrt_llm/executor/proxy.py
@@ -24,8 +24,7 @@
 from .request import CancellingRequest, GenerationRequest
 from .result import GenerationResult, IterationResult
 from .utils import (ErrorResponse, IntraProcessQueue, WorkerCommIpcAddrs,
-                    create_mpi_comm_session, get_spawn_proxy_process_env,
-                    is_llm_response)
+                    create_mpi_comm_session, get_spawn_proxy_process_env)
 from .worker import GenerationExecutorWorker, worker_main
 
 __all__ = [
@@ -172,8 +171,7 @@ def process_res(res):
                 event_loop = event_loop or queue.loop
             else:
                 queue.put(res)
-
-            if (is_llm_response(res) and res.result.is_final) or isinstance(
+            if (hasattr(res, "result") and res.result.is_final) or isinstance(
                     res, ErrorResponse):
                 self._results.pop(client_id)
 
diff --git a/tensorrt_llm/executor/utils.py b/tensorrt_llm/executor/utils.py
index bb6466373f1..e7b9975a5df 100644
--- a/tensorrt_llm/executor/utils.py
+++ b/tensorrt_llm/executor/utils.py
@@ -8,7 +8,6 @@
 from strenum import StrEnum
 
 from tensorrt_llm._utils import mpi_rank
-from tensorrt_llm.bindings.executor import Response
 from tensorrt_llm.llmapi.utils import print_colored_debug
 
 from ..llmapi.mpi_session import (MpiCommSession, MpiPoolSession, MpiSession,
@@ -141,11 +140,3 @@ class WorkerCommIpcAddrs(NamedTuple):
     result_queue_addr: tuple[str, Optional[bytes]]
     stats_queue_addr: tuple[str, Optional[bytes]]
     kv_cache_events_queue_addr: tuple[str, Optional[bytes]]
-
-
-def is_llm_response(instance):
-    from tensorrt_llm._torch.pyexecutor.llm_request import \
-        LlmResponse as PyLlmResponse
-
-    from .result import ResponseWrapper
-    return isinstance(instance, (Response, PyLlmResponse, ResponseWrapper))

From 60ca761095701150ee7c842bfd0bafd655d28660 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 12 Jun 2025 19:12:04 +0800
Subject: [PATCH 05/23] clean

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/executor/proxy.py  |  5 +++--
 tensorrt_llm/executor/result.py | 12 ++++++------
 tensorrt_llm/executor/utils.py  |  4 ++++
 tensorrt_llm/executor/worker.py |  7 ++++---
 4 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/tensorrt_llm/executor/proxy.py b/tensorrt_llm/executor/proxy.py
index 4e54be21684..8f16031a000 100644
--- a/tensorrt_llm/executor/proxy.py
+++ b/tensorrt_llm/executor/proxy.py
@@ -24,7 +24,8 @@
 from .request import CancellingRequest, GenerationRequest
 from .result import GenerationResult, IterationResult
 from .utils import (ErrorResponse, IntraProcessQueue, WorkerCommIpcAddrs,
-                    create_mpi_comm_session, get_spawn_proxy_process_env)
+                    create_mpi_comm_session, get_spawn_proxy_process_env,
+                    is_llm_response)
 from .worker import GenerationExecutorWorker, worker_main
 
 __all__ = [
@@ -171,7 +172,7 @@ def process_res(res):
                 event_loop = event_loop or queue.loop
             else:
                 queue.put(res)
-            if (hasattr(res, "result") and res.result.is_final) or isinstance(
+            if (is_llm_response(res) and res.result.is_final) or isinstance(
                     res, ErrorResponse):
                 self._results.pop(client_id)
 
diff --git a/tensorrt_llm/executor/result.py b/tensorrt_llm/executor/result.py
index acfdc007958..0f2e1581cae 100644
--- a/tensorrt_llm/executor/result.py
+++ b/tensorrt_llm/executor/result.py
@@ -16,7 +16,7 @@
 from ..llmapi.tracer import global_tracer
 from ..llmapi.utils import AsyncQueue
 from ..sampling_params import LogprobParams, SamplingParams
-from .utils import ErrorResponse, has_event_loop
+from .utils import ErrorResponse, has_event_loop, is_llm_response
 
 if TYPE_CHECKING:
     from .executor import GenerationExecutor
@@ -282,11 +282,7 @@ def _handle_response(self,
                 if self._background_error_handler is not None and (
                         handler := self._background_error_handler()):
                     handler(response.error)
-        elif isinstance(response, ErrorResponse):
-            if self._background_error_handler is not None and (
-                    handler := self._background_error_handler()):
-                handler(response.error_msg)
-        elif hasattr(response, "request_id"):
+        elif is_llm_response(response):
             if response.has_error():
                 if self._background_error_handler is not None and (
                         handler := self._background_error_handler()):
@@ -322,6 +318,10 @@ def _handle_response(self,
             if self._background_error_handler and (
                     handler := self._background_error_handler()):
                 handler()
+        elif isinstance(response, ErrorResponse):
+            if self._background_error_handler is not None and (
+                    handler := self._background_error_handler()):
+                handler(response.error_msg)
         else:
             raise ValueError(f"Unknown response type: {response}")
 
diff --git a/tensorrt_llm/executor/utils.py b/tensorrt_llm/executor/utils.py
index e7b9975a5df..fd4cd8444ec 100644
--- a/tensorrt_llm/executor/utils.py
+++ b/tensorrt_llm/executor/utils.py
@@ -140,3 +140,7 @@ class WorkerCommIpcAddrs(NamedTuple):
     result_queue_addr: tuple[str, Optional[bytes]]
     stats_queue_addr: tuple[str, Optional[bytes]]
     kv_cache_events_queue_addr: tuple[str, Optional[bytes]]
+
+
+def is_llm_response(instance):
+    return hasattr(instance, "result")
diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py
index 0a421e4668a..5f057c65012 100644
--- a/tensorrt_llm/executor/worker.py
+++ b/tensorrt_llm/executor/worker.py
@@ -39,7 +39,7 @@
 from .result import (GenerationResult, IterationResult, LogProbsResult,
                      ResponseWrapper, compute_logprobs)
 from .utils import (ErrorResponse, IntraProcessQueue, RequestError,
-                    WorkerCommIpcAddrs, has_event_loop)
+                    WorkerCommIpcAddrs, has_event_loop, is_llm_response)
 
 __all__ = [
     "GenerationExecutorWorker",
@@ -994,8 +994,9 @@ def _send_rsp(
 
     # Eliminate the finished GenerationRequest instances timely, which may
     # take considerable memory.
-    if response.has_error() or response.result.is_final:
-        worker._pop_result(response.client_id)
+    if is_llm_response(response):
+        if response.has_error() or response.result.is_final:
+            worker._pop_result(response.client_id)
     elif isinstance(response, ErrorResponse):
         worker._pop_result(response.client_id)
     else:

From 5f7e9ea233acdcd741c52d06ff6968600331a8fe Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 12 Jun 2025 20:32:15 +0800
Subject: [PATCH 06/23] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
index a722587b799..2cdb8a5becb 100644
--- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
+++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
@@ -48,7 +48,7 @@ std::optional<executor::Response> LlmRequest::createResponse(bool useFastLogits,
 }
 
 /// Note that there is some dependency on the order of operations in this method. Modify with care!
-executor::Result createResult(bool useFastLogits = false, int32_t mpiWorldRank = 0)
+executor::Result LlmRequest::createResult(bool useFastLogits = false, int32_t mpiWorldRank = 0)
 {
     TLLM_CHECK(!isDisaggContextCompleteState());
     if (!(isFinished() || (mIsStreaming && mState == LlmRequestState::kGENERATION_IN_PROGRESS)))

From 1b3a7b7a3286956b18d188b5b1795d80d371b827 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 12 Jun 2025 20:34:05 +0800
Subject: [PATCH 07/23] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
index 2cdb8a5becb..492b45f1d8c 100644
--- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
+++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
@@ -48,7 +48,7 @@ std::optional<executor::Response> LlmRequest::createResponse(bool useFastLogits,
 }
 
 /// Note that there is some dependency on the order of operations in this method. Modify with care!
-executor::Result LlmRequest::createResult(bool useFastLogits = false, int32_t mpiWorldRank = 0)
+executor::Result LlmRequest::createResult(bool useFastLogits, int32_t mpiWorldRank)
 {
     TLLM_CHECK(!isDisaggContextCompleteState());
     if (!(isFinished() || (mIsStreaming && mState == LlmRequestState::kGENERATION_IN_PROGRESS)))

From df6007306b59862662494a2517924a1f5f17612d Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 12 Jun 2025 20:39:35 +0800
Subject: [PATCH 08/23] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/include/tensorrt_llm/batch_manager/llmRequest.h |  2 +-
 cpp/tensorrt_llm/batch_manager/llmRequest.cpp       | 11 +++++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
index dca20816dba..ac5b11d8822 100644
--- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
+++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@@ -2328,7 +2328,7 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
     /// @return An optional Response
     std::optional<executor::Response> createResponse(bool useFastLogits = false, int32_t mpiWorldRank = 0);
 
-    executor::Result createResult(bool useFastLogits = false, int32_t mpiWorldRank = 0);
+    std::optional<executor::Result> createResult(bool useFastLogits = false, int32_t mpiWorldRank = 0);
 
     void validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen, SizeType32 vocabSizePadded,
         std::optional<SizeType32> maxEncoderInputLen = std::nullopt, bool enableKVCacheReuse = false);
diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
index 492b45f1d8c..1c11688e9ea 100644
--- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
+++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
@@ -42,13 +42,16 @@ template class GenericLlmRequest<runtime::ITensor::SharedPtr>;
 std::optional<executor::Response> LlmRequest::createResponse(bool useFastLogits, int32_t mpiWorldRank)
 {
     auto requestId = isChild() ? mParentRequestId : mRequestId;
-    auto response = executor::Response(requestId, std::move(createResult(useFastLogits, mpiWorldRank)), mClientId);
-
-    return response;
+    auto result = createResult(useFastLogits, mpiWorldRank);
+    if (result.has_value())
+    {
+        return executor::Response(requestId, std::move(result), mClientId);
+    }
+    return std::nullopt;
 }
 
 /// Note that there is some dependency on the order of operations in this method. Modify with care!
-executor::Result LlmRequest::createResult(bool useFastLogits, int32_t mpiWorldRank)
+std::optional<executor::Result> LlmRequest::createResult(bool useFastLogits, int32_t mpiWorldRank)
 {
     TLLM_CHECK(!isDisaggContextCompleteState());
     if (!(isFinished() || (mIsStreaming && mState == LlmRequestState::kGENERATION_IN_PROGRESS)))

From 91c904e85e3f50afb0f1ad1aabbea3fad1758f66 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 12 Jun 2025 20:41:44 +0800
Subject: [PATCH 09/23] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
index 1c11688e9ea..702f2083752 100644
--- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
+++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
@@ -45,7 +45,7 @@ std::optional<executor::Response> LlmRequest::createResponse(bool useFastLogits,
     auto result = createResult(useFastLogits, mpiWorldRank);
     if (result.has_value())
     {
-        return executor::Response(requestId, std::move(result), mClientId);
+        return executor::Response(requestId, result, mClientId);
     }
     return std::nullopt;
 }

From 5d62ceac9b53c522cb12d57c33ae21ab2c5bde6b Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 12 Jun 2025 20:54:53 +0800
Subject: [PATCH 10/23] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
index 702f2083752..1a55482b07b 100644
--- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
+++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
@@ -45,7 +45,7 @@ std::optional<executor::Response> LlmRequest::createResponse(bool useFastLogits,
     auto result = createResult(useFastLogits, mpiWorldRank);
     if (result.has_value())
     {
-        return executor::Response(requestId, result, mClientId);
+        return executor::Response(requestId, result.value(), mClientId);
     }
     return std::nullopt;
 }

From 50eb5b964acb634ebab93dc885b359b26ba07143 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 12 Jun 2025 21:12:32 +0800
Subject: [PATCH 11/23] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
index 1a55482b07b..cfa10eb4056 100644
--- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
+++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
@@ -203,6 +203,7 @@ std::optional<executor::Result> LlmRequest::createResult(bool useFastLogits, int
 
     // Update position of last sent response
     setMaxSentTokenLen(maxNbTokens);
+    return result;
 }
 
 void LlmRequest::validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen,

From ea2f8cc9e4fd01cfa9a24538ccf38675a4648ac8 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 12 Jun 2025 21:20:45 +0800
Subject: [PATCH 12/23] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/py_executor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
index 5b84cd7e373..3470737f04a 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -2052,7 +2052,7 @@ def _handle_responses(self):
             request.decoding_iter = request.py_decoding_iter
             response = request.create_response(False, self.dist.rank)
             request_done = False
-            if response:
+            if response and response.result:
                 request_done = response.result.is_final
                 new_responses.update({req_id: response})
             if request_done:

From 431926d21274e2a2be7483b00b35a51d0e204671 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 12 Jun 2025 21:27:19 +0800
Subject: [PATCH 13/23] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/llm_request.py | 3 +++
 tensorrt_llm/_torch/pyexecutor/py_executor.py | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
index 522f39babf2..cecadd6e511 100644
--- a/tensorrt_llm/_torch/pyexecutor/llm_request.py
+++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -232,6 +232,9 @@ def __init__(self,
     def has_error(self):
         return self.error_msg is not None
 
+    def has_result(self):
+        return self.result._result
+
 
 class LlmRequest(tensorrt_llm.bindings.internal.batch_manager.LlmRequest):
     """LlmRequest wraps `bindings.internal.batch_manager.LlmRequest`
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
index 3470737f04a..900aa711274 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -2052,7 +2052,7 @@ def _handle_responses(self):
             request.decoding_iter = request.py_decoding_iter
             response = request.create_response(False, self.dist.rank)
             request_done = False
-            if response and response.result:
+            if response.has_result():
                 request_done = response.result.is_final
                 new_responses.update({req_id: response})
             if request_done:

From 3fb4d84591a09268932b08ff1e3a4290f2b0708c Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 12 Jun 2025 23:10:53 +0800
Subject: [PATCH 14/23] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/llm_request.py | 11 +++--------
 tensorrt_llm/_torch/pyexecutor/py_executor.py |  2 +-
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
index cecadd6e511..752305555ad 100644
--- a/tensorrt_llm/_torch/pyexecutor/llm_request.py
+++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -232,9 +232,6 @@ def __init__(self,
     def has_error(self):
         return self.error_msg is not None
 
-    def has_result(self):
-        return self.result._result
-
 
 class LlmRequest(tensorrt_llm.bindings.internal.batch_manager.LlmRequest):
     """LlmRequest wraps `bindings.internal.batch_manager.LlmRequest`
@@ -314,13 +311,11 @@ def create_response(
             self,
             use_fast_logits=False,
             mpi_world_rank=0) -> tensorrt_llm.bindings.executor.Response | None:
+        result = super().create_result(use_fast_logits, mpi_world_rank)
         return LlmResponse(
             request_id=self.py_request_id,
-            result=LlmResult(
-                super().create_result(use_fast_logits, mpi_world_rank),
-                self.py_result),
-            client_id=self.py_client_id,
-        )
+            result=LlmResult(result, self.py_result),
+            client_id=self.py_client_id) if result is not None else None
 
     @property
     def is_dummy(self):
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
index 900aa711274..5b84cd7e373 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -2052,7 +2052,7 @@ def _handle_responses(self):
             request.decoding_iter = request.py_decoding_iter
             response = request.create_response(False, self.dist.rank)
             request_done = False
-            if response.has_result():
+            if response:
                 request_done = response.result.is_final
                 new_responses.update({req_id: response})
             if request_done:

From 04370bad2ae9279aa8831e2c7ccf72aae2c570eb Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Fri, 13 Jun 2025 00:07:21 +0800
Subject: [PATCH 15/23] polish

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/executor/proxy.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorrt_llm/executor/proxy.py b/tensorrt_llm/executor/proxy.py
index 8f16031a000..76cb2737c6e 100644
--- a/tensorrt_llm/executor/proxy.py
+++ b/tensorrt_llm/executor/proxy.py
@@ -172,6 +172,7 @@ def process_res(res):
                 event_loop = event_loop or queue.loop
             else:
                 queue.put(res)
+
             if (is_llm_response(res) and res.result.is_final) or isinstance(
                     res, ErrorResponse):
                 self._results.pop(client_id)

From acd09a45940f4b70f5e61b12798ffe4b7f111204 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Fri, 13 Jun 2025 12:37:14 +0800
Subject: [PATCH 16/23] expose createSerializedResult api

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 .../tensorrt_llm/batch_manager/llmRequest.h   |  3 +++
 cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 25 +++++++++++++++++++
 .../pybind/batch_manager/bindings.cpp         |  8 ++++++
 3 files changed, 36 insertions(+)

diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
index ac5b11d8822..d71b6e89f6a 100644
--- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
+++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@@ -2330,6 +2330,9 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
 
     std::optional<executor::Result> createResult(bool useFastLogits = false, int32_t mpiWorldRank = 0);
 
+    void createSerializedResult(
+        std::vector<char>& serializedResult, bool& isFinal, bool useFastLogits = false, int32_t mpiWorldRank = 0);
+
     void validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen, SizeType32 vocabSizePadded,
         std::optional<SizeType32> maxEncoderInputLen = std::nullopt, bool enableKVCacheReuse = false);
 
diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
index cfa10eb4056..ea1cffa2545 100644
--- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
+++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
@@ -16,6 +16,7 @@
  */
 
 #include "tensorrt_llm/batch_manager/llmRequest.h"
+#include "tensorrt_llm/executor/serializeUtils.h"
 #include "tensorrt_llm/kernels/beamSearchKernels.h"
 
 namespace tensorrt_llm::batch_manager
@@ -50,6 +51,21 @@ std::optional<executor::Response> LlmRequest::createResponse(bool useFastLogits,
     return std::nullopt;
 }
 
+void LlmRequest::createSerializedResult(
+    std::vector<char>& serializedResult, bool& isFinal, bool useFastLogits = false, int32_t mpiWorldRank = 0)
+{
+    auto result = createResult(useFastLogits, mpiWorldRank);
+    if (result.has_value())
+    {
+        std::ostringstream oStream;
+        executor::serialize_utils::serialize(result.value(), oStream);
+        auto str = oStream.str();
+        serializedResult.resize(str.size());
+        std::copy(serializedResult.begin(), str.begin(), str.end());
+        isFinal = result.isFinal;
+    }
+}
+
 /// Note that there is some dependency on the order of operations in this method. Modify with care!
 std::optional<executor::Result> LlmRequest::createResult(bool useFastLogits, int32_t mpiWorldRank)
 {
@@ -206,6 +222,15 @@ std::optional<executor::Result> LlmRequest::createResult(bool useFastLogits, int
     return result;
 }
 
+bool LlmRequest::createSerializedResult(std::string& serializedResult, bool useFastLogits, int32_t mpiWorldRank)
+{
+    auto result = createResult(useFastLogits, mpiWorldRank);
+    if (result.has_value())
+    {
+        executor::serialize
+    }
+}
+
 void LlmRequest::validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen,
     SizeType32 vocabSizePadded, std::optional<SizeType32> maxEncoderInputLen, bool enableKVCacheReuse)
 {
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
index a3399e1833b..62ef664ac64 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
@@ -362,6 +362,14 @@ void initBindings(pybind11::module_& m)
             py::arg("mpi_world_rank") = 0)
         .def("create_result", &tb::LlmRequest::createResult, py::arg("use_fast_logits") = false,
             py::arg("mpi_world_rank") = 0)
+        .def("create_serialized_result",
+            [](tb::LlmRequest& self, bool use_fast_logits = false, int mpi_world_rank = 0)
+            {
+                std::vector<char> serialized_result;
+                bool is_final = False;
+                self.createSerializedResult(serialized_result, is_final, use_fast_logits, mpi_world_rank);
+                return py::str(serialized_result.data(), serialized_result.size()), is_final;
+            })
         .def("move_prompt_embedding_table_to_gpu", &tb::LlmRequest::movePromptEmbeddingTableToGpu, py::arg("manager"))
         .def("move_lora_weights_to_gpu", &tb::LlmRequest::moveLoraWeightsToGpu, py::arg("manager"))
         .def("finish_by_reason", &tb::LlmRequest::finishByReason, py::arg("finish_reason"));

From 48a999ce2abe906d86751119ef11897821885c8c Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Fri, 13 Jun 2025 12:38:49 +0800
Subject: [PATCH 17/23] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/llm_request.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
index 752305555ad..3ea0eb7ec46 100644
--- a/tensorrt_llm/_torch/pyexecutor/llm_request.py
+++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -213,7 +213,8 @@ def __init__(self, result: tensorrt_llm.bindings.executor.Result,
     def __getattr__(self, item):
         if item in self.py_result_properties:
             return getattr(self._py_result, item)
-        return getattr(self._result, item)
+        result = object.__getattribute__(self, '_result')
+        return getattr(result, item)
 
 
 class LlmResponse:

From 1e36d773786d98d17d5b6d8f172a6d738e27b74d Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Fri, 13 Jun 2025 12:45:18 +0800
Subject: [PATCH 18/23] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
index ea1cffa2545..56e2addf61c 100644
--- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
+++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
@@ -52,7 +52,7 @@ std::optional<executor::Response> LlmRequest::createResponse(bool useFastLogits,
 }
 
 void LlmRequest::createSerializedResult(
-    std::vector<char>& serializedResult, bool& isFinal, bool useFastLogits = false, int32_t mpiWorldRank = 0)
+    std::vector<char>& serializedResult, bool& isFinal, bool useFastLogits = false, int32_t mpiWorldRank)
 {
     auto result = createResult(useFastLogits, mpiWorldRank);
     if (result.has_value())
@@ -61,8 +61,8 @@ void LlmRequest::createSerializedResult(
         executor::serialize_utils::serialize(result.value(), oStream);
         auto str = oStream.str();
         serializedResult.resize(str.size());
-        std::copy(serializedResult.begin(), str.begin(), str.end());
-        isFinal = result.isFinal;
+        std::copy(str.begin(), str.end(), serializedResult.begin());
+        isFinal = result.value().isFinal;
     }
 }
 

From 4fd71bc78730c7a26cc3759b2a231ae5d8d7ac62 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Fri, 13 Jun 2025 12:47:36 +0800
Subject: [PATCH 19/23] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
index 56e2addf61c..cd119dccf94 100644
--- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
+++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
@@ -222,15 +222,6 @@ std::optional<executor::Result> LlmRequest::createResult(bool useFastLogits, int
     return result;
 }
 
-bool LlmRequest::createSerializedResult(std::string& serializedResult, bool useFastLogits, int32_t mpiWorldRank)
-{
-    auto result = createResult(useFastLogits, mpiWorldRank);
-    if (result.has_value())
-    {
-        executor::serialize
-    }
-}
-
 void LlmRequest::validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen,
     SizeType32 vocabSizePadded, std::optional<SizeType32> maxEncoderInputLen, bool enableKVCacheReuse)
 {

From 1a4920ba5fdb345b6dd86f7159f765bd4fd6d693 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Fri, 13 Jun 2025 13:38:20 +0800
Subject: [PATCH 20/23] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/tensorrt_llm/batch_manager/llmRequest.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
index cd119dccf94..433f349b07d 100644
--- a/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
+++ b/cpp/tensorrt_llm/batch_manager/llmRequest.cpp
@@ -52,7 +52,7 @@ std::optional<executor::Response> LlmRequest::createResponse(bool useFastLogits,
 }
 
 void LlmRequest::createSerializedResult(
-    std::vector<char>& serializedResult, bool& isFinal, bool useFastLogits = false, int32_t mpiWorldRank)
+    std::vector<char>& serializedResult, bool& isFinal, bool useFastLogits, int32_t mpiWorldRank)
 {
     auto result = createResult(useFastLogits, mpiWorldRank);
     if (result.has_value())

From 5e9888a3e774c51a98eb3d478ca45b24b50c25d3 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Fri, 13 Jun 2025 13:40:53 +0800
Subject: [PATCH 21/23] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
index 62ef664ac64..ab299af0bae 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
@@ -366,7 +366,7 @@ void initBindings(pybind11::module_& m)
             [](tb::LlmRequest& self, bool use_fast_logits = false, int mpi_world_rank = 0)
             {
                 std::vector<char> serialized_result;
-                bool is_final = False;
+                bool is_final = false;
                 self.createSerializedResult(serialized_result, is_final, use_fast_logits, mpi_world_rank);
                 return py::str(serialized_result.data(), serialized_result.size()), is_final;
             })

From 9bfa834cafbdf7368c72e5aad9aed5b7eff604a8 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Fri, 13 Jun 2025 13:49:17 +0800
Subject: [PATCH 22/23] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
index ab299af0bae..e287550535a 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
@@ -36,6 +36,7 @@
 #include <pybind11/stl.h>
 #include <pybind11/stl_bind.h>
 #include <torch/extension.h>
+#include <tuple>
 
 namespace py = pybind11;
 namespace tb = tensorrt_llm::batch_manager;
@@ -368,7 +369,7 @@ void initBindings(pybind11::module_& m)
                 std::vector<char> serialized_result;
                 bool is_final = false;
                 self.createSerializedResult(serialized_result, is_final, use_fast_logits, mpi_world_rank);
-                return py::str(serialized_result.data(), serialized_result.size()), is_final;
+                return std::make_tuple < (py::str(serialized_result.data(), serialized_result.size()), is_final);
             })
         .def("move_prompt_embedding_table_to_gpu", &tb::LlmRequest::movePromptEmbeddingTableToGpu, py::arg("manager"))
         .def("move_lora_weights_to_gpu", &tb::LlmRequest::moveLoraWeightsToGpu, py::arg("manager"))

From b37703cfade9c0d16f0f8107297d08d6b609bf24 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Fri, 13 Jun 2025 14:27:36 +0800
Subject: [PATCH 23/23] fix

Signed-off-by: QI JUN <22017000+QiJune@users.noreply.github.com>
---
 cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
index e287550535a..9f8f95e8ed7 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
@@ -369,7 +369,7 @@ void initBindings(pybind11::module_& m)
                 std::vector<char> serialized_result;
                 bool is_final = false;
                 self.createSerializedResult(serialized_result, is_final, use_fast_logits, mpi_world_rank);
-                return std::make_tuple < (py::str(serialized_result.data(), serialized_result.size()), is_final);
+                return std::make_tuple(serialized_result, is_final);
             })
         .def("move_prompt_embedding_table_to_gpu", &tb::LlmRequest::movePromptEmbeddingTableToGpu, py::arg("manager"))
         .def("move_lora_weights_to_gpu", &tb::LlmRequest::moveLoraWeightsToGpu, py::arg("manager"))