From d7b9ded0a99889c33891801488b29b26b21a434d Mon Sep 17 00:00:00 2001
From: ixlmar <206748156+ixlmar@users.noreply.github.com>
Date: Wed, 1 Oct 2025 10:51:11 +0200
Subject: [PATCH 1/2] chore: update multi-GPU pipeline triggers

Signed-off-by: ixlmar <206748156+ixlmar@users.noreply.github.com>
---
 jenkins/L0_MergeRequest.groovy | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/jenkins/L0_MergeRequest.groovy b/jenkins/L0_MergeRequest.groovy
index 1b5f224bc82..a1d5d93a391 100644
--- a/jenkins/L0_MergeRequest.groovy
+++ b/jenkins/L0_MergeRequest.groovy
@@ -705,6 +705,8 @@ def getMultiGpuFileChanged(pipeline, testFilter, globalVars)
         "tensorrt_llm/_torch/pyexecutor/_util.py",
         "tensorrt_llm/_torch/pyexecutor/model_engine.py",
         "tensorrt_llm/_torch/pyexecutor/py_executor.py",
+        "tensorrt_llm/evaluate/json_mode_eval.py",
+        "tensorrt_llm/evaluate/mmlu.py",
         "tensorrt_llm/executor/",
         "tensorrt_llm/functional.py",
         "tensorrt_llm/llmapi/",

From 9a123d62b9a0399815aa6bdb542d4fb7125adee4 Mon Sep 17 00:00:00 2001
From: ixlmar <206748156+ixlmar@users.noreply.github.com>
Date: Wed, 1 Oct 2025 09:27:36 -0700
Subject: [PATCH 2/2] test: do not explicitly pass temperature=0 to select
 greedy sampling

Signed-off-by: ixlmar <206748156+ixlmar@users.noreply.github.com>
---
 tensorrt_llm/evaluate/json_mode_eval.py          |  3 +--
 tensorrt_llm/evaluate/mmlu.py                    |  2 +-
 .../defs/accuracy/test_disaggregated_serving.py  | 16 +++++++++++-----
 tests/unittest/llmapi/apps/_test_openai_misc.py  |  3 ---
 4 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/tensorrt_llm/evaluate/json_mode_eval.py b/tensorrt_llm/evaluate/json_mode_eval.py
index 122cbd6e7e4..37360754e50 100644
--- a/tensorrt_llm/evaluate/json_mode_eval.py
+++ b/tensorrt_llm/evaluate/json_mode_eval.py
@@ -64,8 +64,7 @@ def generate_samples(self) -> Iterable[tuple]:
                 schema["x-guidance"] = {"lenient": True}
                 schema = json.dumps(schema)
             sampling_args = {
-                "guided_decoding": GuidedDecodingParams(json=schema),
-                "temperature": 0,
+                "guided_decoding": GuidedDecodingParams(json=schema)
             }
             yield sample["prompt"], sampling_args, sample["completion"], sample[
                 "schema"]
diff --git a/tensorrt_llm/evaluate/mmlu.py b/tensorrt_llm/evaluate/mmlu.py
index b3b3f4ee7cf..92d7ae1171a 100644
--- a/tensorrt_llm/evaluate/mmlu.py
+++ b/tensorrt_llm/evaluate/mmlu.py
@@ -219,7 +219,7 @@ def generate_samples(self) -> Iterable[tuple]:
                                                  include_answer=False)
                 prompt = train_prompt + prompt_end
                 label = test_df.iloc[i, test_df.shape[1] - 1]
-                yield prompt, {"temperature": 0}, label, subject
+                yield prompt, None, label, subject
 
     def compute_score(self, outputs: List[RequestOutput], references: List[str],
                       subjects: List[str]) -> float:
diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py
index 49c612872bf..9539f67a59c 100644
--- a/tests/integration/defs/accuracy/test_disaggregated_serving.py
+++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py
@@ -230,11 +230,17 @@ def send_request(prompt: str, sampling_params: SamplingParams,
                          streaming: bool):
             kwargs = {}
             if sampling_params is not None:
-                kwargs.update(max_tokens=sampling_params.max_tokens,
-                              temperature=sampling_params.temperature,
-                              top_p=sampling_params.top_p,
-                              stop=sampling_params.stop,
-                              seed=sampling_params.seed)
+                kwargs.update(
+                    max_tokens=sampling_params.max_tokens,
+                    # NB: 'LLM' (cf. SamplingParams) and OpenAI API
+                    #     defaults differ (top_p=0 vs. top_p=1).
+                    # FIXME: Because 'LLM' does not permit expressly setting
+                    #     top_p=0, diverting to temperature=0.
+                    temperature=(sampling_params.temperature
+                                 if sampling_params.top_p is not None else 0),
+                    top_p=sampling_params.top_p,
+                    stop=sampling_params.stop,
+                    seed=sampling_params.seed)
                 if (guided_decoding_params :=
                         sampling_params.guided_decoding) is not None:
                     extra_body = {}
diff --git a/tests/unittest/llmapi/apps/_test_openai_misc.py b/tests/unittest/llmapi/apps/_test_openai_misc.py
index 7dcac12304a..8cc715389f3 100644
--- a/tests/unittest/llmapi/apps/_test_openai_misc.py
+++ b/tests/unittest/llmapi/apps/_test_openai_misc.py
@@ -94,12 +94,9 @@ async def test_request_cancellation(server: RemoteOpenAIServer,
     # Request about 2 million tokens
     for _ in range(200):
         task = asyncio.create_task(
-            # FIXME: Some requests complete quickly without temperature=0,
-            #        despite min_tokens being specified, cf. https://nvbugs/5513423
             client.chat.completions.create(messages=chat_input,
                                            model=model_name,
                                            max_tokens=10000,
-                                           temperature=0,
                                            extra_body={"min_tokens": 10000}))
         tasks.append(task)