NVIDIA · Funatiq · Oct 2, 2025 · Oct 1, 2025 · Oct 1, 2025
@@ -705,6 +705,8 @@ def getMultiGpuFileChanged(pipeline, testFilter, globalVars)
         "tensorrt_llm/_torch/pyexecutor/_util.py",
         "tensorrt_llm/_torch/pyexecutor/model_engine.py",
         "tensorrt_llm/_torch/pyexecutor/py_executor.py",
+        "tensorrt_llm/evaluate/json_mode_eval.py",
+        "tensorrt_llm/evaluate/mmlu.py",
         "tensorrt_llm/executor/",
         "tensorrt_llm/functional.py",
         "tensorrt_llm/llmapi/",

diff --git a/tensorrt_llm/evaluate/json_mode_eval.py b/tensorrt_llm/evaluate/json_mode_eval.py
@@ -64,8 +64,7 @@ def generate_samples(self) -> Iterable[tuple]:
                 schema["x-guidance"] = {"lenient": True}
                 schema = json.dumps(schema)
             sampling_args = {
-                "guided_decoding": GuidedDecodingParams(json=schema),
-                "temperature": 0,
+                "guided_decoding": GuidedDecodingParams(json=schema)
             }
             yield sample["prompt"], sampling_args, sample["completion"], sample[
                 "schema"]

diff --git a/tensorrt_llm/evaluate/mmlu.py b/tensorrt_llm/evaluate/mmlu.py
@@ -219,7 +219,7 @@ def generate_samples(self) -> Iterable[tuple]:
                                                  include_answer=False)
                 prompt = train_prompt + prompt_end
                 label = test_df.iloc[i, test_df.shape[1] - 1]
-                yield prompt, {"temperature": 0}, label, subject
+                yield prompt, None, label, subject
 
     def compute_score(self, outputs: List[RequestOutput], references: List[str],
                       subjects: List[str]) -> float:

diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py
@@ -230,11 +230,17 @@ def send_request(prompt: str, sampling_params: SamplingParams,
                          streaming: bool):
             kwargs = {}
             if sampling_params is not None:
-                kwargs.update(max_tokens=sampling_params.max_tokens,
-                              temperature=sampling_params.temperature,
-                              top_p=sampling_params.top_p,
-                              stop=sampling_params.stop,
-                              seed=sampling_params.seed)
+                kwargs.update(
+                    max_tokens=sampling_params.max_tokens,
+                    # NB: 'LLM' (cf. SamplingParams) and OpenAI API
+                    #     defaults differ (top_p=0 vs. top_p=1).
+                    # FIXME: Because 'LLM' does not permit expressly setting
+                    #     top_p=0, diverting to temperature=0.
+                    temperature=(sampling_params.temperature
+                                 if sampling_params.top_p is not None else 0),
+                    top_p=sampling_params.top_p,
+                    stop=sampling_params.stop,
+                    seed=sampling_params.seed)
                 if (guided_decoding_params :=
                         sampling_params.guided_decoding) is not None:
                     extra_body = {}

diff --git a/tests/unittest/llmapi/apps/_test_openai_misc.py b/tests/unittest/llmapi/apps/_test_openai_misc.py
@@ -94,12 +94,9 @@ async def test_request_cancellation(server: RemoteOpenAIServer,
     # Request about 2 million tokens
     for _ in range(200):
         task = asyncio.create_task(
-            # FIXME: Some requests complete quickly without temperature=0,
-            #        despite min_tokens being specified, cf. https://nvbugs/5513423
             client.chat.completions.create(messages=chat_input,
                                            model=model_name,
                                            max_tokens=10000,
-                                           temperature=0,
                                            extra_body={"min_tokens": 10000}))
         tasks.append(task)