From d7b9ded0a99889c33891801488b29b26b21a434d Mon Sep 17 00:00:00 2001 From: ixlmar <206748156+ixlmar@users.noreply.github.com> Date: Wed, 1 Oct 2025 10:51:11 +0200 Subject: [PATCH 1/2] chore: update multi-GPU pipeline triggers Signed-off-by: ixlmar <206748156+ixlmar@users.noreply.github.com> --- jenkins/L0_MergeRequest.groovy | 2 ++ 1 file changed, 2 insertions(+) diff --git a/jenkins/L0_MergeRequest.groovy b/jenkins/L0_MergeRequest.groovy index 1b5f224bc82..a1d5d93a391 100644 --- a/jenkins/L0_MergeRequest.groovy +++ b/jenkins/L0_MergeRequest.groovy @@ -705,6 +705,8 @@ def getMultiGpuFileChanged(pipeline, testFilter, globalVars) "tensorrt_llm/_torch/pyexecutor/_util.py", "tensorrt_llm/_torch/pyexecutor/model_engine.py", "tensorrt_llm/_torch/pyexecutor/py_executor.py", + "tensorrt_llm/evaluate/json_mode_eval.py", + "tensorrt_llm/evaluate/mmlu.py", "tensorrt_llm/executor/", "tensorrt_llm/functional.py", "tensorrt_llm/llmapi/", From 9a123d62b9a0399815aa6bdb542d4fb7125adee4 Mon Sep 17 00:00:00 2001 From: ixlmar <206748156+ixlmar@users.noreply.github.com> Date: Wed, 1 Oct 2025 09:27:36 -0700 Subject: [PATCH 2/2] test: do not explicitly pass temperature=0 to select greedy sampling Signed-off-by: ixlmar <206748156+ixlmar@users.noreply.github.com> --- tensorrt_llm/evaluate/json_mode_eval.py | 3 +-- tensorrt_llm/evaluate/mmlu.py | 2 +- .../defs/accuracy/test_disaggregated_serving.py | 16 +++++++++++----- tests/unittest/llmapi/apps/_test_openai_misc.py | 3 --- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/tensorrt_llm/evaluate/json_mode_eval.py b/tensorrt_llm/evaluate/json_mode_eval.py index 122cbd6e7e4..37360754e50 100644 --- a/tensorrt_llm/evaluate/json_mode_eval.py +++ b/tensorrt_llm/evaluate/json_mode_eval.py @@ -64,8 +64,7 @@ def generate_samples(self) -> Iterable[tuple]: schema["x-guidance"] = {"lenient": True} schema = json.dumps(schema) sampling_args = { - "guided_decoding": GuidedDecodingParams(json=schema), - "temperature": 0, + "guided_decoding": GuidedDecodingParams(json=schema) } yield sample["prompt"], sampling_args, sample["completion"], sample[ "schema"] diff --git a/tensorrt_llm/evaluate/mmlu.py b/tensorrt_llm/evaluate/mmlu.py index b3b3f4ee7cf..92d7ae1171a 100644 --- a/tensorrt_llm/evaluate/mmlu.py +++ b/tensorrt_llm/evaluate/mmlu.py @@ -219,7 +219,7 @@ def generate_samples(self) -> Iterable[tuple]: include_answer=False) prompt = train_prompt + prompt_end label = test_df.iloc[i, test_df.shape[1] - 1] - yield prompt, {"temperature": 0}, label, subject + yield prompt, None, label, subject def compute_score(self, outputs: List[RequestOutput], references: List[str], subjects: List[str]) -> float: diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py index 49c612872bf..9539f67a59c 100644 --- a/tests/integration/defs/accuracy/test_disaggregated_serving.py +++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py @@ -230,11 +230,17 @@ def send_request(prompt: str, sampling_params: SamplingParams, streaming: bool): kwargs = {} if sampling_params is not None: - kwargs.update(max_tokens=sampling_params.max_tokens, - temperature=sampling_params.temperature, - top_p=sampling_params.top_p, - stop=sampling_params.stop, - seed=sampling_params.seed) + kwargs.update( + max_tokens=sampling_params.max_tokens, + # NB: 'LLM' (cf. SamplingParams) and OpenAI API + # defaults differ (top_p=0 vs. top_p=1). + # FIXME: Because 'LLM' does not permit expressly setting + # top_p=0, diverting to temperature=0. + temperature=(sampling_params.temperature + if sampling_params.top_p is not None else 0), + top_p=sampling_params.top_p, + stop=sampling_params.stop, + seed=sampling_params.seed) if (guided_decoding_params := sampling_params.guided_decoding) is not None: extra_body = {} diff --git a/tests/unittest/llmapi/apps/_test_openai_misc.py b/tests/unittest/llmapi/apps/_test_openai_misc.py index 7dcac12304a..8cc715389f3 100644 --- a/tests/unittest/llmapi/apps/_test_openai_misc.py +++ b/tests/unittest/llmapi/apps/_test_openai_misc.py @@ -94,12 +94,9 @@ async def test_request_cancellation(server: RemoteOpenAIServer, # Request about 2 million tokens for _ in range(200): task = asyncio.create_task( - # FIXME: Some requests complete quickly without temperature=0, - # despite min_tokens being specified, cf. https://nvbugs/5513423 client.chat.completions.create(messages=chat_input, model=model_name, max_tokens=10000, - temperature=0, extra_body={"min_tokens": 10000})) tasks.append(task)