From cbbf7f250d76033c91062b08426fa0cab5b5fab5 Mon Sep 17 00:00:00 2001 From: zhongdaor Date: Thu, 6 Nov 2025 14:45:36 -0800 Subject: [PATCH 1/9] feat: expose python api to save/remove/list ModelDeploymentCard Signed-off-by: zhongdaor --- lib/bindings/python/rust/kserve_grpc.rs | 25 +++++++++++++++++++++- lib/bindings/python/rust/llm/model_card.rs | 15 +++++++++++-- 2 files changed, 37 insertions(+), 3 deletions(-) diff --git a/lib/bindings/python/rust/kserve_grpc.rs b/lib/bindings/python/rust/kserve_grpc.rs index 9a0fdd7f40..d8f94c1630 100644 --- a/lib/bindings/python/rust/kserve_grpc.rs +++ b/lib/bindings/python/rust/kserve_grpc.rs @@ -5,7 +5,7 @@ use std::sync::Arc; use pyo3::prelude::*; -use crate::{CancellationToken, engine::*, to_pyerr}; +use crate::{CancellationToken, engine::*, llm::model_card::ModelDeploymentCard, to_pyerr}; pub use dynamo_llm::grpc::service::kserve; @@ -102,6 +102,29 @@ impl KserveGrpcService { Ok(self.inner.model_manager().list_tensor_models()) } + pub fn save_model_card(&self, key: String, card: ModelDeploymentCard) -> PyResult<()> { + self.inner + .model_manager() + .save_model_card(&key, card.inner) + .map_err(to_pyerr) + } + + pub fn remove_model_card(&self, key: String) -> PyResult> { + let card = self.inner.model_manager().remove_model_card(&key); + Ok(card.map(|inner| ModelDeploymentCard { inner })) + } + + pub fn get_model_cards(&self) -> PyResult> { + let cards = self + .inner + .model_manager() + .get_model_cards() + .into_iter() + .map(|inner| ModelDeploymentCard { inner }) + .collect(); + Ok(cards) + } + fn run<'p>(&self, py: Python<'p>, token: CancellationToken) -> PyResult> { let service = self.inner.clone(); pyo3_async_runtimes::tokio::future_into_py(py, async move { diff --git a/lib/bindings/python/rust/llm/model_card.rs b/lib/bindings/python/rust/llm/model_card.rs index 60446c0709..44ff68fb1d 100644 --- a/lib/bindings/python/rust/llm/model_card.rs +++ b/lib/bindings/python/rust/llm/model_card.rs @@ -6,7 +6,7 @@ use llm_rs::model_card::ModelDeploymentCard as RsModelDeploymentCard; #[pyclass] #[derive(Clone)] -pub(crate) struct ModelDeploymentCard { +pub struct ModelDeploymentCard { pub(crate) inner: RsModelDeploymentCard, } @@ -14,7 +14,18 @@ impl ModelDeploymentCard {} #[pymethods] impl ModelDeploymentCard { - // Previously called "from_local_path" + /// Build an in-memory ModelDeploymentCard from a folder containing config.json, + /// tokenizer.json and tokenizer_config.json (i.e. a huggingface repo checkout). + /// + /// # Arguments + /// * `path` - Path to the local model directory + /// * `model_name` - Name of the model + /// + /// # Returns + /// A new ModelDeploymentCard instance + /// + /// # Errors + /// Returns an error if the model directory does not exist or the model name is invalid. #[staticmethod] fn load(path: String, model_name: String) -> PyResult { let mut card = RsModelDeploymentCard::load_from_disk(&path, None).map_err(to_pyerr)?; From f3f6bc3d69b568316b6c16928eaa180a94bc21a1 Mon Sep 17 00:00:00 2001 From: zhongdaor Date: Thu, 6 Nov 2025 15:36:18 -0800 Subject: [PATCH 2/9] use runtime_config rather than ModelDeploymentCard Signed-off-by: zhongdaor --- lib/bindings/python/rust/kserve_grpc.rs | 53 +++++++++++++------------ 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/lib/bindings/python/rust/kserve_grpc.rs b/lib/bindings/python/rust/kserve_grpc.rs index d8f94c1630..e2cd75ff7a 100644 --- a/lib/bindings/python/rust/kserve_grpc.rs +++ b/lib/bindings/python/rust/kserve_grpc.rs @@ -3,9 +3,12 @@ use std::sync::Arc; +use dynamo_llm::{self as llm_rs}; +use llm_rs::model_card::ModelDeploymentCard as RsModelDeploymentCard; +use llm_rs::model_type::{ModelInput, ModelType}; use pyo3::prelude::*; -use crate::{CancellationToken, engine::*, llm::model_card::ModelDeploymentCard, to_pyerr}; +use crate::{CancellationToken, engine::*, llm::local_model::ModelRuntimeConfig, to_pyerr}; pub use dynamo_llm::grpc::service::kserve; @@ -56,12 +59,28 @@ impl KserveGrpcService { .map_err(to_pyerr) } + #[pyo3(signature = (model, checksum, engine, runtime_config=None))] pub fn add_tensor_model( &self, model: String, checksum: String, engine: PythonAsyncEngine, + runtime_config: Option, ) -> PyResult<()> { + // If runtime_config is provided, create and save a ModelDeploymentCard + // so the ModelConfig endpoint can return model configuration + if let Some(runtime_config) = runtime_config { + let mut card = RsModelDeploymentCard::with_name_only(&model); + card.model_type = ModelType::TensorBased; + card.model_input = ModelInput::Tensor; + card.runtime_config = runtime_config.inner; + + self.inner + .model_manager() + .save_model_card(&model, card) + .map_err(to_pyerr)?; + } + let engine = Arc::new(engine); self.inner .model_manager() @@ -84,10 +103,17 @@ impl KserveGrpcService { } pub fn remove_tensor_model(&self, model: String) -> PyResult<()> { + // Remove the engine self.inner .model_manager() .remove_tensor_model(&model) - .map_err(to_pyerr) + .map_err(to_pyerr)?; + + // Also remove the model card if it exists + // (It's ok if it doesn't exist since runtime_config is optional, we just ignore the None return) + let _ = self.inner.model_manager().remove_model_card(&model); + + Ok(()) } pub fn list_chat_completions_models(&self) -> PyResult> { @@ -102,29 +128,6 @@ impl KserveGrpcService { Ok(self.inner.model_manager().list_tensor_models()) } - pub fn save_model_card(&self, key: String, card: ModelDeploymentCard) -> PyResult<()> { - self.inner - .model_manager() - .save_model_card(&key, card.inner) - .map_err(to_pyerr) - } - - pub fn remove_model_card(&self, key: String) -> PyResult> { - let card = self.inner.model_manager().remove_model_card(&key); - Ok(card.map(|inner| ModelDeploymentCard { inner })) - } - - pub fn get_model_cards(&self) -> PyResult> { - let cards = self - .inner - .model_manager() - .get_model_cards() - .into_iter() - .map(|inner| ModelDeploymentCard { inner }) - .collect(); - Ok(cards) - } - fn run<'p>(&self, py: Python<'p>, token: CancellationToken) -> PyResult> { let service = self.inner.clone(); pyo3_async_runtimes::tokio::future_into_py(py, async move { From a572af965307ef1f519dc7280f2df63738547ae1 Mon Sep 17 00:00:00 2001 From: zhongdaor Date: Thu, 6 Nov 2025 15:43:59 -0800 Subject: [PATCH 3/9] revert model_card.rs to main Signed-off-by: zhongdaor --- lib/bindings/python/rust/llm/model_card.rs | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/lib/bindings/python/rust/llm/model_card.rs b/lib/bindings/python/rust/llm/model_card.rs index 44ff68fb1d..60446c0709 100644 --- a/lib/bindings/python/rust/llm/model_card.rs +++ b/lib/bindings/python/rust/llm/model_card.rs @@ -6,7 +6,7 @@ use llm_rs::model_card::ModelDeploymentCard as RsModelDeploymentCard; #[pyclass] #[derive(Clone)] -pub struct ModelDeploymentCard { +pub(crate) struct ModelDeploymentCard { pub(crate) inner: RsModelDeploymentCard, } @@ -14,18 +14,7 @@ impl ModelDeploymentCard {} #[pymethods] impl ModelDeploymentCard { - /// Build an in-memory ModelDeploymentCard from a folder containing config.json, - /// tokenizer.json and tokenizer_config.json (i.e. a huggingface repo checkout). - /// - /// # Arguments - /// * `path` - Path to the local model directory - /// * `model_name` - Name of the model - /// - /// # Returns - /// A new ModelDeploymentCard instance - /// - /// # Errors - /// Returns an error if the model directory does not exist or the model name is invalid. + // Previously called "from_local_path" #[staticmethod] fn load(path: String, model_name: String) -> PyResult { let mut card = RsModelDeploymentCard::load_from_disk(&path, None).map_err(to_pyerr)?; From afd4c439115652163b27ee336e0b2f8cc16689c2 Mon Sep 17 00:00:00 2001 From: zhongdaor Date: Fri, 7 Nov 2025 00:14:57 -0800 Subject: [PATCH 4/9] add tests Signed-off-by: zhongdaor --- lib/bindings/python/src/dynamo/_core.pyi | 3 + lib/bindings/python/tests/test_kserve_grpc.py | 113 ++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 lib/bindings/python/tests/test_kserve_grpc.py diff --git a/lib/bindings/python/src/dynamo/_core.pyi b/lib/bindings/python/src/dynamo/_core.pyi index 2364c8a0b5..e4383e6275 100644 --- a/lib/bindings/python/src/dynamo/_core.pyi +++ b/lib/bindings/python/src/dynamo/_core.pyi @@ -12,6 +12,8 @@ from typing import ( Tuple, ) +from click import Option + from ._prometheus_names import prometheus_names # Import from specialized modules @@ -894,6 +896,7 @@ class KserveGrpcService: model: str, checksum: str, engine: PythonAsyncEngine, + runtime_config: Optional[ModelRuntimeConfig], ) -> None: """ Register a tensor-based model with the service. diff --git a/lib/bindings/python/tests/test_kserve_grpc.py b/lib/bindings/python/tests/test_kserve_grpc.py new file mode 100644 index 0000000000..d4bb9ae59a --- /dev/null +++ b/lib/bindings/python/tests/test_kserve_grpc.py @@ -0,0 +1,113 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +import asyncio +import contextlib +import socket +from typing import Any, Optional + +import pytest +import tritonclient.grpc as grpcclient +import tritonclient.grpc.model_config_pb2 as mc +from tritonclient.utils import InferenceServerException + +from dynamo.llm import KserveGrpcService, ModelRuntimeConfig, PythonAsyncEngine + +pytestmark = pytest.mark.pre_merge + + +async def _fetch_model_config( + client: grpcclient.InferenceServerClient, + model_name: str, + retries: int = 30, +) -> Any: + last_error: Optional[Exception] = None + for _ in range(retries): + try: + return await asyncio.to_thread(client.get_model_config, model_name) + except InferenceServerException as err: + last_error = err + await asyncio.sleep(0.1) + raise AssertionError(f"Unable to fetch model config for '{model_name}': {last_error}") + + +class EchoTensorEngine: + """Minimal tensor engine stub for registering tensor models.""" + + def __init__(self, model_name: str): + self._model_name = model_name + + def generate(self, request, context=None): + async def _generator(): + yield { + "model": self._model_name, + "tensors": request.get("tensors", []), + "parameters": request.get("parameters", {}), + } + + return _generator() + + +@pytest.mark.asyncio +async def test_model_config_uses_runtime_config(runtime): + """Ensure tensor runtime_config is returned via the ModelConfig endpoint.""" + host = "127.0.0.1" + port = 8787 + model_name = "tensor-config-model" + checksum = "dummy-mdcsum" + + loop = asyncio.get_running_loop() + engine = PythonAsyncEngine(EchoTensorEngine(model_name).generate, loop) + + service = KserveGrpcService(port=port, host=host) + + tensor_config = { + "name": model_name, + "inputs": [ + {"name": "input_text", "data_type": "Bytes", "shape": [-1]}, + {"name": "control_flag", "data_type": "Bool", "shape": [1]}, + ], + "outputs": [ + {"name": "results", "data_type": "Bytes", "shape": [-1]}, + ], + } + runtime_config = ModelRuntimeConfig() + runtime_config.set_tensor_model_config(tensor_config) + + service.add_tensor_model( + model_name, checksum, engine, runtime_config=runtime_config + ) + + cancel_token = runtime.child_token() + + async def _serve(): + await service.run(cancel_token) + + server_task = asyncio.create_task(_serve()) + + client: Optional[grpcclient.InferenceServerClient] = None + try: + await asyncio.sleep(1) # wait service to start + client = grpcclient.InferenceServerClient(url=f"{host}:{port}") + response = await _fetch_model_config(client, model_name) + + model_config = response.config + assert model_config.name == model_name + assert model_config.platform == "dynamo" + assert model_config.backend == "dynamo" + + inputs = {spec.name: spec for spec in model_config.input} + assert list(inputs["input_text"].dims) == [-1] + assert inputs["input_text"].data_type == mc.TYPE_STRING + assert list(inputs["control_flag"].dims) == [1] + assert inputs["control_flag"].data_type == mc.TYPE_BOOL + + outputs = {spec.name: spec for spec in model_config.output} + assert list(outputs["results"].dims) == [-1] + assert outputs["results"].data_type == mc.TYPE_STRING + finally: + if client is not None: + client.close() + cancel_token.cancel() + with contextlib.suppress(asyncio.TimeoutError, asyncio.CancelledError): + await asyncio.wait_for(server_task, timeout=5) \ No newline at end of file From dcc356a59ba4e9305ef7b13c1c7abbbe5ddd1784 Mon Sep 17 00:00:00 2001 From: zhongdaor Date: Fri, 7 Nov 2025 00:24:25 -0800 Subject: [PATCH 5/9] make test looks better Signed-off-by: zhongdaor --- lib/bindings/python/tests/test_kserve_grpc.py | 112 +++++++++++------- 1 file changed, 70 insertions(+), 42 deletions(-) diff --git a/lib/bindings/python/tests/test_kserve_grpc.py b/lib/bindings/python/tests/test_kserve_grpc.py index d4bb9ae59a..1610a3eccf 100644 --- a/lib/bindings/python/tests/test_kserve_grpc.py +++ b/lib/bindings/python/tests/test_kserve_grpc.py @@ -4,7 +4,8 @@ import asyncio import contextlib import socket -from typing import Any, Optional +from contextlib import asynccontextmanager +from typing import Any, AsyncIterator, Optional, Tuple import pytest import tritonclient.grpc as grpcclient @@ -48,19 +49,46 @@ async def _generator(): return _generator() -@pytest.mark.asyncio -async def test_model_config_uses_runtime_config(runtime): - """Ensure tensor runtime_config is returned via the ModelConfig endpoint.""" - host = "127.0.0.1" - port = 8787 - model_name = "tensor-config-model" - checksum = "dummy-mdcsum" +@pytest.fixture +def tensor_service(runtime): + @asynccontextmanager + async def _start( + model_name: str, + *, + runtime_config: Optional[ModelRuntimeConfig] = None, + checksum: str = "dummy-mdcsum", + ) -> AsyncIterator[Tuple[str, int]]: + host = "127.0.0.1" + port = 8787 + loop = asyncio.get_running_loop() + engine = PythonAsyncEngine(EchoTensorEngine(model_name).generate, loop) + tensor_model_service = KserveGrpcService(port=port, host=host) + + tensor_model_service.add_tensor_model( + model_name, checksum, engine, runtime_config=runtime_config + ) + + cancel_token = runtime.child_token() - loop = asyncio.get_running_loop() - engine = PythonAsyncEngine(EchoTensorEngine(model_name).generate, loop) + async def _serve(): + await tensor_model_service.run(cancel_token) + + server_task = asyncio.create_task(_serve()) + try: + await asyncio.sleep(1) # wait service to start + yield host, port + finally: + cancel_token.cancel() + with contextlib.suppress(asyncio.TimeoutError, asyncio.CancelledError): + await asyncio.wait_for(server_task, timeout=5) + + return _start - service = KserveGrpcService(port=port, host=host) +@pytest.mark.asyncio +async def test_model_config_uses_runtime_config(tensor_service): + """Ensure tensor runtime_config is returned via the ModelConfig endpoint.""" + model_name = "tensor-config-model" tensor_config = { "name": model_name, "inputs": [ @@ -74,40 +102,40 @@ async def test_model_config_uses_runtime_config(runtime): runtime_config = ModelRuntimeConfig() runtime_config.set_tensor_model_config(tensor_config) - service.add_tensor_model( - model_name, checksum, engine, runtime_config=runtime_config - ) + async with tensor_service(model_name, runtime_config=runtime_config) as (host, port): + client = grpcclient.InferenceServerClient(url=f"{host}:{port}") + try: + response = await _fetch_model_config(client, model_name) + finally: + client.close() + + model_config = response.config + assert model_config.name == model_name + assert model_config.platform == "dynamo" + assert model_config.backend == "dynamo" - cancel_token = runtime.child_token() + inputs = {spec.name: spec for spec in model_config.input} + assert list(inputs["input_text"].dims) == [-1] + assert inputs["input_text"].data_type == mc.TYPE_STRING + assert list(inputs["control_flag"].dims) == [1] + assert inputs["control_flag"].data_type == mc.TYPE_BOOL - async def _serve(): - await service.run(cancel_token) + outputs = {spec.name: spec for spec in model_config.output} + assert list(outputs["results"].dims) == [-1] + assert outputs["results"].data_type == mc.TYPE_STRING - server_task = asyncio.create_task(_serve()) - client: Optional[grpcclient.InferenceServerClient] = None - try: - await asyncio.sleep(1) # wait service to start +@pytest.mark.asyncio +async def test_model_config_missing_runtime_config_errors(tensor_service): + """ModelConfig should return NOT_FOUND when no tensor runtime_config is saved.""" + model_name = "tensor-config-missing" + + async with tensor_service(model_name, runtime_config=None) as (host, port): client = grpcclient.InferenceServerClient(url=f"{host}:{port}") - response = await _fetch_model_config(client, model_name) - - model_config = response.config - assert model_config.name == model_name - assert model_config.platform == "dynamo" - assert model_config.backend == "dynamo" - - inputs = {spec.name: spec for spec in model_config.input} - assert list(inputs["input_text"].dims) == [-1] - assert inputs["input_text"].data_type == mc.TYPE_STRING - assert list(inputs["control_flag"].dims) == [1] - assert inputs["control_flag"].data_type == mc.TYPE_BOOL - - outputs = {spec.name: spec for spec in model_config.output} - assert list(outputs["results"].dims) == [-1] - assert outputs["results"].data_type == mc.TYPE_STRING - finally: - if client is not None: + try: + with pytest.raises(InferenceServerException) as excinfo: + await asyncio.to_thread(client.get_model_config, model_name) + finally: client.close() - cancel_token.cancel() - with contextlib.suppress(asyncio.TimeoutError, asyncio.CancelledError): - await asyncio.wait_for(server_task, timeout=5) \ No newline at end of file + + assert "not found" in str(excinfo.value).lower() From 8c3f15a935a85d2902b34fff04893e64162369c8 Mon Sep 17 00:00:00 2001 From: zhongdaor Date: Fri, 7 Nov 2025 13:40:51 -0800 Subject: [PATCH 6/9] pre-commit Signed-off-by: zhongdaor --- lib/bindings/python/src/dynamo/_core.pyi | 3 +-- lib/bindings/python/tests/test_kserve_grpc.py | 12 ++++++++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/lib/bindings/python/src/dynamo/_core.pyi b/lib/bindings/python/src/dynamo/_core.pyi index e4383e6275..cd90d1f69b 100644 --- a/lib/bindings/python/src/dynamo/_core.pyi +++ b/lib/bindings/python/src/dynamo/_core.pyi @@ -12,7 +12,6 @@ from typing import ( Tuple, ) -from click import Option from ._prometheus_names import prometheus_names @@ -896,7 +895,7 @@ class KserveGrpcService: model: str, checksum: str, engine: PythonAsyncEngine, - runtime_config: Optional[ModelRuntimeConfig], + runtime_config: Optional[ModelRuntimeConfig], ) -> None: """ Register a tensor-based model with the service. diff --git a/lib/bindings/python/tests/test_kserve_grpc.py b/lib/bindings/python/tests/test_kserve_grpc.py index 1610a3eccf..7251572d17 100644 --- a/lib/bindings/python/tests/test_kserve_grpc.py +++ b/lib/bindings/python/tests/test_kserve_grpc.py @@ -3,7 +3,6 @@ import asyncio import contextlib -import socket from contextlib import asynccontextmanager from typing import Any, AsyncIterator, Optional, Tuple @@ -29,7 +28,9 @@ async def _fetch_model_config( except InferenceServerException as err: last_error = err await asyncio.sleep(0.1) - raise AssertionError(f"Unable to fetch model config for '{model_name}': {last_error}") + raise AssertionError( + f"Unable to fetch model config for '{model_name}': {last_error}" + ) class EchoTensorEngine: @@ -75,7 +76,7 @@ async def _serve(): server_task = asyncio.create_task(_serve()) try: - await asyncio.sleep(1) # wait service to start + await asyncio.sleep(1) # wait service to start yield host, port finally: cancel_token.cancel() @@ -102,7 +103,10 @@ async def test_model_config_uses_runtime_config(tensor_service): runtime_config = ModelRuntimeConfig() runtime_config.set_tensor_model_config(tensor_config) - async with tensor_service(model_name, runtime_config=runtime_config) as (host, port): + async with tensor_service(model_name, runtime_config=runtime_config) as ( + host, + port, + ): client = grpcclient.InferenceServerClient(url=f"{host}:{port}") try: response = await _fetch_model_config(client, model_name) From c46c335a912a612cfbb67dc25dcb698843a6493c Mon Sep 17 00:00:00 2001 From: zhongdaor Date: Fri, 7 Nov 2025 13:48:29 -0800 Subject: [PATCH 7/9] pre-commit Signed-off-by: zhongdaor --- lib/bindings/python/src/dynamo/_core.pyi | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/bindings/python/src/dynamo/_core.pyi b/lib/bindings/python/src/dynamo/_core.pyi index cd90d1f69b..d812012822 100644 --- a/lib/bindings/python/src/dynamo/_core.pyi +++ b/lib/bindings/python/src/dynamo/_core.pyi @@ -12,7 +12,6 @@ from typing import ( Tuple, ) - from ._prometheus_names import prometheus_names # Import from specialized modules From 44348b7af2c6ccf3f4de14b1c9574b603bc8d700 Mon Sep 17 00:00:00 2001 From: zhongdaor Date: Mon, 10 Nov 2025 12:41:51 -0800 Subject: [PATCH 8/9] import triton gRPC client in tests hope to prevent CI error due to gRPC can't be forked Signed-off-by: zhongdaor --- lib/bindings/python/tests/test_kserve_grpc.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/lib/bindings/python/tests/test_kserve_grpc.py b/lib/bindings/python/tests/test_kserve_grpc.py index 7251572d17..001bd3633f 100644 --- a/lib/bindings/python/tests/test_kserve_grpc.py +++ b/lib/bindings/python/tests/test_kserve_grpc.py @@ -7,7 +7,6 @@ from typing import Any, AsyncIterator, Optional, Tuple import pytest -import tritonclient.grpc as grpcclient import tritonclient.grpc.model_config_pb2 as mc from tritonclient.utils import InferenceServerException @@ -17,7 +16,7 @@ async def _fetch_model_config( - client: grpcclient.InferenceServerClient, + client, model_name: str, retries: int = 30, ) -> Any: @@ -87,8 +86,10 @@ async def _serve(): @pytest.mark.asyncio +@pytest.mark.forked async def test_model_config_uses_runtime_config(tensor_service): """Ensure tensor runtime_config is returned via the ModelConfig endpoint.""" + import tritonclient.grpc as grpcclient model_name = "tensor-config-model" tensor_config = { "name": model_name, @@ -130,9 +131,11 @@ async def test_model_config_uses_runtime_config(tensor_service): @pytest.mark.asyncio +@pytest.mark.forked async def test_model_config_missing_runtime_config_errors(tensor_service): """ModelConfig should return NOT_FOUND when no tensor runtime_config is saved.""" model_name = "tensor-config-missing" + import tritonclient.grpc as grpcclient async with tensor_service(model_name, runtime_config=None) as (host, port): client = grpcclient.InferenceServerClient(url=f"{host}:{port}") From b4538a63a91184fcedd25644ca5963b43bbbb1a4 Mon Sep 17 00:00:00 2001 From: zhongdaor Date: Mon, 10 Nov 2025 12:42:53 -0800 Subject: [PATCH 9/9] pre-commit Signed-off-by: zhongdaor --- lib/bindings/python/tests/test_kserve_grpc.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/bindings/python/tests/test_kserve_grpc.py b/lib/bindings/python/tests/test_kserve_grpc.py index 001bd3633f..81b84380c0 100644 --- a/lib/bindings/python/tests/test_kserve_grpc.py +++ b/lib/bindings/python/tests/test_kserve_grpc.py @@ -90,6 +90,7 @@ async def _serve(): async def test_model_config_uses_runtime_config(tensor_service): """Ensure tensor runtime_config is returned via the ModelConfig endpoint.""" import tritonclient.grpc as grpcclient + model_name = "tensor-config-model" tensor_config = { "name": model_name,