From cbbf7f250d76033c91062b08426fa0cab5b5fab5 Mon Sep 17 00:00:00 2001
From: zhongdaor <zhongdaor@nvidia.com>
Date: Thu, 6 Nov 2025 14:45:36 -0800
Subject: [PATCH 1/9] feat: expose python api to save/remove/list
 ModelDeploymentCard

Signed-off-by: zhongdaor <zhongdaor@nvidia.com>
---
 lib/bindings/python/rust/kserve_grpc.rs    | 25 +++++++++++++++++++++-
 lib/bindings/python/rust/llm/model_card.rs | 15 +++++++++++--
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/lib/bindings/python/rust/kserve_grpc.rs b/lib/bindings/python/rust/kserve_grpc.rs
index 9a0fdd7f40..d8f94c1630 100644
--- a/lib/bindings/python/rust/kserve_grpc.rs
+++ b/lib/bindings/python/rust/kserve_grpc.rs
@@ -5,7 +5,7 @@ use std::sync::Arc;
 
 use pyo3::prelude::*;
 
-use crate::{CancellationToken, engine::*, to_pyerr};
+use crate::{CancellationToken, engine::*, llm::model_card::ModelDeploymentCard, to_pyerr};
 
 pub use dynamo_llm::grpc::service::kserve;
 
@@ -102,6 +102,29 @@ impl KserveGrpcService {
         Ok(self.inner.model_manager().list_tensor_models())
     }
 
+    pub fn save_model_card(&self, key: String, card: ModelDeploymentCard) -> PyResult<()> {
+        self.inner
+            .model_manager()
+            .save_model_card(&key, card.inner)
+            .map_err(to_pyerr)
+    }
+
+    pub fn remove_model_card(&self, key: String) -> PyResult<Option<ModelDeploymentCard>> {
+        let card = self.inner.model_manager().remove_model_card(&key);
+        Ok(card.map(|inner| ModelDeploymentCard { inner }))
+    }
+
+    pub fn get_model_cards(&self) -> PyResult<Vec<ModelDeploymentCard>> {
+        let cards = self
+            .inner
+            .model_manager()
+            .get_model_cards()
+            .into_iter()
+            .map(|inner| ModelDeploymentCard { inner })
+            .collect();
+        Ok(cards)
+    }
+
     fn run<'p>(&self, py: Python<'p>, token: CancellationToken) -> PyResult<Bound<'p, PyAny>> {
         let service = self.inner.clone();
         pyo3_async_runtimes::tokio::future_into_py(py, async move {
diff --git a/lib/bindings/python/rust/llm/model_card.rs b/lib/bindings/python/rust/llm/model_card.rs
index 60446c0709..44ff68fb1d 100644
--- a/lib/bindings/python/rust/llm/model_card.rs
+++ b/lib/bindings/python/rust/llm/model_card.rs
@@ -6,7 +6,7 @@ use llm_rs::model_card::ModelDeploymentCard as RsModelDeploymentCard;
 
 #[pyclass]
 #[derive(Clone)]
-pub(crate) struct ModelDeploymentCard {
+pub struct ModelDeploymentCard {
     pub(crate) inner: RsModelDeploymentCard,
 }
 
@@ -14,7 +14,18 @@ impl ModelDeploymentCard {}
 
 #[pymethods]
 impl ModelDeploymentCard {
-    // Previously called "from_local_path"
+    /// Build an in-memory ModelDeploymentCard from a folder containing config.json,
+    /// tokenizer.json and tokenizer_config.json (i.e. a huggingface repo checkout).
+    ///
+    /// # Arguments
+    /// * `path` - Path to the local model directory
+    /// * `model_name` - Name of the model
+    ///
+    /// # Returns
+    /// A new ModelDeploymentCard instance
+    ///
+    /// # Errors
+    /// Returns an error if the model directory does not exist or the model name is invalid.
     #[staticmethod]
     fn load(path: String, model_name: String) -> PyResult<ModelDeploymentCard> {
         let mut card = RsModelDeploymentCard::load_from_disk(&path, None).map_err(to_pyerr)?;

From f3f6bc3d69b568316b6c16928eaa180a94bc21a1 Mon Sep 17 00:00:00 2001
From: zhongdaor <zhongdaor@nvidia.com>
Date: Thu, 6 Nov 2025 15:36:18 -0800
Subject: [PATCH 2/9] use runtime_config rather than ModelDeploymentCard

Signed-off-by: zhongdaor <zhongdaor@nvidia.com>
---
 lib/bindings/python/rust/kserve_grpc.rs | 53 +++++++++++++------------
 1 file changed, 28 insertions(+), 25 deletions(-)

diff --git a/lib/bindings/python/rust/kserve_grpc.rs b/lib/bindings/python/rust/kserve_grpc.rs
index d8f94c1630..e2cd75ff7a 100644
--- a/lib/bindings/python/rust/kserve_grpc.rs
+++ b/lib/bindings/python/rust/kserve_grpc.rs
@@ -3,9 +3,12 @@
 
 use std::sync::Arc;
 
+use dynamo_llm::{self as llm_rs};
+use llm_rs::model_card::ModelDeploymentCard as RsModelDeploymentCard;
+use llm_rs::model_type::{ModelInput, ModelType};
 use pyo3::prelude::*;
 
-use crate::{CancellationToken, engine::*, llm::model_card::ModelDeploymentCard, to_pyerr};
+use crate::{CancellationToken, engine::*, llm::local_model::ModelRuntimeConfig, to_pyerr};
 
 pub use dynamo_llm::grpc::service::kserve;
 
@@ -56,12 +59,28 @@ impl KserveGrpcService {
             .map_err(to_pyerr)
     }
 
+    #[pyo3(signature = (model, checksum, engine, runtime_config=None))]
     pub fn add_tensor_model(
         &self,
         model: String,
         checksum: String,
         engine: PythonAsyncEngine,
+        runtime_config: Option<ModelRuntimeConfig>,
     ) -> PyResult<()> {
+        // If runtime_config is provided, create and save a ModelDeploymentCard
+        // so the ModelConfig endpoint can return model configuration
+        if let Some(runtime_config) = runtime_config {
+            let mut card = RsModelDeploymentCard::with_name_only(&model);
+            card.model_type = ModelType::TensorBased;
+            card.model_input = ModelInput::Tensor;
+            card.runtime_config = runtime_config.inner;
+
+            self.inner
+                .model_manager()
+                .save_model_card(&model, card)
+                .map_err(to_pyerr)?;
+        }
+
         let engine = Arc::new(engine);
         self.inner
             .model_manager()
@@ -84,10 +103,17 @@ impl KserveGrpcService {
     }
 
     pub fn remove_tensor_model(&self, model: String) -> PyResult<()> {
+        // Remove the engine
         self.inner
             .model_manager()
             .remove_tensor_model(&model)
-            .map_err(to_pyerr)
+            .map_err(to_pyerr)?;
+
+        // Also remove the model card if it exists
+        // (It's ok if it doesn't exist since runtime_config is optional, we just ignore the None return)
+        let _ = self.inner.model_manager().remove_model_card(&model);
+
+        Ok(())
     }
 
     pub fn list_chat_completions_models(&self) -> PyResult<Vec<String>> {
@@ -102,29 +128,6 @@ impl KserveGrpcService {
         Ok(self.inner.model_manager().list_tensor_models())
     }
 
-    pub fn save_model_card(&self, key: String, card: ModelDeploymentCard) -> PyResult<()> {
-        self.inner
-            .model_manager()
-            .save_model_card(&key, card.inner)
-            .map_err(to_pyerr)
-    }
-
-    pub fn remove_model_card(&self, key: String) -> PyResult<Option<ModelDeploymentCard>> {
-        let card = self.inner.model_manager().remove_model_card(&key);
-        Ok(card.map(|inner| ModelDeploymentCard { inner }))
-    }
-
-    pub fn get_model_cards(&self) -> PyResult<Vec<ModelDeploymentCard>> {
-        let cards = self
-            .inner
-            .model_manager()
-            .get_model_cards()
-            .into_iter()
-            .map(|inner| ModelDeploymentCard { inner })
-            .collect();
-        Ok(cards)
-    }
-
     fn run<'p>(&self, py: Python<'p>, token: CancellationToken) -> PyResult<Bound<'p, PyAny>> {
         let service = self.inner.clone();
         pyo3_async_runtimes::tokio::future_into_py(py, async move {

From a572af965307ef1f519dc7280f2df63738547ae1 Mon Sep 17 00:00:00 2001
From: zhongdaor <zhongdaor@nvidia.com>
Date: Thu, 6 Nov 2025 15:43:59 -0800
Subject: [PATCH 3/9] revert model_card.rs to main

Signed-off-by: zhongdaor <zhongdaor@nvidia.com>
---
 lib/bindings/python/rust/llm/model_card.rs | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/lib/bindings/python/rust/llm/model_card.rs b/lib/bindings/python/rust/llm/model_card.rs
index 44ff68fb1d..60446c0709 100644
--- a/lib/bindings/python/rust/llm/model_card.rs
+++ b/lib/bindings/python/rust/llm/model_card.rs
@@ -6,7 +6,7 @@ use llm_rs::model_card::ModelDeploymentCard as RsModelDeploymentCard;
 
 #[pyclass]
 #[derive(Clone)]
-pub struct ModelDeploymentCard {
+pub(crate) struct ModelDeploymentCard {
     pub(crate) inner: RsModelDeploymentCard,
 }
 
@@ -14,18 +14,7 @@ impl ModelDeploymentCard {}
 
 #[pymethods]
 impl ModelDeploymentCard {
-    /// Build an in-memory ModelDeploymentCard from a folder containing config.json,
-    /// tokenizer.json and tokenizer_config.json (i.e. a huggingface repo checkout).
-    ///
-    /// # Arguments
-    /// * `path` - Path to the local model directory
-    /// * `model_name` - Name of the model
-    ///
-    /// # Returns
-    /// A new ModelDeploymentCard instance
-    ///
-    /// # Errors
-    /// Returns an error if the model directory does not exist or the model name is invalid.
+    // Previously called "from_local_path"
     #[staticmethod]
     fn load(path: String, model_name: String) -> PyResult<ModelDeploymentCard> {
         let mut card = RsModelDeploymentCard::load_from_disk(&path, None).map_err(to_pyerr)?;

From afd4c439115652163b27ee336e0b2f8cc16689c2 Mon Sep 17 00:00:00 2001
From: zhongdaor <zhongdaor@nvidia.com>
Date: Fri, 7 Nov 2025 00:14:57 -0800
Subject: [PATCH 4/9] add tests

Signed-off-by: zhongdaor <zhongdaor@nvidia.com>
---
 lib/bindings/python/src/dynamo/_core.pyi      |   3 +
 lib/bindings/python/tests/test_kserve_grpc.py | 113 ++++++++++++++++++
 2 files changed, 116 insertions(+)
 create mode 100644 lib/bindings/python/tests/test_kserve_grpc.py

diff --git a/lib/bindings/python/src/dynamo/_core.pyi b/lib/bindings/python/src/dynamo/_core.pyi
index 2364c8a0b5..e4383e6275 100644
--- a/lib/bindings/python/src/dynamo/_core.pyi
+++ b/lib/bindings/python/src/dynamo/_core.pyi
@@ -12,6 +12,8 @@ from typing import (
     Tuple,
 )
 
+from click import Option
+
 from ._prometheus_names import prometheus_names
 
 # Import from specialized modules
@@ -894,6 +896,7 @@ class KserveGrpcService:
         model: str,
         checksum: str,
         engine: PythonAsyncEngine,
+        runtime_config: Optional[ModelRuntimeConfig], 
     ) -> None:
         """
         Register a tensor-based model with the service.
diff --git a/lib/bindings/python/tests/test_kserve_grpc.py b/lib/bindings/python/tests/test_kserve_grpc.py
new file mode 100644
index 0000000000..d4bb9ae59a
--- /dev/null
+++ b/lib/bindings/python/tests/test_kserve_grpc.py
@@ -0,0 +1,113 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import asyncio
+import contextlib
+import socket
+from typing import Any, Optional
+
+import pytest
+import tritonclient.grpc as grpcclient
+import tritonclient.grpc.model_config_pb2 as mc
+from tritonclient.utils import InferenceServerException
+
+from dynamo.llm import KserveGrpcService, ModelRuntimeConfig, PythonAsyncEngine
+
+pytestmark = pytest.mark.pre_merge
+
+
+async def _fetch_model_config(
+    client: grpcclient.InferenceServerClient,
+    model_name: str,
+    retries: int = 30,
+) -> Any:
+    last_error: Optional[Exception] = None
+    for _ in range(retries):
+        try:
+            return await asyncio.to_thread(client.get_model_config, model_name)
+        except InferenceServerException as err:
+            last_error = err
+            await asyncio.sleep(0.1)
+    raise AssertionError(f"Unable to fetch model config for '{model_name}': {last_error}")
+
+
+class EchoTensorEngine:
+    """Minimal tensor engine stub for registering tensor models."""
+
+    def __init__(self, model_name: str):
+        self._model_name = model_name
+
+    def generate(self, request, context=None):
+        async def _generator():
+            yield {
+                "model": self._model_name,
+                "tensors": request.get("tensors", []),
+                "parameters": request.get("parameters", {}),
+            }
+
+        return _generator()
+
+
+@pytest.mark.asyncio
+async def test_model_config_uses_runtime_config(runtime):
+    """Ensure tensor runtime_config is returned via the ModelConfig endpoint."""
+    host = "127.0.0.1"
+    port = 8787
+    model_name = "tensor-config-model"
+    checksum = "dummy-mdcsum"
+
+    loop = asyncio.get_running_loop()
+    engine = PythonAsyncEngine(EchoTensorEngine(model_name).generate, loop)
+
+    service = KserveGrpcService(port=port, host=host)
+
+    tensor_config = {
+        "name": model_name,
+        "inputs": [
+            {"name": "input_text", "data_type": "Bytes", "shape": [-1]},
+            {"name": "control_flag", "data_type": "Bool", "shape": [1]},
+        ],
+        "outputs": [
+            {"name": "results", "data_type": "Bytes", "shape": [-1]},
+        ],
+    }
+    runtime_config = ModelRuntimeConfig()
+    runtime_config.set_tensor_model_config(tensor_config)
+
+    service.add_tensor_model(
+        model_name, checksum, engine, runtime_config=runtime_config
+    )
+
+    cancel_token = runtime.child_token()
+
+    async def _serve():
+        await service.run(cancel_token)
+
+    server_task = asyncio.create_task(_serve())
+
+    client: Optional[grpcclient.InferenceServerClient] = None
+    try:
+        await asyncio.sleep(1) # wait service to start
+        client = grpcclient.InferenceServerClient(url=f"{host}:{port}")
+        response = await _fetch_model_config(client, model_name)
+
+        model_config = response.config
+        assert model_config.name == model_name
+        assert model_config.platform == "dynamo"
+        assert model_config.backend == "dynamo"
+
+        inputs = {spec.name: spec for spec in model_config.input}
+        assert list(inputs["input_text"].dims) == [-1]
+        assert inputs["input_text"].data_type == mc.TYPE_STRING
+        assert list(inputs["control_flag"].dims) == [1]
+        assert inputs["control_flag"].data_type == mc.TYPE_BOOL
+
+        outputs = {spec.name: spec for spec in model_config.output}
+        assert list(outputs["results"].dims) == [-1]
+        assert outputs["results"].data_type == mc.TYPE_STRING
+    finally:
+        if client is not None:
+            client.close()
+        cancel_token.cancel()
+        with contextlib.suppress(asyncio.TimeoutError, asyncio.CancelledError):
+            await asyncio.wait_for(server_task, timeout=5)
\ No newline at end of file

From dcc356a59ba4e9305ef7b13c1c7abbbe5ddd1784 Mon Sep 17 00:00:00 2001
From: zhongdaor <zhongdaor@nvidia.com>
Date: Fri, 7 Nov 2025 00:24:25 -0800
Subject: [PATCH 5/9] make test looks better

Signed-off-by: zhongdaor <zhongdaor@nvidia.com>
---
 lib/bindings/python/tests/test_kserve_grpc.py | 112 +++++++++++-------
 1 file changed, 70 insertions(+), 42 deletions(-)

diff --git a/lib/bindings/python/tests/test_kserve_grpc.py b/lib/bindings/python/tests/test_kserve_grpc.py
index d4bb9ae59a..1610a3eccf 100644
--- a/lib/bindings/python/tests/test_kserve_grpc.py
+++ b/lib/bindings/python/tests/test_kserve_grpc.py
@@ -4,7 +4,8 @@
 import asyncio
 import contextlib
 import socket
-from typing import Any, Optional
+from contextlib import asynccontextmanager
+from typing import Any, AsyncIterator, Optional, Tuple
 
 import pytest
 import tritonclient.grpc as grpcclient
@@ -48,19 +49,46 @@ async def _generator():
         return _generator()
 
 
-@pytest.mark.asyncio
-async def test_model_config_uses_runtime_config(runtime):
-    """Ensure tensor runtime_config is returned via the ModelConfig endpoint."""
-    host = "127.0.0.1"
-    port = 8787
-    model_name = "tensor-config-model"
-    checksum = "dummy-mdcsum"
+@pytest.fixture
+def tensor_service(runtime):
+    @asynccontextmanager
+    async def _start(
+        model_name: str,
+        *,
+        runtime_config: Optional[ModelRuntimeConfig] = None,
+        checksum: str = "dummy-mdcsum",
+    ) -> AsyncIterator[Tuple[str, int]]:
+        host = "127.0.0.1"
+        port = 8787
+        loop = asyncio.get_running_loop()
+        engine = PythonAsyncEngine(EchoTensorEngine(model_name).generate, loop)
+        tensor_model_service = KserveGrpcService(port=port, host=host)
+
+        tensor_model_service.add_tensor_model(
+            model_name, checksum, engine, runtime_config=runtime_config
+        )
+
+        cancel_token = runtime.child_token()
 
-    loop = asyncio.get_running_loop()
-    engine = PythonAsyncEngine(EchoTensorEngine(model_name).generate, loop)
+        async def _serve():
+            await tensor_model_service.run(cancel_token)
+
+        server_task = asyncio.create_task(_serve())
+        try:
+            await asyncio.sleep(1) # wait service to start
+            yield host, port
+        finally:
+            cancel_token.cancel()
+            with contextlib.suppress(asyncio.TimeoutError, asyncio.CancelledError):
+                await asyncio.wait_for(server_task, timeout=5)
+
+    return _start
 
-    service = KserveGrpcService(port=port, host=host)
 
+@pytest.mark.asyncio
+async def test_model_config_uses_runtime_config(tensor_service):
+    """Ensure tensor runtime_config is returned via the ModelConfig endpoint."""
+    model_name = "tensor-config-model"
     tensor_config = {
         "name": model_name,
         "inputs": [
@@ -74,40 +102,40 @@ async def test_model_config_uses_runtime_config(runtime):
     runtime_config = ModelRuntimeConfig()
     runtime_config.set_tensor_model_config(tensor_config)
 
-    service.add_tensor_model(
-        model_name, checksum, engine, runtime_config=runtime_config
-    )
+    async with tensor_service(model_name, runtime_config=runtime_config) as (host, port):
+        client = grpcclient.InferenceServerClient(url=f"{host}:{port}")
+        try:
+            response = await _fetch_model_config(client, model_name)
+        finally:
+            client.close()
+
+    model_config = response.config
+    assert model_config.name == model_name
+    assert model_config.platform == "dynamo"
+    assert model_config.backend == "dynamo"
 
-    cancel_token = runtime.child_token()
+    inputs = {spec.name: spec for spec in model_config.input}
+    assert list(inputs["input_text"].dims) == [-1]
+    assert inputs["input_text"].data_type == mc.TYPE_STRING
+    assert list(inputs["control_flag"].dims) == [1]
+    assert inputs["control_flag"].data_type == mc.TYPE_BOOL
 
-    async def _serve():
-        await service.run(cancel_token)
+    outputs = {spec.name: spec for spec in model_config.output}
+    assert list(outputs["results"].dims) == [-1]
+    assert outputs["results"].data_type == mc.TYPE_STRING
 
-    server_task = asyncio.create_task(_serve())
 
-    client: Optional[grpcclient.InferenceServerClient] = None
-    try:
-        await asyncio.sleep(1) # wait service to start
+@pytest.mark.asyncio
+async def test_model_config_missing_runtime_config_errors(tensor_service):
+    """ModelConfig should return NOT_FOUND when no tensor runtime_config is saved."""
+    model_name = "tensor-config-missing"
+
+    async with tensor_service(model_name, runtime_config=None) as (host, port):
         client = grpcclient.InferenceServerClient(url=f"{host}:{port}")
-        response = await _fetch_model_config(client, model_name)
-
-        model_config = response.config
-        assert model_config.name == model_name
-        assert model_config.platform == "dynamo"
-        assert model_config.backend == "dynamo"
-
-        inputs = {spec.name: spec for spec in model_config.input}
-        assert list(inputs["input_text"].dims) == [-1]
-        assert inputs["input_text"].data_type == mc.TYPE_STRING
-        assert list(inputs["control_flag"].dims) == [1]
-        assert inputs["control_flag"].data_type == mc.TYPE_BOOL
-
-        outputs = {spec.name: spec for spec in model_config.output}
-        assert list(outputs["results"].dims) == [-1]
-        assert outputs["results"].data_type == mc.TYPE_STRING
-    finally:
-        if client is not None:
+        try:
+            with pytest.raises(InferenceServerException) as excinfo:
+                await asyncio.to_thread(client.get_model_config, model_name)
+        finally:
             client.close()
-        cancel_token.cancel()
-        with contextlib.suppress(asyncio.TimeoutError, asyncio.CancelledError):
-            await asyncio.wait_for(server_task, timeout=5)
\ No newline at end of file
+
+    assert "not found" in str(excinfo.value).lower()

From 8c3f15a935a85d2902b34fff04893e64162369c8 Mon Sep 17 00:00:00 2001
From: zhongdaor <zhongdaor@nvidia.com>
Date: Fri, 7 Nov 2025 13:40:51 -0800
Subject: [PATCH 6/9] pre-commit

Signed-off-by: zhongdaor <zhongdaor@nvidia.com>
---
 lib/bindings/python/src/dynamo/_core.pyi      |  3 +--
 lib/bindings/python/tests/test_kserve_grpc.py | 12 ++++++++----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/lib/bindings/python/src/dynamo/_core.pyi b/lib/bindings/python/src/dynamo/_core.pyi
index e4383e6275..cd90d1f69b 100644
--- a/lib/bindings/python/src/dynamo/_core.pyi
+++ b/lib/bindings/python/src/dynamo/_core.pyi
@@ -12,7 +12,6 @@ from typing import (
     Tuple,
 )
 
-from click import Option
 
 from ._prometheus_names import prometheus_names
 
@@ -896,7 +895,7 @@ class KserveGrpcService:
         model: str,
         checksum: str,
         engine: PythonAsyncEngine,
-        runtime_config: Optional[ModelRuntimeConfig], 
+        runtime_config: Optional[ModelRuntimeConfig],
     ) -> None:
         """
         Register a tensor-based model with the service.
diff --git a/lib/bindings/python/tests/test_kserve_grpc.py b/lib/bindings/python/tests/test_kserve_grpc.py
index 1610a3eccf..7251572d17 100644
--- a/lib/bindings/python/tests/test_kserve_grpc.py
+++ b/lib/bindings/python/tests/test_kserve_grpc.py
@@ -3,7 +3,6 @@
 
 import asyncio
 import contextlib
-import socket
 from contextlib import asynccontextmanager
 from typing import Any, AsyncIterator, Optional, Tuple
 
@@ -29,7 +28,9 @@ async def _fetch_model_config(
         except InferenceServerException as err:
             last_error = err
             await asyncio.sleep(0.1)
-    raise AssertionError(f"Unable to fetch model config for '{model_name}': {last_error}")
+    raise AssertionError(
+        f"Unable to fetch model config for '{model_name}': {last_error}"
+    )
 
 
 class EchoTensorEngine:
@@ -75,7 +76,7 @@ async def _serve():
 
         server_task = asyncio.create_task(_serve())
         try:
-            await asyncio.sleep(1) # wait service to start
+            await asyncio.sleep(1)  # wait service to start
             yield host, port
         finally:
             cancel_token.cancel()
@@ -102,7 +103,10 @@ async def test_model_config_uses_runtime_config(tensor_service):
     runtime_config = ModelRuntimeConfig()
     runtime_config.set_tensor_model_config(tensor_config)
 
-    async with tensor_service(model_name, runtime_config=runtime_config) as (host, port):
+    async with tensor_service(model_name, runtime_config=runtime_config) as (
+        host,
+        port,
+    ):
         client = grpcclient.InferenceServerClient(url=f"{host}:{port}")
         try:
             response = await _fetch_model_config(client, model_name)

From c46c335a912a612cfbb67dc25dcb698843a6493c Mon Sep 17 00:00:00 2001
From: zhongdaor <zhongdaor@nvidia.com>
Date: Fri, 7 Nov 2025 13:48:29 -0800
Subject: [PATCH 7/9] pre-commit

Signed-off-by: zhongdaor <zhongdaor@nvidia.com>
---
 lib/bindings/python/src/dynamo/_core.pyi | 1 -
 1 file changed, 1 deletion(-)

diff --git a/lib/bindings/python/src/dynamo/_core.pyi b/lib/bindings/python/src/dynamo/_core.pyi
index cd90d1f69b..d812012822 100644
--- a/lib/bindings/python/src/dynamo/_core.pyi
+++ b/lib/bindings/python/src/dynamo/_core.pyi
@@ -12,7 +12,6 @@ from typing import (
     Tuple,
 )
 
-
 from ._prometheus_names import prometheus_names
 
 # Import from specialized modules

From 44348b7af2c6ccf3f4de14b1c9574b603bc8d700 Mon Sep 17 00:00:00 2001
From: zhongdaor <zhongdaor@nvidia.com>
Date: Mon, 10 Nov 2025 12:41:51 -0800
Subject: [PATCH 8/9] import triton gRPC client in tests hope to prevent CI
 error due to gRPC can't be forked

Signed-off-by: zhongdaor <zhongdaor@nvidia.com>
---
 lib/bindings/python/tests/test_kserve_grpc.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/lib/bindings/python/tests/test_kserve_grpc.py b/lib/bindings/python/tests/test_kserve_grpc.py
index 7251572d17..001bd3633f 100644
--- a/lib/bindings/python/tests/test_kserve_grpc.py
+++ b/lib/bindings/python/tests/test_kserve_grpc.py
@@ -7,7 +7,6 @@
 from typing import Any, AsyncIterator, Optional, Tuple
 
 import pytest
-import tritonclient.grpc as grpcclient
 import tritonclient.grpc.model_config_pb2 as mc
 from tritonclient.utils import InferenceServerException
 
@@ -17,7 +16,7 @@
 
 
 async def _fetch_model_config(
-    client: grpcclient.InferenceServerClient,
+    client,
     model_name: str,
     retries: int = 30,
 ) -> Any:
@@ -87,8 +86,10 @@ async def _serve():
 
 
 @pytest.mark.asyncio
+@pytest.mark.forked
 async def test_model_config_uses_runtime_config(tensor_service):
     """Ensure tensor runtime_config is returned via the ModelConfig endpoint."""
+    import tritonclient.grpc as grpcclient
     model_name = "tensor-config-model"
     tensor_config = {
         "name": model_name,
@@ -130,9 +131,11 @@ async def test_model_config_uses_runtime_config(tensor_service):
 
 
 @pytest.mark.asyncio
+@pytest.mark.forked
 async def test_model_config_missing_runtime_config_errors(tensor_service):
     """ModelConfig should return NOT_FOUND when no tensor runtime_config is saved."""
     model_name = "tensor-config-missing"
+    import tritonclient.grpc as grpcclient
 
     async with tensor_service(model_name, runtime_config=None) as (host, port):
         client = grpcclient.InferenceServerClient(url=f"{host}:{port}")

From b4538a63a91184fcedd25644ca5963b43bbbb1a4 Mon Sep 17 00:00:00 2001
From: zhongdaor <zhongdaor@nvidia.com>
Date: Mon, 10 Nov 2025 12:42:53 -0800
Subject: [PATCH 9/9] pre-commit

Signed-off-by: zhongdaor <zhongdaor@nvidia.com>
---
 lib/bindings/python/tests/test_kserve_grpc.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/bindings/python/tests/test_kserve_grpc.py b/lib/bindings/python/tests/test_kserve_grpc.py
index 001bd3633f..81b84380c0 100644
--- a/lib/bindings/python/tests/test_kserve_grpc.py
+++ b/lib/bindings/python/tests/test_kserve_grpc.py
@@ -90,6 +90,7 @@ async def _serve():
 async def test_model_config_uses_runtime_config(tensor_service):
     """Ensure tensor runtime_config is returned via the ModelConfig endpoint."""
     import tritonclient.grpc as grpcclient
+
     model_name = "tensor-config-model"
     tensor_config = {
         "name": model_name,