From 243d59f8fe090ca3a767188708b05927c3386240 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Sat, 28 Mar 2026 06:42:43 -0700
Subject: [PATCH 1/3] feat(inference): add timeout

---
 crates/openshell-cli/src/main.rs              | 14 ++++++++++-
 crates/openshell-cli/src/run.rs               | 21 ++++++++++++++--
 crates/openshell-router/src/backend.rs        |  3 ++-
 crates/openshell-router/src/config.rs         |  8 +++++++
 crates/openshell-router/src/lib.rs            |  3 ---
 crates/openshell-router/src/mock.rs           |  1 +
 .../tests/backend_integration.rs              |  6 +++++
 crates/openshell-sandbox/src/lib.rs           | 22 +++++++++++++++++
 .../tests/system_inference.rs                 |  3 +++
 crates/openshell-server/src/inference.rs      | 24 +++++++++++++++++--
 proto/inference.proto                         | 10 ++++++++
 11 files changed, 106 insertions(+), 9 deletions(-)
diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs
index 5de31c79c..310979579 100644
--- a/crates/openshell-cli/src/main.rs
+++ b/crates/openshell-cli/src/main.rs
@@ -937,6 +937,10 @@ enum InferenceCommands {
         /// Skip endpoint verification before saving the route.
         #[arg(long)]
         no_verify: bool,
+
+        /// Request timeout in seconds for inference calls (0 = default 60s).
+        #[arg(long, default_value_t = 0)]
+        timeout: u64,
     },
 
     /// Update gateway-level inference configuration (partial update).
@@ -957,6 +961,10 @@ enum InferenceCommands {
         /// Skip endpoint verification before saving the route.
         #[arg(long)]
         no_verify: bool,
+
+        /// Request timeout in seconds for inference calls (0 = default 60s, unchanged if omitted).
+        #[arg(long)]
+        timeout: Option<u64>,
     },
 
     /// Get gateway-level inference provider and model.
@@ -2026,10 +2034,12 @@ async fn main() -> Result<()> {
                     model,
                     system,
                     no_verify,
+                    timeout,
                 } => {
                     let route_name = if system { "sandbox-system" } else { "" };
                     run::gateway_inference_set(
-                        endpoint, &provider, &model, route_name, no_verify, &tls,
+                        endpoint, &provider, &model, route_name, no_verify, timeout,
+                        &tls,
                     )
                     .await?;
                 }
@@ -2038,6 +2048,7 @@ async fn main() -> Result<()> {
                     model,
                     system,
                     no_verify,
+                    timeout,
                 } => {
                     let route_name = if system { "sandbox-system" } else { "" };
                     run::gateway_inference_update(
@@ -2046,6 +2057,7 @@ async fn main() -> Result<()> {
                         model.as_deref(),
                         route_name,
                         no_verify,
+                        timeout,
                         &tls,
                     )
                     .await?;
diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs
index e32eec2a4..bab819137 100644
--- a/crates/openshell-cli/src/run.rs
+++ b/crates/openshell-cli/src/run.rs
@@ -3481,6 +3481,7 @@ pub async fn gateway_inference_set(
     model_id: &str,
     route_name: &str,
     no_verify: bool,
+    timeout_secs: u64,
     tls: &TlsOptions,
 ) -> Result<()> {
     let progress = if std::io::stdout().is_terminal() {
@@ -3504,6 +3505,7 @@ pub async fn gateway_inference_set(
             route_name: route_name.to_string(),
             verify: false,
             no_verify,
+            timeout_secs,
         })
         .await;
 
@@ -3525,6 +3527,7 @@ pub async fn gateway_inference_set(
     println!("  {} {}", "Provider:".dimmed(), configured.provider_name);
     println!("  {} {}", "Model:".dimmed(), configured.model_id);
     println!("  {} {}", "Version:".dimmed(), configured.version);
+    print_timeout(configured.timeout_secs);
     if configured.validation_performed {
         println!("  {}", "Validated Endpoints:".dimmed());
         for endpoint in configured.validated_endpoints {
@@ -3540,11 +3543,12 @@ pub async fn gateway_inference_update(
     model_id: Option<&str>,
     route_name: &str,
     no_verify: bool,
+    timeout_secs: Option<u64>,
     tls: &TlsOptions,
 ) -> Result<()> {
-    if provider_name.is_none() && model_id.is_none() {
+    if provider_name.is_none() && model_id.is_none() && timeout_secs.is_none() {
         return Err(miette::miette!(
-            "at least one of --provider or --model must be specified"
+            "at least one of --provider, --model, or --timeout must be specified"
         ));
     }
 
@@ -3561,6 +3565,7 @@ pub async fn gateway_inference_update(
 
     let provider = provider_name.unwrap_or(&current.provider_name);
     let model = model_id.unwrap_or(&current.model_id);
+    let timeout = timeout_secs.unwrap_or(current.timeout_secs);
 
     let progress = if std::io::stdout().is_terminal() {
         let spinner = ProgressBar::new_spinner();
@@ -3582,6 +3587,7 @@ pub async fn gateway_inference_update(
             route_name: route_name.to_string(),
             verify: false,
             no_verify,
+            timeout_secs: timeout,
         })
         .await;
 
@@ -3603,6 +3609,7 @@ pub async fn gateway_inference_update(
     println!("  {} {}", "Provider:".dimmed(), configured.provider_name);
     println!("  {} {}", "Model:".dimmed(), configured.model_id);
     println!("  {} {}", "Version:".dimmed(), configured.version);
+    print_timeout(configured.timeout_secs);
     if configured.validation_performed {
         println!("  {}", "Validated Endpoints:".dimmed());
         for endpoint in configured.validated_endpoints {
@@ -3639,6 +3646,7 @@ pub async fn gateway_inference_get(
         println!("  {} {}", "Provider:".dimmed(), configured.provider_name);
         println!("  {} {}", "Model:".dimmed(), configured.model_id);
         println!("  {} {}", "Version:".dimmed(), configured.version);
+        print_timeout(configured.timeout_secs);
     } else {
         // Show both routes by default.
         print_inference_route(&mut client, "Gateway inference", "").await;
@@ -3666,6 +3674,7 @@ async fn print_inference_route(
             println!("  {} {}", "Provider:".dimmed(), configured.provider_name);
             println!("  {} {}", "Model:".dimmed(), configured.model_id);
             println!("  {} {}", "Version:".dimmed(), configured.version);
+            print_timeout(configured.timeout_secs);
         }
         Err(e) if e.code() == Code::NotFound => {
             println!("{}", format!("{label}:").cyan().bold());
@@ -3680,6 +3689,14 @@ async fn print_inference_route(
     }
 }
 
+fn print_timeout(timeout_secs: u64) {
+    if timeout_secs == 0 {
+        println!("  {} {}s (default)", "Timeout:".dimmed(), 60);
+    } else {
+        println!("  {} {}s", "Timeout:".dimmed(), timeout_secs);
+    }
+}
+
 fn format_inference_status(status: Status) -> miette::Report {
     let message = status.message().trim();
 
diff --git a/crates/openshell-router/src/backend.rs b/crates/openshell-router/src/backend.rs
index d82ea082c..d1d7092c0 100644
--- a/crates/openshell-router/src/backend.rs
+++ b/crates/openshell-router/src/backend.rs
@@ -149,7 +149,7 @@ async fn send_backend_request(
         }
         Err(_) => body,
     };
-    builder = builder.body(body);
+    builder = builder.body(body).timeout(route.timeout);
 
     builder.send().await.map_err(|e| {
         if e.is_timeout() {
@@ -468,6 +468,7 @@ mod tests {
             protocols: protocols.iter().map(|p| (*p).to_string()).collect(),
             auth,
             default_headers: vec![("anthropic-version".to_string(), "2023-06-01".to_string())],
+            timeout: crate::config::DEFAULT_ROUTE_TIMEOUT,
         }
     }
 
diff --git a/crates/openshell-router/src/config.rs b/crates/openshell-router/src/config.rs
index d9c081d60..52c22da9f 100644
--- a/crates/openshell-router/src/config.rs
+++ b/crates/openshell-router/src/config.rs
@@ -3,11 +3,14 @@
 
 use serde::Deserialize;
 use std::path::Path;
+use std::time::Duration;
 
 pub use openshell_core::inference::AuthHeader;
 
 use crate::RouterError;
 
+pub const DEFAULT_ROUTE_TIMEOUT: Duration = Duration::from_secs(60);
+
 #[derive(Debug, Clone, Deserialize)]
 pub struct RouterConfig {
     pub routes: Vec<RouteConfig>,
@@ -45,6 +48,8 @@ pub struct ResolvedRoute {
     pub auth: AuthHeader,
     /// Extra headers injected on every request (e.g. `anthropic-version`).
     pub default_headers: Vec<(String, String)>,
+    /// Per-request timeout for proxied inference calls.
+    pub timeout: Duration,
 }
 
 impl std::fmt::Debug for ResolvedRoute {
@@ -57,6 +62,7 @@ impl std::fmt::Debug for ResolvedRoute {
             .field("protocols", &self.protocols)
             .field("auth", &self.auth)
             .field("default_headers", &self.default_headers)
+            .field("timeout", &self.timeout)
             .finish()
     }
 }
@@ -129,6 +135,7 @@ impl RouteConfig {
             protocols,
             auth,
             default_headers,
+            timeout: DEFAULT_ROUTE_TIMEOUT,
         })
     }
 }
@@ -256,6 +263,7 @@ routes:
             protocols: vec!["openai_chat_completions".to_string()],
             auth: AuthHeader::Bearer,
             default_headers: Vec::new(),
+            timeout: DEFAULT_ROUTE_TIMEOUT,
         };
         let debug_output = format!("{route:?}");
         assert!(
diff --git a/crates/openshell-router/src/lib.rs b/crates/openshell-router/src/lib.rs
index a5712d9a0..7deed6fc4 100644
--- a/crates/openshell-router/src/lib.rs
+++ b/crates/openshell-router/src/lib.rs
@@ -5,8 +5,6 @@ mod backend;
 pub mod config;
 mod mock;
 
-use std::time::Duration;
-
 pub use backend::{
     ProxyResponse, StreamingProxyResponse, ValidatedEndpoint, ValidationFailure,
     ValidationFailureKind, verify_backend_endpoint,
@@ -39,7 +37,6 @@ pub struct Router {
 impl Router {
     pub fn new() -> Result<Self, RouterError> {
         let client = reqwest::Client::builder()
-            .timeout(Duration::from_secs(60))
             .build()
             .map_err(|e| RouterError::Internal(format!("failed to build HTTP client: {e}")))?;
         Ok(Self {
diff --git a/crates/openshell-router/src/mock.rs b/crates/openshell-router/src/mock.rs
index 9b6accb60..a17ce486f 100644
--- a/crates/openshell-router/src/mock.rs
+++ b/crates/openshell-router/src/mock.rs
@@ -131,6 +131,7 @@ mod tests {
             protocols: protocols.iter().map(ToString::to_string).collect(),
             auth: crate::config::AuthHeader::Bearer,
             default_headers: Vec::new(),
+            timeout: crate::config::DEFAULT_ROUTE_TIMEOUT,
         }
     }
 
diff --git a/crates/openshell-router/tests/backend_integration.rs b/crates/openshell-router/tests/backend_integration.rs
index 4861bd6d0..571964aa8 100644
--- a/crates/openshell-router/tests/backend_integration.rs
+++ b/crates/openshell-router/tests/backend_integration.rs
@@ -15,6 +15,7 @@ fn mock_candidates(base_url: &str) -> Vec<ResolvedRoute> {
         protocols: vec!["openai_chat_completions".to_string()],
         auth: AuthHeader::Bearer,
         default_headers: Vec::new(),
+        timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT,
     }]
 }
 
@@ -117,6 +118,7 @@ async fn proxy_no_compatible_route_returns_error() {
         protocols: vec!["anthropic_messages".to_string()],
         auth: AuthHeader::Custom("x-api-key"),
         default_headers: Vec::new(),
+        timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT,
     }];
 
     let err = router
@@ -178,6 +180,7 @@ async fn proxy_mock_route_returns_canned_response() {
         protocols: vec!["openai_chat_completions".to_string()],
         auth: AuthHeader::Bearer,
         default_headers: Vec::new(),
+        timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT,
     }];
 
     let body = serde_json::to_vec(&serde_json::json!({
@@ -312,6 +315,7 @@ async fn proxy_uses_x_api_key_for_anthropic_route() {
         protocols: vec!["anthropic_messages".to_string()],
         auth: AuthHeader::Custom("x-api-key"),
         default_headers: vec![("anthropic-version".to_string(), "2023-06-01".to_string())],
+        timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT,
     }];
 
     let body = serde_json::to_vec(&serde_json::json!({
@@ -370,6 +374,7 @@ async fn proxy_anthropic_does_not_send_bearer_auth() {
         protocols: vec!["anthropic_messages".to_string()],
         auth: AuthHeader::Custom("x-api-key"),
         default_headers: vec![("anthropic-version".to_string(), "2023-06-01".to_string())],
+        timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT,
     }];
 
     let response = router
@@ -414,6 +419,7 @@ async fn proxy_forwards_client_anthropic_version_header() {
         protocols: vec!["anthropic_messages".to_string()],
         auth: AuthHeader::Custom("x-api-key"),
         default_headers: vec![("anthropic-version".to_string(), "2023-06-01".to_string())],
+        timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT,
     }];
 
     let body = serde_json::to_vec(&serde_json::json!({
diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs
index 493e4d237..a347de13f 100644
--- a/crates/openshell-sandbox/src/lib.rs
+++ b/crates/openshell-sandbox/src/lib.rs
@@ -801,6 +801,11 @@ pub(crate) fn bundle_to_resolved_routes(
         .map(|r| {
             let (auth, default_headers) =
                 openshell_core::inference::auth_for_provider_type(&r.provider_type);
+            let timeout = if r.timeout_secs == 0 {
+                openshell_router::config::DEFAULT_ROUTE_TIMEOUT
+            } else {
+                Duration::from_secs(r.timeout_secs)
+            };
             openshell_router::config::ResolvedRoute {
                 name: r.name.clone(),
                 endpoint: r.base_url.clone(),
@@ -809,6 +814,7 @@ pub(crate) fn bundle_to_resolved_routes(
                 protocols: r.protocols.clone(),
                 auth,
                 default_headers,
+                timeout,
             }
         })
         .collect()
@@ -1482,6 +1488,7 @@ mod tests {
                         "openai_responses".to_string(),
                     ],
                     provider_type: "openai".to_string(),
+                    timeout_secs: 0,
                 },
                 openshell_core::proto::ResolvedRoute {
                     name: "local".to_string(),
@@ -1490,6 +1497,7 @@ mod tests {
                     model_id: "llama-3".to_string(),
                     protocols: vec!["openai_chat_completions".to_string()],
                     provider_type: String::new(),
+                    timeout_secs: 120,
                 },
             ],
             revision: "abc123".to_string(),
@@ -1510,11 +1518,21 @@ mod tests {
             routes[0].protocols,
             vec!["openai_chat_completions", "openai_responses"]
         );
+        assert_eq!(
+            routes[0].timeout,
+            openshell_router::config::DEFAULT_ROUTE_TIMEOUT,
+            "timeout_secs=0 should map to default"
+        );
         assert_eq!(routes[1].endpoint, "http://vllm:8000/v1");
         assert_eq!(
             routes[1].auth,
             openshell_core::inference::AuthHeader::Bearer
         );
+        assert_eq!(
+            routes[1].timeout,
+            Duration::from_secs(120),
+            "timeout_secs=120 should map to 120s"
+        );
     }
 
     #[test]
@@ -1539,6 +1557,7 @@ mod tests {
                 model_id: "model".to_string(),
                 protocols: vec!["openai_chat_completions".to_string()],
                 provider_type: "openai".to_string(),
+                timeout_secs: 0,
             }],
             revision: "rev".to_string(),
             generated_at_ms: 0,
@@ -1559,6 +1578,7 @@ mod tests {
                 protocols: vec!["openai_chat_completions".to_string()],
                 auth: openshell_core::inference::AuthHeader::Bearer,
                 default_headers: vec![],
+                timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT,
             },
             openshell_router::config::ResolvedRoute {
                 name: "sandbox-system".to_string(),
@@ -1568,6 +1588,7 @@ mod tests {
                 protocols: vec!["anthropic_messages".to_string()],
                 auth: openshell_core::inference::AuthHeader::Custom("x-api-key"),
                 default_headers: vec![],
+                timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT,
             },
         ];
 
@@ -1856,6 +1877,7 @@ filesystem_policy:
             auth: openshell_core::inference::AuthHeader::Bearer,
             protocols: vec!["openai_chat_completions".to_string()],
             default_headers: vec![],
+            timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT,
         }];
 
         let cache = Arc::new(RwLock::new(routes));
diff --git a/crates/openshell-sandbox/tests/system_inference.rs b/crates/openshell-sandbox/tests/system_inference.rs
index 3f6a471e5..5d581fbe2 100644
--- a/crates/openshell-sandbox/tests/system_inference.rs
+++ b/crates/openshell-sandbox/tests/system_inference.rs
@@ -20,6 +20,7 @@ fn make_system_route() -> ResolvedRoute {
         protocols: vec!["openai_chat_completions".to_string()],
         auth: AuthHeader::Bearer,
         default_headers: Vec::new(),
+        timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT,
     }
 }
 
@@ -32,6 +33,7 @@ fn make_user_route() -> ResolvedRoute {
         protocols: vec!["openai_chat_completions".to_string()],
         auth: AuthHeader::Bearer,
         default_headers: Vec::new(),
+        timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT,
     }
 }
 
@@ -124,6 +126,7 @@ async fn system_inference_with_anthropic_protocol() {
         protocols: vec!["anthropic_messages".to_string()],
         auth: AuthHeader::Custom("x-api-key"),
         default_headers: vec![("anthropic-version".to_string(), "2023-06-01".to_string())],
+        timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT,
     };
 
     let ctx = InferenceContext::new(patterns, router, vec![], vec![system_route]);
diff --git a/crates/openshell-server/src/inference.rs b/crates/openshell-server/src/inference.rs
index bbabaf70b..e5cda5f79 100644
--- a/crates/openshell-server/src/inference.rs
+++ b/crates/openshell-server/src/inference.rs
@@ -86,6 +86,7 @@ impl Inference for InferenceService {
             route_name,
             &req.provider_name,
             &req.model_id,
+            req.timeout_secs,
             verify,
         )
         .await?;
@@ -103,6 +104,7 @@ impl Inference for InferenceService {
             route_name: route_name.to_string(),
             validation_performed: !route.validation.is_empty(),
             validated_endpoints: route.validation,
+            timeout_secs: config.timeout_secs,
         }))
     }
 
@@ -140,6 +142,7 @@ impl Inference for InferenceService {
             model_id: config.model_id.clone(),
             version: route.version,
             route_name: route_name.to_string(),
+            timeout_secs: config.timeout_secs,
         }))
     }
 }
@@ -149,6 +152,7 @@ async fn upsert_cluster_inference_route(
     route_name: &str,
     provider_name: &str,
     model_id: &str,
+    timeout_secs: u64,
     verify: bool,
 ) -> Result<UpsertedInferenceRoute, Status> {
     if provider_name.trim().is_empty() {
@@ -173,7 +177,7 @@ async fn upsert_cluster_inference_route(
         Vec::new()
     };
 
-    let config = build_cluster_inference_config(&provider, model_id);
+    let config = build_cluster_inference_config(&provider, model_id, timeout_secs);
 
     let existing = store
         .get_message_by_name::<InferenceRoute>(route_name)
@@ -204,10 +208,15 @@ async fn upsert_cluster_inference_route(
     Ok(UpsertedInferenceRoute { route, validation })
 }
 
-fn build_cluster_inference_config(provider: &Provider, model_id: &str) -> ClusterInferenceConfig {
+fn build_cluster_inference_config(
+    provider: &Provider,
+    model_id: &str,
+    timeout_secs: u64,
+) -> ClusterInferenceConfig {
     ClusterInferenceConfig {
         provider_name: provider.name.clone(),
         model_id: model_id.to_string(),
+        timeout_secs,
     }
 }
 
@@ -267,6 +276,7 @@ fn resolve_provider_route(provider: &Provider) -> Result<ResolvedProviderRoute,
                 .iter()
                 .map(|(name, value)| ((*name).to_string(), (*value).to_string()))
                 .collect(),
+            timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT,
         },
     })
 }
@@ -454,6 +464,7 @@ async fn resolve_route_by_name(
         api_key: resolved.route.api_key,
         protocols: resolved.route.protocols,
         provider_type: resolved.provider_type,
+        timeout_secs: config.timeout_secs,
     }))
 }
 
@@ -470,6 +481,7 @@ mod tests {
             config: Some(ClusterInferenceConfig {
                 provider_name: provider_name.to_string(),
                 model_id: model_id.to_string(),
+                timeout_secs: 0,
             }),
             version: 1,
         }
@@ -516,6 +528,7 @@ mod tests {
             CLUSTER_INFERENCE_ROUTE_NAME,
             "openai-dev",
             "gpt-4o",
+            0,
             false,
         )
         .await
@@ -528,6 +541,7 @@ mod tests {
             CLUSTER_INFERENCE_ROUTE_NAME,
             "openai-dev",
             "gpt-4.1",
+            0,
             false,
         )
         .await
@@ -654,6 +668,7 @@ mod tests {
             config: Some(ClusterInferenceConfig {
                 provider_name: "openai-dev".to_string(),
                 model_id: "test/model".to_string(),
+                timeout_secs: 0,
             }),
             version: 7,
         };
@@ -739,6 +754,7 @@ mod tests {
             SANDBOX_SYSTEM_ROUTE_NAME,
             "anthropic-dev",
             "claude-sonnet-4-20250514",
+            0,
             false,
         )
         .await
@@ -825,6 +841,7 @@ mod tests {
             SANDBOX_SYSTEM_ROUTE_NAME,
             "openai-dev",
             "gpt-4o-mini",
+            0,
             false,
         )
         .await
@@ -883,6 +900,7 @@ mod tests {
             CLUSTER_INFERENCE_ROUTE_NAME,
             "openai-dev",
             "gpt-4o-mini",
+            0,
             true,
         )
         .await
@@ -924,6 +942,7 @@ mod tests {
             CLUSTER_INFERENCE_ROUTE_NAME,
             "openai-dev",
             "gpt-4o-mini",
+            0,
             true,
         )
         .await
@@ -968,6 +987,7 @@ mod tests {
             CLUSTER_INFERENCE_ROUTE_NAME,
             "openai-dev",
             "gpt-4o-mini",
+            0,
             false,
         )
         .await
diff --git a/proto/inference.proto b/proto/inference.proto
index a15f4b842..1fefe87c6 100644
--- a/proto/inference.proto
+++ b/proto/inference.proto
@@ -35,6 +35,8 @@ message ClusterInferenceConfig {
   string provider_name = 1;
   // Model identifier to force on generation calls.
   string model_id = 2;
+  // Per-route request timeout in seconds. 0 means use default (60s).
+  uint64 timeout_secs = 3;
 }
 
 // Storage envelope for the managed cluster inference route.
@@ -60,6 +62,8 @@ message SetClusterInferenceRequest {
   bool verify = 4;
   // Skip synchronous endpoint validation before persistence.
   bool no_verify = 5;
+  // Per-route request timeout in seconds. 0 means use default (60s).
+  uint64 timeout_secs = 6;
 }
 
 message ValidatedEndpoint {
@@ -77,6 +81,8 @@ message SetClusterInferenceResponse {
   bool validation_performed = 5;
   // The concrete endpoints that were probed during validation, when available.
   repeated ValidatedEndpoint validated_endpoints = 6;
+  // Per-route request timeout in seconds that was persisted.
+  uint64 timeout_secs = 7;
 }
 
 message GetClusterInferenceRequest {
@@ -91,6 +97,8 @@ message GetClusterInferenceResponse {
   uint64 version = 3;
   // Route name that was queried.
   string route_name = 4;
+  // Per-route request timeout in seconds. 0 means default (60s).
+  uint64 timeout_secs = 5;
 }
 
 message GetInferenceBundleRequest {}
@@ -103,6 +111,8 @@ message ResolvedRoute {
   string api_key = 4;
   string model_id = 5;
   string provider_type = 6;
+  // Per-route request timeout in seconds. 0 means use default (60s).
+  uint64 timeout_secs = 7;
 }
 
 message GetInferenceBundleResponse {

From dcd1cb587cdbd7c669fa29d1496678dba2bdc81e Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Sun, 29 Mar 2026 01:17:26 -0700
Subject: [PATCH 2/3] feat(inference): fix dynamic timeout change

---
 crates/openshell-server/src/inference.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/crates/openshell-server/src/inference.rs b/crates/openshell-server/src/inference.rs
index e5cda5f79..0fb29bde5 100644
--- a/crates/openshell-server/src/inference.rs
+++ b/crates/openshell-server/src/inference.rs
@@ -404,6 +404,7 @@ async fn resolve_inference_bundle(store: &Store) -> Result<GetInferenceBundleRes
             r.api_key.hash(&mut hasher);
             r.protocols.hash(&mut hasher);
             r.provider_type.hash(&mut hasher);
+            r.timeout_secs.hash(&mut hasher);
         }
         format!("{:016x}", hasher.finish())
     };

From d1c90d70336f50445a9d4c5ff390cb8f1455dc6a Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Sun, 29 Mar 2026 01:17:39 -0700
Subject: [PATCH 3/3] feat(inference): update docs

---
 architecture/inference-routing.md | 23 ++++++++++++++---------
 docs/inference/configure.md       | 23 +++++++++++++++++++++--
 2 files changed, 35 insertions(+), 11 deletions(-)

diff --git a/architecture/inference-routing.md b/architecture/inference-routing.md
index 0d3a95afb..9d45d7cd9 100644
--- a/architecture/inference-routing.md
+++ b/architecture/inference-routing.md
@@ -92,10 +92,10 @@ File: `proto/inference.proto`
 
 Key messages:
 
-- `SetClusterInferenceRequest` -- `provider_name` + `model_id` + optional `no_verify` override, with verification enabled by default
-- `SetClusterInferenceResponse` -- `provider_name` + `model_id` + `version`
+- `SetClusterInferenceRequest` -- `provider_name` + `model_id` + `timeout_secs` + optional `no_verify` override, with verification enabled by default
+- `SetClusterInferenceResponse` -- `provider_name` + `model_id` + `timeout_secs` + `version`
 - `GetInferenceBundleResponse` -- `repeated ResolvedRoute routes` + `revision` + `generated_at_ms`
-- `ResolvedRoute` -- `name`, `base_url`, `protocols`, `api_key`, `model_id`, `provider_type`
+- `ResolvedRoute` -- `name`, `base_url`, `protocols`, `api_key`, `model_id`, `provider_type`, `timeout_secs`
 
 ## Data Plane (Sandbox)
 
@@ -106,7 +106,7 @@ Files:
 - `crates/openshell-sandbox/src/lib.rs` -- inference context initialization, route refresh
 - `crates/openshell-sandbox/src/grpc_client.rs` -- `fetch_inference_bundle()`
 
-In cluster mode, the sandbox starts a background refresh loop as soon as the inference context is created. The loop polls the gateway every 5 seconds by default (`OPENSHELL_ROUTE_REFRESH_INTERVAL_SECS` override) and uses the bundle revision hash to skip no-op cache writes.
+In cluster mode, the sandbox starts a background refresh loop as soon as the inference context is created. The loop polls the gateway every 5 seconds by default (`OPENSHELL_ROUTE_REFRESH_INTERVAL_SECS` override) and uses the bundle revision hash to skip no-op cache writes. The revision hash covers all route fields including `timeout_secs`, so any configuration change (provider, model, or timeout) triggers a cache update on the next poll.
 
 ### Interception flow
 
@@ -143,7 +143,7 @@ If no pattern matches, the proxy returns `403 Forbidden` with `{"error": "connec
 ### Route cache
 
 - `InferenceContext` holds a `Router`, the pattern list, and an `Arc<RwLock<Vec<ResolvedRoute>>>` route cache.
-- In cluster mode, `spawn_route_refresh()` polls `GetInferenceBundle` every 30 seconds (`ROUTE_REFRESH_INTERVAL_SECS`). On failure, stale routes are kept.
+- In cluster mode, `spawn_route_refresh()` polls `GetInferenceBundle` every 5 seconds (`OPENSHELL_ROUTE_REFRESH_INTERVAL_SECS`). On failure, stale routes are kept.
 - In file mode (`--inference-routes`), routes load once at startup from YAML. No refresh task is spawned.
 - In cluster mode, an empty initial bundle still enables the inference context so the refresh task can pick up later configuration.
 
@@ -209,9 +209,11 @@ File: `crates/openshell-router/src/mock.rs`
 
 Routes with `mock://` scheme endpoints return canned responses without making HTTP requests. Mock responses are protocol-aware (OpenAI chat completion, OpenAI completion, Anthropic messages, or generic JSON). Mock routes include an `x-openshell-mock: true` response header.
 
-### HTTP client
+### Per-request timeout
 
-The router uses a `reqwest::Client` with a 60-second timeout. Timeouts and connection failures map to `RouterError::UpstreamUnavailable`.
+Each `ResolvedRoute` carries a `timeout` field (`Duration`). The `reqwest::Client` has no global timeout; instead, each outgoing request applies `.timeout(route.timeout)` on the request builder. When `timeout_secs` is `0` in the proto message, the default of 60 seconds is used (defined as `DEFAULT_ROUTE_TIMEOUT` in `config.rs`). Timeouts and connection failures map to `RouterError::UpstreamUnavailable`.
+
+Timeout changes propagate dynamically to running sandboxes. The bundle revision hash includes `timeout_secs`, so when the timeout is updated via `openshell inference update --timeout`, the refresh loop detects the revision change and updates the route cache within one polling interval (5 seconds by default).
 
 ## Standalone Route File
 
@@ -297,13 +299,16 @@ The system route is stored as a separate `InferenceRoute` record in the gateway
 
 Cluster inference commands:
 
-- `openshell inference set --provider <name> --model <id>` -- configures user-facing cluster inference
-- `openshell inference set --system --provider <name> --model <id>` -- configures system inference
+- `openshell inference set --provider <name> --model <id> [--timeout <secs>]` -- configures user-facing cluster inference
+- `openshell inference set --system --provider <name> --model <id> [--timeout <secs>]` -- configures system inference
+- `openshell inference update [--provider <name>] [--model <id>] [--timeout <secs>]` -- updates individual fields without resetting others
 - `openshell inference get` -- displays both user and system inference configuration
 - `openshell inference get --system` -- displays only the system inference configuration
 
 The `--provider` flag references a provider record name (not a provider type). The provider must already exist in the cluster and have a supported inference type (`openai`, `anthropic`, or `nvidia`).
 
+The `--timeout` flag sets the per-request timeout in seconds for upstream inference calls. When omitted or set to `0`, the default of 60 seconds applies. Timeout changes propagate to running sandboxes within the route refresh interval (5 seconds by default).
+
 Inference writes verify by default. `--no-verify` is the explicit opt-out for endpoints that are not up yet.
 
 ## Provider Discovery
diff --git a/docs/inference/configure.md b/docs/inference/configure.md
index 07b991996..78065689e 100644
--- a/docs/inference/configure.md
+++ b/docs/inference/configure.md
@@ -28,12 +28,13 @@ content:
 
 This page covers the managed local inference endpoint (`https://inference.local`). External inference endpoints go through sandbox `network_policies`. Refer to [Policies](/sandboxes/policies.md) for details.
 
-The configuration consists of two values:
+The configuration consists of three values:
 
 | Value | Description |
 |---|---|
 | Provider record | The credential backend OpenShell uses to authenticate with the upstream model host. |
 | Model ID | The model to use for generation requests. |
+| Timeout | Per-request timeout in seconds for upstream inference calls. Defaults to 60 seconds. |
 
 For a list of tested providers and their base URLs, refer to [Supported Inference Providers](../sandboxes/manage-providers.md#supported-inference-providers).
 
@@ -111,6 +112,17 @@ $ openshell inference set \
     --model nvidia/nemotron-3-nano-30b-a3b
 ```
 
+To override the default 60-second per-request timeout, add `--timeout`:
+
+```console
+$ openshell inference set \
+    --provider nvidia-prod \
+    --model nvidia/nemotron-3-nano-30b-a3b \
+    --timeout 300
+```
+
+The value is in seconds. When `--timeout` is omitted (or set to `0`), the default of 60 seconds applies.
+
 ## Verify the Active Config
 
 Confirm that the provider and model are set correctly:
@@ -121,6 +133,7 @@ Gateway inference:
 
   Provider: nvidia-prod
   Model: nvidia/nemotron-3-nano-30b-a3b
+  Timeout: 300s
   Version: 1
 ```
 
@@ -138,6 +151,12 @@ Or switch providers without repeating the current model:
 $ openshell inference update --provider openai-prod
 ```
 
+Or change only the timeout:
+
+```console
+$ openshell inference update --timeout 120
+```
+
 ## Use the Local Endpoint from a Sandbox
 
 After inference is configured, code inside any sandbox can call `https://inference.local` directly:
@@ -182,7 +201,7 @@ A successful response confirms the privacy router can reach the configured backe
 
 - Gateway-scoped: Every sandbox using the active gateway sees the same `inference.local` backend.
 - HTTPS only: `inference.local` is intercepted only for HTTPS traffic.
-- Hot reload: Provider and inference changes are picked up within about 5 seconds by default.
+- Hot reload: Provider, model, and timeout changes are picked up by running sandboxes within about 5 seconds by default. No sandbox recreation is required.
 
 ## Next Steps