From 243d59f8fe090ca3a767188708b05927c3386240 Mon Sep 17 00:00:00 2001 From: Peter Andreas Entschev Date: Sat, 28 Mar 2026 06:42:43 -0700 Subject: [PATCH 1/3] feat(inference): add timeout --- crates/openshell-cli/src/main.rs | 14 ++++++++++- crates/openshell-cli/src/run.rs | 21 ++++++++++++++-- crates/openshell-router/src/backend.rs | 3 ++- crates/openshell-router/src/config.rs | 8 +++++++ crates/openshell-router/src/lib.rs | 3 --- crates/openshell-router/src/mock.rs | 1 + .../tests/backend_integration.rs | 6 +++++ crates/openshell-sandbox/src/lib.rs | 22 +++++++++++++++++ .../tests/system_inference.rs | 3 +++ crates/openshell-server/src/inference.rs | 24 +++++++++++++++++-- proto/inference.proto | 10 ++++++++ 11 files changed, 106 insertions(+), 9 deletions(-) diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs index 5de31c79c..310979579 100644 --- a/crates/openshell-cli/src/main.rs +++ b/crates/openshell-cli/src/main.rs @@ -937,6 +937,10 @@ enum InferenceCommands { /// Skip endpoint verification before saving the route. #[arg(long)] no_verify: bool, + + /// Request timeout in seconds for inference calls (0 = default 60s). + #[arg(long, default_value_t = 0)] + timeout: u64, }, /// Update gateway-level inference configuration (partial update). @@ -957,6 +961,10 @@ enum InferenceCommands { /// Skip endpoint verification before saving the route. #[arg(long)] no_verify: bool, + + /// Request timeout in seconds for inference calls (0 = default 60s, unchanged if omitted). + #[arg(long)] + timeout: Option, }, /// Get gateway-level inference provider and model. @@ -2026,10 +2034,12 @@ async fn main() -> Result<()> { model, system, no_verify, + timeout, } => { let route_name = if system { "sandbox-system" } else { "" }; run::gateway_inference_set( - endpoint, &provider, &model, route_name, no_verify, &tls, + endpoint, &provider, &model, route_name, no_verify, timeout, + &tls, ) .await?; } @@ -2038,6 +2048,7 @@ async fn main() -> Result<()> { model, system, no_verify, + timeout, } => { let route_name = if system { "sandbox-system" } else { "" }; run::gateway_inference_update( @@ -2046,6 +2057,7 @@ async fn main() -> Result<()> { model.as_deref(), route_name, no_verify, + timeout, &tls, ) .await?; diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index e32eec2a4..bab819137 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -3481,6 +3481,7 @@ pub async fn gateway_inference_set( model_id: &str, route_name: &str, no_verify: bool, + timeout_secs: u64, tls: &TlsOptions, ) -> Result<()> { let progress = if std::io::stdout().is_terminal() { @@ -3504,6 +3505,7 @@ pub async fn gateway_inference_set( route_name: route_name.to_string(), verify: false, no_verify, + timeout_secs, }) .await; @@ -3525,6 +3527,7 @@ pub async fn gateway_inference_set( println!(" {} {}", "Provider:".dimmed(), configured.provider_name); println!(" {} {}", "Model:".dimmed(), configured.model_id); println!(" {} {}", "Version:".dimmed(), configured.version); + print_timeout(configured.timeout_secs); if configured.validation_performed { println!(" {}", "Validated Endpoints:".dimmed()); for endpoint in configured.validated_endpoints { @@ -3540,11 +3543,12 @@ pub async fn gateway_inference_update( model_id: Option<&str>, route_name: &str, no_verify: bool, + timeout_secs: Option, tls: &TlsOptions, ) -> Result<()> { - if provider_name.is_none() && model_id.is_none() { + if provider_name.is_none() && model_id.is_none() && timeout_secs.is_none() { return Err(miette::miette!( - "at least one of --provider or --model must be specified" + "at least one of --provider, --model, or --timeout must be specified" )); } @@ -3561,6 +3565,7 @@ pub async fn gateway_inference_update( let provider = provider_name.unwrap_or(¤t.provider_name); let model = model_id.unwrap_or(¤t.model_id); + let timeout = timeout_secs.unwrap_or(current.timeout_secs); let progress = if std::io::stdout().is_terminal() { let spinner = ProgressBar::new_spinner(); @@ -3582,6 +3587,7 @@ pub async fn gateway_inference_update( route_name: route_name.to_string(), verify: false, no_verify, + timeout_secs: timeout, }) .await; @@ -3603,6 +3609,7 @@ pub async fn gateway_inference_update( println!(" {} {}", "Provider:".dimmed(), configured.provider_name); println!(" {} {}", "Model:".dimmed(), configured.model_id); println!(" {} {}", "Version:".dimmed(), configured.version); + print_timeout(configured.timeout_secs); if configured.validation_performed { println!(" {}", "Validated Endpoints:".dimmed()); for endpoint in configured.validated_endpoints { @@ -3639,6 +3646,7 @@ pub async fn gateway_inference_get( println!(" {} {}", "Provider:".dimmed(), configured.provider_name); println!(" {} {}", "Model:".dimmed(), configured.model_id); println!(" {} {}", "Version:".dimmed(), configured.version); + print_timeout(configured.timeout_secs); } else { // Show both routes by default. print_inference_route(&mut client, "Gateway inference", "").await; @@ -3666,6 +3674,7 @@ async fn print_inference_route( println!(" {} {}", "Provider:".dimmed(), configured.provider_name); println!(" {} {}", "Model:".dimmed(), configured.model_id); println!(" {} {}", "Version:".dimmed(), configured.version); + print_timeout(configured.timeout_secs); } Err(e) if e.code() == Code::NotFound => { println!("{}", format!("{label}:").cyan().bold()); @@ -3680,6 +3689,14 @@ async fn print_inference_route( } } +fn print_timeout(timeout_secs: u64) { + if timeout_secs == 0 { + println!(" {} {}s (default)", "Timeout:".dimmed(), 60); + } else { + println!(" {} {}s", "Timeout:".dimmed(), timeout_secs); + } +} + fn format_inference_status(status: Status) -> miette::Report { let message = status.message().trim(); diff --git a/crates/openshell-router/src/backend.rs b/crates/openshell-router/src/backend.rs index d82ea082c..d1d7092c0 100644 --- a/crates/openshell-router/src/backend.rs +++ b/crates/openshell-router/src/backend.rs @@ -149,7 +149,7 @@ async fn send_backend_request( } Err(_) => body, }; - builder = builder.body(body); + builder = builder.body(body).timeout(route.timeout); builder.send().await.map_err(|e| { if e.is_timeout() { @@ -468,6 +468,7 @@ mod tests { protocols: protocols.iter().map(|p| (*p).to_string()).collect(), auth, default_headers: vec![("anthropic-version".to_string(), "2023-06-01".to_string())], + timeout: crate::config::DEFAULT_ROUTE_TIMEOUT, } } diff --git a/crates/openshell-router/src/config.rs b/crates/openshell-router/src/config.rs index d9c081d60..52c22da9f 100644 --- a/crates/openshell-router/src/config.rs +++ b/crates/openshell-router/src/config.rs @@ -3,11 +3,14 @@ use serde::Deserialize; use std::path::Path; +use std::time::Duration; pub use openshell_core::inference::AuthHeader; use crate::RouterError; +pub const DEFAULT_ROUTE_TIMEOUT: Duration = Duration::from_secs(60); + #[derive(Debug, Clone, Deserialize)] pub struct RouterConfig { pub routes: Vec, @@ -45,6 +48,8 @@ pub struct ResolvedRoute { pub auth: AuthHeader, /// Extra headers injected on every request (e.g. `anthropic-version`). pub default_headers: Vec<(String, String)>, + /// Per-request timeout for proxied inference calls. + pub timeout: Duration, } impl std::fmt::Debug for ResolvedRoute { @@ -57,6 +62,7 @@ impl std::fmt::Debug for ResolvedRoute { .field("protocols", &self.protocols) .field("auth", &self.auth) .field("default_headers", &self.default_headers) + .field("timeout", &self.timeout) .finish() } } @@ -129,6 +135,7 @@ impl RouteConfig { protocols, auth, default_headers, + timeout: DEFAULT_ROUTE_TIMEOUT, }) } } @@ -256,6 +263,7 @@ routes: protocols: vec!["openai_chat_completions".to_string()], auth: AuthHeader::Bearer, default_headers: Vec::new(), + timeout: DEFAULT_ROUTE_TIMEOUT, }; let debug_output = format!("{route:?}"); assert!( diff --git a/crates/openshell-router/src/lib.rs b/crates/openshell-router/src/lib.rs index a5712d9a0..7deed6fc4 100644 --- a/crates/openshell-router/src/lib.rs +++ b/crates/openshell-router/src/lib.rs @@ -5,8 +5,6 @@ mod backend; pub mod config; mod mock; -use std::time::Duration; - pub use backend::{ ProxyResponse, StreamingProxyResponse, ValidatedEndpoint, ValidationFailure, ValidationFailureKind, verify_backend_endpoint, @@ -39,7 +37,6 @@ pub struct Router { impl Router { pub fn new() -> Result { let client = reqwest::Client::builder() - .timeout(Duration::from_secs(60)) .build() .map_err(|e| RouterError::Internal(format!("failed to build HTTP client: {e}")))?; Ok(Self { diff --git a/crates/openshell-router/src/mock.rs b/crates/openshell-router/src/mock.rs index 9b6accb60..a17ce486f 100644 --- a/crates/openshell-router/src/mock.rs +++ b/crates/openshell-router/src/mock.rs @@ -131,6 +131,7 @@ mod tests { protocols: protocols.iter().map(ToString::to_string).collect(), auth: crate::config::AuthHeader::Bearer, default_headers: Vec::new(), + timeout: crate::config::DEFAULT_ROUTE_TIMEOUT, } } diff --git a/crates/openshell-router/tests/backend_integration.rs b/crates/openshell-router/tests/backend_integration.rs index 4861bd6d0..571964aa8 100644 --- a/crates/openshell-router/tests/backend_integration.rs +++ b/crates/openshell-router/tests/backend_integration.rs @@ -15,6 +15,7 @@ fn mock_candidates(base_url: &str) -> Vec { protocols: vec!["openai_chat_completions".to_string()], auth: AuthHeader::Bearer, default_headers: Vec::new(), + timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, }] } @@ -117,6 +118,7 @@ async fn proxy_no_compatible_route_returns_error() { protocols: vec!["anthropic_messages".to_string()], auth: AuthHeader::Custom("x-api-key"), default_headers: Vec::new(), + timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, }]; let err = router @@ -178,6 +180,7 @@ async fn proxy_mock_route_returns_canned_response() { protocols: vec!["openai_chat_completions".to_string()], auth: AuthHeader::Bearer, default_headers: Vec::new(), + timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, }]; let body = serde_json::to_vec(&serde_json::json!({ @@ -312,6 +315,7 @@ async fn proxy_uses_x_api_key_for_anthropic_route() { protocols: vec!["anthropic_messages".to_string()], auth: AuthHeader::Custom("x-api-key"), default_headers: vec![("anthropic-version".to_string(), "2023-06-01".to_string())], + timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, }]; let body = serde_json::to_vec(&serde_json::json!({ @@ -370,6 +374,7 @@ async fn proxy_anthropic_does_not_send_bearer_auth() { protocols: vec!["anthropic_messages".to_string()], auth: AuthHeader::Custom("x-api-key"), default_headers: vec![("anthropic-version".to_string(), "2023-06-01".to_string())], + timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, }]; let response = router @@ -414,6 +419,7 @@ async fn proxy_forwards_client_anthropic_version_header() { protocols: vec!["anthropic_messages".to_string()], auth: AuthHeader::Custom("x-api-key"), default_headers: vec![("anthropic-version".to_string(), "2023-06-01".to_string())], + timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, }]; let body = serde_json::to_vec(&serde_json::json!({ diff --git a/crates/openshell-sandbox/src/lib.rs b/crates/openshell-sandbox/src/lib.rs index 493e4d237..a347de13f 100644 --- a/crates/openshell-sandbox/src/lib.rs +++ b/crates/openshell-sandbox/src/lib.rs @@ -801,6 +801,11 @@ pub(crate) fn bundle_to_resolved_routes( .map(|r| { let (auth, default_headers) = openshell_core::inference::auth_for_provider_type(&r.provider_type); + let timeout = if r.timeout_secs == 0 { + openshell_router::config::DEFAULT_ROUTE_TIMEOUT + } else { + Duration::from_secs(r.timeout_secs) + }; openshell_router::config::ResolvedRoute { name: r.name.clone(), endpoint: r.base_url.clone(), @@ -809,6 +814,7 @@ pub(crate) fn bundle_to_resolved_routes( protocols: r.protocols.clone(), auth, default_headers, + timeout, } }) .collect() @@ -1482,6 +1488,7 @@ mod tests { "openai_responses".to_string(), ], provider_type: "openai".to_string(), + timeout_secs: 0, }, openshell_core::proto::ResolvedRoute { name: "local".to_string(), @@ -1490,6 +1497,7 @@ mod tests { model_id: "llama-3".to_string(), protocols: vec!["openai_chat_completions".to_string()], provider_type: String::new(), + timeout_secs: 120, }, ], revision: "abc123".to_string(), @@ -1510,11 +1518,21 @@ mod tests { routes[0].protocols, vec!["openai_chat_completions", "openai_responses"] ); + assert_eq!( + routes[0].timeout, + openshell_router::config::DEFAULT_ROUTE_TIMEOUT, + "timeout_secs=0 should map to default" + ); assert_eq!(routes[1].endpoint, "http://vllm:8000/v1"); assert_eq!( routes[1].auth, openshell_core::inference::AuthHeader::Bearer ); + assert_eq!( + routes[1].timeout, + Duration::from_secs(120), + "timeout_secs=120 should map to 120s" + ); } #[test] @@ -1539,6 +1557,7 @@ mod tests { model_id: "model".to_string(), protocols: vec!["openai_chat_completions".to_string()], provider_type: "openai".to_string(), + timeout_secs: 0, }], revision: "rev".to_string(), generated_at_ms: 0, @@ -1559,6 +1578,7 @@ mod tests { protocols: vec!["openai_chat_completions".to_string()], auth: openshell_core::inference::AuthHeader::Bearer, default_headers: vec![], + timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, }, openshell_router::config::ResolvedRoute { name: "sandbox-system".to_string(), @@ -1568,6 +1588,7 @@ mod tests { protocols: vec!["anthropic_messages".to_string()], auth: openshell_core::inference::AuthHeader::Custom("x-api-key"), default_headers: vec![], + timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, }, ]; @@ -1856,6 +1877,7 @@ filesystem_policy: auth: openshell_core::inference::AuthHeader::Bearer, protocols: vec!["openai_chat_completions".to_string()], default_headers: vec![], + timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, }]; let cache = Arc::new(RwLock::new(routes)); diff --git a/crates/openshell-sandbox/tests/system_inference.rs b/crates/openshell-sandbox/tests/system_inference.rs index 3f6a471e5..5d581fbe2 100644 --- a/crates/openshell-sandbox/tests/system_inference.rs +++ b/crates/openshell-sandbox/tests/system_inference.rs @@ -20,6 +20,7 @@ fn make_system_route() -> ResolvedRoute { protocols: vec!["openai_chat_completions".to_string()], auth: AuthHeader::Bearer, default_headers: Vec::new(), + timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, } } @@ -32,6 +33,7 @@ fn make_user_route() -> ResolvedRoute { protocols: vec!["openai_chat_completions".to_string()], auth: AuthHeader::Bearer, default_headers: Vec::new(), + timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, } } @@ -124,6 +126,7 @@ async fn system_inference_with_anthropic_protocol() { protocols: vec!["anthropic_messages".to_string()], auth: AuthHeader::Custom("x-api-key"), default_headers: vec![("anthropic-version".to_string(), "2023-06-01".to_string())], + timeout: openshell_router::config::DEFAULT_ROUTE_TIMEOUT, }; let ctx = InferenceContext::new(patterns, router, vec![], vec![system_route]); diff --git a/crates/openshell-server/src/inference.rs b/crates/openshell-server/src/inference.rs index bbabaf70b..e5cda5f79 100644 --- a/crates/openshell-server/src/inference.rs +++ b/crates/openshell-server/src/inference.rs @@ -86,6 +86,7 @@ impl Inference for InferenceService { route_name, &req.provider_name, &req.model_id, + req.timeout_secs, verify, ) .await?; @@ -103,6 +104,7 @@ impl Inference for InferenceService { route_name: route_name.to_string(), validation_performed: !route.validation.is_empty(), validated_endpoints: route.validation, + timeout_secs: config.timeout_secs, })) } @@ -140,6 +142,7 @@ impl Inference for InferenceService { model_id: config.model_id.clone(), version: route.version, route_name: route_name.to_string(), + timeout_secs: config.timeout_secs, })) } } @@ -149,6 +152,7 @@ async fn upsert_cluster_inference_route( route_name: &str, provider_name: &str, model_id: &str, + timeout_secs: u64, verify: bool, ) -> Result { if provider_name.trim().is_empty() { @@ -173,7 +177,7 @@ async fn upsert_cluster_inference_route( Vec::new() }; - let config = build_cluster_inference_config(&provider, model_id); + let config = build_cluster_inference_config(&provider, model_id, timeout_secs); let existing = store .get_message_by_name::(route_name) @@ -204,10 +208,15 @@ async fn upsert_cluster_inference_route( Ok(UpsertedInferenceRoute { route, validation }) } -fn build_cluster_inference_config(provider: &Provider, model_id: &str) -> ClusterInferenceConfig { +fn build_cluster_inference_config( + provider: &Provider, + model_id: &str, + timeout_secs: u64, +) -> ClusterInferenceConfig { ClusterInferenceConfig { provider_name: provider.name.clone(), model_id: model_id.to_string(), + timeout_secs, } } @@ -267,6 +276,7 @@ fn resolve_provider_route(provider: &Provider) -> Result Date: Sun, 29 Mar 2026 01:17:26 -0700 Subject: [PATCH 2/3] feat(inference): fix dynamic timeout change --- crates/openshell-server/src/inference.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/openshell-server/src/inference.rs b/crates/openshell-server/src/inference.rs index e5cda5f79..0fb29bde5 100644 --- a/crates/openshell-server/src/inference.rs +++ b/crates/openshell-server/src/inference.rs @@ -404,6 +404,7 @@ async fn resolve_inference_bundle(store: &Store) -> Result Date: Sun, 29 Mar 2026 01:17:39 -0700 Subject: [PATCH 3/3] feat(inference): update docs --- architecture/inference-routing.md | 23 ++++++++++++++--------- docs/inference/configure.md | 23 +++++++++++++++++++++-- 2 files changed, 35 insertions(+), 11 deletions(-) diff --git a/architecture/inference-routing.md b/architecture/inference-routing.md index 0d3a95afb..9d45d7cd9 100644 --- a/architecture/inference-routing.md +++ b/architecture/inference-routing.md @@ -92,10 +92,10 @@ File: `proto/inference.proto` Key messages: -- `SetClusterInferenceRequest` -- `provider_name` + `model_id` + optional `no_verify` override, with verification enabled by default -- `SetClusterInferenceResponse` -- `provider_name` + `model_id` + `version` +- `SetClusterInferenceRequest` -- `provider_name` + `model_id` + `timeout_secs` + optional `no_verify` override, with verification enabled by default +- `SetClusterInferenceResponse` -- `provider_name` + `model_id` + `timeout_secs` + `version` - `GetInferenceBundleResponse` -- `repeated ResolvedRoute routes` + `revision` + `generated_at_ms` -- `ResolvedRoute` -- `name`, `base_url`, `protocols`, `api_key`, `model_id`, `provider_type` +- `ResolvedRoute` -- `name`, `base_url`, `protocols`, `api_key`, `model_id`, `provider_type`, `timeout_secs` ## Data Plane (Sandbox) @@ -106,7 +106,7 @@ Files: - `crates/openshell-sandbox/src/lib.rs` -- inference context initialization, route refresh - `crates/openshell-sandbox/src/grpc_client.rs` -- `fetch_inference_bundle()` -In cluster mode, the sandbox starts a background refresh loop as soon as the inference context is created. The loop polls the gateway every 5 seconds by default (`OPENSHELL_ROUTE_REFRESH_INTERVAL_SECS` override) and uses the bundle revision hash to skip no-op cache writes. +In cluster mode, the sandbox starts a background refresh loop as soon as the inference context is created. The loop polls the gateway every 5 seconds by default (`OPENSHELL_ROUTE_REFRESH_INTERVAL_SECS` override) and uses the bundle revision hash to skip no-op cache writes. The revision hash covers all route fields including `timeout_secs`, so any configuration change (provider, model, or timeout) triggers a cache update on the next poll. ### Interception flow @@ -143,7 +143,7 @@ If no pattern matches, the proxy returns `403 Forbidden` with `{"error": "connec ### Route cache - `InferenceContext` holds a `Router`, the pattern list, and an `Arc>>` route cache. -- In cluster mode, `spawn_route_refresh()` polls `GetInferenceBundle` every 30 seconds (`ROUTE_REFRESH_INTERVAL_SECS`). On failure, stale routes are kept. +- In cluster mode, `spawn_route_refresh()` polls `GetInferenceBundle` every 5 seconds (`OPENSHELL_ROUTE_REFRESH_INTERVAL_SECS`). On failure, stale routes are kept. - In file mode (`--inference-routes`), routes load once at startup from YAML. No refresh task is spawned. - In cluster mode, an empty initial bundle still enables the inference context so the refresh task can pick up later configuration. @@ -209,9 +209,11 @@ File: `crates/openshell-router/src/mock.rs` Routes with `mock://` scheme endpoints return canned responses without making HTTP requests. Mock responses are protocol-aware (OpenAI chat completion, OpenAI completion, Anthropic messages, or generic JSON). Mock routes include an `x-openshell-mock: true` response header. -### HTTP client +### Per-request timeout -The router uses a `reqwest::Client` with a 60-second timeout. Timeouts and connection failures map to `RouterError::UpstreamUnavailable`. +Each `ResolvedRoute` carries a `timeout` field (`Duration`). The `reqwest::Client` has no global timeout; instead, each outgoing request applies `.timeout(route.timeout)` on the request builder. When `timeout_secs` is `0` in the proto message, the default of 60 seconds is used (defined as `DEFAULT_ROUTE_TIMEOUT` in `config.rs`). Timeouts and connection failures map to `RouterError::UpstreamUnavailable`. + +Timeout changes propagate dynamically to running sandboxes. The bundle revision hash includes `timeout_secs`, so when the timeout is updated via `openshell inference update --timeout`, the refresh loop detects the revision change and updates the route cache within one polling interval (5 seconds by default). ## Standalone Route File @@ -297,13 +299,16 @@ The system route is stored as a separate `InferenceRoute` record in the gateway Cluster inference commands: -- `openshell inference set --provider --model ` -- configures user-facing cluster inference -- `openshell inference set --system --provider --model ` -- configures system inference +- `openshell inference set --provider --model [--timeout ]` -- configures user-facing cluster inference +- `openshell inference set --system --provider --model [--timeout ]` -- configures system inference +- `openshell inference update [--provider ] [--model ] [--timeout ]` -- updates individual fields without resetting others - `openshell inference get` -- displays both user and system inference configuration - `openshell inference get --system` -- displays only the system inference configuration The `--provider` flag references a provider record name (not a provider type). The provider must already exist in the cluster and have a supported inference type (`openai`, `anthropic`, or `nvidia`). +The `--timeout` flag sets the per-request timeout in seconds for upstream inference calls. When omitted or set to `0`, the default of 60 seconds applies. Timeout changes propagate to running sandboxes within the route refresh interval (5 seconds by default). + Inference writes verify by default. `--no-verify` is the explicit opt-out for endpoints that are not up yet. ## Provider Discovery diff --git a/docs/inference/configure.md b/docs/inference/configure.md index 07b991996..78065689e 100644 --- a/docs/inference/configure.md +++ b/docs/inference/configure.md @@ -28,12 +28,13 @@ content: This page covers the managed local inference endpoint (`https://inference.local`). External inference endpoints go through sandbox `network_policies`. Refer to [Policies](/sandboxes/policies.md) for details. -The configuration consists of two values: +The configuration consists of three values: | Value | Description | |---|---| | Provider record | The credential backend OpenShell uses to authenticate with the upstream model host. | | Model ID | The model to use for generation requests. | +| Timeout | Per-request timeout in seconds for upstream inference calls. Defaults to 60 seconds. | For a list of tested providers and their base URLs, refer to [Supported Inference Providers](../sandboxes/manage-providers.md#supported-inference-providers). @@ -111,6 +112,17 @@ $ openshell inference set \ --model nvidia/nemotron-3-nano-30b-a3b ``` +To override the default 60-second per-request timeout, add `--timeout`: + +```console +$ openshell inference set \ + --provider nvidia-prod \ + --model nvidia/nemotron-3-nano-30b-a3b \ + --timeout 300 +``` + +The value is in seconds. When `--timeout` is omitted (or set to `0`), the default of 60 seconds applies. + ## Verify the Active Config Confirm that the provider and model are set correctly: @@ -121,6 +133,7 @@ Gateway inference: Provider: nvidia-prod Model: nvidia/nemotron-3-nano-30b-a3b + Timeout: 300s Version: 1 ``` @@ -138,6 +151,12 @@ Or switch providers without repeating the current model: $ openshell inference update --provider openai-prod ``` +Or change only the timeout: + +```console +$ openshell inference update --timeout 120 +``` + ## Use the Local Endpoint from a Sandbox After inference is configured, code inside any sandbox can call `https://inference.local` directly: @@ -182,7 +201,7 @@ A successful response confirms the privacy router can reach the configured backe - Gateway-scoped: Every sandbox using the active gateway sees the same `inference.local` backend. - HTTPS only: `inference.local` is intercepted only for HTTPS traffic. -- Hot reload: Provider and inference changes are picked up within about 5 seconds by default. +- Hot reload: Provider, model, and timeout changes are picked up by running sandboxes within about 5 seconds by default. No sandbox recreation is required. ## Next Steps