From 1618f4184ebda8afdcf96415852f85e4ca78da8f Mon Sep 17 00:00:00 2001 From: Patrick Riel Date: Thu, 26 Mar 2026 17:28:53 +0000 Subject: [PATCH] feat(cli): add generic sandbox device request flags Signed-off-by: Patrick Riel --- architecture/gateway-single-node.md | 2 +- crates/openshell-cli/src/main.rs | 49 +++++ crates/openshell-cli/src/run.rs | 207 +++++++++++++++++- .../sandbox_create_lifecycle_integration.rs | 17 +- docs/sandboxes/manage-sandboxes.md | 13 ++ 5 files changed, 275 insertions(+), 13 deletions(-) diff --git a/architecture/gateway-single-node.md b/architecture/gateway-single-node.md index 57aebd3a..05a54984 100644 --- a/architecture/gateway-single-node.md +++ b/architecture/gateway-single-node.md @@ -381,7 +381,7 @@ When `openshell sandbox create` cannot connect to a gateway (connection refused, 1. `should_attempt_bootstrap()` in `crates/openshell-cli/src/bootstrap.rs` checks the error type. It returns `true` for connectivity errors and missing default TLS materials, but `false` for TLS handshake/auth errors. 2. If running in a terminal, the user is prompted to confirm. 3. `run_bootstrap()` deploys a gateway named `"openshell"`, sets it as active, and returns fresh `TlsOptions` pointing to the newly-written mTLS certs. -4. When `sandbox create` requests GPU explicitly (`--gpu`) or infers it from an image whose final name component contains `gpu` (such as `nvidia-gpu`), the bootstrap path enables gateway GPU support before retrying sandbox creation. +4. When `sandbox create` requests GPU explicitly (`--gpu`), requests `nvidia.com/gpu` through `--resource`, or infers GPU intent from an image whose final name component contains `gpu` (such as `nvidia-gpu`), the bootstrap path enables gateway GPU support before retrying sandbox creation. ## Container Environment Variables diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs index 5de31c79..f822f969 100644 --- a/crates/openshell-cli/src/main.rs +++ b/crates/openshell-cli/src/main.rs @@ -1109,6 +1109,21 @@ enum SandboxCommands { #[arg(long)] gpu: bool, + /// Request an additional sandbox resource limit. + /// + /// Accepts `=` and may be repeated. This is primarily + /// useful for device-plugin resources such as `nvidia.com/gpu=2` or + /// `vendor.com/vf=1`. + #[arg(long = "resource", value_name = "NAME=COUNT")] + resources: Vec, + + /// Override the sandbox pod runtime class. + /// + /// This is forwarded to `template.runtime_class_name` and is useful + /// for device runtimes that require an explicit runtime class. + #[arg(long = "runtime-class")] + runtime_class: Option, + /// SSH destination for remote bootstrap (e.g., user@hostname). /// Only used when no cluster exists yet; ignored if a cluster is /// already active. @@ -2073,6 +2088,8 @@ async fn main() -> Result<()> { no_keep, editor, gpu, + resources, + runtime_class, remote, ssh_key, providers, @@ -2154,6 +2171,8 @@ async fn main() -> Result<()> { upload_spec.as_ref(), keep, gpu, + &resources, + runtime_class.as_deref(), editor, remote.as_deref(), ssh_key.as_deref(), @@ -2176,6 +2195,8 @@ async fn main() -> Result<()> { upload_spec.as_ref(), keep, gpu, + &resources, + runtime_class.as_deref(), editor, remote.as_deref(), ssh_key.as_deref(), @@ -2872,6 +2893,34 @@ mod tests { assert_eq!(dest.get_value_hint(), ValueHint::AnyPath); } + #[test] + fn sandbox_create_accepts_resource_and_runtime_class_flags() { + let cli = Cli::try_parse_from([ + "openshell", + "sandbox", + "create", + "--resource", + "nvidia.com/gpu=2", + "--resource", + "vendor.com/vf=1", + "--runtime-class", + "nvidia", + ]) + .expect("sandbox create should parse generic resource flags"); + + assert!(matches!( + cli.command, + Some(Commands::Sandbox { + command: Some(SandboxCommands::Create { + resources, + runtime_class, + .. + }) + }) if resources == vec!["nvidia.com/gpu=2".to_string(), "vendor.com/vf=1".to_string()] + && runtime_class.as_deref() == Some("nvidia") + )); + } + #[test] fn parse_upload_spec_without_remote() { let (local, remote) = parse_upload_spec("./src"); diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs index e32eec2a..c6c51ea1 100644 --- a/crates/openshell-cli/src/run.rs +++ b/crates/openshell-cli/src/run.rs @@ -37,7 +37,8 @@ use openshell_providers::{ ProviderRegistry, detect_provider_from_command, normalize_provider_type, }; use owo_colors::OwoColorize; -use std::collections::{HashMap, HashSet, VecDeque}; +use prost_types::{Struct, Value, value::Kind}; +use std::collections::{BTreeMap, HashMap, HashSet, VecDeque}; use std::io::{IsTerminal, Write}; use std::path::{Path, PathBuf}; use std::process::Command; @@ -1845,6 +1846,8 @@ pub async fn sandbox_create_with_bootstrap( upload: Option<&(String, Option, bool)>, keep: bool, gpu: bool, + resources: &[String], + runtime_class_name: Option<&str>, editor: Option, remote: Option<&str>, ssh_key: Option<&str>, @@ -1856,6 +1859,16 @@ pub async fn sandbox_create_with_bootstrap( bootstrap_override: Option, auto_providers_override: Option, ) -> Result<()> { + if gpu && resource_specs_request_named_resource(resources, "nvidia.com/gpu") { + return Err(miette!( + "--gpu conflicts with --resource nvidia.com/gpu=; use only one GPU request path" + )); + } + if gpu && runtime_class_name.is_some() { + return Err(miette!( + "--runtime-class cannot be combined with --gpu because GPU sandboxes force the runtime class automatically" + )); + } if !crate::bootstrap::confirm_bootstrap(bootstrap_override)? { return Err(miette::miette!( "No active gateway.\n\ @@ -1863,7 +1876,9 @@ pub async fn sandbox_create_with_bootstrap( Or deploy a new gateway: openshell gateway start" )); } - let requested_gpu = gpu || from.is_some_and(source_requests_gpu); + let requested_gpu = gpu + || from.is_some_and(source_requests_gpu) + || resource_specs_request_named_resource(resources, "nvidia.com/gpu"); let (tls, server, gateway_name) = crate::bootstrap::run_bootstrap(remote, ssh_key, requested_gpu).await?; // Disable bootstrap inside sandbox_create so that a transient connection @@ -1876,6 +1891,8 @@ pub async fn sandbox_create_with_bootstrap( upload, keep, gpu, + resources, + runtime_class_name, editor, remote, ssh_key, @@ -1931,6 +1948,8 @@ pub async fn sandbox_create( upload: Option<&(String, Option, bool)>, keep: bool, gpu: bool, + resources: &[String], + runtime_class_name: Option<&str>, editor: Option, remote: Option<&str>, ssh_key: Option<&str>, @@ -1988,7 +2007,9 @@ pub async fn sandbox_create( eprintln!(); return Err(err); } - let requested_gpu = gpu || from.is_some_and(source_requests_gpu); + let requested_gpu = gpu + || from.is_some_and(source_requests_gpu) + || resource_specs_request_named_resource(resources, "nvidia.com/gpu"); let (new_tls, new_server, _) = crate::bootstrap::run_bootstrap(remote, ssh_key, requested_gpu).await?; let c = grpc_client(&new_server, &new_tls) @@ -2017,6 +2038,17 @@ pub async fn sandbox_create( None => None, }; let requested_gpu = gpu || image.as_deref().is_some_and(image_requests_gpu); + let requested_gpu_resource = resource_specs_request_named_resource(resources, "nvidia.com/gpu"); + if gpu && requested_gpu_resource { + return Err(miette!( + "--gpu conflicts with --resource nvidia.com/gpu=; use only one GPU request path" + )); + } + if gpu && runtime_class_name.is_some() { + return Err(miette!( + "--runtime-class cannot be combined with --gpu because GPU sandboxes force the runtime class automatically" + )); + } let inferred_types: Vec = inferred_provider_type(command).into_iter().collect(); let configured_providers = ensure_required_providers( @@ -2029,10 +2061,7 @@ pub async fn sandbox_create( let policy = load_sandbox_policy(policy)?; - let template = image.map(|img| SandboxTemplate { - image: img, - ..SandboxTemplate::default() - }); + let template = build_sandbox_template(image, runtime_class_name, resources)?; let request = CreateSandboxRequest { spec: Some(SandboxSpec { @@ -2547,6 +2576,87 @@ fn image_requests_gpu(image: &str) -> bool { image_name.contains("gpu") } +fn parse_resource_limits(resources: &[String]) -> Result> { + if resources.is_empty() { + return Ok(None); + } + + let mut limits = BTreeMap::new(); + for resource in resources { + let (name, count) = resource.split_once('=').ok_or_else(|| { + miette!("invalid --resource value '{resource}'; expected =") + })?; + if name.is_empty() { + return Err(miette!( + "invalid --resource value '{resource}'; resource name is empty" + )); + } + let count = count.parse::().map_err(|_| { + miette!("invalid --resource value '{resource}'; count must be a positive integer") + })?; + if count == 0 { + return Err(miette!( + "invalid --resource value '{resource}'; count must be greater than zero" + )); + } + if limits + .insert(name.to_string(), string_value(count.to_string())) + .is_some() + { + return Err(miette!( + "duplicate --resource entry for '{name}'; specify each resource only once" + )); + } + } + + Ok(Some(Struct { + fields: BTreeMap::from([("limits".to_string(), struct_value(limits))]), + })) +} + +fn resource_specs_request_named_resource(resources: &[String], resource_name: &str) -> bool { + resources + .iter() + .filter_map(|resource| resource.split_once('=')) + .any(|(name, _)| name == resource_name) +} + +fn build_sandbox_template( + image: Option, + runtime_class_name: Option<&str>, + resources: &[String], +) -> Result> { + let mut template = SandboxTemplate::default(); + let mut changed = false; + + if let Some(image) = image { + template.image = image; + changed = true; + } + if let Some(runtime_class_name) = runtime_class_name.filter(|value| !value.is_empty()) { + template.runtime_class_name = runtime_class_name.to_string(); + changed = true; + } + if let Some(resource_limits) = parse_resource_limits(resources)? { + template.resources = Some(resource_limits); + changed = true; + } + + Ok(changed.then_some(template)) +} + +fn string_value(value: impl Into) -> Value { + Value { + kind: Some(Kind::StringValue(value.into())), + } +} + +fn struct_value(fields: BTreeMap) -> Value { + Value { + kind: Some(Kind::StructValue(Struct { fields })), + } +} + /// Build a Dockerfile and push the resulting image into the gateway. /// /// Returns the image tag that was built so the caller can use it for sandbox @@ -4987,12 +5097,14 @@ fn format_timestamp_ms(ms: i64) -> String { #[cfg(test)] mod tests { use super::{ - GatewayControlTarget, TlsOptions, format_gateway_select_header, + GatewayControlTarget, TlsOptions, build_sandbox_template, format_gateway_select_header, format_gateway_select_items, gateway_auth_label, gateway_select_with, gateway_type_label, git_sync_files, http_health_check, image_requests_gpu, inferred_provider_type, - parse_cli_setting_value, parse_credential_pairs, provisioning_timeout_message, - ready_false_condition_message, resolve_gateway_control_target_from, sandbox_should_persist, - shell_escape, source_requests_gpu, validate_gateway_name, validate_ssh_host, + parse_cli_setting_value, parse_credential_pairs, parse_resource_limits, + provisioning_timeout_message, ready_false_condition_message, + resolve_gateway_control_target_from, resource_specs_request_named_resource, + sandbox_should_persist, shell_escape, source_requests_gpu, validate_gateway_name, + validate_ssh_host, }; use crate::TEST_ENV_LOCK; use hyper::StatusCode; @@ -5006,6 +5118,7 @@ mod tests { use openshell_bootstrap::GatewayMetadata; use openshell_core::proto::{SandboxCondition, SandboxStatus}; + use prost_types::value::Kind; struct EnvVarGuard { key: &'static str, @@ -5206,6 +5319,78 @@ mod tests { assert!(sandbox_should_persist(false, Some(&spec))); } + #[test] + fn parse_resource_limits_builds_limits_struct() { + let parsed = parse_resource_limits(&[ + "nvidia.com/gpu=2".to_string(), + "vendor.com/vf=1".to_string(), + ]) + .expect("parse resource limits") + .expect("resource limits should exist"); + + let limits = parsed + .fields + .get("limits") + .expect("limits field should exist"); + let Kind::StructValue(limits) = limits.kind.clone().expect("limits kind should exist") + else { + panic!("limits should be a struct"); + }; + let Kind::StringValue(gpu) = limits + .fields + .get("nvidia.com/gpu") + .and_then(|value| value.kind.clone()) + .expect("gpu limit should exist") + else { + panic!("gpu limit should be a string"); + }; + let Kind::StringValue(vf) = limits + .fields + .get("vendor.com/vf") + .and_then(|value| value.kind.clone()) + .expect("vf limit should exist") + else { + panic!("vf limit should be a string"); + }; + + assert_eq!(gpu, "2"); + assert_eq!(vf, "1"); + } + + #[test] + fn parse_resource_limits_rejects_invalid_values() { + let err = parse_resource_limits(&["nvidia.com/gpu=two".to_string()]) + .expect_err("invalid counts should fail"); + assert!(err.to_string().contains("count must be a positive integer")); + } + + #[test] + fn build_sandbox_template_includes_runtime_class_and_resources() { + let template = build_sandbox_template( + Some("example.com/test:latest".to_string()), + Some("nvidia"), + &["nvidia.com/gpu=2".to_string()], + ) + .expect("build template") + .expect("template should exist"); + + assert_eq!(template.image, "example.com/test:latest"); + assert_eq!(template.runtime_class_name, "nvidia"); + assert!(template.resources.is_some()); + } + + #[test] + fn resource_specs_request_named_resource_detects_gpu_requests() { + assert!(resource_specs_request_named_resource( + &["nvidia.com/gpu=2".to_string()], + "nvidia.com/gpu" + )); + assert!(!resource_specs_request_named_resource( + &["vendor.com/vf=1".to_string()], + "nvidia.com/gpu" + )); + } + #[test] fn image_requests_gpu_matches_known_gpu_image_names() { for image in [ diff --git a/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs b/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs index d5d39f08..e82918dd 100644 --- a/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs +++ b/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs @@ -541,6 +541,8 @@ async fn sandbox_create_keeps_command_sessions_by_default() { None, true, false, + &[], + None, None, None, None, @@ -581,6 +583,8 @@ async fn sandbox_create_deletes_command_sessions_with_no_keep() { None, false, false, + &[], + None, None, None, None, @@ -624,6 +628,8 @@ async fn sandbox_create_deletes_shell_sessions_with_no_keep() { None, false, false, + &[], + None, None, None, None, @@ -667,6 +673,8 @@ async fn sandbox_create_keeps_sandbox_with_hidden_keep_flag() { None, true, false, + &[], + None, None, None, None, @@ -698,6 +706,11 @@ async fn sandbox_create_keeps_sandbox_with_forwarding() { let _env = test_env(&fake_ssh_dir, &xdg_dir); let tls = test_tls(&server); install_fake_ssh(&fake_ssh_dir); + let forward_port = std::net::TcpListener::bind("127.0.0.1:0") + .unwrap() + .local_addr() + .unwrap() + .port(); run::sandbox_create( &server.endpoint, @@ -707,12 +720,14 @@ async fn sandbox_create_keeps_sandbox_with_forwarding() { None, false, false, + &[], + None, None, None, None, &[], None, - Some(openshell_core::forward::ForwardSpec::new(8080)), + Some(openshell_core::forward::ForwardSpec::new(forward_port)), &["echo".to_string(), "OK".to_string()], Some(false), Some(false), diff --git a/docs/sandboxes/manage-sandboxes.md b/docs/sandboxes/manage-sandboxes.md index 5306120a..4bf9374b 100644 --- a/docs/sandboxes/manage-sandboxes.md +++ b/docs/sandboxes/manage-sandboxes.md @@ -57,6 +57,19 @@ To request GPU resources, add `--gpu`: $ openshell sandbox create --gpu -- claude ``` +For device-plugin resources beyond the one-GPU shorthand, use `--resource` and +optionally `--runtime-class`: + +```console +$ openshell sandbox create --resource nvidia.com/gpu=2 --runtime-class nvidia -- claude +$ openshell sandbox create --resource vendor.com/vf=1 --runtime-class kata -- my-tool +``` + +`--resource` accepts `=` and may be repeated. This is useful for +device-style resources exposed through Kubernetes, such as GPUs, VFs, or other +vendor accelerators. Keep using `--gpu` when you only need the existing +single-GPU shorthand. + ### Custom Containers Use `--from` to create a sandbox from a pre-built community package, a local directory, or a container image: