From 1618f4184ebda8afdcf96415852f85e4ca78da8f Mon Sep 17 00:00:00 2001
From: Patrick Riel <priel@nvidia.com>
Date: Thu, 26 Mar 2026 17:28:53 +0000
Subject: [PATCH] feat(cli): add generic sandbox device request flags

Signed-off-by: Patrick Riel <priel@nvidia.com>
---
 architecture/gateway-single-node.md           |   2 +-
 crates/openshell-cli/src/main.rs              |  49 +++++
 crates/openshell-cli/src/run.rs               | 207 +++++++++++++++++-
 .../sandbox_create_lifecycle_integration.rs   |  17 +-
 docs/sandboxes/manage-sandboxes.md            |  13 ++
 5 files changed, 275 insertions(+), 13 deletions(-)
diff --git a/architecture/gateway-single-node.md b/architecture/gateway-single-node.md
index 57aebd3a..05a54984 100644
--- a/architecture/gateway-single-node.md
+++ b/architecture/gateway-single-node.md
@@ -381,7 +381,7 @@ When `openshell sandbox create` cannot connect to a gateway (connection refused,
 1. `should_attempt_bootstrap()` in `crates/openshell-cli/src/bootstrap.rs` checks the error type. It returns `true` for connectivity errors and missing default TLS materials, but `false` for TLS handshake/auth errors.
 2. If running in a terminal, the user is prompted to confirm.
 3. `run_bootstrap()` deploys a gateway named `"openshell"`, sets it as active, and returns fresh `TlsOptions` pointing to the newly-written mTLS certs.
-4. When `sandbox create` requests GPU explicitly (`--gpu`) or infers it from an image whose final name component contains `gpu` (such as `nvidia-gpu`), the bootstrap path enables gateway GPU support before retrying sandbox creation.
+4. When `sandbox create` requests GPU explicitly (`--gpu`), requests `nvidia.com/gpu` through `--resource`, or infers GPU intent from an image whose final name component contains `gpu` (such as `nvidia-gpu`), the bootstrap path enables gateway GPU support before retrying sandbox creation.
 
 ## Container Environment Variables
 
diff --git a/crates/openshell-cli/src/main.rs b/crates/openshell-cli/src/main.rs
index 5de31c79..f822f969 100644
--- a/crates/openshell-cli/src/main.rs
+++ b/crates/openshell-cli/src/main.rs
@@ -1109,6 +1109,21 @@ enum SandboxCommands {
         #[arg(long)]
         gpu: bool,
 
+        /// Request an additional sandbox resource limit.
+        ///
+        /// Accepts `<NAME>=<COUNT>` and may be repeated. This is primarily
+        /// useful for device-plugin resources such as `nvidia.com/gpu=2` or
+        /// `vendor.com/vf=1`.
+        #[arg(long = "resource", value_name = "NAME=COUNT")]
+        resources: Vec<String>,
+
+        /// Override the sandbox pod runtime class.
+        ///
+        /// This is forwarded to `template.runtime_class_name` and is useful
+        /// for device runtimes that require an explicit runtime class.
+        #[arg(long = "runtime-class")]
+        runtime_class: Option<String>,
+
         /// SSH destination for remote bootstrap (e.g., user@hostname).
         /// Only used when no cluster exists yet; ignored if a cluster is
         /// already active.
@@ -2073,6 +2088,8 @@ async fn main() -> Result<()> {
                     no_keep,
                     editor,
                     gpu,
+                    resources,
+                    runtime_class,
                     remote,
                     ssh_key,
                     providers,
@@ -2154,6 +2171,8 @@ async fn main() -> Result<()> {
                                 upload_spec.as_ref(),
                                 keep,
                                 gpu,
+                                &resources,
+                                runtime_class.as_deref(),
                                 editor,
                                 remote.as_deref(),
                                 ssh_key.as_deref(),
@@ -2176,6 +2195,8 @@ async fn main() -> Result<()> {
                                 upload_spec.as_ref(),
                                 keep,
                                 gpu,
+                                &resources,
+                                runtime_class.as_deref(),
                                 editor,
                                 remote.as_deref(),
                                 ssh_key.as_deref(),
@@ -2872,6 +2893,34 @@ mod tests {
         assert_eq!(dest.get_value_hint(), ValueHint::AnyPath);
     }
 
+    #[test]
+    fn sandbox_create_accepts_resource_and_runtime_class_flags() {
+        let cli = Cli::try_parse_from([
+            "openshell",
+            "sandbox",
+            "create",
+            "--resource",
+            "nvidia.com/gpu=2",
+            "--resource",
+            "vendor.com/vf=1",
+            "--runtime-class",
+            "nvidia",
+        ])
+        .expect("sandbox create should parse generic resource flags");
+
+        assert!(matches!(
+            cli.command,
+            Some(Commands::Sandbox {
+                command: Some(SandboxCommands::Create {
+                    resources,
+                    runtime_class,
+                    ..
+                })
+            }) if resources == vec!["nvidia.com/gpu=2".to_string(), "vendor.com/vf=1".to_string()]
+                && runtime_class.as_deref() == Some("nvidia")
+        ));
+    }
+
     #[test]
     fn parse_upload_spec_without_remote() {
         let (local, remote) = parse_upload_spec("./src");
diff --git a/crates/openshell-cli/src/run.rs b/crates/openshell-cli/src/run.rs
index e32eec2a..c6c51ea1 100644
--- a/crates/openshell-cli/src/run.rs
+++ b/crates/openshell-cli/src/run.rs
@@ -37,7 +37,8 @@ use openshell_providers::{
     ProviderRegistry, detect_provider_from_command, normalize_provider_type,
 };
 use owo_colors::OwoColorize;
-use std::collections::{HashMap, HashSet, VecDeque};
+use prost_types::{Struct, Value, value::Kind};
+use std::collections::{BTreeMap, HashMap, HashSet, VecDeque};
 use std::io::{IsTerminal, Write};
 use std::path::{Path, PathBuf};
 use std::process::Command;
@@ -1845,6 +1846,8 @@ pub async fn sandbox_create_with_bootstrap(
     upload: Option<&(String, Option<String>, bool)>,
     keep: bool,
     gpu: bool,
+    resources: &[String],
+    runtime_class_name: Option<&str>,
     editor: Option<Editor>,
     remote: Option<&str>,
     ssh_key: Option<&str>,
@@ -1856,6 +1859,16 @@ pub async fn sandbox_create_with_bootstrap(
     bootstrap_override: Option<bool>,
     auto_providers_override: Option<bool>,
 ) -> Result<()> {
+    if gpu && resource_specs_request_named_resource(resources, "nvidia.com/gpu") {
+        return Err(miette!(
+            "--gpu conflicts with --resource nvidia.com/gpu=<COUNT>; use only one GPU request path"
+        ));
+    }
+    if gpu && runtime_class_name.is_some() {
+        return Err(miette!(
+            "--runtime-class cannot be combined with --gpu because GPU sandboxes force the runtime class automatically"
+        ));
+    }
     if !crate::bootstrap::confirm_bootstrap(bootstrap_override)? {
         return Err(miette::miette!(
             "No active gateway.\n\
@@ -1863,7 +1876,9 @@ pub async fn sandbox_create_with_bootstrap(
              Or deploy a new gateway: openshell gateway start"
         ));
     }
-    let requested_gpu = gpu || from.is_some_and(source_requests_gpu);
+    let requested_gpu = gpu
+        || from.is_some_and(source_requests_gpu)
+        || resource_specs_request_named_resource(resources, "nvidia.com/gpu");
     let (tls, server, gateway_name) =
         crate::bootstrap::run_bootstrap(remote, ssh_key, requested_gpu).await?;
     // Disable bootstrap inside sandbox_create so that a transient connection
@@ -1876,6 +1891,8 @@ pub async fn sandbox_create_with_bootstrap(
         upload,
         keep,
         gpu,
+        resources,
+        runtime_class_name,
         editor,
         remote,
         ssh_key,
@@ -1931,6 +1948,8 @@ pub async fn sandbox_create(
     upload: Option<&(String, Option<String>, bool)>,
     keep: bool,
     gpu: bool,
+    resources: &[String],
+    runtime_class_name: Option<&str>,
     editor: Option<Editor>,
     remote: Option<&str>,
     ssh_key: Option<&str>,
@@ -1988,7 +2007,9 @@ pub async fn sandbox_create(
                 eprintln!();
                 return Err(err);
             }
-            let requested_gpu = gpu || from.is_some_and(source_requests_gpu);
+            let requested_gpu = gpu
+                || from.is_some_and(source_requests_gpu)
+                || resource_specs_request_named_resource(resources, "nvidia.com/gpu");
             let (new_tls, new_server, _) =
                 crate::bootstrap::run_bootstrap(remote, ssh_key, requested_gpu).await?;
             let c = grpc_client(&new_server, &new_tls)
@@ -2017,6 +2038,17 @@ pub async fn sandbox_create(
         None => None,
     };
     let requested_gpu = gpu || image.as_deref().is_some_and(image_requests_gpu);
+    let requested_gpu_resource = resource_specs_request_named_resource(resources, "nvidia.com/gpu");
+    if gpu && requested_gpu_resource {
+        return Err(miette!(
+            "--gpu conflicts with --resource nvidia.com/gpu=<COUNT>; use only one GPU request path"
+        ));
+    }
+    if gpu && runtime_class_name.is_some() {
+        return Err(miette!(
+            "--runtime-class cannot be combined with --gpu because GPU sandboxes force the runtime class automatically"
+        ));
+    }
 
     let inferred_types: Vec<String> = inferred_provider_type(command).into_iter().collect();
     let configured_providers = ensure_required_providers(
@@ -2029,10 +2061,7 @@ pub async fn sandbox_create(
 
     let policy = load_sandbox_policy(policy)?;
 
-    let template = image.map(|img| SandboxTemplate {
-        image: img,
-        ..SandboxTemplate::default()
-    });
+    let template = build_sandbox_template(image, runtime_class_name, resources)?;
 
     let request = CreateSandboxRequest {
         spec: Some(SandboxSpec {
@@ -2547,6 +2576,87 @@ fn image_requests_gpu(image: &str) -> bool {
     image_name.contains("gpu")
 }
 
+fn parse_resource_limits(resources: &[String]) -> Result<Option<Struct>> {
+    if resources.is_empty() {
+        return Ok(None);
+    }
+
+    let mut limits = BTreeMap::new();
+    for resource in resources {
+        let (name, count) = resource.split_once('=').ok_or_else(|| {
+            miette!("invalid --resource value '{resource}'; expected <NAME>=<COUNT>")
+        })?;
+        if name.is_empty() {
+            return Err(miette!(
+                "invalid --resource value '{resource}'; resource name is empty"
+            ));
+        }
+        let count = count.parse::<u32>().map_err(|_| {
+            miette!("invalid --resource value '{resource}'; count must be a positive integer")
+        })?;
+        if count == 0 {
+            return Err(miette!(
+                "invalid --resource value '{resource}'; count must be greater than zero"
+            ));
+        }
+        if limits
+            .insert(name.to_string(), string_value(count.to_string()))
+            .is_some()
+        {
+            return Err(miette!(
+                "duplicate --resource entry for '{name}'; specify each resource only once"
+            ));
+        }
+    }
+
+    Ok(Some(Struct {
+        fields: BTreeMap::from([("limits".to_string(), struct_value(limits))]),
+    }))
+}
+
+fn resource_specs_request_named_resource(resources: &[String], resource_name: &str) -> bool {
+    resources
+        .iter()
+        .filter_map(|resource| resource.split_once('='))
+        .any(|(name, _)| name == resource_name)
+}
+
+fn build_sandbox_template(
+    image: Option<String>,
+    runtime_class_name: Option<&str>,
+    resources: &[String],
+) -> Result<Option<SandboxTemplate>> {
+    let mut template = SandboxTemplate::default();
+    let mut changed = false;
+
+    if let Some(image) = image {
+        template.image = image;
+        changed = true;
+    }
+    if let Some(runtime_class_name) = runtime_class_name.filter(|value| !value.is_empty()) {
+        template.runtime_class_name = runtime_class_name.to_string();
+        changed = true;
+    }
+    if let Some(resource_limits) = parse_resource_limits(resources)? {
+        template.resources = Some(resource_limits);
+        changed = true;
+    }
+
+    Ok(changed.then_some(template))
+}
+
+fn string_value(value: impl Into<String>) -> Value {
+    Value {
+        kind: Some(Kind::StringValue(value.into())),
+    }
+}
+
+fn struct_value(fields: BTreeMap<String, Value>) -> Value {
+    Value {
+        kind: Some(Kind::StructValue(Struct { fields })),
+    }
+}
+
 /// Build a Dockerfile and push the resulting image into the gateway.
 ///
 /// Returns the image tag that was built so the caller can use it for sandbox
@@ -4987,12 +5097,14 @@ fn format_timestamp_ms(ms: i64) -> String {
 #[cfg(test)]
 mod tests {
     use super::{
-        GatewayControlTarget, TlsOptions, format_gateway_select_header,
+        GatewayControlTarget, TlsOptions, build_sandbox_template, format_gateway_select_header,
         format_gateway_select_items, gateway_auth_label, gateway_select_with, gateway_type_label,
         git_sync_files, http_health_check, image_requests_gpu, inferred_provider_type,
-        parse_cli_setting_value, parse_credential_pairs, provisioning_timeout_message,
-        ready_false_condition_message, resolve_gateway_control_target_from, sandbox_should_persist,
-        shell_escape, source_requests_gpu, validate_gateway_name, validate_ssh_host,
+        parse_cli_setting_value, parse_credential_pairs, parse_resource_limits,
+        provisioning_timeout_message, ready_false_condition_message,
+        resolve_gateway_control_target_from, resource_specs_request_named_resource,
+        sandbox_should_persist, shell_escape, source_requests_gpu, validate_gateway_name,
+        validate_ssh_host,
     };
     use crate::TEST_ENV_LOCK;
     use hyper::StatusCode;
@@ -5006,6 +5118,7 @@ mod tests {
 
     use openshell_bootstrap::GatewayMetadata;
     use openshell_core::proto::{SandboxCondition, SandboxStatus};
+    use prost_types::value::Kind;
 
     struct EnvVarGuard {
         key: &'static str,
@@ -5206,6 +5319,78 @@ mod tests {
         assert!(sandbox_should_persist(false, Some(&spec)));
     }
 
+    #[test]
+    fn parse_resource_limits_builds_limits_struct() {
+        let parsed = parse_resource_limits(&[
+            "nvidia.com/gpu=2".to_string(),
+            "vendor.com/vf=1".to_string(),
+        ])
+        .expect("parse resource limits")
+        .expect("resource limits should exist");
+
+        let limits = parsed
+            .fields
+            .get("limits")
+            .expect("limits field should exist");
+        let Kind::StructValue(limits) = limits.kind.clone().expect("limits kind should exist")
+        else {
+            panic!("limits should be a struct");
+        };
+        let Kind::StringValue(gpu) = limits
+            .fields
+            .get("nvidia.com/gpu")
+            .and_then(|value| value.kind.clone())
+            .expect("gpu limit should exist")
+        else {
+            panic!("gpu limit should be a string");
+        };
+        let Kind::StringValue(vf) = limits
+            .fields
+            .get("vendor.com/vf")
+            .and_then(|value| value.kind.clone())
+            .expect("vf limit should exist")
+        else {
+            panic!("vf limit should be a string");
+        };
+
+        assert_eq!(gpu, "2");
+        assert_eq!(vf, "1");
+    }
+
+    #[test]
+    fn parse_resource_limits_rejects_invalid_values() {
+        let err = parse_resource_limits(&["nvidia.com/gpu=two".to_string()])
+            .expect_err("invalid counts should fail");
+        assert!(err.to_string().contains("count must be a positive integer"));
+    }
+
+    #[test]
+    fn build_sandbox_template_includes_runtime_class_and_resources() {
+        let template = build_sandbox_template(
+            Some("example.com/test:latest".to_string()),
+            Some("nvidia"),
+            &["nvidia.com/gpu=2".to_string()],
+        )
+        .expect("build template")
+        .expect("template should exist");
+
+        assert_eq!(template.image, "example.com/test:latest");
+        assert_eq!(template.runtime_class_name, "nvidia");
+        assert!(template.resources.is_some());
+    }
+
+    #[test]
+    fn resource_specs_request_named_resource_detects_gpu_requests() {
+        assert!(resource_specs_request_named_resource(
+            &["nvidia.com/gpu=2".to_string()],
+            "nvidia.com/gpu"
+        ));
+        assert!(!resource_specs_request_named_resource(
+            &["vendor.com/vf=1".to_string()],
+            "nvidia.com/gpu"
+        ));
+    }
+
     #[test]
     fn image_requests_gpu_matches_known_gpu_image_names() {
         for image in [
diff --git a/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs b/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs
index d5d39f08..e82918dd 100644
--- a/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs
+++ b/crates/openshell-cli/tests/sandbox_create_lifecycle_integration.rs
@@ -541,6 +541,8 @@ async fn sandbox_create_keeps_command_sessions_by_default() {
         None,
         true,
         false,
+        &[],
+        None,
         None,
         None,
         None,
@@ -581,6 +583,8 @@ async fn sandbox_create_deletes_command_sessions_with_no_keep() {
         None,
         false,
         false,
+        &[],
+        None,
         None,
         None,
         None,
@@ -624,6 +628,8 @@ async fn sandbox_create_deletes_shell_sessions_with_no_keep() {
         None,
         false,
         false,
+        &[],
+        None,
         None,
         None,
         None,
@@ -667,6 +673,8 @@ async fn sandbox_create_keeps_sandbox_with_hidden_keep_flag() {
         None,
         true,
         false,
+        &[],
+        None,
         None,
         None,
         None,
@@ -698,6 +706,11 @@ async fn sandbox_create_keeps_sandbox_with_forwarding() {
     let _env = test_env(&fake_ssh_dir, &xdg_dir);
     let tls = test_tls(&server);
     install_fake_ssh(&fake_ssh_dir);
+    let forward_port = std::net::TcpListener::bind("127.0.0.1:0")
+        .unwrap()
+        .local_addr()
+        .unwrap()
+        .port();
 
     run::sandbox_create(
         &server.endpoint,
@@ -707,12 +720,14 @@ async fn sandbox_create_keeps_sandbox_with_forwarding() {
         None,
         false,
         false,
+        &[],
+        None,
         None,
         None,
         None,
         &[],
         None,
-        Some(openshell_core::forward::ForwardSpec::new(8080)),
+        Some(openshell_core::forward::ForwardSpec::new(forward_port)),
         &["echo".to_string(), "OK".to_string()],
         Some(false),
         Some(false),
diff --git a/docs/sandboxes/manage-sandboxes.md b/docs/sandboxes/manage-sandboxes.md
index 5306120a..4bf9374b 100644
--- a/docs/sandboxes/manage-sandboxes.md
+++ b/docs/sandboxes/manage-sandboxes.md
@@ -57,6 +57,19 @@ To request GPU resources, add `--gpu`:
 $ openshell sandbox create --gpu -- claude
 ```
 
+For device-plugin resources beyond the one-GPU shorthand, use `--resource` and
+optionally `--runtime-class`:
+
+```console
+$ openshell sandbox create --resource nvidia.com/gpu=2 --runtime-class nvidia -- claude
+$ openshell sandbox create --resource vendor.com/vf=1 --runtime-class kata -- my-tool
+```
+
+`--resource` accepts `<NAME>=<COUNT>` and may be repeated. This is useful for
+device-style resources exposed through Kubernetes, such as GPUs, VFs, or other
+vendor accelerators. Keep using `--gpu` when you only need the existing
+single-GPU shorthand.
+
 ### Custom Containers
 
 Use `--from` to create a sandbox from a pre-built community package, a local directory, or a container image: