From 22c5b3d7c726f273455ec486dac71a24be0884bb Mon Sep 17 00:00:00 2001 From: Antonio Jimeno Yepes Date: Thu, 26 Mar 2026 14:05:06 -0700 Subject: [PATCH 1/9] feat: integrate Tensorlake Firecracker microVM sandbox for isolated shell execution Adds a Tensorlake sandbox backend as an alternative to local shell execution. When enabled via --tensorlake , all shell tool calls are routed through an isolated Firecracker microVM instead of running directly on the host machine. Changes: - crates/forge_infra/src/tensorlake.rs: new TensorlakeCommandExecutor implementing CommandInfra; lazy sandbox provisioning via Tensorlake REST API; best-effort cleanup on Drop; unit tests for config defaults and executor creation - crates/forge_infra/src/forge_infra.rs: CommandExecutor enum dispatches between Local and Tensorlake backends; ForgeInfra::new_with_tensorlake() constructor - crates/forge_infra/src/lib.rs: export tensorlake module and TensorlakeConfig - crates/forge_api/src/forge_api.rs: ForgeAPI::init_with_tensorlake() factory - crates/forge_api/src/lib.rs: re-export TensorlakeConfig for callers - crates/forge_main/src/cli.rs: --tensorlake flag (also reads TENSORLAKE_API_KEY env var) - crates/forge_main/src/main.rs: wire CLI flag to init_with_tensorlake() Co-Authored-By: ForgeCode --- crates/forge_api/src/forge_api.rs | 12 +- crates/forge_api/src/lib.rs | 1 + crates/forge_infra/src/forge_infra.rs | 83 +++++++-- crates/forge_infra/src/lib.rs | 2 + crates/forge_infra/src/tensorlake.rs | 239 ++++++++++++++++++++++++++ crates/forge_main/src/cli.rs | 8 + crates/forge_main/src/main.rs | 15 +- 7 files changed, 347 insertions(+), 13 deletions(-) create mode 100644 crates/forge_infra/src/tensorlake.rs diff --git a/crates/forge_api/src/forge_api.rs b/crates/forge_api/src/forge_api.rs index a2dc7847a7..36e713648c 100644 --- a/crates/forge_api/src/forge_api.rs +++ b/crates/forge_api/src/forge_api.rs @@ -12,7 +12,7 @@ use forge_app::{ WorkspaceService, }; use forge_domain::{Agent, ConsoleWriter, InitAuth, LoginInfo, *}; -use forge_infra::ForgeInfra; +use forge_infra::{ForgeInfra, TensorlakeConfig}; use forge_repo::ForgeRepo; use forge_services::ForgeServices; use forge_stream::MpscStream; @@ -41,6 +41,7 @@ impl ForgeAPI { } impl ForgeAPI>, ForgeRepo> { + /// Initialises the API with the local command executor (default mode). pub fn init(restricted: bool, cwd: PathBuf) -> Self { let infra = Arc::new(ForgeInfra::new(restricted, cwd)); let repo = Arc::new(ForgeRepo::new(infra.clone())); @@ -48,6 +49,15 @@ impl ForgeAPI>, ForgeRepo> { ForgeAPI::new(app, repo) } + /// Initialises the API routing all shell commands through an isolated + /// Tensorlake Firecracker microVM sandbox. + pub fn init_with_tensorlake(restricted: bool, cwd: PathBuf, config: TensorlakeConfig) -> Self { + let infra = Arc::new(ForgeInfra::new_with_tensorlake(restricted, cwd, config)); + let repo = Arc::new(ForgeRepo::new(infra.clone())); + let app = Arc::new(ForgeServices::new(repo.clone())); + ForgeAPI::new(app, repo) + } + pub async fn get_skills_internal(&self) -> Result> { use forge_domain::SkillRepository; self.infra.load_skills().await diff --git a/crates/forge_api/src/lib.rs b/crates/forge_api/src/lib.rs index bf407ba049..5b686cf4cc 100644 --- a/crates/forge_api/src/lib.rs +++ b/crates/forge_api/src/lib.rs @@ -6,3 +6,4 @@ pub use forge_api::*; pub use forge_app::dto::*; pub use forge_app::{Plan, UsageInfo, UserUsage}; pub use forge_domain::{Agent, *}; +pub use forge_infra::TensorlakeConfig; diff --git a/crates/forge_infra/src/forge_infra.rs b/crates/forge_infra/src/forge_infra.rs index 1a79373409..59d81407b4 100644 --- a/crates/forge_infra/src/forge_infra.rs +++ b/crates/forge_infra/src/forge_infra.rs @@ -32,8 +32,20 @@ use crate::http::ForgeHttpInfra; use crate::inquire::ForgeInquire; use crate::mcp_client::ForgeMcpClient; use crate::mcp_server::ForgeMcpServer; +use crate::tensorlake::{TensorlakeCommandExecutor, TensorlakeConfig}; use crate::walker::ForgeWalkerService; +/// Abstraction over the available command execution backends. +/// +/// Commands are either executed locally on the host machine via +/// `ForgeCommandExecutorService`, or inside an isolated Tensorlake +/// Firecracker microVM via `TensorlakeCommandExecutor`. +#[derive(Clone)] +enum CommandExecutor { + Local(Arc), + Tensorlake(Arc), +} + #[derive(Clone)] pub struct ForgeInfra { // TODO: Drop the "Service" suffix. Use names like ForgeFileReader, ForgeFileWriter, @@ -45,7 +57,7 @@ pub struct ForgeInfra { file_meta_service: Arc, create_dirs_service: Arc, directory_reader_service: Arc, - command_executor_service: Arc, + command_executor: CommandExecutor, inquire_service: Arc, mcp_server: ForgeMcpServer, walker_service: Arc, @@ -56,6 +68,8 @@ pub struct ForgeInfra { } impl ForgeInfra { + /// Creates a `ForgeInfra` instance that executes shell commands locally on + /// the host machine. pub fn new(restricted: bool, cwd: PathBuf) -> Self { let environment_service = Arc::new(ForgeEnvironmentInfra::new(restricted, cwd)); let env = environment_service.get_environment(); @@ -77,9 +91,48 @@ impl ForgeInfra { file_meta_service, create_dirs_service: Arc::new(ForgeCreateDirsService), directory_reader_service, - command_executor_service: Arc::new(ForgeCommandExecutorService::new( - env.clone(), - output_printer.clone(), + command_executor: CommandExecutor::Local(Arc::new( + ForgeCommandExecutorService::new(env.clone(), output_printer.clone()), + )), + inquire_service: Arc::new(ForgeInquire::new()), + mcp_server: ForgeMcpServer, + walker_service: Arc::new(ForgeWalkerService::new()), + strategy_factory: Arc::new(ForgeAuthStrategyFactory::new()), + http_service, + grpc_client, + output_printer, + } + } + + /// Creates a `ForgeInfra` instance that executes shell commands inside an + /// isolated Tensorlake Firecracker microVM sandbox. + /// + /// A single sandbox is provisioned lazily on the first command execution and + /// reused for the entire session. The sandbox is terminated when the returned + /// `ForgeInfra` is dropped. + pub fn new_with_tensorlake(restricted: bool, cwd: PathBuf, config: TensorlakeConfig) -> Self { + let environment_service = Arc::new(ForgeEnvironmentInfra::new(restricted, cwd)); + let env = environment_service.get_environment(); + + let file_write_service = Arc::new(ForgeFileWriteService::new()); + let http_service = Arc::new(ForgeHttpInfra::new(env.clone(), file_write_service.clone())); + let file_read_service = Arc::new(ForgeFileReadService::new()); + let file_meta_service = Arc::new(ForgeFileMetaService); + let directory_reader_service = + Arc::new(ForgeDirectoryReaderService::new(env.parallel_file_reads)); + let grpc_client = Arc::new(ForgeGrpcClient::new(env.workspace_server_url.clone())); + let output_printer = Arc::new(StdConsoleWriter::default()); + + Self { + file_read_service, + file_write_service, + file_remove_service: Arc::new(ForgeFileRemoveService::new()), + environment_service, + file_meta_service, + create_dirs_service: Arc::new(ForgeCreateDirsService), + directory_reader_service, + command_executor: CommandExecutor::Tensorlake(Arc::new( + TensorlakeCommandExecutor::new(config), )), inquire_service: Arc::new(ForgeInquire::new()), mcp_server: ForgeMcpServer, @@ -194,9 +247,14 @@ impl CommandInfra for ForgeInfra { silent: bool, env_vars: Option>, ) -> anyhow::Result { - self.command_executor_service - .execute_command(command, working_dir, silent, env_vars) - .await + match &self.command_executor { + CommandExecutor::Local(svc) => { + svc.execute_command(command, working_dir, silent, env_vars).await + } + CommandExecutor::Tensorlake(svc) => { + svc.execute_command(command, working_dir, silent, env_vars).await + } + } } async fn execute_command_raw( @@ -205,9 +263,14 @@ impl CommandInfra for ForgeInfra { working_dir: PathBuf, env_vars: Option>, ) -> anyhow::Result { - self.command_executor_service - .execute_command_raw(command, working_dir, env_vars) - .await + match &self.command_executor { + CommandExecutor::Local(svc) => { + svc.execute_command_raw(command, working_dir, env_vars).await + } + CommandExecutor::Tensorlake(svc) => { + svc.execute_command_raw(command, working_dir, env_vars).await + } + } } } diff --git a/crates/forge_infra/src/lib.rs b/crates/forge_infra/src/lib.rs index 4823e80e62..e8cedfc342 100644 --- a/crates/forge_infra/src/lib.rs +++ b/crates/forge_infra/src/lib.rs @@ -1,5 +1,6 @@ mod console; pub mod executor; +pub mod tensorlake; mod auth; mod env; @@ -23,3 +24,4 @@ pub use console::StdConsoleWriter; pub use executor::ForgeCommandExecutorService; pub use forge_infra::*; pub use kv_storage::CacacheStorage; +pub use tensorlake::{TensorlakeCommandExecutor, TensorlakeConfig}; diff --git a/crates/forge_infra/src/tensorlake.rs b/crates/forge_infra/src/tensorlake.rs new file mode 100644 index 0000000000..aaea6e283c --- /dev/null +++ b/crates/forge_infra/src/tensorlake.rs @@ -0,0 +1,239 @@ +use std::path::PathBuf; +use std::sync::Arc; + +use anyhow::{Context, anyhow}; +use async_trait::async_trait; +use forge_app::CommandInfra; +use forge_domain::CommandOutput; +use serde::{Deserialize, Serialize}; +use tokio::sync::Mutex; + +const TENSORLAKE_API_BASE: &str = "https://api.tensorlake.ai"; + +/// Configuration for a Tensorlake sandbox session. +#[derive(Debug, Clone)] +pub struct TensorlakeConfig { + /// Tensorlake API key used for all requests. + pub api_key: String, + /// Number of vCPUs to allocate for the sandbox (default: 2.0). + pub cpus: f64, + /// Memory in megabytes to allocate for the sandbox (default: 4096). + pub memory_mb: u64, + /// Inactivity timeout in seconds before the sandbox auto-suspends (default: 3600). + pub timeout_secs: u64, +} + +impl TensorlakeConfig { + /// Creates a new `TensorlakeConfig` with the given API key and sensible defaults. + pub fn new(api_key: String) -> Self { + Self { api_key, cpus: 2.0, memory_mb: 4096, timeout_secs: 3600 } + } +} + +/// Response returned by the Tensorlake sandboxes create endpoint. +#[derive(Debug, Deserialize)] +struct CreateSandboxResponse { + sandbox_id: String, +} + +/// Response returned by the Tensorlake sandbox command execution endpoint. +#[derive(Debug, Deserialize)] +struct RunCommandResponse { + stdout: String, + stderr: String, + exit_code: Option, +} + +/// Request body for executing a command inside a sandbox. +#[derive(Debug, Serialize)] +struct RunCommandRequest<'a> { + cmd: &'a str, + args: Vec<&'a str>, + cwd: String, +} + +/// Infrastructure implementation that executes shell commands inside an isolated +/// Tensorlake Firecracker microVM sandbox. +/// +/// A single sandbox is created lazily on the first command execution and reused +/// for the lifetime of the `TensorlakeCommandExecutor` instance. The sandbox is +/// terminated when the executor is dropped. +#[derive(Clone)] +pub struct TensorlakeCommandExecutor { + config: TensorlakeConfig, + client: reqwest::Client, + /// Lazily initialized sandbox ID, shared across clones via `Arc>`. + sandbox_id: Arc>>, +} + +impl TensorlakeCommandExecutor { + /// Creates a new executor with the provided Tensorlake configuration. + pub fn new(config: TensorlakeConfig) -> Self { + Self { + config, + client: reqwest::Client::new(), + sandbox_id: Arc::new(Mutex::new(None)), + } + } + + /// Returns the sandbox ID, creating a new sandbox if one has not been + /// provisioned yet for this session. + async fn ensure_sandbox(&self) -> anyhow::Result { + let mut guard = self.sandbox_id.lock().await; + if let Some(id) = guard.as_deref() { + return Ok(id.to_string()); + } + + let id = self.create_sandbox().await?; + tracing::info!(sandbox_id = %id, "Tensorlake sandbox created"); + *guard = Some(id.clone()); + Ok(id) + } + + /// Provisions a new Tensorlake sandbox and returns its ID. + async fn create_sandbox(&self) -> anyhow::Result { + let url = format!("{}/v2/sandboxes", TENSORLAKE_API_BASE); + let body = serde_json::json!({ + "cpus": self.config.cpus, + "memory_mb": self.config.memory_mb, + "timeout_secs": self.config.timeout_secs, + }); + + let response = self + .client + .post(&url) + .bearer_auth(&self.config.api_key) + .json(&body) + .send() + .await + .context("Failed to send create sandbox request to Tensorlake")?; + + if !response.status().is_success() { + let status = response.status(); + let text = response.text().await.unwrap_or_default(); + return Err(anyhow!( + "Tensorlake sandbox creation failed with status {status}: {text}" + )); + } + + let parsed: CreateSandboxResponse = response + .json() + .await + .context("Failed to parse Tensorlake create sandbox response")?; + + Ok(parsed.sandbox_id) + } +} + +impl Drop for TensorlakeCommandExecutor { + /// Schedules sandbox termination when the executor is dropped. + /// + /// A best-effort background task is spawned so that the `Drop` impl + /// remains synchronous while still cleaning up remote resources. + fn drop(&mut self) { + let sandbox_id = self.sandbox_id.clone(); + let client = self.client.clone(); + let api_key = self.config.api_key.clone(); + + tokio::spawn(async move { + let guard = sandbox_id.lock().await; + if let Some(id) = guard.as_deref() { + let url = format!("{}/v2/sandboxes/{}", TENSORLAKE_API_BASE, id); + let _ = client.delete(&url).bearer_auth(&api_key).send().await; + tracing::debug!(sandbox_id = %id, "Tensorlake sandbox cleanup on drop"); + } + }); + } +} + +#[async_trait] +impl CommandInfra for TensorlakeCommandExecutor { + /// Executes a shell command inside the Tensorlake sandbox and returns the captured output. + async fn execute_command( + &self, + command: String, + working_dir: PathBuf, + _silent: bool, + _env_vars: Option>, + ) -> anyhow::Result { + let sandbox_id = self.ensure_sandbox().await?; + let cwd = working_dir.to_string_lossy().to_string(); + + let url = format!("{}/v2/sandboxes/{}/commands", TENSORLAKE_API_BASE, sandbox_id); + let request = RunCommandRequest { cmd: "sh", args: vec!["-c", &command], cwd }; + + tracing::info!(command = %command, sandbox_id = %sandbox_id, "Executing command in Tensorlake sandbox"); + + let response = self + .client + .post(&url) + .bearer_auth(&self.config.api_key) + .json(&request) + .send() + .await + .context("Failed to send command execution request to Tensorlake sandbox")?; + + if !response.status().is_success() { + let status = response.status(); + let text = response.text().await.unwrap_or_default(); + return Err(anyhow!( + "Tensorlake command execution failed with status {status}: {text}" + )); + } + + let result: RunCommandResponse = response + .json() + .await + .context("Failed to parse Tensorlake command execution response")?; + + Ok(CommandOutput { + stdout: result.stdout, + stderr: result.stderr, + exit_code: result.exit_code.map(|c| c as i32), + command, + }) + } + + /// Interactive (raw) commands are not supported in Tensorlake sandbox mode. + /// + /// Raw command execution requires an attached TTY which is not available over + /// the Tensorlake HTTP API. This method always returns an error directing the + /// caller to use `execute_command` instead. + async fn execute_command_raw( + &self, + _command: &str, + _working_dir: PathBuf, + _env_vars: Option>, + ) -> anyhow::Result { + Err(anyhow!( + "Interactive (raw) command execution is not supported in Tensorlake sandbox mode. \ + Use non-interactive commands instead." + )) + } +} + +#[cfg(test)] +mod tests { + use pretty_assertions::assert_eq; + + use super::*; + + #[test] + fn test_tensorlake_config_defaults() { + let fixture = TensorlakeConfig::new("test-api-key".to_string()); + + assert_eq!(fixture.api_key, "test-api-key"); + assert_eq!(fixture.cpus, 2.0); + assert_eq!(fixture.memory_mb, 4096); + assert_eq!(fixture.timeout_secs, 3600); + } + + #[test] + fn test_tensorlake_executor_creation() { + let config = TensorlakeConfig::new("test-api-key".to_string()); + let executor = TensorlakeCommandExecutor::new(config.clone()); + + assert_eq!(executor.config.api_key, config.api_key); + assert_eq!(executor.config.cpus, config.cpus); + } +} diff --git a/crates/forge_main/src/cli.rs b/crates/forge_main/src/cli.rs index 13d6845afc..952b0cf3b4 100644 --- a/crates/forge_main/src/cli.rs +++ b/crates/forge_main/src/cli.rs @@ -50,6 +50,14 @@ pub struct Cli { #[arg(long)] pub sandbox: Option, + /// Run all shell commands inside an isolated Tensorlake Firecracker microVM sandbox. + /// + /// Requires a valid Tensorlake API key. Can also be supplied via the + /// TENSORLAKE_API_KEY environment variable. When set, every shell tool call + /// is routed through a remote Tensorlake sandbox instead of the local host. + #[arg(long, env = "TENSORLAKE_API_KEY")] + pub tensorlake: Option, + /// Enable verbose logging output. #[arg(long, default_value_t = false)] pub verbose: bool, diff --git a/crates/forge_main/src/main.rs b/crates/forge_main/src/main.rs index 0c54498fe2..9025377892 100644 --- a/crates/forge_main/src/main.rs +++ b/crates/forge_main/src/main.rs @@ -4,7 +4,7 @@ use std::path::PathBuf; use anyhow::Result; use clap::Parser; -use forge_api::ForgeAPI; +use forge_api::{ForgeAPI, TensorlakeConfig}; use forge_domain::TitleFormat; use forge_main::{Cli, Sandbox, TitleDisplayExt, UI, tracker}; @@ -64,7 +64,18 @@ async fn main() -> Result<()> { // Initialize the ForgeAPI with the restricted mode if specified let restricted = cli.restricted; - let mut ui = UI::init(cli, move || ForgeAPI::init(restricted, cwd.clone()))?; + let tensorlake_key = cli.tensorlake.clone(); + let mut ui = UI::init(cli, move || { + if let Some(api_key) = tensorlake_key { + ForgeAPI::init_with_tensorlake( + restricted, + cwd.clone(), + TensorlakeConfig::new(api_key), + ) + } else { + ForgeAPI::init(restricted, cwd.clone()) + } + })?; ui.run().await; Ok(()) From 18e8f002bee74e71529e557acd28766b804d0f2b Mon Sep 17 00:00:00 2001 From: Antonio Jimeno Yepes Date: Thu, 26 Mar 2026 14:11:13 -0700 Subject: [PATCH 2/9] fix(tensorlake): use #[tokio::test] for executor creation test The Drop impl calls tokio::spawn, which requires a Tokio runtime context. Switching to #[tokio::test] provides that runtime during the test. Co-Authored-By: ForgeCode --- crates/forge_infra/src/tensorlake.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/forge_infra/src/tensorlake.rs b/crates/forge_infra/src/tensorlake.rs index aaea6e283c..cd1ec65424 100644 --- a/crates/forge_infra/src/tensorlake.rs +++ b/crates/forge_infra/src/tensorlake.rs @@ -228,8 +228,8 @@ mod tests { assert_eq!(fixture.timeout_secs, 3600); } - #[test] - fn test_tensorlake_executor_creation() { + #[tokio::test] + async fn test_tensorlake_executor_creation() { let config = TensorlakeConfig::new("test-api-key".to_string()); let executor = TensorlakeCommandExecutor::new(config.clone()); From 08a6853c9e585f6ed6f5fe6bbe7ba8c7f07d7d77 Mon Sep 17 00:00:00 2001 From: Antonio Jimeno Yepes Date: Thu, 26 Mar 2026 15:22:58 -0700 Subject: [PATCH 3/9] fix: remap host working dirs and fix clap env feature for tensorlake - Remap macOS/home paths to /tmp when executing commands in the remote Linux microVM so process spawn never fails with ENOENT - Fix test_sandbox_proxy_url to use #[tokio::test] since Drop calls tokio::spawn and panics outside a runtime - Add 'env' feature to clap so TENSORLAKE_API_KEY env var is respected - Fix borrow issue in tensorlake key closure in main.rs Co-Authored-By: ForgeCode --- Cargo.toml | 2 +- crates/forge_infra/src/tensorlake.rs | 190 +++++++++++++++++++++++---- crates/forge_main/src/main.rs | 4 +- 3 files changed, 165 insertions(+), 31 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 9a9f43786a..e79b4065ee 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,7 +29,7 @@ aws-smithy-runtime = { version = "1.10", features = ["connector-hyper-0-14-x", " base64 = "0.22.1" bytes = "1.11.1" chrono = { version = "0.4.44", features = ["serde"] } -clap = { version = "4.6.0", features = ["derive"] } +clap = { version = "4.6.0", features = ["derive", "env"] } clap_complete = "4.6.0" colored = "3.1.1" console = "0.16.3" diff --git a/crates/forge_infra/src/tensorlake.rs b/crates/forge_infra/src/tensorlake.rs index cd1ec65424..ab81e567d1 100644 --- a/crates/forge_infra/src/tensorlake.rs +++ b/crates/forge_infra/src/tensorlake.rs @@ -36,20 +36,24 @@ struct CreateSandboxResponse { sandbox_id: String, } -/// Response returned by the Tensorlake sandbox command execution endpoint. +/// Response returned by the per-sandbox process execution endpoint. #[derive(Debug, Deserialize)] -struct RunCommandResponse { - stdout: String, - stderr: String, - exit_code: Option, +struct StartProcessResponse { + pid: u64, } -/// Request body for executing a command inside a sandbox. +/// Combined output response from the per-sandbox output endpoint. +#[derive(Debug, Deserialize)] +struct ProcessOutputResponse { + lines: Vec, +} + +/// Request body for starting a process inside a sandbox. #[derive(Debug, Serialize)] -struct RunCommandRequest<'a> { - cmd: &'a str, +struct StartProcessRequest<'a> { + command: &'a str, args: Vec<&'a str>, - cwd: String, + working_dir: String, } /// Infrastructure implementation that executes shell commands inside an isolated @@ -92,10 +96,12 @@ impl TensorlakeCommandExecutor { /// Provisions a new Tensorlake sandbox and returns its ID. async fn create_sandbox(&self) -> anyhow::Result { - let url = format!("{}/v2/sandboxes", TENSORLAKE_API_BASE); + let url = format!("{}/sandboxes", TENSORLAKE_API_BASE); let body = serde_json::json!({ - "cpus": self.config.cpus, - "memory_mb": self.config.memory_mb, + "resources": { + "cpus": self.config.cpus, + "memory_mb": self.config.memory_mb, + }, "timeout_secs": self.config.timeout_secs, }); @@ -121,8 +127,49 @@ impl TensorlakeCommandExecutor { .await .context("Failed to parse Tensorlake create sandbox response")?; + // Poll until the sandbox is running (it starts as "pending") + self.wait_for_running(&parsed.sandbox_id).await?; + Ok(parsed.sandbox_id) } + + /// Polls the sandbox status endpoint until the sandbox reaches the "running" state. + async fn wait_for_running(&self, sandbox_id: &str) -> anyhow::Result<()> { + let url = format!("{}/sandboxes/{}", TENSORLAKE_API_BASE, sandbox_id); + for attempt in 0..60 { + tokio::time::sleep(std::time::Duration::from_secs(if attempt == 0 { 1 } else { 2 })) + .await; + + let resp = self + .client + .get(&url) + .bearer_auth(&self.config.api_key) + .send() + .await + .context("Failed to poll sandbox status")?; + + if !resp.status().is_success() { + continue; + } + + let body: serde_json::Value = + resp.json().await.context("Failed to parse sandbox status")?; + + match body.get("status").and_then(|s| s.as_str()) { + Some("running") => return Ok(()), + Some("terminated") => { + return Err(anyhow!("Tensorlake sandbox terminated unexpectedly")) + } + _ => continue, + } + } + Err(anyhow!("Timed out waiting for Tensorlake sandbox to become running")) + } + + /// Returns the per-sandbox proxy base URL for API calls. + fn sandbox_proxy_url(&self, sandbox_id: &str) -> String { + format!("https://{}.sandbox.tensorlake.ai/api/v1", sandbox_id) + } } impl Drop for TensorlakeCommandExecutor { @@ -138,7 +185,7 @@ impl Drop for TensorlakeCommandExecutor { tokio::spawn(async move { let guard = sandbox_id.lock().await; if let Some(id) = guard.as_deref() { - let url = format!("{}/v2/sandboxes/{}", TENSORLAKE_API_BASE, id); + let url = format!("{}/sandboxes/{}", TENSORLAKE_API_BASE, id); let _ = client.delete(&url).bearer_auth(&api_key).send().await; tracing::debug!(sandbox_id = %id, "Tensorlake sandbox cleanup on drop"); } @@ -157,41 +204,58 @@ impl CommandInfra for TensorlakeCommandExecutor { _env_vars: Option>, ) -> anyhow::Result { let sandbox_id = self.ensure_sandbox().await?; - let cwd = working_dir.to_string_lossy().to_string(); + let proxy = self.sandbox_proxy_url(&sandbox_id); + + // The sandbox is a remote Linux microVM. Host-specific paths (e.g. macOS + // `/Users/…`) do not exist inside the sandbox. Fall back to `/tmp` so + // that process spawn never fails with "No such file or directory". + let cwd = { + let host_path = working_dir.to_string_lossy(); + if host_path.starts_with("/Users/") || host_path.starts_with("/home/") { + "/tmp".to_string() + } else { + host_path.into_owned() + } + }; - let url = format!("{}/v2/sandboxes/{}/commands", TENSORLAKE_API_BASE, sandbox_id); - let request = RunCommandRequest { cmd: "sh", args: vec!["-c", &command], cwd }; + // Use `sh -c ` so pipes and redirects work correctly. + let start_url = format!("{}/processes", proxy); + let request = StartProcessRequest { command: "sh", args: vec!["-c", &command], working_dir: cwd }; tracing::info!(command = %command, sandbox_id = %sandbox_id, "Executing command in Tensorlake sandbox"); let response = self .client - .post(&url) + .post(&start_url) .bearer_auth(&self.config.api_key) .json(&request) .send() .await - .context("Failed to send command execution request to Tensorlake sandbox")?; + .context("Failed to start process in Tensorlake sandbox")?; if !response.status().is_success() { let status = response.status(); let text = response.text().await.unwrap_or_default(); return Err(anyhow!( - "Tensorlake command execution failed with status {status}: {text}" + "Tensorlake process start failed with status {status}: {text}" )); } - let result: RunCommandResponse = response + let started: StartProcessResponse = response .json() .await - .context("Failed to parse Tensorlake command execution response")?; - - Ok(CommandOutput { - stdout: result.stdout, - stderr: result.stderr, - exit_code: result.exit_code.map(|c| c as i32), - command, - }) + .context("Failed to parse Tensorlake process start response")?; + + let pid = started.pid; + + // Poll until the process exits. + let exit_code = self.wait_for_process_exit(&proxy, pid).await?; + + // Collect stdout and stderr. + let stdout = self.get_process_output(&proxy, pid, "stdout").await?; + let stderr = self.get_process_output(&proxy, pid, "stderr").await?; + + Ok(CommandOutput { stdout, stderr, exit_code: Some(exit_code), command }) } /// Interactive (raw) commands are not supported in Tensorlake sandbox mode. @@ -212,6 +276,68 @@ impl CommandInfra for TensorlakeCommandExecutor { } } +impl TensorlakeCommandExecutor { + /// Polls the process status until it exits, returning the exit code. + async fn wait_for_process_exit(&self, proxy: &str, pid: u64) -> anyhow::Result { + let url = format!("{}/processes/{}", proxy, pid); + for _ in 0..300 { + tokio::time::sleep(std::time::Duration::from_millis(500)).await; + + let resp = self + .client + .get(&url) + .bearer_auth(&self.config.api_key) + .send() + .await + .context("Failed to poll process status")?; + + if !resp.status().is_success() { + continue; + } + + let body: serde_json::Value = + resp.json().await.context("Failed to parse process status")?; + + if body.get("status").and_then(|s| s.as_str()) == Some("exited") { + let code = body + .get("exit_code") + .and_then(|c| c.as_i64()) + .unwrap_or(0) as i32; + return Ok(code); + } + } + Err(anyhow!("Timed out waiting for Tensorlake process {} to exit", pid)) + } + + /// Fetches the captured output lines (stdout or stderr) for a completed process. + async fn get_process_output( + &self, + proxy: &str, + pid: u64, + stream: &str, + ) -> anyhow::Result { + let url = format!("{}/processes/{}/{}", proxy, pid, stream); + let resp = self + .client + .get(&url) + .bearer_auth(&self.config.api_key) + .send() + .await + .with_context(|| format!("Failed to fetch process {stream} for pid {pid}"))?; + + if !resp.status().is_success() { + return Ok(String::new()); + } + + let output: ProcessOutputResponse = resp + .json() + .await + .with_context(|| format!("Failed to parse process {stream} response"))?; + + Ok(output.lines.join("\n")) + } +} + #[cfg(test)] mod tests { use pretty_assertions::assert_eq; @@ -236,4 +362,12 @@ mod tests { assert_eq!(executor.config.api_key, config.api_key); assert_eq!(executor.config.cpus, config.cpus); } + + #[tokio::test] + async fn test_sandbox_proxy_url() { + let config = TensorlakeConfig::new("key".to_string()); + let executor = TensorlakeCommandExecutor::new(config); + let url = executor.sandbox_proxy_url("abc123"); + assert_eq!(url, "https://abc123.sandbox.tensorlake.ai/api/v1"); + } } diff --git a/crates/forge_main/src/main.rs b/crates/forge_main/src/main.rs index 9025377892..1cbb94d777 100644 --- a/crates/forge_main/src/main.rs +++ b/crates/forge_main/src/main.rs @@ -66,11 +66,11 @@ async fn main() -> Result<()> { let restricted = cli.restricted; let tensorlake_key = cli.tensorlake.clone(); let mut ui = UI::init(cli, move || { - if let Some(api_key) = tensorlake_key { + if let Some(api_key) = &tensorlake_key { ForgeAPI::init_with_tensorlake( restricted, cwd.clone(), - TensorlakeConfig::new(api_key), + TensorlakeConfig::new(api_key.to_string()), ) } else { ForgeAPI::init(restricted, cwd.clone()) From 29bc6695d6c019dbd1e7dcbd86639f5bedec8b59 Mon Sep 17 00:00:00 2001 From: Antonio Jimeno Yepes Date: Thu, 26 Mar 2026 15:34:38 -0700 Subject: [PATCH 4/9] fix: address Graphite review feedback on tensorlake executor - Add AtomicBool cleanup_scheduled guard to Drop so that clones sharing the same sandbox_id Arc only schedule one DELETE request, not one per clone - Parse env_vars (Vec of KEY=VALUE) into a HashMap and forward to the Tensorlake process API which accepts {"KEY": "value"} dicts - Add env field to StartProcessRequest with skip_serializing_if so requests without env vars are unaffected - Add tests for both: clone-dedup guard and env var parsing Co-Authored-By: ForgeCode --- crates/forge_infra/src/tensorlake.rs | 79 ++++++++++++++++++++++++++-- 1 file changed, 75 insertions(+), 4 deletions(-) diff --git a/crates/forge_infra/src/tensorlake.rs b/crates/forge_infra/src/tensorlake.rs index ab81e567d1..f3b483dc3a 100644 --- a/crates/forge_infra/src/tensorlake.rs +++ b/crates/forge_infra/src/tensorlake.rs @@ -1,5 +1,7 @@ +use std::collections::HashMap; use std::path::PathBuf; use std::sync::Arc; +use std::sync::atomic::{AtomicBool, Ordering}; use anyhow::{Context, anyhow}; use async_trait::async_trait; @@ -54,6 +56,10 @@ struct StartProcessRequest<'a> { command: &'a str, args: Vec<&'a str>, working_dir: String, + /// Optional environment variables to inject, as `KEY=VALUE` pairs parsed + /// into a map. The Tensorlake API accepts `{"KEY": "VALUE"}` format. + #[serde(skip_serializing_if = "Option::is_none")] + env: Option>, } /// Infrastructure implementation that executes shell commands inside an isolated @@ -68,6 +74,9 @@ pub struct TensorlakeCommandExecutor { client: reqwest::Client, /// Lazily initialized sandbox ID, shared across clones via `Arc>`. sandbox_id: Arc>>, + /// Guards against duplicate DELETE requests when multiple clones are dropped. + /// Only the first clone to set this flag will schedule sandbox termination. + cleanup_scheduled: Arc, } impl TensorlakeCommandExecutor { @@ -77,6 +86,7 @@ impl TensorlakeCommandExecutor { config, client: reqwest::Client::new(), sandbox_id: Arc::new(Mutex::new(None)), + cleanup_scheduled: Arc::new(AtomicBool::new(false)), } } @@ -175,9 +185,15 @@ impl TensorlakeCommandExecutor { impl Drop for TensorlakeCommandExecutor { /// Schedules sandbox termination when the executor is dropped. /// - /// A best-effort background task is spawned so that the `Drop` impl - /// remains synchronous while still cleaning up remote resources. + /// Uses an `AtomicBool` flag so that only the first clone to be dropped + /// schedules the DELETE request — preventing duplicate API calls when + /// multiple clones of the executor share the same `sandbox_id` Arc. fn drop(&mut self) { + // Only the first dropper proceeds; all subsequent clones are no-ops. + if self.cleanup_scheduled.swap(true, Ordering::SeqCst) { + return; + } + let sandbox_id = self.sandbox_id.clone(); let client = self.client.clone(); let api_key = self.config.api_key.clone(); @@ -201,7 +217,7 @@ impl CommandInfra for TensorlakeCommandExecutor { command: String, working_dir: PathBuf, _silent: bool, - _env_vars: Option>, + env_vars: Option>, ) -> anyhow::Result { let sandbox_id = self.ensure_sandbox().await?; let proxy = self.sandbox_proxy_url(&sandbox_id); @@ -218,9 +234,26 @@ impl CommandInfra for TensorlakeCommandExecutor { } }; + // Parse `KEY=VALUE` strings into the dict format the Tensorlake API expects. + let env = env_vars.map(|vars| { + vars.into_iter() + .filter_map(|kv| { + let mut parts = kv.splitn(2, '='); + let key = parts.next()?.to_string(); + let value = parts.next().unwrap_or("").to_string(); + Some((key, value)) + }) + .collect::>() + }); + // Use `sh -c ` so pipes and redirects work correctly. let start_url = format!("{}/processes", proxy); - let request = StartProcessRequest { command: "sh", args: vec!["-c", &command], working_dir: cwd }; + let request = StartProcessRequest { + command: "sh", + args: vec!["-c", &command], + working_dir: cwd, + env, + }; tracing::info!(command = %command, sandbox_id = %sandbox_id, "Executing command in Tensorlake sandbox"); @@ -370,4 +403,42 @@ mod tests { let url = executor.sandbox_proxy_url("abc123"); assert_eq!(url, "https://abc123.sandbox.tensorlake.ai/api/v1"); } + + #[tokio::test] + async fn test_clone_does_not_duplicate_cleanup() { + let config = TensorlakeConfig::new("key".to_string()); + let executor = TensorlakeCommandExecutor::new(config); + let clone = executor.clone(); + + // First drop sets the flag and would schedule cleanup. + assert!(!executor.cleanup_scheduled.swap(true, Ordering::SeqCst)); + // Second drop (the clone) sees the flag already set and is a no-op. + assert!(clone.cleanup_scheduled.swap(true, Ordering::SeqCst)); + } + + #[test] + fn test_env_vars_parsed_into_map() { + let vars = vec![ + "FOO=bar".to_string(), + "BAZ=qux=with=equals".to_string(), + "EMPTY=".to_string(), + ]; + + let actual: HashMap = vars + .into_iter() + .filter_map(|kv| { + let mut parts = kv.splitn(2, '='); + let key = parts.next()?.to_string(); + let value = parts.next().unwrap_or("").to_string(); + Some((key, value)) + }) + .collect(); + + let mut expected = HashMap::new(); + expected.insert("FOO".to_string(), "bar".to_string()); + expected.insert("BAZ".to_string(), "qux=with=equals".to_string()); + expected.insert("EMPTY".to_string(), "".to_string()); + + assert_eq!(actual, expected); + } } From 6235e17d94e92e5e2f023f8f42447560d4572725 Mon Sep 17 00:00:00 2001 From: Antonio Jimeno Yepes Date: Thu, 26 Mar 2026 15:48:41 -0700 Subject: [PATCH 5/9] fix: terminate Tensorlake sandbox on last clone dropped, not first MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The AtomicBool cleanup_scheduled fired the DELETE on the first clone to be dropped, while other clones sharing the same Arc> could still be alive and using the sandbox. Fix by introducing a SandboxGuard inner struct that owns the sandbox state and implements Drop. Wrapping it in Arc inside TensorlakeCommandExecutor means the DELETE is issued exactly once — when Arc::strong_count reaches zero (last clone dropped). Co-Authored-By: ForgeCode --- crates/forge_infra/src/tensorlake.rs | 101 +++++++++++++++------------ 1 file changed, 58 insertions(+), 43 deletions(-) diff --git a/crates/forge_infra/src/tensorlake.rs b/crates/forge_infra/src/tensorlake.rs index f3b483dc3a..c2d359d748 100644 --- a/crates/forge_infra/src/tensorlake.rs +++ b/crates/forge_infra/src/tensorlake.rs @@ -1,7 +1,6 @@ use std::collections::HashMap; use std::path::PathBuf; use std::sync::Arc; -use std::sync::atomic::{AtomicBool, Ordering}; use anyhow::{Context, anyhow}; use async_trait::async_trait; @@ -62,32 +61,69 @@ struct StartProcessRequest<'a> { env: Option>, } +/// Owns the sandbox lifetime and issues the DELETE on drop. +/// +/// Wrapped in `Arc` inside `TensorlakeCommandExecutor` so that the sandbox is +/// terminated exactly once — when the last clone of the executor is dropped and +/// the `Arc` reference count reaches zero. +struct SandboxGuard { + sandbox_id: Arc>>, + client: reqwest::Client, + api_key: String, +} + +impl Drop for SandboxGuard { + /// Spawns a best-effort DELETE request to terminate the sandbox. + /// + /// Because `Drop` is synchronous the HTTP call is dispatched as a detached + /// Tokio task. This is sufficient for normal program exit where the runtime + /// continues to run briefly after `main` returns; the `timeout_secs` + /// configured on the sandbox acts as the ultimate safety net for abrupt + /// termination (e.g. SIGKILL). + fn drop(&mut self) { + let sandbox_id = self.sandbox_id.clone(); + let client = self.client.clone(); + let api_key = self.api_key.clone(); + + tokio::spawn(async move { + let guard = sandbox_id.lock().await; + if let Some(id) = guard.as_deref() { + let url = format!("{}/sandboxes/{}", TENSORLAKE_API_BASE, id); + let _ = client.delete(&url).bearer_auth(&api_key).send().await; + tracing::debug!(sandbox_id = %id, "Tensorlake sandbox terminated"); + } + }); + } +} + /// Infrastructure implementation that executes shell commands inside an isolated /// Tensorlake Firecracker microVM sandbox. /// /// A single sandbox is created lazily on the first command execution and reused /// for the lifetime of the `TensorlakeCommandExecutor` instance. The sandbox is -/// terminated when the executor is dropped. +/// terminated when the last clone of the executor is dropped. #[derive(Clone)] pub struct TensorlakeCommandExecutor { config: TensorlakeConfig, client: reqwest::Client, /// Lazily initialized sandbox ID, shared across clones via `Arc>`. sandbox_id: Arc>>, - /// Guards against duplicate DELETE requests when multiple clones are dropped. - /// Only the first clone to set this flag will schedule sandbox termination. - cleanup_scheduled: Arc, + /// Dropping the last clone of this `Arc` triggers `SandboxGuard::drop`, + /// which issues the sandbox DELETE exactly once. + _guard: Arc, } impl TensorlakeCommandExecutor { /// Creates a new executor with the provided Tensorlake configuration. pub fn new(config: TensorlakeConfig) -> Self { - Self { - config, - client: reqwest::Client::new(), - sandbox_id: Arc::new(Mutex::new(None)), - cleanup_scheduled: Arc::new(AtomicBool::new(false)), - } + let client = reqwest::Client::new(); + let sandbox_id = Arc::new(Mutex::new(None)); + let guard = Arc::new(SandboxGuard { + sandbox_id: sandbox_id.clone(), + client: client.clone(), + api_key: config.api_key.clone(), + }); + Self { config, client, sandbox_id, _guard: guard } } /// Returns the sandbox ID, creating a new sandbox if one has not been @@ -182,33 +218,6 @@ impl TensorlakeCommandExecutor { } } -impl Drop for TensorlakeCommandExecutor { - /// Schedules sandbox termination when the executor is dropped. - /// - /// Uses an `AtomicBool` flag so that only the first clone to be dropped - /// schedules the DELETE request — preventing duplicate API calls when - /// multiple clones of the executor share the same `sandbox_id` Arc. - fn drop(&mut self) { - // Only the first dropper proceeds; all subsequent clones are no-ops. - if self.cleanup_scheduled.swap(true, Ordering::SeqCst) { - return; - } - - let sandbox_id = self.sandbox_id.clone(); - let client = self.client.clone(); - let api_key = self.config.api_key.clone(); - - tokio::spawn(async move { - let guard = sandbox_id.lock().await; - if let Some(id) = guard.as_deref() { - let url = format!("{}/sandboxes/{}", TENSORLAKE_API_BASE, id); - let _ = client.delete(&url).bearer_auth(&api_key).send().await; - tracing::debug!(sandbox_id = %id, "Tensorlake sandbox cleanup on drop"); - } - }); - } -} - #[async_trait] impl CommandInfra for TensorlakeCommandExecutor { /// Executes a shell command inside the Tensorlake sandbox and returns the captured output. @@ -405,15 +414,21 @@ mod tests { } #[tokio::test] - async fn test_clone_does_not_duplicate_cleanup() { + async fn test_cleanup_fires_on_last_clone_dropped() { let config = TensorlakeConfig::new("key".to_string()); let executor = TensorlakeCommandExecutor::new(config); let clone = executor.clone(); - // First drop sets the flag and would schedule cleanup. - assert!(!executor.cleanup_scheduled.swap(true, Ordering::SeqCst)); - // Second drop (the clone) sees the flag already set and is a no-op. - assert!(clone.cleanup_scheduled.swap(true, Ordering::SeqCst)); + // Both the original and the clone share the same Arc. + assert_eq!(Arc::strong_count(&executor._guard), 2); + + // Dropping the clone reduces the count to 1 — cleanup not yet triggered. + drop(clone); + assert_eq!(Arc::strong_count(&executor._guard), 1); + + // Dropping the original reduces the count to 0 — cleanup fires here. + // (No sandbox was provisioned so the spawned task is a no-op.) + drop(executor); } #[test] From d619bff57ad8f36ca6a4d742ac51fe67431a460c Mon Sep 17 00:00:00 2001 From: Antonio Jimeno Yepes Date: Thu, 26 Mar 2026 16:10:29 -0700 Subject: [PATCH 6/9] Updated tensorlake sandboxes termination --- Cargo.lock | 1 + crates/forge_infra/Cargo.toml | 1 + crates/forge_infra/src/tensorlake.rs | 122 ++++++++++++++++++++++----- crates/forge_main/src/main.rs | 7 +- 4 files changed, 108 insertions(+), 23 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 623065ff20..46a5388726 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2072,6 +2072,7 @@ dependencies = [ "google-cloud-auth", "http 1.4.0", "libsqlite3-sys", + "mockito", "oauth2", "pretty_assertions", "reqwest 0.12.28", diff --git a/crates/forge_infra/Cargo.toml b/crates/forge_infra/Cargo.toml index 222279bd41..0495f9357a 100644 --- a/crates/forge_infra/Cargo.toml +++ b/crates/forge_infra/Cargo.toml @@ -53,3 +53,4 @@ serial_test = "3.4" fake = { version = "5.1.0", features = ["derive"] } pretty_assertions.workspace = true forge_domain = { path = "../forge_domain" } +mockito = { workspace = true } diff --git a/crates/forge_infra/src/tensorlake.rs b/crates/forge_infra/src/tensorlake.rs index c2d359d748..0af96924f3 100644 --- a/crates/forge_infra/src/tensorlake.rs +++ b/crates/forge_infra/src/tensorlake.rs @@ -22,12 +22,21 @@ pub struct TensorlakeConfig { pub memory_mb: u64, /// Inactivity timeout in seconds before the sandbox auto-suspends (default: 3600). pub timeout_secs: u64, + /// Base URL for the Tensorlake API (default: `https://api.tensorlake.ai`). + /// Overridable in tests to point at a local mock server. + pub base_url: String, } impl TensorlakeConfig { /// Creates a new `TensorlakeConfig` with the given API key and sensible defaults. pub fn new(api_key: String) -> Self { - Self { api_key, cpus: 2.0, memory_mb: 4096, timeout_secs: 3600 } + Self { + api_key, + cpus: 2.0, + memory_mb: 4096, + timeout_secs: 3600, + base_url: TENSORLAKE_API_BASE.to_string(), + } } } @@ -70,29 +79,34 @@ struct SandboxGuard { sandbox_id: Arc>>, client: reqwest::Client, api_key: String, + base_url: String, } impl Drop for SandboxGuard { - /// Spawns a best-effort DELETE request to terminate the sandbox. + /// Synchronously terminates the sandbox by sending a DELETE request. /// - /// Because `Drop` is synchronous the HTTP call is dispatched as a detached - /// Tokio task. This is sufficient for normal program exit where the runtime - /// continues to run briefly after `main` returns; the `timeout_secs` - /// configured on the sandbox acts as the ultimate safety net for abrupt - /// termination (e.g. SIGKILL). + /// Uses `block_in_place` + `block_on` so the HTTP call completes before + /// `Drop` returns, ensuring the sandbox is always cleaned up on normal + /// exit, Ctrl-C, and panics that unwind the stack. SIGKILL remains + /// unhandled by design; `timeout_secs` acts as the safety net there. fn drop(&mut self) { let sandbox_id = self.sandbox_id.clone(); let client = self.client.clone(); let api_key = self.api_key.clone(); - - tokio::spawn(async move { - let guard = sandbox_id.lock().await; - if let Some(id) = guard.as_deref() { - let url = format!("{}/sandboxes/{}", TENSORLAKE_API_BASE, id); - let _ = client.delete(&url).bearer_auth(&api_key).send().await; - tracing::debug!(sandbox_id = %id, "Tensorlake sandbox terminated"); - } - }); + let base_url = self.base_url.clone(); + + if let Ok(handle) = tokio::runtime::Handle::try_current() { + tokio::task::block_in_place(|| { + handle.block_on(async move { + let guard = sandbox_id.lock().await; + if let Some(id) = guard.as_deref() { + let url = format!("{}/sandboxes/{}", base_url, id); + let _ = client.delete(&url).bearer_auth(&api_key).send().await; + tracing::debug!(sandbox_id = %id, "Tensorlake sandbox terminated"); + } + }); + }); + } } } @@ -122,6 +136,7 @@ impl TensorlakeCommandExecutor { sandbox_id: sandbox_id.clone(), client: client.clone(), api_key: config.api_key.clone(), + base_url: config.base_url.clone(), }); Self { config, client, sandbox_id, _guard: guard } } @@ -142,7 +157,7 @@ impl TensorlakeCommandExecutor { /// Provisions a new Tensorlake sandbox and returns its ID. async fn create_sandbox(&self) -> anyhow::Result { - let url = format!("{}/sandboxes", TENSORLAKE_API_BASE); + let url = format!("{}/sandboxes", self.config.base_url); let body = serde_json::json!({ "resources": { "cpus": self.config.cpus, @@ -181,7 +196,7 @@ impl TensorlakeCommandExecutor { /// Polls the sandbox status endpoint until the sandbox reaches the "running" state. async fn wait_for_running(&self, sandbox_id: &str) -> anyhow::Result<()> { - let url = format!("{}/sandboxes/{}", TENSORLAKE_API_BASE, sandbox_id); + let url = format!("{}/sandboxes/{}", self.config.base_url, sandbox_id); for attempt in 0..60 { tokio::time::sleep(std::time::Duration::from_secs(if attempt == 0 { 1 } else { 2 })) .await; @@ -394,9 +409,10 @@ mod tests { assert_eq!(fixture.cpus, 2.0); assert_eq!(fixture.memory_mb, 4096); assert_eq!(fixture.timeout_secs, 3600); + assert_eq!(fixture.base_url, "https://api.tensorlake.ai"); } - #[tokio::test] + #[tokio::test(flavor = "multi_thread")] async fn test_tensorlake_executor_creation() { let config = TensorlakeConfig::new("test-api-key".to_string()); let executor = TensorlakeCommandExecutor::new(config.clone()); @@ -405,7 +421,7 @@ mod tests { assert_eq!(executor.config.cpus, config.cpus); } - #[tokio::test] + #[tokio::test(flavor = "multi_thread")] async fn test_sandbox_proxy_url() { let config = TensorlakeConfig::new("key".to_string()); let executor = TensorlakeCommandExecutor::new(config); @@ -413,7 +429,7 @@ mod tests { assert_eq!(url, "https://abc123.sandbox.tensorlake.ai/api/v1"); } - #[tokio::test] + #[tokio::test(flavor = "multi_thread")] async fn test_cleanup_fires_on_last_clone_dropped() { let config = TensorlakeConfig::new("key".to_string()); let executor = TensorlakeCommandExecutor::new(config); @@ -431,6 +447,70 @@ mod tests { drop(executor); } + /// Verifies that dropping the executor sends a DELETE /sandboxes/{id} request + /// synchronously — i.e. the request completes before `drop` returns, so the + /// mock server receives it before `assert_hits` is checked. + #[tokio::test(flavor = "multi_thread")] + async fn test_sandbox_terminated_on_drop() { + let mut server = mockito::Server::new_async().await; + + let mock = server + .mock("DELETE", "/sandboxes/test-sandbox-id") + .with_status(200) + .expect(1) + .create_async() + .await; + + let mut config = TensorlakeConfig::new("test-key".to_string()); + config.base_url = server.url(); + + let executor = TensorlakeCommandExecutor::new(config); + + // Manually plant a sandbox ID so the guard has something to DELETE. + { + let mut guard = executor.sandbox_id.lock().await; + *guard = Some("test-sandbox-id".to_string()); + } + + // Drop the executor — SandboxGuard::drop must complete the DELETE before + // returning, so the mock expectation is satisfied synchronously. + drop(executor); + + mock.assert_async().await; + } + + /// Verifies that a clone and the original share the guard, and the DELETE is + /// sent exactly once — only when the last clone is dropped. + #[tokio::test(flavor = "multi_thread")] + async fn test_sandbox_terminated_exactly_once_across_clones() { + let mut server = mockito::Server::new_async().await; + + let mock = server + .mock("DELETE", "/sandboxes/shared-sandbox") + .with_status(200) + .expect(1) // must fire exactly once, not twice + .create_async() + .await; + + let mut config = TensorlakeConfig::new("test-key".to_string()); + config.base_url = server.url(); + + let executor = TensorlakeCommandExecutor::new(config); + let clone = executor.clone(); + + { + let mut guard = executor.sandbox_id.lock().await; + *guard = Some("shared-sandbox".to_string()); + } + + // Dropping the clone must NOT trigger the DELETE yet. + drop(clone); + // Dropping the original must trigger the DELETE exactly once. + drop(executor); + + mock.assert_async().await; + } + #[test] fn test_env_vars_parsed_into_map() { let vars = vec![ diff --git a/crates/forge_main/src/main.rs b/crates/forge_main/src/main.rs index 1cbb94d777..59feccbe0a 100644 --- a/crates/forge_main/src/main.rs +++ b/crates/forge_main/src/main.rs @@ -19,7 +19,11 @@ async fn main() -> Result<()> { // available let _ = rustls::crypto::ring::default_provider().install_default(); - // Set up panic hook for better error display + // Set up panic hook for better error display. + // Important: do NOT call `std::process::exit` here. Exiting inside the hook + // skips stack unwinding, which prevents `Drop` implementations (such as the + // Tensorlake `SandboxGuard`) from running. Returning from the hook lets Rust + // unwind the stack normally so all destructors fire before the process exits. panic::set_hook(Box::new(|panic_info| { let message = if let Some(s) = panic_info.payload().downcast_ref::<&str>() { s.to_string() @@ -31,7 +35,6 @@ async fn main() -> Result<()> { println!("{}", TitleFormat::error(message.to_string()).display()); tracker::error_blocking(message); - std::process::exit(1); })); // Initialize and run the UI From ff65b950979b2a83711cda5f4cef89874c812187 Mon Sep 17 00:00:00 2001 From: Antonio Jimeno Yepes Date: Thu, 26 Mar 2026 16:19:13 -0700 Subject: [PATCH 7/9] fix(tensorlake): propagate HTTP errors instead of silently swallowing them - wait_for_process_exit: return Err on non-success HTTP status instead of continuing to poll, so API failures fail fast rather than spinning for 150 s before timing out - get_process_output: return Err on non-success HTTP status instead of returning an empty string, preventing silent output loss Addresses Graphite review comments on PR #2707. Co-Authored-By: ForgeCode --- crates/forge_infra/src/tensorlake.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/crates/forge_infra/src/tensorlake.rs b/crates/forge_infra/src/tensorlake.rs index 0af96924f3..87f443963b 100644 --- a/crates/forge_infra/src/tensorlake.rs +++ b/crates/forge_infra/src/tensorlake.rs @@ -349,7 +349,8 @@ impl TensorlakeCommandExecutor { .context("Failed to poll process status")?; if !resp.status().is_success() { - continue; + let status = resp.status(); + return Err(anyhow!("Failed to poll process {pid} status: HTTP {status}")); } let body: serde_json::Value = @@ -383,7 +384,8 @@ impl TensorlakeCommandExecutor { .with_context(|| format!("Failed to fetch process {stream} for pid {pid}"))?; if !resp.status().is_success() { - return Ok(String::new()); + let status = resp.status(); + return Err(anyhow!("Failed to fetch {stream} for pid {pid}: HTTP {status}")); } let output: ProcessOutputResponse = resp From 254eaebe6e65b5bd87118590159f332d326cb8f0 Mon Sep 17 00:00:00 2001 From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com> Date: Fri, 27 Mar 2026 04:15:53 +0000 Subject: [PATCH 8/9] [autofix.ci] apply automated fixes --- crates/forge_infra/src/forge_infra.rs | 25 +++++---- crates/forge_infra/src/tensorlake.rs | 80 +++++++++++++++++---------- crates/forge_main/src/cli.rs | 3 +- 3 files changed, 67 insertions(+), 41 deletions(-) diff --git a/crates/forge_infra/src/forge_infra.rs b/crates/forge_infra/src/forge_infra.rs index 59d81407b4..102c4598e4 100644 --- a/crates/forge_infra/src/forge_infra.rs +++ b/crates/forge_infra/src/forge_infra.rs @@ -91,9 +91,10 @@ impl ForgeInfra { file_meta_service, create_dirs_service: Arc::new(ForgeCreateDirsService), directory_reader_service, - command_executor: CommandExecutor::Local(Arc::new( - ForgeCommandExecutorService::new(env.clone(), output_printer.clone()), - )), + command_executor: CommandExecutor::Local(Arc::new(ForgeCommandExecutorService::new( + env.clone(), + output_printer.clone(), + ))), inquire_service: Arc::new(ForgeInquire::new()), mcp_server: ForgeMcpServer, walker_service: Arc::new(ForgeWalkerService::new()), @@ -107,9 +108,9 @@ impl ForgeInfra { /// Creates a `ForgeInfra` instance that executes shell commands inside an /// isolated Tensorlake Firecracker microVM sandbox. /// - /// A single sandbox is provisioned lazily on the first command execution and - /// reused for the entire session. The sandbox is terminated when the returned - /// `ForgeInfra` is dropped. + /// A single sandbox is provisioned lazily on the first command execution + /// and reused for the entire session. The sandbox is terminated when + /// the returned `ForgeInfra` is dropped. pub fn new_with_tensorlake(restricted: bool, cwd: PathBuf, config: TensorlakeConfig) -> Self { let environment_service = Arc::new(ForgeEnvironmentInfra::new(restricted, cwd)); let env = environment_service.get_environment(); @@ -249,10 +250,12 @@ impl CommandInfra for ForgeInfra { ) -> anyhow::Result { match &self.command_executor { CommandExecutor::Local(svc) => { - svc.execute_command(command, working_dir, silent, env_vars).await + svc.execute_command(command, working_dir, silent, env_vars) + .await } CommandExecutor::Tensorlake(svc) => { - svc.execute_command(command, working_dir, silent, env_vars).await + svc.execute_command(command, working_dir, silent, env_vars) + .await } } } @@ -265,10 +268,12 @@ impl CommandInfra for ForgeInfra { ) -> anyhow::Result { match &self.command_executor { CommandExecutor::Local(svc) => { - svc.execute_command_raw(command, working_dir, env_vars).await + svc.execute_command_raw(command, working_dir, env_vars) + .await } CommandExecutor::Tensorlake(svc) => { - svc.execute_command_raw(command, working_dir, env_vars).await + svc.execute_command_raw(command, working_dir, env_vars) + .await } } } diff --git a/crates/forge_infra/src/tensorlake.rs b/crates/forge_infra/src/tensorlake.rs index 87f443963b..7e2071927c 100644 --- a/crates/forge_infra/src/tensorlake.rs +++ b/crates/forge_infra/src/tensorlake.rs @@ -20,7 +20,8 @@ pub struct TensorlakeConfig { pub cpus: f64, /// Memory in megabytes to allocate for the sandbox (default: 4096). pub memory_mb: u64, - /// Inactivity timeout in seconds before the sandbox auto-suspends (default: 3600). + /// Inactivity timeout in seconds before the sandbox auto-suspends (default: + /// 3600). pub timeout_secs: u64, /// Base URL for the Tensorlake API (default: `https://api.tensorlake.ai`). /// Overridable in tests to point at a local mock server. @@ -28,7 +29,8 @@ pub struct TensorlakeConfig { } impl TensorlakeConfig { - /// Creates a new `TensorlakeConfig` with the given API key and sensible defaults. + /// Creates a new `TensorlakeConfig` with the given API key and sensible + /// defaults. pub fn new(api_key: String) -> Self { Self { api_key, @@ -110,8 +112,8 @@ impl Drop for SandboxGuard { } } -/// Infrastructure implementation that executes shell commands inside an isolated -/// Tensorlake Firecracker microVM sandbox. +/// Infrastructure implementation that executes shell commands inside an +/// isolated Tensorlake Firecracker microVM sandbox. /// /// A single sandbox is created lazily on the first command execution and reused /// for the lifetime of the `TensorlakeCommandExecutor` instance. The sandbox is @@ -194,12 +196,17 @@ impl TensorlakeCommandExecutor { Ok(parsed.sandbox_id) } - /// Polls the sandbox status endpoint until the sandbox reaches the "running" state. + /// Polls the sandbox status endpoint until the sandbox reaches the + /// "running" state. async fn wait_for_running(&self, sandbox_id: &str) -> anyhow::Result<()> { let url = format!("{}/sandboxes/{}", self.config.base_url, sandbox_id); for attempt in 0..60 { - tokio::time::sleep(std::time::Duration::from_secs(if attempt == 0 { 1 } else { 2 })) - .await; + tokio::time::sleep(std::time::Duration::from_secs(if attempt == 0 { + 1 + } else { + 2 + })) + .await; let resp = self .client @@ -213,18 +220,22 @@ impl TensorlakeCommandExecutor { continue; } - let body: serde_json::Value = - resp.json().await.context("Failed to parse sandbox status")?; + let body: serde_json::Value = resp + .json() + .await + .context("Failed to parse sandbox status")?; match body.get("status").and_then(|s| s.as_str()) { Some("running") => return Ok(()), Some("terminated") => { - return Err(anyhow!("Tensorlake sandbox terminated unexpectedly")) + return Err(anyhow!("Tensorlake sandbox terminated unexpectedly")); } _ => continue, } } - Err(anyhow!("Timed out waiting for Tensorlake sandbox to become running")) + Err(anyhow!( + "Timed out waiting for Tensorlake sandbox to become running" + )) } /// Returns the per-sandbox proxy base URL for API calls. @@ -235,7 +246,8 @@ impl TensorlakeCommandExecutor { #[async_trait] impl CommandInfra for TensorlakeCommandExecutor { - /// Executes a shell command inside the Tensorlake sandbox and returns the captured output. + /// Executes a shell command inside the Tensorlake sandbox and returns the + /// captured output. async fn execute_command( &self, command: String, @@ -317,9 +329,9 @@ impl CommandInfra for TensorlakeCommandExecutor { /// Interactive (raw) commands are not supported in Tensorlake sandbox mode. /// - /// Raw command execution requires an attached TTY which is not available over - /// the Tensorlake HTTP API. This method always returns an error directing the - /// caller to use `execute_command` instead. + /// Raw command execution requires an attached TTY which is not available + /// over the Tensorlake HTTP API. This method always returns an error + /// directing the caller to use `execute_command` instead. async fn execute_command_raw( &self, _command: &str, @@ -350,24 +362,29 @@ impl TensorlakeCommandExecutor { if !resp.status().is_success() { let status = resp.status(); - return Err(anyhow!("Failed to poll process {pid} status: HTTP {status}")); + return Err(anyhow!( + "Failed to poll process {pid} status: HTTP {status}" + )); } - let body: serde_json::Value = - resp.json().await.context("Failed to parse process status")?; + let body: serde_json::Value = resp + .json() + .await + .context("Failed to parse process status")?; if body.get("status").and_then(|s| s.as_str()) == Some("exited") { - let code = body - .get("exit_code") - .and_then(|c| c.as_i64()) - .unwrap_or(0) as i32; + let code = body.get("exit_code").and_then(|c| c.as_i64()).unwrap_or(0) as i32; return Ok(code); } } - Err(anyhow!("Timed out waiting for Tensorlake process {} to exit", pid)) + Err(anyhow!( + "Timed out waiting for Tensorlake process {} to exit", + pid + )) } - /// Fetches the captured output lines (stdout or stderr) for a completed process. + /// Fetches the captured output lines (stdout or stderr) for a completed + /// process. async fn get_process_output( &self, proxy: &str, @@ -385,7 +402,9 @@ impl TensorlakeCommandExecutor { if !resp.status().is_success() { let status = resp.status(); - return Err(anyhow!("Failed to fetch {stream} for pid {pid}: HTTP {status}")); + return Err(anyhow!( + "Failed to fetch {stream} for pid {pid}: HTTP {status}" + )); } let output: ProcessOutputResponse = resp @@ -449,9 +468,10 @@ mod tests { drop(executor); } - /// Verifies that dropping the executor sends a DELETE /sandboxes/{id} request - /// synchronously — i.e. the request completes before `drop` returns, so the - /// mock server receives it before `assert_hits` is checked. + /// Verifies that dropping the executor sends a DELETE /sandboxes/{id} + /// request synchronously — i.e. the request completes before `drop` + /// returns, so the mock server receives it before `assert_hits` is + /// checked. #[tokio::test(flavor = "multi_thread")] async fn test_sandbox_terminated_on_drop() { let mut server = mockito::Server::new_async().await; @@ -481,8 +501,8 @@ mod tests { mock.assert_async().await; } - /// Verifies that a clone and the original share the guard, and the DELETE is - /// sent exactly once — only when the last clone is dropped. + /// Verifies that a clone and the original share the guard, and the DELETE + /// is sent exactly once — only when the last clone is dropped. #[tokio::test(flavor = "multi_thread")] async fn test_sandbox_terminated_exactly_once_across_clones() { let mut server = mockito::Server::new_async().await; diff --git a/crates/forge_main/src/cli.rs b/crates/forge_main/src/cli.rs index 952b0cf3b4..51b2c6019a 100644 --- a/crates/forge_main/src/cli.rs +++ b/crates/forge_main/src/cli.rs @@ -50,7 +50,8 @@ pub struct Cli { #[arg(long)] pub sandbox: Option, - /// Run all shell commands inside an isolated Tensorlake Firecracker microVM sandbox. + /// Run all shell commands inside an isolated Tensorlake Firecracker microVM + /// sandbox. /// /// Requires a valid Tensorlake API key. Can also be supplied via the /// TENSORLAKE_API_KEY environment variable. When set, every shell tool call From f825e12f0d76b5b30fc37ae37c8a712e1d7a4eea Mon Sep 17 00:00:00 2001 From: Antonio Jose Jimeno Yepes Date: Fri, 27 Mar 2026 09:19:55 -0700 Subject: [PATCH 9/9] Update crates/forge_infra/src/tensorlake.rs Co-authored-by: graphite-app[bot] <96075541+graphite-app[bot]@users.noreply.github.com> --- crates/forge_infra/src/tensorlake.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/crates/forge_infra/src/tensorlake.rs b/crates/forge_infra/src/tensorlake.rs index 7e2071927c..44f31c11a5 100644 --- a/crates/forge_infra/src/tensorlake.rs +++ b/crates/forge_infra/src/tensorlake.rs @@ -261,15 +261,18 @@ impl CommandInfra for TensorlakeCommandExecutor { // The sandbox is a remote Linux microVM. Host-specific paths (e.g. macOS // `/Users/…`) do not exist inside the sandbox. Fall back to `/tmp` so // that process spawn never fails with "No such file or directory". + let cwd = { let host_path = working_dir.to_string_lossy(); - if host_path.starts_with("/Users/") || host_path.starts_with("/home/") { + // Handle both Unix and Windows paths + if host_path.starts_with("/Users/") || host_path.starts_with("/home/") || host_path.contains(":\\") { "/tmp".to_string() } else { host_path.into_owned() } }; + // Parse `KEY=VALUE` strings into the dict format the Tensorlake API expects. let env = env_vars.map(|vars| { vars.into_iter()