From 6f02b3eb03a03b2b63c02b507a889cb5bfcc5772 Mon Sep 17 00:00:00 2001 From: yan5xu Date: Mon, 9 Mar 2026 19:32:28 +0800 Subject: [PATCH 1/2] =?UTF-8?q?feat(cli):=20add=20`boxlite=20serve`=20?= =?UTF-8?q?=E2=80=94=20long-running=20REST=20API=20server?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Holds a single BoxliteRuntime and exposes box lifecycle + exec operations over HTTP. Solves runtime lock contention when multiple callers need concurrent access (e.g. Pinix Server executing nested clip-to-clip calls). Endpoints (9, MVP): - GET /v1/config - POST /v1/local/boxes (create) - GET /v1/local/boxes (list) - GET /v1/local/boxes/{id} (info) - POST /v1/local/boxes/{id}/start - POST /v1/local/boxes/{id}/stop - POST /v1/local/boxes/{id}/exec (returns execution_id, supports stdin) - GET /v1/local/boxes/{id}/executions/{eid}/output (SSE stream) - DELETE /v1/local/boxes/{id} SSE exec protocol: event: stdout, data: {"data":""} event: stderr, data: {"data":""} event: exit, data: {"exit_code":0,"duration_ms":1234} Co-Authored-By: Claude Opus 4.6 (1M context) --- Cargo.lock | 93 ++++- boxlite-cli/Cargo.toml | 6 + boxlite-cli/src/cli.rs | 3 + boxlite-cli/src/commands/mod.rs | 1 + boxlite-cli/src/commands/serve.rs | 602 ++++++++++++++++++++++++++++++ boxlite-cli/src/main.rs | 1 + 6 files changed, 703 insertions(+), 3 deletions(-) create mode 100644 boxlite-cli/src/commands/serve.rs diff --git a/Cargo.lock b/Cargo.lock index 12cfaeb8..f2874651 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -198,14 +198,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f" dependencies = [ "async-trait", - "axum-core", + "axum-core 0.4.5", "bytes", "futures-util", "http", "http-body", "http-body-util", "itoa", - "matchit", + "matchit 0.7.3", "memchr", "mime", "percent-encoding", @@ -218,6 +218,40 @@ dependencies = [ "tower-service", ] +[[package]] +name = "axum" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b52af3cb4058c895d37317bb27508dccc8e5f2d39454016b297bf4a400597b8" +dependencies = [ + "axum-core 0.5.6", + "axum-macros", + "bytes", + "form_urlencoded", + "futures-util", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-util", + "itoa", + "matchit 0.8.4", + "memchr", + "mime", + "percent-encoding", + "pin-project-lite", + "serde_core", + "serde_json", + "serde_path_to_error", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tower 0.5.3", + "tower-layer", + "tower-service", + "tracing", +] + [[package]] name = "axum-core" version = "0.4.5" @@ -238,6 +272,36 @@ dependencies = [ "tower-service", ] +[[package]] +name = "axum-core" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08c78f31d7b1291f7ee735c1c6780ccde7785daae9a9206026862dab7d8792d1" +dependencies = [ + "bytes", + "futures-core", + "http", + "http-body", + "http-body-util", + "mime", + "pin-project-lite", + "sync_wrapper", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "axum-macros" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "604fde5e028fea851ce1d8570bbdc034bec850d157f7569d10f347d06808c05c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.114", +] + [[package]] name = "base64" version = "0.13.1" @@ -396,6 +460,9 @@ version = "0.2.0" dependencies = [ "anyhow", "assert_cmd", + "async-stream", + "axum 0.8.8", + "base64 0.22.1", "boxlite", "boxlite-test-utils", "chrono", @@ -417,6 +484,7 @@ dependencies = [ "tempfile", "term_size", "tokio", + "tower-http", "tracing", "tracing-subscriber", "ulid", @@ -2051,6 +2119,12 @@ version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94" +[[package]] +name = "matchit" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3" + [[package]] name = "memchr" version = "2.7.6" @@ -3597,6 +3671,17 @@ dependencies = [ "zmij", ] +[[package]] +name = "serde_path_to_error" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10a9ff822e371bb5403e391ecd83e182e0e77ba7f6fe0160b795797109d1b457" +dependencies = [ + "itoa", + "serde", + "serde_core", +] + [[package]] name = "serde_spanned" version = "1.0.4" @@ -4176,7 +4261,7 @@ checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52" dependencies = [ "async-stream", "async-trait", - "axum", + "axum 0.7.9", "base64 0.22.1", "bytes", "h2", @@ -4245,6 +4330,7 @@ dependencies = [ "tokio", "tower-layer", "tower-service", + "tracing", ] [[package]] @@ -4263,6 +4349,7 @@ dependencies = [ "tower 0.5.3", "tower-layer", "tower-service", + "tracing", ] [[package]] diff --git a/boxlite-cli/Cargo.toml b/boxlite-cli/Cargo.toml index d853eacb..e1db7de1 100644 --- a/boxlite-cli/Cargo.toml +++ b/boxlite-cli/Cargo.toml @@ -24,6 +24,12 @@ anyhow = "1.0" tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["fmt", "env-filter"] } +# HTTP server (serve subcommand) +async-stream = "0.3" +axum = { version = "0.8", features = ["macros"] } +base64 = "0.22" +tower-http = { version = "0.6", features = ["cors", "trace"] } + # Serialization (aligned with core boxlite) serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" diff --git a/boxlite-cli/src/cli.rs b/boxlite-cli/src/cli.rs index 45085665..6810691a 100644 --- a/boxlite-cli/src/cli.rs +++ b/boxlite-cli/src/cli.rs @@ -93,6 +93,9 @@ pub enum Commands { /// Display resource usage statistics for a box Stats(crate::commands::stats::StatsArgs), + /// Start a long-running REST API server + Serve(crate::commands::serve::ServeArgs), + /// Generate shell completion script (hidden from help) #[command(hide = true)] Completion(CompletionArgs), diff --git a/boxlite-cli/src/commands/mod.rs b/boxlite-cli/src/commands/mod.rs index 7d15fc35..5c2ba1e0 100644 --- a/boxlite-cli/src/commands/mod.rs +++ b/boxlite-cli/src/commands/mod.rs @@ -10,6 +10,7 @@ pub mod pull; pub mod restart; pub mod rm; pub mod run; +pub mod serve; pub mod start; pub mod stats; pub mod stop; diff --git a/boxlite-cli/src/commands/serve.rs b/boxlite-cli/src/commands/serve.rs new file mode 100644 index 00000000..0fe2e746 --- /dev/null +++ b/boxlite-cli/src/commands/serve.rs @@ -0,0 +1,602 @@ +//! `boxlite serve` — long-running REST API server. +//! +//! Holds a single BoxliteRuntime and exposes box lifecycle + exec operations +//! over HTTP so that multiple callers share the same runtime (no lock contention). + +use std::collections::HashMap; +use std::sync::Arc; +use std::time::Instant; + +use axum::extract::{Path, State}; +use axum::http::StatusCode; +use axum::response::sse::{Event, KeepAlive, Sse}; +use axum::response::{IntoResponse, Response}; +use axum::routing::{delete, get, post}; +use axum::{Json, Router}; +use base64::Engine; +use clap::Args; +use futures::StreamExt; +use serde::{Deserialize, Serialize}; +use tokio::sync::RwLock; + +use boxlite::{BoxCommand, BoxInfo, BoxOptions, BoxliteRuntime, Execution, LiteBox, RootfsSpec}; + +use crate::cli::GlobalFlags; + +// ============================================================================ +// CLI Args +// ============================================================================ + +#[derive(Args, Debug)] +pub struct ServeArgs { + /// Port to listen on + #[arg(long, default_value = "8100")] + pub port: u16, + + /// Host/address to bind to + #[arg(long, default_value = "0.0.0.0")] + pub host: String, +} + +// ============================================================================ +// Shared State +// ============================================================================ + +struct AppState { + runtime: BoxliteRuntime, + /// Cached box handles (box_id -> Arc). + boxes: RwLock>>, + /// Active executions (execution_id -> ActiveExecution). + executions: RwLock>, +} + +struct ActiveExecution { + execution: Execution, + started_at: Instant, +} + +/// Internal message type for multiplexing stdout/stderr SSE events. +enum SseItem { + Event(Event), + StreamDone, +} + +// ============================================================================ +// Wire Types (request/response JSON) +// ============================================================================ + +#[derive(Deserialize)] +struct CreateBoxRequest { + #[serde(default)] + name: Option, + #[serde(default)] + image: Option, + #[serde(default)] + rootfs_path: Option, + #[serde(default)] + cpus: Option, + #[serde(default)] + memory_mib: Option, + #[serde(default)] + disk_size_gb: Option, + #[serde(default)] + working_dir: Option, + #[serde(default)] + env: Option>, + #[serde(default)] + entrypoint: Option>, + #[serde(default)] + cmd: Option>, + #[serde(default)] + user: Option, + #[serde(default)] + auto_remove: Option, + #[serde(default)] + detach: Option, +} + +#[derive(Serialize)] +struct BoxResponse { + box_id: String, + name: Option, + status: String, + created_at: String, + updated_at: String, + pid: Option, + image: String, + cpus: u8, + memory_mib: u32, + labels: HashMap, +} + +#[derive(Serialize)] +struct ListBoxesResponse { + boxes: Vec, +} + +#[derive(Deserialize)] +struct ExecRequest { + command: String, + #[serde(default)] + args: Vec, + #[serde(default)] + stdin: Option, + #[serde(default)] + env: Option>, + #[serde(default)] + timeout_seconds: Option, + #[serde(default)] + working_dir: Option, + #[serde(default)] + tty: bool, +} + +#[derive(Serialize)] +struct ExecResponse { + execution_id: String, +} + +#[derive(Serialize)] +struct ErrorBody { + error: ErrorDetail, +} + +#[derive(Serialize)] +struct ErrorDetail { + message: String, + #[serde(rename = "type")] + error_type: String, + code: u16, +} + +#[derive(Serialize)] +struct ConfigResponse { + defaults: ConfigDefaults, +} + +#[derive(Serialize)] +struct ConfigDefaults { + cpus: u8, + memory_mib: u32, + disk_size_gb: u64, +} + +// ============================================================================ +// Conversions +// ============================================================================ + +fn box_info_to_response(info: &BoxInfo) -> BoxResponse { + BoxResponse { + box_id: info.id.to_string(), + name: info.name.clone(), + status: info.status.as_str().to_string(), + created_at: info.created_at.to_rfc3339(), + updated_at: info.last_updated.to_rfc3339(), + pid: info.pid, + image: info.image.clone(), + cpus: info.cpus, + memory_mib: info.memory_mib, + labels: info.labels.clone(), + } +} + +fn build_box_options(req: &CreateBoxRequest) -> BoxOptions { + let rootfs = if let Some(ref path) = req.rootfs_path { + RootfsSpec::RootfsPath(path.clone()) + } else { + RootfsSpec::Image(req.image.clone().unwrap_or_else(|| "alpine:latest".into())) + }; + + let env: Vec<(String, String)> = req + .env + .as_ref() + .map(|m| m.iter().map(|(k, v)| (k.clone(), v.clone())).collect()) + .unwrap_or_default(); + + BoxOptions { + rootfs, + cpus: req.cpus, + memory_mib: req.memory_mib, + disk_size_gb: req.disk_size_gb, + working_dir: req.working_dir.clone(), + env, + entrypoint: req.entrypoint.clone(), + cmd: req.cmd.clone(), + user: req.user.clone(), + auto_remove: req.auto_remove.unwrap_or(false), + detach: req.detach.unwrap_or(true), + ..Default::default() + } +} + +fn build_box_command(req: &ExecRequest) -> BoxCommand { + let mut cmd = BoxCommand::new(&req.command).args(req.args.iter().map(String::as_str)); + + if let Some(ref env_map) = req.env { + for (k, v) in env_map { + cmd = cmd.env(k, v); + } + } + if let Some(ref wd) = req.working_dir { + cmd = cmd.working_dir(wd); + } + if req.tty { + cmd = cmd.tty(true); + } + if let Some(secs) = req.timeout_seconds { + cmd = cmd.timeout(std::time::Duration::from_secs_f64(secs)); + } + cmd +} + +// ============================================================================ +// Error Helpers +// ============================================================================ + +fn error_response(status: StatusCode, message: impl Into, error_type: &str) -> Response { + let body = ErrorBody { + error: ErrorDetail { + message: message.into(), + error_type: error_type.to_string(), + code: status.as_u16(), + }, + }; + (status, Json(body)).into_response() +} + +fn classify_boxlite_error(err: &boxlite::BoxliteError) -> (StatusCode, &'static str) { + let msg = err.to_string().to_lowercase(); + if msg.contains("not found") { + (StatusCode::NOT_FOUND, "NotFoundError") + } else if msg.contains("already") || msg.contains("conflict") { + (StatusCode::CONFLICT, "ConflictError") + } else if msg.contains("unsupported") { + (StatusCode::BAD_REQUEST, "UnsupportedError") + } else { + (StatusCode::INTERNAL_SERVER_ERROR, "InternalError") + } +} + +// ============================================================================ +// Handlers +// ============================================================================ + +async fn get_config() -> Json { + Json(ConfigResponse { + defaults: ConfigDefaults { + cpus: 2, + memory_mib: 512, + disk_size_gb: 10, + }, + }) +} + +async fn create_box( + State(state): State>, + Json(req): Json, +) -> Response { + let name = req.name.clone(); + let options = build_box_options(&req); + + let litebox = match state.runtime.create(options, name).await { + Ok(b) => b, + Err(e) => { + let (status, etype) = classify_boxlite_error(&e); + return error_response(status, e.to_string(), etype); + } + }; + + let info = litebox.info(); + let box_id = info.id.to_string(); + let resp = box_info_to_response(&info); + + state.boxes.write().await.insert(box_id, Arc::new(litebox)); + + (StatusCode::CREATED, Json(resp)).into_response() +} + +async fn list_boxes(State(state): State>) -> Response { + match state.runtime.list_info().await { + Ok(infos) => { + let boxes = infos.iter().map(box_info_to_response).collect(); + Json(ListBoxesResponse { boxes }).into_response() + } + Err(e) => { + let (status, etype) = classify_boxlite_error(&e); + error_response(status, e.to_string(), etype) + } + } +} + +async fn get_box(State(state): State>, Path(box_id): Path) -> Response { + match state.runtime.get_info(&box_id).await { + Ok(Some(info)) => Json(box_info_to_response(&info)).into_response(), + Ok(None) => error_response( + StatusCode::NOT_FOUND, + format!("box not found: {box_id}"), + "NotFoundError", + ), + Err(e) => { + let (status, etype) = classify_boxlite_error(&e); + error_response(status, e.to_string(), etype) + } + } +} + +async fn start_box(State(state): State>, Path(box_id): Path) -> Response { + let litebox = match get_or_fetch_box(&state, &box_id).await { + Ok(b) => b, + Err(resp) => return resp, + }; + + if let Err(e) = litebox.start().await { + let (status, etype) = classify_boxlite_error(&e); + return error_response(status, e.to_string(), etype); + } + + let info = litebox.info(); + Json(box_info_to_response(&info)).into_response() +} + +async fn stop_box(State(state): State>, Path(box_id): Path) -> Response { + let litebox = match get_or_fetch_box(&state, &box_id).await { + Ok(b) => b, + Err(resp) => return resp, + }; + + if let Err(e) = litebox.stop().await { + let (status, etype) = classify_boxlite_error(&e); + return error_response(status, e.to_string(), etype); + } + + let info = litebox.info(); + Json(box_info_to_response(&info)).into_response() +} + +async fn remove_box(State(state): State>, Path(box_id): Path) -> Response { + // Evict from cache first + state.boxes.write().await.remove(&box_id); + + match state.runtime.remove(&box_id, true).await { + Ok(()) => StatusCode::NO_CONTENT.into_response(), + Err(e) => { + let (status, etype) = classify_boxlite_error(&e); + error_response(status, e.to_string(), etype) + } + } +} + +async fn start_execution( + State(state): State>, + Path(box_id): Path, + Json(req): Json, +) -> Response { + let litebox = match get_or_fetch_box(&state, &box_id).await { + Ok(b) => b, + Err(resp) => return resp, + }; + + let stdin_data = req.stdin.clone(); + let cmd = build_box_command(&req); + + let mut execution = match litebox.exec(cmd).await { + Ok(e) => e, + Err(e) => { + let (status, etype) = classify_boxlite_error(&e); + return error_response(status, e.to_string(), etype); + } + }; + + // Write stdin and close it before storing the execution + if let Some(ref data) = stdin_data { + if let Some(mut stdin) = execution.stdin() { + let _ = stdin.write_all(data.as_bytes()).await; + stdin.close(); + } + } + + let exec_id = execution.id().clone(); + + state.executions.write().await.insert( + exec_id.clone(), + ActiveExecution { + execution, + started_at: Instant::now(), + }, + ); + + ( + StatusCode::CREATED, + Json(ExecResponse { + execution_id: exec_id, + }), + ) + .into_response() +} + +async fn stream_execution_output( + State(state): State>, + Path((_box_id, exec_id)): Path<(String, String)>, +) -> Response { + // Take the execution out of the map (streams can only be consumed once) + let active = match state.executions.write().await.remove(&exec_id) { + Some(a) => a, + None => { + return error_response( + StatusCode::NOT_FOUND, + format!("execution not found: {exec_id}"), + "NotFoundError", + ); + } + }; + + let started_at = active.started_at; + let mut execution = active.execution; + + // Take stdout/stderr streams (can only be called once) + let stdout = execution.stdout(); + let stderr = execution.stderr(); + + let (tx, mut rx) = tokio::sync::mpsc::unbounded_channel::(); + + // Spawn producer tasks for stdout and stderr + let mut stream_count = 0u32; + if let Some(mut out) = stdout { + stream_count += 1; + let tx_out = tx.clone(); + tokio::spawn(async move { + let b64 = base64::engine::general_purpose::STANDARD; + while let Some(line) = out.next().await { + let encoded = b64.encode(line.as_bytes()); + let data = serde_json::json!({"data": encoded}).to_string(); + let event = Event::default().event("stdout").data(data); + if tx_out.send(SseItem::Event(event)).is_err() { + break; + } + } + let _ = tx_out.send(SseItem::StreamDone); + }); + } + + if let Some(mut err_stream) = stderr { + stream_count += 1; + let tx_err = tx.clone(); + tokio::spawn(async move { + let b64 = base64::engine::general_purpose::STANDARD; + while let Some(line) = err_stream.next().await { + let encoded = b64.encode(line.as_bytes()); + let data = serde_json::json!({"data": encoded}).to_string(); + let event = Event::default().event("stderr").data(data); + if tx_err.send(SseItem::Event(event)).is_err() { + break; + } + } + let _ = tx_err.send(SseItem::StreamDone); + }); + } + + // Drop original sender so rx closes when all producers finish + drop(tx); + + let stream = async_stream::stream! { + // Multiplex stdout/stderr events + let mut done = 0u32; + while done < stream_count { + match rx.recv().await { + Some(SseItem::Event(event)) => { + yield Ok::<_, std::convert::Infallible>(event); + } + Some(SseItem::StreamDone) => { + done += 1; + } + None => break, + } + } + + // Wait for exit + let result = execution.wait().await; + let elapsed_ms = started_at.elapsed().as_millis() as u64; + + let (exit_code, _error_message) = match result { + Ok(r) => (r.exit_code, r.error_message), + Err(e) => (-1, Some(e.to_string())), + }; + + let exit_data = serde_json::json!({ + "exit_code": exit_code, + "duration_ms": elapsed_ms, + }) + .to_string(); + + yield Ok(Event::default().event("exit").data(exit_data)); + }; + + Sse::new(stream) + .keep_alive(KeepAlive::default()) + .into_response() +} + +// ============================================================================ +// Box Handle Cache Helper +// ============================================================================ + +async fn get_or_fetch_box(state: &AppState, box_id: &str) -> Result, Response> { + // Check cache first + if let Some(b) = state.boxes.read().await.get(box_id) { + return Ok(Arc::clone(b)); + } + + // Fetch from runtime + match state.runtime.get(box_id).await { + Ok(Some(b)) => { + let id = b.info().id.to_string(); + let arc = Arc::new(b); + state.boxes.write().await.insert(id, Arc::clone(&arc)); + Ok(arc) + } + Ok(None) => Err(error_response( + StatusCode::NOT_FOUND, + format!("box not found: {box_id}"), + "NotFoundError", + )), + Err(e) => { + let (status, etype) = classify_boxlite_error(&e); + Err(error_response(status, e.to_string(), etype)) + } + } +} + +// ============================================================================ +// Router +// ============================================================================ + +fn build_router(state: Arc) -> Router { + Router::new() + .route("/v1/config", get(get_config)) + .route("/v1/local/boxes", post(create_box)) + .route("/v1/local/boxes", get(list_boxes)) + .route("/v1/local/boxes/{box_id}", get(get_box)) + .route("/v1/local/boxes/{box_id}", delete(remove_box)) + .route("/v1/local/boxes/{box_id}/start", post(start_box)) + .route("/v1/local/boxes/{box_id}/stop", post(stop_box)) + .route("/v1/local/boxes/{box_id}/exec", post(start_execution)) + .route( + "/v1/local/boxes/{box_id}/executions/{exec_id}/output", + get(stream_execution_output), + ) + .with_state(state) +} + +// ============================================================================ +// Entry Point +// ============================================================================ + +pub async fn execute(args: ServeArgs, global: &GlobalFlags) -> anyhow::Result<()> { + let runtime = global.create_runtime()?; + + let state = Arc::new(AppState { + runtime, + boxes: RwLock::new(HashMap::new()), + executions: RwLock::new(HashMap::new()), + }); + + let app = build_router(state.clone()); + let addr = format!("{}:{}", args.host, args.port); + let listener = tokio::net::TcpListener::bind(&addr).await?; + + tracing::info!("boxlite serve listening on {}", addr); + eprintln!("BoxLite REST API server listening on http://{addr}"); + + // Graceful shutdown on ctrl-c + let shutdown_state = state.clone(); + axum::serve(listener, app) + .with_graceful_shutdown(async move { + let _ = tokio::signal::ctrl_c().await; + tracing::info!("shutting down..."); + eprintln!("\nShutting down..."); + let _ = shutdown_state.runtime.shutdown(Some(10)).await; + }) + .await?; + + Ok(()) +} diff --git a/boxlite-cli/src/main.rs b/boxlite-cli/src/main.rs index 4e3a9be0..aa2b9870 100644 --- a/boxlite-cli/src/main.rs +++ b/boxlite-cli/src/main.rs @@ -60,6 +60,7 @@ async fn run_cli(cli: Cli) -> anyhow::Result<()> { cli::Commands::Info(args) => commands::info::execute(args, &global).await, cli::Commands::Logs(args) => commands::logs::execute(args, &global).await, cli::Commands::Stats(args) => commands::stats::execute(args, &global).await, + cli::Commands::Serve(args) => commands::serve::execute(args, &global).await, // Handled in main() before tokio; never reaches run_cli cli::Commands::Completion(_) => { unreachable!("completion subcommand is handled before tokio in main()") From f11f2000a0010a98450b0578ad0e9c0f7096a2b3 Mon Sep 17 00:00:00 2001 From: yan5xu Date: Mon, 9 Mar 2026 20:20:29 +0800 Subject: [PATCH 2/2] fix(guest): mount devtmpfs at /dev so block device nodes exist MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The guest init process mounts tmpfs on /tmp, /var/tmp, and /run but never mounts devtmpfs at /dev. Without devtmpfs the kernel cannot auto-populate block device nodes, so /dev/vdb (the 10 GB container disk) is missing. Container.Init then fails to mount vdb and all container writes fall back to the 256 MB guest rootfs on vda. Mount devtmpfs at /dev early in boot — before tmpfs mounts and before the gRPC server starts — so that all virtio-blk devices are visible. Also generalise the is_tmpfs() helper into is_mounted(path, fstype) and rename mount_essential_tmpfs → mount_essential_filesystems to reflect the broader scope. Closes #358 Co-Authored-By: Claude Opus 4.6 (1M context) --- guest/src/main.rs | 12 +++++--- guest/src/mounts.rs | 69 +++++++++++++++++++++++++++++++++++++-------- 2 files changed, 65 insertions(+), 16 deletions(-) diff --git a/guest/src/main.rs b/guest/src/main.rs index e7988add..5de979b5 100644 --- a/guest/src/main.rs +++ b/guest/src/main.rs @@ -98,10 +98,14 @@ async fn main() -> BoxliteResult<()> { info!("BoxLite Guest Agent starting"); - // Mount essential tmpfs directories early - // Needed because virtio-fs doesn't support open-unlink-fstat pattern - mounts::mount_essential_tmpfs()?; - eprintln!("[guest] T+{}ms: tmpfs mounted", boot_elapsed_ms()); + // Mount essential filesystems early (devtmpfs + tmpfs). + // devtmpfs populates /dev with block device nodes (vda, vdb, …). + // tmpfs is needed because virtio-fs doesn't support open-unlink-fstat. + mounts::mount_essential_filesystems()?; + eprintln!( + "[guest] T+{}ms: essential filesystems mounted", + boot_elapsed_ms() + ); // Parse command-line arguments with clap let args = GuestArgs::parse(); diff --git a/guest/src/mounts.rs b/guest/src/mounts.rs index 3fa6ec5b..a67d80ec 100644 --- a/guest/src/mounts.rs +++ b/guest/src/mounts.rs @@ -1,7 +1,9 @@ -//! Essential tmpfs mounts for guest filesystem +//! Essential filesystem mounts for guest init. //! -//! Mounts tmpfs on directories that require local filesystem semantics -//! (e.g., open-unlink-fstat pattern) which virtio-fs doesn't support. +//! Mounts devtmpfs at /dev so the kernel auto-populates block device nodes +//! (e.g., /dev/vda, /dev/vdb), and mounts tmpfs on directories that require +//! local filesystem semantics (open-unlink-fstat pattern) which virtio-fs +//! doesn't support. use boxlite_shared::errors::{BoxliteError, BoxliteResult}; use nix::mount::{mount, MsFlags}; @@ -31,13 +33,18 @@ const TMPFS_MOUNTS: &[TmpfsMount] = &[ }, ]; -/// Mount essential tmpfs directories +/// Mount essential filesystems for guest boot. /// -/// Called early in guest startup, before gRPC server starts. -/// These mounts are needed because virtio-fs doesn't support the -/// open-unlink-fstat pattern used by apt and other tools. -pub fn mount_essential_tmpfs() -> BoxliteResult<()> { - tracing::info!("Mounting essential tmpfs directories"); +/// Called early in guest startup, before the gRPC server starts. +/// +/// 1. **devtmpfs at /dev** — the kernel auto-populates block device nodes +/// (vda, vdb, …) so that later volume-mount RPCs can find them. +/// 2. **tmpfs on /tmp, /var/tmp, /run** — needed because virtio-fs doesn't +/// support the open-unlink-fstat pattern used by apt and other tools. +pub fn mount_essential_filesystems() -> BoxliteResult<()> { + tracing::info!("Mounting essential guest filesystems"); + + mount_devtmpfs()?; for mount_cfg in TMPFS_MOUNTS { mount_tmpfs(mount_cfg)?; @@ -46,11 +53,48 @@ pub fn mount_essential_tmpfs() -> BoxliteResult<()> { Ok(()) } +/// Mount devtmpfs at /dev so the kernel populates block device nodes. +/// +/// Without this, /dev/vdb (container disk) doesn't exist even though the +/// kernel sees the disk in /proc/partitions. Container.Init then fails to +/// mount the 10 GB container disk and writes fall back to the tiny 256 MB +/// guest rootfs on /dev/vda. +fn mount_devtmpfs() -> BoxliteResult<()> { + let dev_path = Path::new("/dev"); + + // Skip if /dev is already a devtmpfs mount + if is_mounted(dev_path, "devtmpfs")? { + tracing::debug!("/dev is already devtmpfs, skipping"); + return Ok(()); + } + + if !dev_path.exists() { + fs::create_dir_all(dev_path) + .map_err(|e| BoxliteError::Internal(format!("Failed to create /dev: {}", e)))?; + } + + tracing::debug!("Mounting devtmpfs on /dev"); + mount( + Some("devtmpfs"), + dev_path, + Some("devtmpfs"), + MsFlags::MS_NOSUID, + Some("mode=0755"), + ) + .map_err(|e| { + tracing::error!("Failed to mount devtmpfs on /dev: {} (errno: {:?})", e, e); + BoxliteError::Internal(format!("Failed to mount devtmpfs on /dev: {}", e)) + })?; + + tracing::info!("Mounted devtmpfs on /dev"); + Ok(()) +} + fn mount_tmpfs(cfg: &TmpfsMount) -> BoxliteResult<()> { let path = Path::new(cfg.path); // Skip if already mounted as tmpfs - if is_tmpfs(path)? { + if is_mounted(path, "tmpfs")? { tracing::debug!("{} is already tmpfs, skipping", cfg.path); return Ok(()); } @@ -95,7 +139,8 @@ fn mount_tmpfs(cfg: &TmpfsMount) -> BoxliteResult<()> { Ok(()) } -fn is_tmpfs(path: &Path) -> BoxliteResult { +/// Check whether `path` is already mounted with the given filesystem type. +fn is_mounted(path: &Path, fstype: &str) -> BoxliteResult { let mounts = match fs::read_to_string("/proc/mounts") { Ok(content) => content, Err(_) => return Ok(false), // /proc may not be mounted yet @@ -105,7 +150,7 @@ fn is_tmpfs(path: &Path) -> BoxliteResult { for line in mounts.lines() { let parts: Vec<&str> = line.split_whitespace().collect(); - if parts.len() >= 3 && parts[1] == path_str && parts[2] == "tmpfs" { + if parts.len() >= 3 && parts[1] == path_str && parts[2] == fstype { return Ok(true); } }