From 4a46c072644d88545cc10ffaca72858fcd1d4b6e Mon Sep 17 00:00:00 2001 From: lile Date: Fri, 27 Mar 2026 18:07:36 +0800 Subject: [PATCH 1/3] feat(litebox): add pause/resume API for zero-CPU VM freezing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add pause() and resume() across all API layers (Rust core, REST, Python SDK) to freeze/resume VMs via SIGSTOP/SIGCONT with guest filesystem quiesce. Core implementation: - pause(): FIFREEZE guest I/O → SIGSTOP shim (quiesce-then-freeze) - resume(): SIGCONT shim → FITHAW guest I/O (resume-then-thaw) - Both operations are idempotent (pause on Paused = no-op, etc.) - State machine: Running ↔ Paused, Paused → Stopped Safety and correctness: - stop() sends SIGCONT before guest shutdown RPC on Paused boxes (prevents 10s gRPC timeout on SIGSTOP'd process) - exec/copy_into/copy_out reject Paused boxes with InvalidState (shim can't handle gRPC while SIGSTOP'd) - Health check skips gRPC pings during Paused state but verifies process alive via kill(pid, 0) to detect death while paused - with_quiesce_async preserves user-initiated Paused state (clone/export/snapshot don't auto-resume user-paused boxes) - Fix pre-existing deadlock: health check save_box used state.read() while holding state.write() (parking_lot RwLock is not reentrant) API surface: - EventListener: on_box_paused/on_box_resumed callbacks - AuditEventKind: BoxPaused/BoxResumed variants - BoxStatus: can_pause()/can_resume()/is_paused() methods - REST: POST /v1/default/boxes/{id}/pause and /resume - Python SDK: box.pause() and box.resume() async methods Tests: 12 new pause/resume unit tests, 2 new integration tests Co-Authored-By: Claude Opus 4.6 --- .../src/commands/serve/handlers/boxes.rs | 36 ++ boxlite-cli/src/commands/serve/mod.rs | 8 + .../event_listener/audit_event_listener.rs | 48 +++ boxlite/src/event_listener/event.rs | 6 + boxlite/src/event_listener/listener.rs | 6 + boxlite/src/litebox/box_impl.rs | 324 ++++++++++++++++-- boxlite/src/litebox/mod.rs | 21 ++ boxlite/src/litebox/state.rs | 98 ++++++ boxlite/src/rest/litebox.rs | 18 + boxlite/src/runtime/backend.rs | 4 + boxlite/tests/audit.rs | 33 ++ sdks/python/src/box_handle.rs | 20 ++ 12 files changed, 587 insertions(+), 35 deletions(-) diff --git a/boxlite-cli/src/commands/serve/handlers/boxes.rs b/boxlite-cli/src/commands/serve/handlers/boxes.rs index 057c8104..e1ecf44b 100644 --- a/boxlite-cli/src/commands/serve/handlers/boxes.rs +++ b/boxlite-cli/src/commands/serve/handlers/boxes.rs @@ -118,6 +118,42 @@ pub(in crate::commands::serve) async fn stop_box( Json(box_info_to_response(&info)).into_response() } +pub(in crate::commands::serve) async fn pause_box( + State(state): State>, + Path(box_id): Path, +) -> Response { + let litebox = match get_or_fetch_box(&state, &box_id).await { + Ok(b) => b, + Err(resp) => return resp, + }; + + if let Err(e) = litebox.pause().await { + let (status, etype) = classify_boxlite_error(&e); + return error_response(status, e.to_string(), etype); + } + + let info = litebox.info(); + Json(box_info_to_response(&info)).into_response() +} + +pub(in crate::commands::serve) async fn resume_box( + State(state): State>, + Path(box_id): Path, +) -> Response { + let litebox = match get_or_fetch_box(&state, &box_id).await { + Ok(b) => b, + Err(resp) => return resp, + }; + + if let Err(e) = litebox.resume().await { + let (status, etype) = classify_boxlite_error(&e); + return error_response(status, e.to_string(), etype); + } + + let info = litebox.info(); + Json(box_info_to_response(&info)).into_response() +} + pub(in crate::commands::serve) async fn remove_box( State(state): State>, Path(box_id): Path, diff --git a/boxlite-cli/src/commands/serve/mod.rs b/boxlite-cli/src/commands/serve/mod.rs index 35d58105..5949a408 100644 --- a/boxlite-cli/src/commands/serve/mod.rs +++ b/boxlite-cli/src/commands/serve/mod.rs @@ -241,6 +241,14 @@ fn build_router(state: Arc) -> Router { "/v1/default/boxes/{box_id}/stop", post(boxes::stop_box), ) + .route( + "/v1/default/boxes/{box_id}/pause", + post(boxes::pause_box), + ) + .route( + "/v1/default/boxes/{box_id}/resume", + post(boxes::resume_box), + ) // Box metrics .route( "/v1/default/boxes/{box_id}/metrics", diff --git a/boxlite/src/event_listener/audit_event_listener.rs b/boxlite/src/event_listener/audit_event_listener.rs index 9fadf3f0..4e9489e8 100644 --- a/boxlite/src/event_listener/audit_event_listener.rs +++ b/boxlite/src/event_listener/audit_event_listener.rs @@ -116,6 +116,14 @@ impl EventListener for AuditEventListener { )); } + fn on_box_paused(&self, box_id: &BoxID) { + self.record(AuditEvent::now(box_id.clone(), AuditEventKind::BoxPaused)); + } + + fn on_box_resumed(&self, box_id: &BoxID) { + self.record(AuditEvent::now(box_id.clone(), AuditEventKind::BoxResumed)); + } + fn on_box_removed(&self, box_id: &BoxID) { self.record(AuditEvent::now(box_id.clone(), AuditEventKind::BoxRemoved)); } @@ -229,6 +237,46 @@ mod tests { assert_eq!(listener.len(), 1000); } + #[test] + fn records_pause_resume_events() { + let listener = AuditEventListener::new(); + let id = test_box_id(); + + listener.on_box_started(&id); + listener.on_box_paused(&id); + listener.on_box_resumed(&id); + listener.on_box_stopped(&id, None); + + let events = listener.events(); + assert_eq!(events.len(), 4); + assert!(matches!(events[0].kind, AuditEventKind::BoxStarted)); + assert!(matches!(events[1].kind, AuditEventKind::BoxPaused)); + assert!(matches!(events[2].kind, AuditEventKind::BoxResumed)); + assert!(matches!(events[3].kind, AuditEventKind::BoxStopped { .. })); + } + + #[test] + fn records_multiple_pause_resume_cycles() { + let listener = AuditEventListener::new(); + let id = test_box_id(); + + listener.on_box_started(&id); + // Cycle 1 + listener.on_box_paused(&id); + listener.on_box_resumed(&id); + // Cycle 2 + listener.on_box_paused(&id); + listener.on_box_resumed(&id); + listener.on_box_stopped(&id, Some(0)); + + let events = listener.events(); + assert_eq!(events.len(), 6); + // Verify all events have the same box_id + for event in &events { + assert_eq!(event.box_id, id); + } + } + #[test] fn events_since_filters() { let listener = AuditEventListener::new(); diff --git a/boxlite/src/event_listener/event.rs b/boxlite/src/event_listener/event.rs index 71e23377..9895aff3 100644 --- a/boxlite/src/event_listener/event.rs +++ b/boxlite/src/event_listener/event.rs @@ -42,6 +42,12 @@ pub enum AuditEventKind { /// Box VM stopped. BoxStopped { exit_code: Option }, + /// Box VM paused (SIGSTOP). + BoxPaused, + + /// Box VM resumed from pause (SIGCONT). + BoxResumed, + /// Box removed. BoxRemoved, diff --git a/boxlite/src/event_listener/listener.rs b/boxlite/src/event_listener/listener.rs index 9fe9bfa3..45085310 100644 --- a/boxlite/src/event_listener/listener.rs +++ b/boxlite/src/event_listener/listener.rs @@ -44,6 +44,12 @@ pub trait EventListener: Send + Sync { /// Called after a box VM stops. fn on_box_stopped(&self, _box_id: &BoxID, _exit_code: Option) {} + /// Called after a box VM is paused (SIGSTOP). + fn on_box_paused(&self, _box_id: &BoxID) {} + + /// Called after a box VM is resumed from pause (SIGCONT). + fn on_box_resumed(&self, _box_id: &BoxID) {} + /// Called after a box is removed. fn on_box_removed(&self, _box_id: &BoxID) {} diff --git a/boxlite/src/litebox/box_impl.rs b/boxlite/src/litebox/box_impl.rs index d2c9a1bd..d5959873 100644 --- a/boxlite/src/litebox/box_impl.rs +++ b/boxlite/src/litebox/box_impl.rs @@ -227,6 +227,15 @@ impl BoxImpl { )); } + // Reject exec on paused boxes — shim can't handle gRPC requests while SIGSTOP'd. + let status = self.state.read().status; + if !status.can_exec() { + return Err(BoxliteError::InvalidState(format!( + "Cannot exec on box in {} state", + status + ))); + } + let live = self.live_state().await?; // Inject container ID into environment if not already set @@ -311,6 +320,155 @@ impl BoxImpl { )) } + /// Pause the box (freeze VM via SIGSTOP). + /// + /// Performs a clean quiesce: + /// 1. Guest filesystem quiesce (FIFREEZE — best-effort) + /// 2. SIGSTOP shim process (pauses all vCPUs and virtio backends) + /// + /// Idempotent: calling pause() on an already-Paused box is a no-op. + /// The box must be Running; other states return InvalidState. + pub(crate) async fn pause(&self) -> BoxliteResult<()> { + // Check if already shutdown + if self.shutdown_token.is_cancelled() { + return Err(BoxliteError::Stopped( + "Handle invalidated after stop(). Use runtime.get() to get a new handle.".into(), + )); + } + + let status = self.state.read().status; + + // Idempotent: already paused + if status.is_paused() { + return Ok(()); + } + + // Only Running boxes can be paused + if !status.can_pause() { + return Err(BoxliteError::InvalidState(format!( + "Cannot pause box in {} state", + status + ))); + } + + let pid = { + let state = self.state.read(); + state + .pid + .map(|p| p as i32) + .ok_or_else(|| BoxliteError::Internal("Box is running but has no PID".into()))? + }; + + // Phase 1: Freeze guest I/O (best-effort, 5s timeout) + let frozen = self.guest_quiesce().await; + + // Phase 2: SIGSTOP — pause vCPUs + // SAFETY: sending SIGSTOP to a known valid PID that we own (shim process). + let ret = unsafe { libc::kill(pid, libc::SIGSTOP) }; + if ret != 0 { + if frozen { + self.guest_thaw().await; + } + return Err(BoxliteError::Internal(format!( + "Failed to SIGSTOP shim process (pid={}): {}", + pid, + std::io::Error::last_os_error() + ))); + } + + // Update state + { + let mut state = self.state.write(); + state.force_status(BoxStatus::Paused); + let _ = self.runtime.box_manager.save_box(self.id(), &state); + } + + for listener in &self.event_listeners { + listener.on_box_paused(&self.config.id); + } + + tracing::info!(box_id = %self.config.id, frozen, "Box paused"); + Ok(()) + } + + /// Resume the box from paused state (SIGCONT + thaw). + /// + /// Performs: + /// 1. SIGCONT shim process (resumes vCPUs) + /// 2. Guest filesystem thaw (FITHAW — best-effort) + /// + /// Idempotent: calling resume() on a Running box is a no-op. + /// The box must be Paused; other states return InvalidState. + pub(crate) async fn resume(&self) -> BoxliteResult<()> { + // Check if already shutdown + if self.shutdown_token.is_cancelled() { + return Err(BoxliteError::Stopped( + "Handle invalidated after stop(). Use runtime.get() to get a new handle.".into(), + )); + } + + let status = self.state.read().status; + + // Idempotent: already running + if status.is_running() { + return Ok(()); + } + + // Only Paused boxes can be resumed + if !status.can_resume() { + return Err(BoxliteError::InvalidState(format!( + "Cannot resume box in {} state", + status + ))); + } + + let pid = { + let state = self.state.read(); + state + .pid + .map(|p| p as i32) + .ok_or_else(|| BoxliteError::Internal("Box is paused but has no PID".into()))? + }; + + // Phase 1: SIGCONT — resume vCPUs + // SAFETY: sending SIGCONT to a known valid PID that we own (shim process). + let ret = unsafe { libc::kill(pid, libc::SIGCONT) }; + if ret != 0 { + return Err(BoxliteError::Internal(format!( + "Failed to SIGCONT shim process (pid={}): {}", + pid, + std::io::Error::last_os_error() + ))); + } + + // Verify process is alive before transitioning state + if unsafe { libc::kill(pid, 0) } != 0 { + let mut state = self.state.write(); + state.mark_stop(); + let _ = self.runtime.box_manager.save_box(self.id(), &state); + return Err(BoxliteError::Internal( + "Shim process died while paused".into(), + )); + } + + // Update state + { + let mut state = self.state.write(); + state.force_status(BoxStatus::Running); + let _ = self.runtime.box_manager.save_box(self.id(), &state); + } + + // Phase 2: Thaw guest I/O (best-effort) + self.guest_thaw().await; + + for listener in &self.event_listeners { + listener.on_box_resumed(&self.config.id); + } + + tracing::info!(box_id = %self.config.id, "Box resumed"); + Ok(()) + } + pub(crate) async fn stop(&self) -> BoxliteResult<()> { let t0 = Instant::now(); @@ -340,6 +498,33 @@ impl BoxImpl { // Cancel the token - signals all in-flight operations to abort self.shutdown_token.cancel(); + // If the box is paused (SIGSTOP'd), we must SIGCONT before attempting + // guest shutdown RPC — a stopped process can't handle gRPC requests. + // Without this, we'd hit the 10s timeout then SIGKILL. + { + let state = self.state.read(); + if state.status == BoxStatus::Paused + && let Some(pid) = state.pid + { + // SAFETY: sending SIGCONT to our own shim process PID. + let ret = unsafe { libc::kill(pid as i32, libc::SIGCONT) }; + if ret != 0 { + tracing::debug!( + box_id = %self.config.id, + pid, + error = %std::io::Error::last_os_error(), + "SIGCONT failed (process may have exited while paused)" + ); + } else { + tracing::debug!( + box_id = %self.config.id, + pid, + "Sent SIGCONT to paused shim before guest shutdown" + ); + } + } + } + // Only try to stop VM if LiveState exists if let Some(live) = self.live.get() { // Gracefully shut down guest (with timeout to avoid hanging on unresponsive guests) @@ -468,6 +653,13 @@ impl BoxImpl { )); } + // Reject when paused — guest can't handle gRPC file upload while SIGSTOP'd. + if self.state.read().status.is_paused() { + return Err(BoxliteError::InvalidState( + "Cannot copy into box while paused".into(), + )); + } + // Ensure box is running let live = self.live_state().await?; @@ -543,6 +735,13 @@ impl BoxImpl { )); } + // Reject when paused — guest can't handle gRPC file download while SIGSTOP'd. + if self.state.read().status.is_paused() { + return Err(BoxliteError::InvalidState( + "Cannot copy from box while paused".into(), + )); + } + // Ensure box is running let live = self.live_state().await?; @@ -754,6 +953,38 @@ impl BoxImpl { } } + // Skip gRPC ping if box is paused — shim can't respond while SIGSTOP'd. + // But verify the process is still alive to detect death during pause. + if state.read().status.is_paused() { + let pid = state.read().pid; + if let Some(pid) = pid + && !crate::util::is_process_alive(pid) + { + tracing::error!( + box_id = %box_id, + pid, + "Shim process died while paused, marking box as Stopped" + ); + let mut state_guard = state.write(); + state_guard.force_status(crate::litebox::BoxStatus::Stopped); + state_guard.set_pid(None); + state_guard.health_status.state = crate::litebox::HealthState::Unhealthy; + if let Err(db_err) = runtime.box_manager.save_box(&box_id, &state_guard) { + tracing::error!( + box_id = %box_id, + error = %db_err, + "Failed to persist dead-while-paused state" + ); + } + break; + } + tracing::debug!( + box_id = %box_id, + "Box is paused, skipping gRPC health check" + ); + continue; + } + let elapsed = start_time.elapsed(); let result = if elapsed < start_period { tracing::debug!( @@ -860,7 +1091,7 @@ impl BoxImpl { let mut state_guard = state.write(); let became_unhealthy = state_guard.mark_health_check_failure(retries); - if let Err(db_err) = runtime.box_manager.save_box(&box_id, &state.read()) { + if let Err(db_err) = runtime.box_manager.save_box(&box_id, &state_guard) { tracing::error!( box_id = %box_id, error = %db_err, @@ -908,52 +1139,61 @@ impl BoxImpl { where Fut: std::future::Future>, { - let (pid, was_running) = { + let (pid, was_running, was_paused) = { let state = self.state.read(); let running = state.status.is_running(); - let pid = if running { + let paused = state.status.is_paused(); + let pid = if running || paused { state.pid.map(|p| p as i32) } else { None }; - (pid, running) + (pid, running, paused) }; let Some(pid) = pid else { - if was_running { + if was_running || was_paused { return Err(BoxliteError::Internal( - "Box is running but has no PID".to_string(), + "Box is active but has no PID".to_string(), )); } - // Not running — execute directly, no quiesce needed. + // Not active — execute directly, no quiesce needed. return fut.await; }; let t0 = Instant::now(); // Phase 1: Freeze guest I/O (best-effort, 5s timeout) + // Skip if already paused — guest I/O is already frozen from pause(). let t_quiesce = Instant::now(); - let frozen = self.guest_quiesce().await; + let frozen = if was_paused { + false // Already quiesced by user's pause() + } else { + self.guest_quiesce().await + }; let quiesce_ms = t_quiesce.elapsed().as_millis() as u64; // Phase 2: SIGSTOP — pause vCPUs - // SAFETY: sending SIGSTOP to a known valid PID that we own (shim process). - let ret = unsafe { libc::kill(pid, libc::SIGSTOP) }; - if ret != 0 { - // If SIGSTOP fails, thaw before returning error - if frozen { - self.guest_thaw().await; + // Skip if already paused by user — process is already stopped. + if !was_paused { + // SAFETY: sending SIGSTOP to a known valid PID that we own (shim process). + let ret = unsafe { libc::kill(pid, libc::SIGSTOP) }; + if ret != 0 { + // If SIGSTOP fails, thaw before returning error + if frozen { + self.guest_thaw().await; + } + return Err(BoxliteError::Internal(format!( + "Failed to SIGSTOP shim process (pid={}): {}", + pid, + std::io::Error::last_os_error() + ))); + } + { + let mut state = self.state.write(); + state.force_status(BoxStatus::Paused); + let _ = self.runtime.box_manager.save_box(self.id(), &state); } - return Err(BoxliteError::Internal(format!( - "Failed to SIGSTOP shim process (pid={}): {}", - pid, - std::io::Error::last_os_error() - ))); - } - { - let mut state = self.state.write(); - state.force_status(BoxStatus::Paused); - let _ = self.runtime.box_manager.save_box(self.id(), &state); } // Phase 3: Caller's operation @@ -962,20 +1202,25 @@ impl BoxImpl { let operation_ms = t_op.elapsed().as_millis() as u64; // Phase 4: SIGCONT — resume vCPUs (always, even if f() failed) - // SAFETY: Always send SIGCONT — harmless ESRCH if process already dead. - unsafe { - libc::kill(pid, libc::SIGCONT); - } - // Only transition to Running if process is still alive after resume. - if unsafe { libc::kill(pid, 0) } == 0 { - let mut state = self.state.write(); - state.force_status(BoxStatus::Running); - let _ = self.runtime.box_manager.save_box(self.id(), &state); + // If user had paused the box, leave in Paused state — user must call resume(). + if !was_paused { + // Bracket-initiated pause: resume as before. + // SAFETY: Always send SIGCONT — harmless ESRCH if process already dead. + unsafe { + libc::kill(pid, libc::SIGCONT); + } + // Only transition to Running if process is still alive after resume. + if unsafe { libc::kill(pid, 0) } == 0 { + let mut state = self.state.write(); + state.force_status(BoxStatus::Running); + let _ = self.runtime.box_manager.save_box(self.id(), &state); + } } - // Phase 5: Thaw guest I/O (always, best-effort) + // Phase 5: Thaw guest I/O (best-effort) + // Skip if user had paused — thaw will happen when user calls resume(). let t_thaw = Instant::now(); - if frozen { + if frozen && !was_paused { self.guest_thaw().await; } let thaw_ms = t_thaw.elapsed().as_millis() as u64; @@ -987,6 +1232,7 @@ impl BoxImpl { operation_ms, thaw_ms, frozen, + was_paused, "Quiesce bracket completed" ); @@ -1088,6 +1334,14 @@ impl crate::runtime::backend::BoxBackend for BoxImpl { self.stop().await } + async fn pause(&self) -> BoxliteResult<()> { + self.pause().await + } + + async fn resume(&self) -> BoxliteResult<()> { + self.resume().await + } + async fn copy_into( &self, host_src: &std::path::Path, diff --git a/boxlite/src/litebox/mod.rs b/boxlite/src/litebox/mod.rs index 414590e5..fac0261e 100644 --- a/boxlite/src/litebox/mod.rs +++ b/boxlite/src/litebox/mod.rs @@ -106,6 +106,27 @@ impl LiteBox { self.box_backend.stop().await } + /// Pause the box (freeze VM via SIGSTOP). + /// + /// Quiesces guest filesystems, then sends SIGSTOP to freeze all vCPUs. + /// The box keeps its memory and state but consumes zero CPU. + /// + /// This is idempotent - calling pause() on a Paused box is a no-op. + /// Use resume() to continue execution. + pub async fn pause(&self) -> BoxliteResult<()> { + self.box_backend.pause().await + } + + /// Resume a paused box (SIGCONT + thaw). + /// + /// Sends SIGCONT to resume vCPUs and thaws guest filesystems. + /// The box continues from exactly where it was paused. + /// + /// This is idempotent - calling resume() on a Running box is a no-op. + pub async fn resume(&self) -> BoxliteResult<()> { + self.box_backend.resume().await + } + /// Copy files/directories from host into the container rootfs. pub async fn copy_into( &self, diff --git a/boxlite/src/litebox/state.rs b/boxlite/src/litebox/state.rs index 4e90c546..c0fe7d91 100644 --- a/boxlite/src/litebox/state.rs +++ b/boxlite/src/litebox/state.rs @@ -95,6 +95,18 @@ impl BoxStatus { ) } + /// Check if pause() can be called from this state. + /// Only Running boxes can be paused. + pub fn can_pause(&self) -> bool { + matches!(self, BoxStatus::Running) + } + + /// Check if resume() can be called from this state. + /// Only Paused boxes can be resumed. + pub fn can_resume(&self) -> bool { + matches!(self, BoxStatus::Paused) + } + /// Check if exec() can be called from this state. /// Configured and Stopped will trigger implicit start(). pub fn can_exec(&self) -> bool { @@ -863,4 +875,90 @@ mod tests { assert_eq!(state.health_status.failures, 0); assert!(state.health_status.last_check.is_none()); } + + // ======================================================================== + // Pause/Resume State Tests + // ======================================================================== + + #[test] + fn test_status_can_pause() { + assert!(!BoxStatus::Configured.can_pause()); + assert!(BoxStatus::Running.can_pause()); + assert!(!BoxStatus::Stopping.can_pause()); + assert!(!BoxStatus::Stopped.can_pause()); + assert!(!BoxStatus::Paused.can_pause()); + assert!(!BoxStatus::Unknown.can_pause()); + } + + #[test] + fn test_status_can_resume() { + assert!(!BoxStatus::Configured.can_resume()); + assert!(!BoxStatus::Running.can_resume()); + assert!(!BoxStatus::Stopping.can_resume()); + assert!(!BoxStatus::Stopped.can_resume()); + assert!(BoxStatus::Paused.can_resume()); + assert!(!BoxStatus::Unknown.can_resume()); + } + + #[test] + fn test_pause_resume_cycle() { + let mut state = BoxState::new(); + + // Configured → Running + assert!(state.transition_to(BoxStatus::Running).is_ok()); + assert_eq!(state.status, BoxStatus::Running); + + // Running → Paused + assert!(state.transition_to(BoxStatus::Paused).is_ok()); + assert_eq!(state.status, BoxStatus::Paused); + + // Paused → Running (resume) + assert!(state.transition_to(BoxStatus::Running).is_ok()); + assert_eq!(state.status, BoxStatus::Running); + + // Running → Paused → Running (second cycle) + assert!(state.transition_to(BoxStatus::Paused).is_ok()); + assert!(state.transition_to(BoxStatus::Running).is_ok()); + assert_eq!(state.status, BoxStatus::Running); + } + + #[test] + fn test_paused_to_stopped() { + let mut state = BoxState::new(); + state.force_status(BoxStatus::Paused); + + // Paused → Stopped (stop while paused) + assert!(state.transition_to(BoxStatus::Stopped).is_ok()); + assert_eq!(state.status, BoxStatus::Stopped); + } + + #[test] + fn test_stopped_cannot_pause() { + // Stopped boxes cannot be paused — must start first + assert!(!BoxStatus::Stopped.can_transition_to(BoxStatus::Paused)); + } + + #[test] + fn test_configured_cannot_pause() { + // Configured boxes cannot be paused — must start first + assert!(!BoxStatus::Configured.can_transition_to(BoxStatus::Paused)); + } + + #[test] + fn test_paused_cannot_exec() { + // Exec is blocked while paused + assert!(!BoxStatus::Paused.can_exec()); + } + + #[test] + fn test_paused_can_stop() { + // Stop is allowed from Paused state + assert!(BoxStatus::Paused.can_stop()); + } + + #[test] + fn test_paused_cannot_start() { + // Start is not allowed from Paused state (use resume instead) + assert!(!BoxStatus::Paused.can_start()); + } } diff --git a/boxlite/src/rest/litebox.rs b/boxlite/src/rest/litebox.rs index 9d4681f1..58d3ee27 100644 --- a/boxlite/src/rest/litebox.rs +++ b/boxlite/src/rest/litebox.rs @@ -152,6 +152,24 @@ impl BoxBackend for RestBox { Ok(()) } + async fn pause(&self) -> BoxliteResult<()> { + let box_id = self.box_id_str(); + let path = format!("/boxes/{}/pause", box_id); + let resp: BoxResponse = self.client.post_empty(&path).await?; + let mut info = self.cached_info.write(); + *info = resp.to_box_info(); + Ok(()) + } + + async fn resume(&self) -> BoxliteResult<()> { + let box_id = self.box_id_str(); + let path = format!("/boxes/{}/resume", box_id); + let resp: BoxResponse = self.client.post_empty(&path).await?; + let mut info = self.cached_info.write(); + *info = resp.to_box_info(); + Ok(()) + } + async fn copy_into( &self, host_src: &Path, diff --git a/boxlite/src/runtime/backend.rs b/boxlite/src/runtime/backend.rs index 8c210f3f..9974eea5 100644 --- a/boxlite/src/runtime/backend.rs +++ b/boxlite/src/runtime/backend.rs @@ -82,6 +82,10 @@ pub(crate) trait BoxBackend: Send + Sync { async fn stop(&self) -> BoxliteResult<()>; + async fn pause(&self) -> BoxliteResult<()>; + + async fn resume(&self) -> BoxliteResult<()>; + async fn copy_into( &self, host_src: &Path, diff --git a/boxlite/tests/audit.rs b/boxlite/tests/audit.rs index 4ee95f4a..39401216 100644 --- a/boxlite/tests/audit.rs +++ b/boxlite/tests/audit.rs @@ -49,3 +49,36 @@ fn multiple_listeners_all_receive_events() { assert_eq!(l1.events().len(), 1); assert_eq!(l2.events().len(), 1); } + +#[test] +fn audit_event_listener_records_pause_resume() { + let listener = AuditEventListener::new(); + let id = BoxIDMint::mint(); + + listener.on_box_created(&id); + listener.on_box_started(&id); + listener.on_box_paused(&id); + listener.on_box_resumed(&id); + listener.on_box_stopped(&id, Some(0)); + + let events = listener.events(); + assert_eq!(events.len(), 5); + assert!(matches!(events[0].kind, AuditEventKind::BoxCreated)); + assert!(matches!(events[1].kind, AuditEventKind::BoxStarted)); + assert!(matches!(events[2].kind, AuditEventKind::BoxPaused)); + assert!(matches!(events[3].kind, AuditEventKind::BoxResumed)); + assert!(matches!( + events[4].kind, + AuditEventKind::BoxStopped { exit_code: Some(0) } + )); +} + +#[test] +fn pause_resume_via_trait_object() { + let listener: Arc = Arc::new(AuditEventListener::new()); + let id = BoxIDMint::mint(); + + // Verify pause/resume work through dyn trait object + listener.on_box_paused(&id); + listener.on_box_resumed(&id); +} diff --git a/sdks/python/src/box_handle.rs b/sdks/python/src/box_handle.rs index a2c2ad20..32edca5d 100644 --- a/sdks/python/src/box_handle.rs +++ b/sdks/python/src/box_handle.rs @@ -106,6 +106,26 @@ impl PyBox { }) } + /// Pause the box (freeze VM, zero CPU, state preserved). + fn pause<'a>(&self, py: Python<'a>) -> PyResult> { + let handle = Arc::clone(&self.handle); + + pyo3_async_runtimes::tokio::future_into_py(py, async move { + handle.pause().await.map_err(map_err)?; + Ok(()) + }) + } + + /// Resume the box from paused state. + fn resume<'a>(&self, py: Python<'a>) -> PyResult> { + let handle = Arc::clone(&self.handle); + + pyo3_async_runtimes::tokio::future_into_py(py, async move { + handle.resume().await.map_err(map_err)?; + Ok(()) + }) + } + fn metrics<'a>(&self, py: Python<'a>) -> PyResult> { let handle = Arc::clone(&self.handle); From 98cdd3ae6100b251d0c70defc3a6cbcc4ea56d71 Mon Sep 17 00:00:00 2001 From: lile Date: Sat, 28 Mar 2026 15:37:26 +0800 Subject: [PATCH 2/3] feat(litebox): harden pause/resume with tests, Node.js SDK, and review fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address code review findings for the pause/resume API: - Fix TOCTOU: re-check shutdown_token after guest_quiesce() before SIGSTOP - Replace force_status() with transition_to() + fallback for safety - Log save_box failures with tracing::warn instead of silent discard - Combine double state.read() in health check into single lock acquisition - Add Paused → Stopping transition to state machine for completeness Add missing test coverage: - copy_into/copy_out rejected while paused (P1 gap) - resume on stopped box returns error (P1 gap) - Event listener multi-listener and box_id correctness tests - State machine: Paused→Stopping transition, Paused cannot remove Add Node.js SDK bindings: - pause()/resume() in napi-rs (box_handle.rs) - pause()/resume() in SimpleBox TypeScript wrapper with JSDoc Add integration tests and Python example: - 10 integration tests covering all pause/resume scenarios - Python example with 4 demo functions Co-Authored-By: Claude Opus 4.6 --- boxlite/src/litebox/box_impl.rs | 37 +- boxlite/src/litebox/state.rs | 19 +- boxlite/tests/audit.rs | 33 ++ boxlite/tests/pause_resume.rs | 350 ++++++++++++++++++ .../python/03_lifecycle/pause_and_resume.py | 192 ++++++++++ sdks/node/lib/simplebox.ts | 49 +++ sdks/node/src/box_handle.rs | 16 + 7 files changed, 688 insertions(+), 8 deletions(-) create mode 100644 boxlite/tests/pause_resume.rs create mode 100644 examples/python/03_lifecycle/pause_and_resume.py diff --git a/boxlite/src/litebox/box_impl.rs b/boxlite/src/litebox/box_impl.rs index d5959873..002b5582 100644 --- a/boxlite/src/litebox/box_impl.rs +++ b/boxlite/src/litebox/box_impl.rs @@ -362,6 +362,16 @@ impl BoxImpl { // Phase 1: Freeze guest I/O (best-effort, 5s timeout) let frozen = self.guest_quiesce().await; + // Re-check shutdown token after async quiesce — stop() may have raced. + if self.shutdown_token.is_cancelled() { + if frozen { + self.guest_thaw().await; + } + return Err(BoxliteError::Stopped( + "Handle invalidated after stop(). Use runtime.get() to get a new handle.".into(), + )); + } + // Phase 2: SIGSTOP — pause vCPUs // SAFETY: sending SIGSTOP to a known valid PID that we own (shim process). let ret = unsafe { libc::kill(pid, libc::SIGSTOP) }; @@ -379,8 +389,13 @@ impl BoxImpl { // Update state { let mut state = self.state.write(); - state.force_status(BoxStatus::Paused); - let _ = self.runtime.box_manager.save_box(self.id(), &state); + if let Err(e) = state.transition_to(BoxStatus::Paused) { + tracing::warn!(box_id = %self.config.id, error = %e, "State transition to Paused failed (race?)"); + state.force_status(BoxStatus::Paused); + } + if let Err(e) = self.runtime.box_manager.save_box(self.id(), &state) { + tracing::warn!(box_id = %self.config.id, error = %e, "Failed to persist Paused state"); + } } for listener in &self.event_listeners { @@ -454,8 +469,13 @@ impl BoxImpl { // Update state { let mut state = self.state.write(); - state.force_status(BoxStatus::Running); - let _ = self.runtime.box_manager.save_box(self.id(), &state); + if let Err(e) = state.transition_to(BoxStatus::Running) { + tracing::warn!(box_id = %self.config.id, error = %e, "State transition to Running failed (race?)"); + state.force_status(BoxStatus::Running); + } + if let Err(e) = self.runtime.box_manager.save_box(self.id(), &state) { + tracing::warn!(box_id = %self.config.id, error = %e, "Failed to persist Running state"); + } } // Phase 2: Thaw guest I/O (best-effort) @@ -955,9 +975,12 @@ impl BoxImpl { // Skip gRPC ping if box is paused — shim can't respond while SIGSTOP'd. // But verify the process is still alive to detect death during pause. - if state.read().status.is_paused() { - let pid = state.read().pid; - if let Some(pid) = pid + let (is_paused, paused_pid) = { + let s = state.read(); + (s.status.is_paused(), s.pid) + }; + if is_paused { + if let Some(pid) = paused_pid && !crate::util::is_process_alive(pid) { tracing::error!( diff --git a/boxlite/src/litebox/state.rs b/boxlite/src/litebox/state.rs index c0fe7d91..50ccb257 100644 --- a/boxlite/src/litebox/state.rs +++ b/boxlite/src/litebox/state.rs @@ -138,8 +138,9 @@ impl BoxStatus { // Stopped → Running (restart) (Stopped, Running) | (Stopped, Unknown) | - // Paused → Running (SIGCONT resume) or Stopped (killed while paused) + // Paused → Running (SIGCONT resume), Stopping (graceful stop), or Stopped (killed while paused) (Paused, Running) | + (Paused, Stopping) | (Paused, Stopped) | (Paused, Unknown) ) @@ -961,4 +962,20 @@ mod tests { // Start is not allowed from Paused state (use resume instead) assert!(!BoxStatus::Paused.can_start()); } + + #[test] + fn test_paused_to_stopping() { + // Paused → Stopping is valid (graceful stop from paused state) + assert!(BoxStatus::Paused.can_transition_to(BoxStatus::Stopping)); + let mut state = BoxState::new(); + state.force_status(BoxStatus::Paused); + assert!(state.transition_to(BoxStatus::Stopping).is_ok()); + assert_eq!(state.status, BoxStatus::Stopping); + } + + #[test] + fn test_paused_cannot_remove() { + // Paused boxes cannot be removed (must stop first) + assert!(!BoxStatus::Paused.can_remove()); + } } diff --git a/boxlite/tests/audit.rs b/boxlite/tests/audit.rs index 39401216..1bbe4eb6 100644 --- a/boxlite/tests/audit.rs +++ b/boxlite/tests/audit.rs @@ -82,3 +82,36 @@ fn pause_resume_via_trait_object() { listener.on_box_paused(&id); listener.on_box_resumed(&id); } + +#[test] +fn multiple_listeners_all_receive_pause_resume() { + let l1 = Arc::new(AuditEventListener::new()); + let l2 = Arc::new(AuditEventListener::new()); + let listeners: Vec> = vec![l1.clone(), l2.clone()]; + + let id = BoxIDMint::mint(); + for listener in &listeners { + listener.on_box_paused(&id); + listener.on_box_resumed(&id); + } + + assert_eq!(l1.events().len(), 2); + assert_eq!(l2.events().len(), 2); + assert!(matches!(l1.events()[0].kind, AuditEventKind::BoxPaused)); + assert!(matches!(l1.events()[1].kind, AuditEventKind::BoxResumed)); +} + +#[test] +fn pause_resume_events_have_correct_box_id() { + let listener = AuditEventListener::new(); + let id1 = BoxIDMint::mint(); + let id2 = BoxIDMint::mint(); + + listener.on_box_paused(&id1); + listener.on_box_resumed(&id2); + + let events = listener.events(); + assert_eq!(events.len(), 2); + assert_eq!(events[0].box_id, id1); + assert_eq!(events[1].box_id, id2); +} diff --git a/boxlite/tests/pause_resume.rs b/boxlite/tests/pause_resume.rs new file mode 100644 index 00000000..90fe8659 --- /dev/null +++ b/boxlite/tests/pause_resume.rs @@ -0,0 +1,350 @@ +//! Integration tests for the pause/resume API. +//! +//! Tests the high-level `LiteBox::pause()` and `LiteBox::resume()` methods +//! with a real VM (alpine:latest). Validates state transitions, idempotency, +//! exec rejection while paused, and stop-from-paused. +//! +//! Requires a real VM runtime. Run with: +//! +//! ```sh +//! cargo test -p boxlite --test pause_resume +//! ``` + +mod common; + +use boxlite::runtime::options::BoxliteOptions; +use boxlite::runtime::types::BoxStatus; +use boxlite::{BoxCommand, BoxliteRuntime}; + +/// Helper: create a runtime with a per-test home directory. +fn test_runtime() -> (boxlite_test_utils::home::PerTestBoxHome, BoxliteRuntime) { + let home = boxlite_test_utils::home::PerTestBoxHome::new(); + let runtime = BoxliteRuntime::new(BoxliteOptions { + home_dir: home.path.clone(), + image_registries: common::test_registries(), + }) + .expect("create runtime"); + (home, runtime) +} + +#[tokio::test] +async fn pause_freezes_vm_and_resume_restores_it() { + let (_home, runtime) = test_runtime(); + + let litebox = runtime + .create(common::alpine_opts(), Some("pause-test".into())) + .await + .expect("create box"); + + litebox.start().await.expect("start box"); + + // Verify box is responsive + let cmd = BoxCommand::new("echo").args(["before-pause"]); + let mut exec = litebox.exec(cmd).await.expect("exec before pause"); + let result = exec.wait().await.expect("wait before pause"); + assert_eq!(result.exit_code, 0); + + // Pause + litebox.pause().await.expect("pause box"); + assert_eq!(litebox.info().status, BoxStatus::Paused); + + // Resume + litebox.resume().await.expect("resume box"); + assert_eq!(litebox.info().status, BoxStatus::Running); + + // Verify box is still responsive after resume + let cmd = BoxCommand::new("echo").args(["after-resume"]); + let mut exec = litebox.exec(cmd).await.expect("exec after resume"); + let result = exec.wait().await.expect("wait after resume"); + assert_eq!(result.exit_code, 0); + + litebox.stop().await.expect("stop box"); + let _ = runtime.shutdown(Some(common::TEST_SHUTDOWN_TIMEOUT)).await; +} + +#[tokio::test] +async fn exec_rejected_while_paused() { + let (_home, runtime) = test_runtime(); + + let litebox = runtime + .create(common::alpine_opts(), Some("pause-exec-test".into())) + .await + .expect("create box"); + + litebox.start().await.expect("start box"); + litebox.pause().await.expect("pause box"); + + // Exec should fail with InvalidState + let cmd = BoxCommand::new("echo").args(["should-fail"]); + let err = match litebox.exec(cmd).await { + Err(e) => e, + Ok(_) => panic!("exec should fail while paused"), + }; + let msg = err.to_string(); + assert!( + msg.contains("Paused") || msg.contains("paused") || msg.contains("InvalidState"), + "Expected InvalidState/Paused error, got: {msg}" + ); + + // Resume and verify exec works again + litebox.resume().await.expect("resume box"); + let cmd = BoxCommand::new("echo").args(["works-again"]); + let mut exec = litebox.exec(cmd).await.expect("exec after resume"); + let result = exec.wait().await.expect("wait after resume"); + assert_eq!(result.exit_code, 0); + + litebox.stop().await.expect("stop box"); + let _ = runtime.shutdown(Some(common::TEST_SHUTDOWN_TIMEOUT)).await; +} + +#[tokio::test] +async fn pause_is_idempotent() { + let (_home, runtime) = test_runtime(); + + let litebox = runtime + .create(common::alpine_opts(), Some("pause-idempotent".into())) + .await + .expect("create box"); + + litebox.start().await.expect("start box"); + + // Pause twice — second call should be a no-op + litebox.pause().await.expect("first pause"); + assert_eq!(litebox.info().status, BoxStatus::Paused); + litebox.pause().await.expect("second pause (idempotent)"); + assert_eq!(litebox.info().status, BoxStatus::Paused); + + litebox.resume().await.expect("resume"); + litebox.stop().await.expect("stop box"); + let _ = runtime.shutdown(Some(common::TEST_SHUTDOWN_TIMEOUT)).await; +} + +#[tokio::test] +async fn resume_is_idempotent() { + let (_home, runtime) = test_runtime(); + + let litebox = runtime + .create(common::alpine_opts(), Some("resume-idempotent".into())) + .await + .expect("create box"); + + litebox.start().await.expect("start box"); + + // Resume on a Running box should be a no-op + litebox + .resume() + .await + .expect("resume on running (idempotent)"); + assert_eq!(litebox.info().status, BoxStatus::Running); + + litebox.stop().await.expect("stop box"); + let _ = runtime.shutdown(Some(common::TEST_SHUTDOWN_TIMEOUT)).await; +} + +#[tokio::test] +async fn stop_from_paused_state() { + let (_home, runtime) = test_runtime(); + + let litebox = runtime + .create(common::alpine_opts(), Some("pause-stop".into())) + .await + .expect("create box"); + + litebox.start().await.expect("start box"); + litebox.pause().await.expect("pause box"); + assert_eq!(litebox.info().status, BoxStatus::Paused); + + // Stop directly from Paused should work + litebox.stop().await.expect("stop from paused"); + + let info = runtime + .get_info(litebox.id().as_str()) + .await + .expect("get info") + .expect("box should exist"); + assert_eq!(info.status, BoxStatus::Stopped); + + let _ = runtime.shutdown(Some(common::TEST_SHUTDOWN_TIMEOUT)).await; +} + +#[tokio::test] +async fn multiple_pause_resume_cycles() { + let (_home, runtime) = test_runtime(); + + let litebox = runtime + .create(common::alpine_opts(), Some("pause-cycles".into())) + .await + .expect("create box"); + + litebox.start().await.expect("start box"); + + for i in 0..3 { + litebox + .pause() + .await + .unwrap_or_else(|e| panic!("pause cycle {i}: {e}")); + assert_eq!(litebox.info().status, BoxStatus::Paused); + + litebox + .resume() + .await + .unwrap_or_else(|e| panic!("resume cycle {i}: {e}")); + assert_eq!(litebox.info().status, BoxStatus::Running); + + // Verify VM is responsive after each cycle + let cmd = BoxCommand::new("echo").args([format!("cycle-{i}")]); + let mut exec = litebox + .exec(cmd) + .await + .unwrap_or_else(|e| panic!("exec cycle {i}: {e}")); + let result = exec + .wait() + .await + .unwrap_or_else(|e| panic!("wait cycle {i}: {e}")); + assert_eq!(result.exit_code, 0, "command failed in cycle {i}"); + } + + litebox.stop().await.expect("stop box"); + let _ = runtime.shutdown(Some(common::TEST_SHUTDOWN_TIMEOUT)).await; +} + +#[tokio::test] +async fn resume_on_stopped_box_returns_error() { + let (_home, runtime) = test_runtime(); + + let litebox = runtime + .create(common::alpine_opts(), Some("resume-stopped".into())) + .await + .expect("create box"); + + litebox.start().await.expect("start box"); + litebox.stop().await.expect("stop box"); + + // Resume on a Stopped box should fail + let err = match litebox.resume().await { + Err(e) => e, + Ok(()) => panic!("resume should fail on stopped box"), + }; + let msg = err.to_string(); + assert!( + msg.contains("stop") || msg.contains("Stop") || msg.contains("invalidated"), + "Expected stopped/invalidated error, got: {msg}" + ); + + let _ = runtime.shutdown(Some(common::TEST_SHUTDOWN_TIMEOUT)).await; +} + +#[tokio::test] +async fn copy_into_rejected_while_paused() { + let (_home, runtime) = test_runtime(); + + let litebox = runtime + .create(common::alpine_opts(), Some("pause-copy-in".into())) + .await + .expect("create box"); + + litebox.start().await.expect("start box"); + + // Create a temp file to copy + let tmp = std::env::temp_dir().join("boxlite-test-copy-pause"); + std::fs::write(&tmp, b"test").expect("write temp file"); + + litebox.pause().await.expect("pause box"); + + // copy_into should fail while paused + let err = match litebox + .copy_into(&tmp, "/tmp/test", Default::default()) + .await + { + Err(e) => e, + Ok(()) => panic!("copy_into should fail while paused"), + }; + let msg = err.to_string(); + assert!( + msg.contains("paused") || msg.contains("Paused"), + "Expected paused error, got: {msg}" + ); + + // Resume and verify copy works + litebox.resume().await.expect("resume box"); + litebox + .copy_into(&tmp, "/tmp/test", Default::default()) + .await + .expect("copy_into after resume"); + + let _ = std::fs::remove_file(&tmp); + litebox.stop().await.expect("stop box"); + let _ = runtime.shutdown(Some(common::TEST_SHUTDOWN_TIMEOUT)).await; +} + +#[tokio::test] +async fn copy_out_rejected_while_paused() { + let (_home, runtime) = test_runtime(); + + let litebox = runtime + .create(common::alpine_opts(), Some("pause-copy-out".into())) + .await + .expect("create box"); + + litebox.start().await.expect("start box"); + + // Create a file inside the box to copy out + let cmd = BoxCommand::new("sh").args(["-c", "echo test > /tmp/testfile"]); + let mut exec = litebox.exec(cmd).await.expect("create file"); + exec.wait().await.expect("wait create file"); + + litebox.pause().await.expect("pause box"); + + let host_dst = std::env::temp_dir().join("boxlite-test-copy-out-pause"); + + // copy_out should fail while paused + let err = match litebox + .copy_out("/tmp/testfile", &host_dst, Default::default()) + .await + { + Err(e) => e, + Ok(()) => panic!("copy_out should fail while paused"), + }; + let msg = err.to_string(); + assert!( + msg.contains("paused") || msg.contains("Paused"), + "Expected paused error, got: {msg}" + ); + + // Resume and verify copy works + litebox.resume().await.expect("resume box"); + litebox + .copy_out("/tmp/testfile", &host_dst, Default::default()) + .await + .expect("copy_out after resume"); + + let _ = std::fs::remove_file(&host_dst); + litebox.stop().await.expect("stop box"); + let _ = runtime.shutdown(Some(common::TEST_SHUTDOWN_TIMEOUT)).await; +} + +#[tokio::test] +async fn pause_on_stopped_box_returns_error() { + let (_home, runtime) = test_runtime(); + + let litebox = runtime + .create(common::alpine_opts(), Some("pause-stopped".into())) + .await + .expect("create box"); + + litebox.start().await.expect("start box"); + litebox.stop().await.expect("stop box"); + + // Pause on a Stopped box should fail + let err = match litebox.pause().await { + Err(e) => e, + Ok(()) => panic!("pause should fail on stopped box"), + }; + let msg = err.to_string(); + assert!( + msg.contains("stop") || msg.contains("Stop") || msg.contains("invalidated"), + "Expected stopped/invalidated error, got: {msg}" + ); + + let _ = runtime.shutdown(Some(common::TEST_SHUTDOWN_TIMEOUT)).await; +} diff --git a/examples/python/03_lifecycle/pause_and_resume.py b/examples/python/03_lifecycle/pause_and_resume.py new file mode 100644 index 00000000..a844503e --- /dev/null +++ b/examples/python/03_lifecycle/pause_and_resume.py @@ -0,0 +1,192 @@ +#!/usr/bin/env python3 +""" +Pause and Resume Example - Zero-CPU VM Freezing + +Demonstrates the pause/resume API: +- pause(): Freezes VM (SIGSTOP) — zero CPU, memory preserved +- resume(): Thaws VM (SIGCONT) — continues from exact point +- Idempotent: pause on paused = no-op, resume on running = no-op +- Exec rejected while paused (InvalidState) +- Stop works directly from paused state +""" + +import asyncio + +import boxlite + + +async def basic_pause_resume(): + """Pause a box, then resume and verify it still works.""" + print("\n=== Basic Pause/Resume ===") + + runtime = boxlite.Boxlite.default() + + box = await runtime.create(boxlite.BoxOptions( + image="alpine:latest", + auto_remove=False, + )) + print(f"Created box: {box.id}") + + # Run a command to verify box is working + execution = await box.exec("echo", ["Box is running"]) + stdout = execution.stdout() + async for line in stdout: + print(f" {line.strip()}") + await execution.wait() + + info = box.info() + print(f"State: {info.state}") + + # Pause — VM frozen, zero CPU usage + print("\nPausing box...") + await box.pause() + info = box.info() + print(f"State after pause: {info.state}") + + # Resume — VM continues from exact point + print("\nResuming box...") + await box.resume() + info = box.info() + print(f"State after resume: {info.state}") + + # Verify box still works + execution = await box.exec("echo", ["Still alive after pause/resume!"]) + stdout = execution.stdout() + async for line in stdout: + print(f" {line.strip()}") + await execution.wait() + + await box.stop() + await runtime.remove(box.id, force=False) + print("\nBox stopped and removed") + + +async def exec_blocked_while_paused(): + """Show that exec is rejected while the box is paused.""" + print("\n\n=== Exec Blocked While Paused ===") + + runtime = boxlite.Boxlite.default() + + box = await runtime.create(boxlite.BoxOptions( + image="alpine:latest", + auto_remove=False, + )) + print(f"Created box: {box.id}") + + execution = await box.exec("echo", ["ready"]) + await execution.wait() + + await box.pause() + print("Box paused") + + # Attempt exec while paused + print("Attempting exec while paused...") + try: + await box.exec("echo", ["should fail"]) + print(" Unexpected: exec succeeded") + except Exception as e: + print(f" Expected error: {e}") + + # Resume and exec works again + await box.resume() + print("Box resumed") + + execution = await box.exec("echo", ["works again!"]) + stdout = execution.stdout() + async for line in stdout: + print(f" {line.strip()}") + await execution.wait() + + await box.stop() + await runtime.remove(box.id, force=False) + + +async def pause_resume_cycles(): + """Multiple pause/resume cycles without corruption.""" + print("\n\n=== Multiple Pause/Resume Cycles ===") + + runtime = boxlite.Boxlite.default() + + box = await runtime.create(boxlite.BoxOptions( + image="alpine:latest", + auto_remove=False, + )) + print(f"Created box: {box.id}") + + execution = await box.exec("echo", ["init"]) + await execution.wait() + + for i in range(3): + await box.pause() + info = box.info() + print(f" Cycle {i}: paused (state={info.state})") + + await box.resume() + execution = await box.exec("echo", [f"cycle-{i}"]) + stdout = execution.stdout() + async for line in stdout: + print(f" Cycle {i}: {line.strip()}") + await execution.wait() + + print("All cycles completed — VM integrity preserved") + + await box.stop() + await runtime.remove(box.id, force=False) + + +async def stop_from_paused(): + """Stop a paused box directly (no need to resume first).""" + print("\n\n=== Stop From Paused State ===") + + runtime = boxlite.Boxlite.default() + + box = await runtime.create(boxlite.BoxOptions( + image="alpine:latest", + auto_remove=False, + )) + box_id = box.id + print(f"Created box: {box_id}") + + execution = await box.exec("echo", ["running"]) + await execution.wait() + + await box.pause() + print(f"State: {box.info().state}") + + # Stop directly from Paused — no resume needed + print("Stopping directly from paused state...") + await box.stop() + + info = await runtime.get_info(box_id) + if info: + print(f"State after stop: {info.state}") + + await runtime.remove(box_id, force=False) + print("Box removed") + + +async def main(): + """Run all pause/resume demonstrations.""" + print("Pause/Resume API Demo") + print("=" * 60) + print("\nKey concepts:") + print(" - pause() freezes VM: zero CPU, memory preserved") + print(" - resume() thaws VM: continues from exact point") + print(" - exec/copy rejected while paused (InvalidState)") + print(" - stop() works directly from paused state") + + await basic_pause_resume() + await exec_blocked_while_paused() + await pause_resume_cycles() + await stop_from_paused() + + print("\n" + "=" * 60) + print("All demos completed!") + print("\nUse cases:") + print(" - Suspend idle AI agent sandboxes (save CPU, keep state)") + print(" - Point-in-time snapshots (pause → snapshot → resume)") + print(" - Resource management (pause low-priority boxes)") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/sdks/node/lib/simplebox.ts b/sdks/node/lib/simplebox.ts index 40637dfd..33d92782 100644 --- a/sdks/node/lib/simplebox.ts +++ b/sdks/node/lib/simplebox.ts @@ -637,6 +637,55 @@ export class SimpleBox { * console.log('Box stopped'); * ``` */ + /** + * Pause the box (freeze VM, zero CPU, state preserved). + * + * Quiesces guest filesystems, then sends SIGSTOP to freeze all vCPUs. + * The box keeps its memory and state but consumes zero CPU. + * + * Idempotent: calling pause() on a Paused box is a no-op. + * Use resume() to continue execution. + * + * Does nothing if the box was never created. + * + * @example + * ```typescript + * await box.pause(); + * // Box is frozen — zero CPU, memory preserved + * await box.resume(); + * ``` + */ + async pause(): Promise { + if (!this._box) { + return; + } + await this._box.pause(); + } + + /** + * Resume the box from paused state. + * + * Sends SIGCONT to resume vCPUs and thaws guest filesystems. + * The box continues from exactly where it was paused. + * + * Idempotent: calling resume() on a Running box is a no-op. + * + * Does nothing if the box was never created. + * + * @example + * ```typescript + * await box.pause(); + * // ... do something while box is frozen ... + * await box.resume(); + * ``` + */ + async resume(): Promise { + if (!this._box) { + return; + } + await this._box.resume(); + } + async stop(): Promise { if (!this._box) { // Box was never created, nothing to stop diff --git a/sdks/node/src/box_handle.rs b/sdks/node/src/box_handle.rs index d7fc4788..ac335669 100644 --- a/sdks/node/src/box_handle.rs +++ b/sdks/node/src/box_handle.rs @@ -149,6 +149,22 @@ impl JsBox { self.handle.stop().await.map_err(map_err) } + /// Pause the box (freeze VM, zero CPU, state preserved). + /// + /// Idempotent: calling pause() on a Paused box is a no-op. + #[napi] + pub async fn pause(&self) -> Result<()> { + self.handle.pause().await.map_err(map_err) + } + + /// Resume the box from paused state. + /// + /// Idempotent: calling resume() on a Running box is a no-op. + #[napi] + pub async fn resume(&self) -> Result<()> { + self.handle.resume().await.map_err(map_err) + } + /// Get box metrics. #[napi] pub async fn metrics(&self) -> Result { From a8b1ca137593b2a615e07e6153f2ae90bc57769e Mon Sep 17 00:00:00 2001 From: lile Date: Sun, 29 Mar 2026 21:31:30 +0800 Subject: [PATCH 3/3] =?UTF-8?q?fix(litebox):=20address=20review=20feedback?= =?UTF-8?q?=20=E2=80=94=20ESRCH=20races,=20quiesced=20tracking,=20typed=20?= =?UTF-8?q?error=20matching?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Handle ESRCH in SIGSTOP/SIGCONT: if shim dies mid-pause, transition to Stopped instead of returning a confusing Internal error - Handle stop() racing with pause(): if state transitions to Stopping/Stopped during pause, undo SIGSTOP and yield to stop() teardown - Track quiesced flag on BoxState so with_quiesce_async knows whether guest I/O was frozen during an earlier pause(); warn when degrading to crash-consistent (SIGSTOP-only) snapshots - CLI serve: pattern-match on BoxliteError variants instead of string matching for HTTP status classification - Node.js SDK: fix duplicate JSDoc block on stop() - Python SDK: add idempotency docstrings to pause()/resume() - Python example: add try/except cleanup, use info.state.status (not info.state) - Add unit tests for quiesced flag initialization and mark_stop clearing Co-Authored-By: Claude Opus 4.6 --- boxlite-cli/src/commands/serve/mod.rs | 20 +- boxlite/src/litebox/box_impl.rs | 119 +++++++-- boxlite/src/litebox/state.rs | 39 ++- .../python/03_lifecycle/pause_and_resume.py | 239 ++++++++++-------- sdks/node/lib/simplebox.ts | 28 +- sdks/python/src/box_handle.rs | 5 + 6 files changed, 300 insertions(+), 150 deletions(-) diff --git a/boxlite-cli/src/commands/serve/mod.rs b/boxlite-cli/src/commands/serve/mod.rs index 5949a408..871e8ff7 100644 --- a/boxlite-cli/src/commands/serve/mod.rs +++ b/boxlite-cli/src/commands/serve/mod.rs @@ -165,15 +165,17 @@ fn error_response(status: StatusCode, message: impl Into, error_type: &s } fn classify_boxlite_error(err: &boxlite::BoxliteError) -> (StatusCode, &'static str) { - let msg = err.to_string().to_lowercase(); - if msg.contains("not found") { - (StatusCode::NOT_FOUND, "NotFoundError") - } else if msg.contains("already") || msg.contains("conflict") { - (StatusCode::CONFLICT, "ConflictError") - } else if msg.contains("unsupported") { - (StatusCode::BAD_REQUEST, "UnsupportedError") - } else { - (StatusCode::INTERNAL_SERVER_ERROR, "InternalError") + use boxlite::BoxliteError; + match err { + BoxliteError::NotFound(_) => (StatusCode::NOT_FOUND, "NotFoundError"), + BoxliteError::AlreadyExists(_) => (StatusCode::CONFLICT, "ConflictError"), + BoxliteError::InvalidState(_) => (StatusCode::CONFLICT, "InvalidStateError"), + BoxliteError::InvalidArgument(_) => (StatusCode::BAD_REQUEST, "InvalidArgumentError"), + BoxliteError::Unsupported(_) | BoxliteError::UnsupportedEngine => { + (StatusCode::BAD_REQUEST, "UnsupportedError") + } + BoxliteError::Stopped(_) => (StatusCode::CONFLICT, "StoppedError"), + _ => (StatusCode::INTERNAL_SERVER_ERROR, "InternalError"), } } diff --git a/boxlite/src/litebox/box_impl.rs b/boxlite/src/litebox/box_impl.rs index 002b5582..f5aca758 100644 --- a/boxlite/src/litebox/box_impl.rs +++ b/boxlite/src/litebox/box_impl.rs @@ -376,23 +376,59 @@ impl BoxImpl { // SAFETY: sending SIGSTOP to a known valid PID that we own (shim process). let ret = unsafe { libc::kill(pid, libc::SIGSTOP) }; if ret != 0 { + let os_err = std::io::Error::last_os_error(); if frozen { self.guest_thaw().await; } + if os_err.raw_os_error() == Some(libc::ESRCH) { + // Process died between status check and SIGSTOP (stop() raced). + let mut state = self.state.write(); + state.mark_stop(); + if let Err(e) = self.runtime.box_manager.save_box(self.id(), &state) { + tracing::warn!(box_id = %self.config.id, error = %e, "Failed to persist Stopped state after ESRCH"); + } + return Err(BoxliteError::Stopped( + "Shim process died during pause".into(), + )); + } + tracing::error!(box_id = %self.config.id, pid, error = %os_err, "SIGSTOP failed with unexpected error"); return Err(BoxliteError::Internal(format!( "Failed to SIGSTOP shim process (pid={}): {}", - pid, - std::io::Error::last_os_error() + pid, os_err ))); } // Update state - { + let stop_raced = { let mut state = self.state.write(); if let Err(e) = state.transition_to(BoxStatus::Paused) { - tracing::warn!(box_id = %self.config.id, error = %e, "State transition to Paused failed (race?)"); - state.force_status(BoxStatus::Paused); + // If stop() raced and state is already Stopping/Stopped, + // undo our SIGSTOP so stop() can proceed with shutdown. + if matches!(state.status, BoxStatus::Stopping | BoxStatus::Stopped) { + // SAFETY: undo SIGSTOP on the shim so stop() teardown can proceed. + unsafe { + libc::kill(pid, libc::SIGCONT); + } + true + } else { + tracing::warn!(box_id = %self.config.id, error = %e, "State transition to Paused failed (race?)"); + state.force_status(BoxStatus::Paused); + false + } + } else { + false } + }; + // Handle stop() race outside the lock (guest_thaw is async). + if stop_raced { + if frozen { + self.guest_thaw().await; + } + return Err(BoxliteError::Stopped("Box is being stopped".into())); + } + { + let mut state = self.state.write(); + state.quiesced = frozen; if let Err(e) = self.runtime.box_manager.save_box(self.id(), &state) { tracing::warn!(box_id = %self.config.id, error = %e, "Failed to persist Paused state"); } @@ -449,10 +485,28 @@ impl BoxImpl { // SAFETY: sending SIGCONT to a known valid PID that we own (shim process). let ret = unsafe { libc::kill(pid, libc::SIGCONT) }; if ret != 0 { + let os_err = std::io::Error::last_os_error(); + // Process vanished while paused (ESRCH) — transition to Stopped + // so the box doesn't stay stuck in Paused forever. + if os_err.raw_os_error() == Some(libc::ESRCH) { + let mut state = self.state.write(); + state.mark_stop(); + if let Err(e) = self.runtime.box_manager.save_box(self.id(), &state) { + tracing::warn!(box_id = %self.config.id, error = %e, "Failed to persist Stopped state after ESRCH"); + } + return Err(BoxliteError::Internal( + "Shim process died while paused".into(), + )); + } + tracing::error!( + box_id = %self.config.id, + pid, + error = %os_err, + "SIGCONT failed with unexpected error" + ); return Err(BoxliteError::Internal(format!( "Failed to SIGCONT shim process (pid={}): {}", - pid, - std::io::Error::last_os_error() + pid, os_err ))); } @@ -460,7 +514,9 @@ impl BoxImpl { if unsafe { libc::kill(pid, 0) } != 0 { let mut state = self.state.write(); state.mark_stop(); - let _ = self.runtime.box_manager.save_box(self.id(), &state); + if let Err(e) = self.runtime.box_manager.save_box(self.id(), &state) { + tracing::warn!(box_id = %self.config.id, error = %e, "Failed to persist Stopped state"); + } return Err(BoxliteError::Internal( "Shim process died while paused".into(), )); @@ -470,9 +526,14 @@ impl BoxImpl { { let mut state = self.state.write(); if let Err(e) = state.transition_to(BoxStatus::Running) { + // If stop() raced and state is already Stopping/Stopped, don't override. + if matches!(state.status, BoxStatus::Stopping | BoxStatus::Stopped) { + return Err(BoxliteError::Stopped("Box is being stopped".into())); + } tracing::warn!(box_id = %self.config.id, error = %e, "State transition to Running failed (race?)"); state.force_status(BoxStatus::Running); } + state.quiesced = false; if let Err(e) = self.runtime.box_manager.save_box(self.id(), &state) { tracing::warn!(box_id = %self.config.id, error = %e, "Failed to persist Running state"); } @@ -674,10 +735,12 @@ impl BoxImpl { } // Reject when paused — guest can't handle gRPC file upload while SIGSTOP'd. - if self.state.read().status.is_paused() { - return Err(BoxliteError::InvalidState( - "Cannot copy into box while paused".into(), - )); + let status = self.state.read().status; + if status.is_paused() { + return Err(BoxliteError::InvalidState(format!( + "Cannot copy into box in {} state", + status + ))); } // Ensure box is running @@ -756,10 +819,12 @@ impl BoxImpl { } // Reject when paused — guest can't handle gRPC file download while SIGSTOP'd. - if self.state.read().status.is_paused() { - return Err(BoxliteError::InvalidState( - "Cannot copy from box while paused".into(), - )); + let status = self.state.read().status; + if status.is_paused() { + return Err(BoxliteError::InvalidState(format!( + "Cannot copy from box in {} state", + status + ))); } // Ensure box is running @@ -989,8 +1054,7 @@ impl BoxImpl { "Shim process died while paused, marking box as Stopped" ); let mut state_guard = state.write(); - state_guard.force_status(crate::litebox::BoxStatus::Stopped); - state_guard.set_pid(None); + state_guard.mark_stop(); state_guard.health_status.state = crate::litebox::HealthState::Unhealthy; if let Err(db_err) = runtime.box_manager.save_box(&box_id, &state_guard) { tracing::error!( @@ -1162,16 +1226,17 @@ impl BoxImpl { where Fut: std::future::Future>, { - let (pid, was_running, was_paused) = { + let (pid, was_running, was_paused, was_quiesced) = { let state = self.state.read(); let running = state.status.is_running(); let paused = state.status.is_paused(); + let quiesced = state.quiesced; let pid = if running || paused { state.pid.map(|p| p as i32) } else { None }; - (pid, running, paused) + (pid, running, paused, quiesced) }; let Some(pid) = pid else { @@ -1187,10 +1252,20 @@ impl BoxImpl { let t0 = Instant::now(); // Phase 1: Freeze guest I/O (best-effort, 5s timeout) - // Skip if already paused — guest I/O is already frozen from pause(). + // If user paused and quiesce succeeded during pause(), skip (already frozen). + // If user paused but quiesce failed during pause(), log warning — we cannot + // retry because the process is SIGSTOP'd and cannot respond to gRPC. + // The operation degrades to crash-consistent (SIGSTOP-only). let t_quiesce = Instant::now(); let frozen = if was_paused { - false // Already quiesced by user's pause() + if !was_quiesced { + tracing::warn!( + box_id = %self.id(), + "Box was paused without successful guest quiesce; \ + snapshot/export will be crash-consistent only (SIGSTOP without FIFREEZE)" + ); + } + false } else { self.guest_quiesce().await }; diff --git a/boxlite/src/litebox/state.rs b/boxlite/src/litebox/state.rs index 50ccb257..8087a2cd 100644 --- a/boxlite/src/litebox/state.rs +++ b/boxlite/src/litebox/state.rs @@ -17,10 +17,14 @@ use serde::{Deserialize, Serialize}; /// ```text /// create() → Configured (persisted to DB, no VM) /// start() → Running (VM initialized) -/// SIGSTOP → Paused (VM frozen, used during export/snapshot) -/// SIGCONT → Running (VM resumed) +/// pause() → Paused (VM frozen via SIGSTOP — zero CPU, memory preserved) +/// resume() → Running (VM resumed via SIGCONT) /// stop() → Stopped (VM terminated, can restart) /// ``` +/// +/// The Paused state is used both by the user-facing `pause()`/`resume()` API +/// and internally by the quiesce bracket (`with_quiesce_async`) during +/// export/snapshot/clone operations. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] #[serde(rename_all = "lowercase")] pub enum BoxStatus { @@ -42,7 +46,8 @@ pub enum BoxStatus { Stopped, /// Box VM is frozen via SIGSTOP (all vCPUs and virtio backends paused). - /// Used during export/snapshot for point-in-time consistency. + /// Used by user-facing `pause()`/`resume()` API and internally during + /// export/snapshot for point-in-time consistency. /// Equivalent to Docker's cgroup freezer pause. Paused, } @@ -205,6 +210,11 @@ pub struct BoxState { /// Health status. #[serde(default)] pub health_status: HealthStatus, + /// Whether guest I/O was successfully quiesced (FIFREEZE) during pause(). + /// Runtime-only: not persisted to DB. Used by `with_quiesce_async` to decide + /// whether to skip its own quiesce when the box is already paused. + #[serde(skip)] + pub quiesced: bool, } /// Health status of a box. @@ -302,6 +312,7 @@ impl BoxState { last_updated: Utc::now(), lock_id: None, health_status: HealthStatus::new(), + quiesced: false, } } @@ -352,6 +363,7 @@ impl BoxState { pub fn mark_stop(&mut self) { self.status = BoxStatus::Stopped; self.pid = None; + self.quiesced = false; self.last_updated = Utc::now(); } @@ -364,6 +376,7 @@ impl BoxState { self.status = BoxStatus::Stopped; } self.pid = None; + self.quiesced = false; self.last_updated = Utc::now(); } @@ -978,4 +991,24 @@ mod tests { // Paused boxes cannot be removed (must stop first) assert!(!BoxStatus::Paused.can_remove()); } + + #[test] + fn test_new_state_quiesced_is_false() { + let state = BoxState::new(); + assert!(!state.quiesced); + } + + #[test] + fn test_mark_stop_clears_quiesced() { + let mut state = BoxState::new(); + state.status = BoxStatus::Paused; + state.pid = Some(123); + state.quiesced = true; + + state.mark_stop(); + + assert!(!state.quiesced); + assert_eq!(state.status, BoxStatus::Stopped); + assert_eq!(state.pid, None); + } } diff --git a/examples/python/03_lifecycle/pause_and_resume.py b/examples/python/03_lifecycle/pause_and_resume.py index a844503e..5b69f520 100644 --- a/examples/python/03_lifecycle/pause_and_resume.py +++ b/examples/python/03_lifecycle/pause_and_resume.py @@ -20,45 +20,54 @@ async def basic_pause_resume(): print("\n=== Basic Pause/Resume ===") runtime = boxlite.Boxlite.default() + box = None - box = await runtime.create(boxlite.BoxOptions( - image="alpine:latest", - auto_remove=False, - )) - print(f"Created box: {box.id}") - - # Run a command to verify box is working - execution = await box.exec("echo", ["Box is running"]) - stdout = execution.stdout() - async for line in stdout: - print(f" {line.strip()}") - await execution.wait() - - info = box.info() - print(f"State: {info.state}") - - # Pause — VM frozen, zero CPU usage - print("\nPausing box...") - await box.pause() - info = box.info() - print(f"State after pause: {info.state}") - - # Resume — VM continues from exact point - print("\nResuming box...") - await box.resume() - info = box.info() - print(f"State after resume: {info.state}") - - # Verify box still works - execution = await box.exec("echo", ["Still alive after pause/resume!"]) - stdout = execution.stdout() - async for line in stdout: - print(f" {line.strip()}") - await execution.wait() - - await box.stop() - await runtime.remove(box.id, force=False) - print("\nBox stopped and removed") + try: + box = await runtime.create(boxlite.BoxOptions( + image="alpine:latest", + auto_remove=False, + )) + box_id = box.id + print(f"Created box: {box_id}") + + # Run a command to verify box is working + execution = await box.exec("echo", ["Box is running"]) + stdout = execution.stdout() + async for line in stdout: + print(f" {line.strip()}") + await execution.wait() + + info = box.info() + print(f"State: {info.state.status}") + + # Pause — VM frozen, zero CPU usage + print("\nPausing box...") + await box.pause() + info = box.info() + print(f"State after pause: {info.state.status}") + + # Resume — VM continues from exact point + print("\nResuming box...") + await box.resume() + info = box.info() + print(f"State after resume: {info.state.status}") + + # Verify box still works + execution = await box.exec("echo", ["Still alive after pause/resume!"]) + stdout = execution.stdout() + async for line in stdout: + print(f" {line.strip()}") + await execution.wait() + + await box.stop() + await runtime.remove(box_id, force=False) + print("\nBox stopped and removed") + + except Exception as e: + print(f"\nError: {e}") + if box is not None: + await box.stop() + await runtime.remove(box.id, force=True) async def exec_blocked_while_paused(): @@ -66,39 +75,48 @@ async def exec_blocked_while_paused(): print("\n\n=== Exec Blocked While Paused ===") runtime = boxlite.Boxlite.default() + box = None - box = await runtime.create(boxlite.BoxOptions( - image="alpine:latest", - auto_remove=False, - )) - print(f"Created box: {box.id}") + try: + box = await runtime.create(boxlite.BoxOptions( + image="alpine:latest", + auto_remove=False, + )) + box_id = box.id + print(f"Created box: {box_id}") + + execution = await box.exec("echo", ["ready"]) + await execution.wait() - execution = await box.exec("echo", ["ready"]) - await execution.wait() + await box.pause() + print("Box paused") - await box.pause() - print("Box paused") + # Attempt exec while paused + print("Attempting exec while paused...") + try: + await box.exec("echo", ["should fail"]) + print(" Unexpected: exec succeeded") + except Exception as e: + print(f" Expected error: {e}") - # Attempt exec while paused - print("Attempting exec while paused...") - try: - await box.exec("echo", ["should fail"]) - print(" Unexpected: exec succeeded") - except Exception as e: - print(f" Expected error: {e}") + # Resume and exec works again + await box.resume() + print("Box resumed") - # Resume and exec works again - await box.resume() - print("Box resumed") + execution = await box.exec("echo", ["works again!"]) + stdout = execution.stdout() + async for line in stdout: + print(f" {line.strip()}") + await execution.wait() - execution = await box.exec("echo", ["works again!"]) - stdout = execution.stdout() - async for line in stdout: - print(f" {line.strip()}") - await execution.wait() + await box.stop() + await runtime.remove(box_id, force=False) - await box.stop() - await runtime.remove(box.id, force=False) + except Exception as e: + print(f"\nError: {e}") + if box is not None: + await box.stop() + await runtime.remove(box.id, force=True) async def pause_resume_cycles(): @@ -106,32 +124,41 @@ async def pause_resume_cycles(): print("\n\n=== Multiple Pause/Resume Cycles ===") runtime = boxlite.Boxlite.default() + box = None - box = await runtime.create(boxlite.BoxOptions( - image="alpine:latest", - auto_remove=False, - )) - print(f"Created box: {box.id}") + try: + box = await runtime.create(boxlite.BoxOptions( + image="alpine:latest", + auto_remove=False, + )) + box_id = box.id + print(f"Created box: {box_id}") + + execution = await box.exec("echo", ["init"]) + await execution.wait() - execution = await box.exec("echo", ["init"]) - await execution.wait() + for i in range(3): + await box.pause() + info = box.info() + print(f" Cycle {i}: paused (status={info.state.status})") - for i in range(3): - await box.pause() - info = box.info() - print(f" Cycle {i}: paused (state={info.state})") + await box.resume() + execution = await box.exec("echo", [f"cycle-{i}"]) + stdout = execution.stdout() + async for line in stdout: + print(f" Cycle {i}: {line.strip()}") + await execution.wait() - await box.resume() - execution = await box.exec("echo", [f"cycle-{i}"]) - stdout = execution.stdout() - async for line in stdout: - print(f" Cycle {i}: {line.strip()}") - await execution.wait() + print("All cycles completed — VM integrity preserved") - print("All cycles completed — VM integrity preserved") + await box.stop() + await runtime.remove(box_id, force=False) - await box.stop() - await runtime.remove(box.id, force=False) + except Exception as e: + print(f"\nError: {e}") + if box is not None: + await box.stop() + await runtime.remove(box.id, force=True) async def stop_from_paused(): @@ -139,30 +166,38 @@ async def stop_from_paused(): print("\n\n=== Stop From Paused State ===") runtime = boxlite.Boxlite.default() + box = None - box = await runtime.create(boxlite.BoxOptions( - image="alpine:latest", - auto_remove=False, - )) - box_id = box.id - print(f"Created box: {box_id}") + try: + box = await runtime.create(boxlite.BoxOptions( + image="alpine:latest", + auto_remove=False, + )) + box_id = box.id + print(f"Created box: {box_id}") + + execution = await box.exec("echo", ["running"]) + await execution.wait() - execution = await box.exec("echo", ["running"]) - await execution.wait() + await box.pause() + print(f"State: {box.info().state.status}") - await box.pause() - print(f"State: {box.info().state}") + # Stop directly from Paused — no resume needed + print("Stopping directly from paused state...") + await box.stop() - # Stop directly from Paused — no resume needed - print("Stopping directly from paused state...") - await box.stop() + info = await runtime.get_info(box_id) + if info: + print(f"State after stop: {info.state.status}") - info = await runtime.get_info(box_id) - if info: - print(f"State after stop: {info.state}") + await runtime.remove(box_id, force=False) + print("Box removed") - await runtime.remove(box_id, force=False) - print("Box removed") + except Exception as e: + print(f"\nError: {e}") + if box is not None: + await box.stop() + await runtime.remove(box.id, force=True) async def main(): diff --git a/sdks/node/lib/simplebox.ts b/sdks/node/lib/simplebox.ts index 33d92782..4d4fe15a 100644 --- a/sdks/node/lib/simplebox.ts +++ b/sdks/node/lib/simplebox.ts @@ -623,20 +623,6 @@ export class SimpleBox { return box.metrics(); } - /** - * Stop the box. - * - * Sends a graceful shutdown signal to the VM. If `autoRemove` is true - * (default), the box files will be deleted after stopping. - * - * Does nothing if the box was never created. - * - * @example - * ```typescript - * await box.stop(); - * console.log('Box stopped'); - * ``` - */ /** * Pause the box (freeze VM, zero CPU, state preserved). * @@ -686,6 +672,20 @@ export class SimpleBox { await this._box.resume(); } + /** + * Stop the box. + * + * Sends a graceful shutdown signal to the VM. If `autoRemove` is true + * (default), the box files will be deleted after stopping. + * + * Does nothing if the box was never created. + * + * @example + * ```typescript + * await box.stop(); + * console.log('Box stopped'); + * ``` + */ async stop(): Promise { if (!this._box) { // Box was never created, nothing to stop diff --git a/sdks/python/src/box_handle.rs b/sdks/python/src/box_handle.rs index 32edca5d..9a551269 100644 --- a/sdks/python/src/box_handle.rs +++ b/sdks/python/src/box_handle.rs @@ -107,6 +107,8 @@ impl PyBox { } /// Pause the box (freeze VM, zero CPU, state preserved). + /// + /// Idempotent: calling pause() on a Paused box is a no-op. fn pause<'a>(&self, py: Python<'a>) -> PyResult> { let handle = Arc::clone(&self.handle); @@ -117,6 +119,9 @@ impl PyBox { } /// Resume the box from paused state. + /// + /// Sends SIGCONT to resume vCPUs and thaws guest filesystems. + /// Idempotent: calling resume() on a Running box is a no-op. fn resume<'a>(&self, py: Python<'a>) -> PyResult> { let handle = Arc::clone(&self.handle);