diff --git a/boxlite-cli/src/commands/serve/handlers/boxes.rs b/boxlite-cli/src/commands/serve/handlers/boxes.rs index 057c8104..e1ecf44b 100644 --- a/boxlite-cli/src/commands/serve/handlers/boxes.rs +++ b/boxlite-cli/src/commands/serve/handlers/boxes.rs @@ -118,6 +118,42 @@ pub(in crate::commands::serve) async fn stop_box( Json(box_info_to_response(&info)).into_response() } +pub(in crate::commands::serve) async fn pause_box( + State(state): State>, + Path(box_id): Path, +) -> Response { + let litebox = match get_or_fetch_box(&state, &box_id).await { + Ok(b) => b, + Err(resp) => return resp, + }; + + if let Err(e) = litebox.pause().await { + let (status, etype) = classify_boxlite_error(&e); + return error_response(status, e.to_string(), etype); + } + + let info = litebox.info(); + Json(box_info_to_response(&info)).into_response() +} + +pub(in crate::commands::serve) async fn resume_box( + State(state): State>, + Path(box_id): Path, +) -> Response { + let litebox = match get_or_fetch_box(&state, &box_id).await { + Ok(b) => b, + Err(resp) => return resp, + }; + + if let Err(e) = litebox.resume().await { + let (status, etype) = classify_boxlite_error(&e); + return error_response(status, e.to_string(), etype); + } + + let info = litebox.info(); + Json(box_info_to_response(&info)).into_response() +} + pub(in crate::commands::serve) async fn remove_box( State(state): State>, Path(box_id): Path, diff --git a/boxlite-cli/src/commands/serve/mod.rs b/boxlite-cli/src/commands/serve/mod.rs index 35d58105..871e8ff7 100644 --- a/boxlite-cli/src/commands/serve/mod.rs +++ b/boxlite-cli/src/commands/serve/mod.rs @@ -165,15 +165,17 @@ fn error_response(status: StatusCode, message: impl Into, error_type: &s } fn classify_boxlite_error(err: &boxlite::BoxliteError) -> (StatusCode, &'static str) { - let msg = err.to_string().to_lowercase(); - if msg.contains("not found") { - (StatusCode::NOT_FOUND, "NotFoundError") - } else if msg.contains("already") || msg.contains("conflict") { - (StatusCode::CONFLICT, "ConflictError") - } else if msg.contains("unsupported") { - (StatusCode::BAD_REQUEST, "UnsupportedError") - } else { - (StatusCode::INTERNAL_SERVER_ERROR, "InternalError") + use boxlite::BoxliteError; + match err { + BoxliteError::NotFound(_) => (StatusCode::NOT_FOUND, "NotFoundError"), + BoxliteError::AlreadyExists(_) => (StatusCode::CONFLICT, "ConflictError"), + BoxliteError::InvalidState(_) => (StatusCode::CONFLICT, "InvalidStateError"), + BoxliteError::InvalidArgument(_) => (StatusCode::BAD_REQUEST, "InvalidArgumentError"), + BoxliteError::Unsupported(_) | BoxliteError::UnsupportedEngine => { + (StatusCode::BAD_REQUEST, "UnsupportedError") + } + BoxliteError::Stopped(_) => (StatusCode::CONFLICT, "StoppedError"), + _ => (StatusCode::INTERNAL_SERVER_ERROR, "InternalError"), } } @@ -241,6 +243,14 @@ fn build_router(state: Arc) -> Router { "/v1/default/boxes/{box_id}/stop", post(boxes::stop_box), ) + .route( + "/v1/default/boxes/{box_id}/pause", + post(boxes::pause_box), + ) + .route( + "/v1/default/boxes/{box_id}/resume", + post(boxes::resume_box), + ) // Box metrics .route( "/v1/default/boxes/{box_id}/metrics", diff --git a/boxlite/src/event_listener/audit_event_listener.rs b/boxlite/src/event_listener/audit_event_listener.rs index 9fadf3f0..4e9489e8 100644 --- a/boxlite/src/event_listener/audit_event_listener.rs +++ b/boxlite/src/event_listener/audit_event_listener.rs @@ -116,6 +116,14 @@ impl EventListener for AuditEventListener { )); } + fn on_box_paused(&self, box_id: &BoxID) { + self.record(AuditEvent::now(box_id.clone(), AuditEventKind::BoxPaused)); + } + + fn on_box_resumed(&self, box_id: &BoxID) { + self.record(AuditEvent::now(box_id.clone(), AuditEventKind::BoxResumed)); + } + fn on_box_removed(&self, box_id: &BoxID) { self.record(AuditEvent::now(box_id.clone(), AuditEventKind::BoxRemoved)); } @@ -229,6 +237,46 @@ mod tests { assert_eq!(listener.len(), 1000); } + #[test] + fn records_pause_resume_events() { + let listener = AuditEventListener::new(); + let id = test_box_id(); + + listener.on_box_started(&id); + listener.on_box_paused(&id); + listener.on_box_resumed(&id); + listener.on_box_stopped(&id, None); + + let events = listener.events(); + assert_eq!(events.len(), 4); + assert!(matches!(events[0].kind, AuditEventKind::BoxStarted)); + assert!(matches!(events[1].kind, AuditEventKind::BoxPaused)); + assert!(matches!(events[2].kind, AuditEventKind::BoxResumed)); + assert!(matches!(events[3].kind, AuditEventKind::BoxStopped { .. })); + } + + #[test] + fn records_multiple_pause_resume_cycles() { + let listener = AuditEventListener::new(); + let id = test_box_id(); + + listener.on_box_started(&id); + // Cycle 1 + listener.on_box_paused(&id); + listener.on_box_resumed(&id); + // Cycle 2 + listener.on_box_paused(&id); + listener.on_box_resumed(&id); + listener.on_box_stopped(&id, Some(0)); + + let events = listener.events(); + assert_eq!(events.len(), 6); + // Verify all events have the same box_id + for event in &events { + assert_eq!(event.box_id, id); + } + } + #[test] fn events_since_filters() { let listener = AuditEventListener::new(); diff --git a/boxlite/src/event_listener/event.rs b/boxlite/src/event_listener/event.rs index 71e23377..9895aff3 100644 --- a/boxlite/src/event_listener/event.rs +++ b/boxlite/src/event_listener/event.rs @@ -42,6 +42,12 @@ pub enum AuditEventKind { /// Box VM stopped. BoxStopped { exit_code: Option }, + /// Box VM paused (SIGSTOP). + BoxPaused, + + /// Box VM resumed from pause (SIGCONT). + BoxResumed, + /// Box removed. BoxRemoved, diff --git a/boxlite/src/event_listener/listener.rs b/boxlite/src/event_listener/listener.rs index 9fe9bfa3..45085310 100644 --- a/boxlite/src/event_listener/listener.rs +++ b/boxlite/src/event_listener/listener.rs @@ -44,6 +44,12 @@ pub trait EventListener: Send + Sync { /// Called after a box VM stops. fn on_box_stopped(&self, _box_id: &BoxID, _exit_code: Option) {} + /// Called after a box VM is paused (SIGSTOP). + fn on_box_paused(&self, _box_id: &BoxID) {} + + /// Called after a box VM is resumed from pause (SIGCONT). + fn on_box_resumed(&self, _box_id: &BoxID) {} + /// Called after a box is removed. fn on_box_removed(&self, _box_id: &BoxID) {} diff --git a/boxlite/src/litebox/box_impl.rs b/boxlite/src/litebox/box_impl.rs index d2c9a1bd..f5aca758 100644 --- a/boxlite/src/litebox/box_impl.rs +++ b/boxlite/src/litebox/box_impl.rs @@ -227,6 +227,15 @@ impl BoxImpl { )); } + // Reject exec on paused boxes — shim can't handle gRPC requests while SIGSTOP'd. + let status = self.state.read().status; + if !status.can_exec() { + return Err(BoxliteError::InvalidState(format!( + "Cannot exec on box in {} state", + status + ))); + } + let live = self.live_state().await?; // Inject container ID into environment if not already set @@ -311,6 +320,236 @@ impl BoxImpl { )) } + /// Pause the box (freeze VM via SIGSTOP). + /// + /// Performs a clean quiesce: + /// 1. Guest filesystem quiesce (FIFREEZE — best-effort) + /// 2. SIGSTOP shim process (pauses all vCPUs and virtio backends) + /// + /// Idempotent: calling pause() on an already-Paused box is a no-op. + /// The box must be Running; other states return InvalidState. + pub(crate) async fn pause(&self) -> BoxliteResult<()> { + // Check if already shutdown + if self.shutdown_token.is_cancelled() { + return Err(BoxliteError::Stopped( + "Handle invalidated after stop(). Use runtime.get() to get a new handle.".into(), + )); + } + + let status = self.state.read().status; + + // Idempotent: already paused + if status.is_paused() { + return Ok(()); + } + + // Only Running boxes can be paused + if !status.can_pause() { + return Err(BoxliteError::InvalidState(format!( + "Cannot pause box in {} state", + status + ))); + } + + let pid = { + let state = self.state.read(); + state + .pid + .map(|p| p as i32) + .ok_or_else(|| BoxliteError::Internal("Box is running but has no PID".into()))? + }; + + // Phase 1: Freeze guest I/O (best-effort, 5s timeout) + let frozen = self.guest_quiesce().await; + + // Re-check shutdown token after async quiesce — stop() may have raced. + if self.shutdown_token.is_cancelled() { + if frozen { + self.guest_thaw().await; + } + return Err(BoxliteError::Stopped( + "Handle invalidated after stop(). Use runtime.get() to get a new handle.".into(), + )); + } + + // Phase 2: SIGSTOP — pause vCPUs + // SAFETY: sending SIGSTOP to a known valid PID that we own (shim process). + let ret = unsafe { libc::kill(pid, libc::SIGSTOP) }; + if ret != 0 { + let os_err = std::io::Error::last_os_error(); + if frozen { + self.guest_thaw().await; + } + if os_err.raw_os_error() == Some(libc::ESRCH) { + // Process died between status check and SIGSTOP (stop() raced). + let mut state = self.state.write(); + state.mark_stop(); + if let Err(e) = self.runtime.box_manager.save_box(self.id(), &state) { + tracing::warn!(box_id = %self.config.id, error = %e, "Failed to persist Stopped state after ESRCH"); + } + return Err(BoxliteError::Stopped( + "Shim process died during pause".into(), + )); + } + tracing::error!(box_id = %self.config.id, pid, error = %os_err, "SIGSTOP failed with unexpected error"); + return Err(BoxliteError::Internal(format!( + "Failed to SIGSTOP shim process (pid={}): {}", + pid, os_err + ))); + } + + // Update state + let stop_raced = { + let mut state = self.state.write(); + if let Err(e) = state.transition_to(BoxStatus::Paused) { + // If stop() raced and state is already Stopping/Stopped, + // undo our SIGSTOP so stop() can proceed with shutdown. + if matches!(state.status, BoxStatus::Stopping | BoxStatus::Stopped) { + // SAFETY: undo SIGSTOP on the shim so stop() teardown can proceed. + unsafe { + libc::kill(pid, libc::SIGCONT); + } + true + } else { + tracing::warn!(box_id = %self.config.id, error = %e, "State transition to Paused failed (race?)"); + state.force_status(BoxStatus::Paused); + false + } + } else { + false + } + }; + // Handle stop() race outside the lock (guest_thaw is async). + if stop_raced { + if frozen { + self.guest_thaw().await; + } + return Err(BoxliteError::Stopped("Box is being stopped".into())); + } + { + let mut state = self.state.write(); + state.quiesced = frozen; + if let Err(e) = self.runtime.box_manager.save_box(self.id(), &state) { + tracing::warn!(box_id = %self.config.id, error = %e, "Failed to persist Paused state"); + } + } + + for listener in &self.event_listeners { + listener.on_box_paused(&self.config.id); + } + + tracing::info!(box_id = %self.config.id, frozen, "Box paused"); + Ok(()) + } + + /// Resume the box from paused state (SIGCONT + thaw). + /// + /// Performs: + /// 1. SIGCONT shim process (resumes vCPUs) + /// 2. Guest filesystem thaw (FITHAW — best-effort) + /// + /// Idempotent: calling resume() on a Running box is a no-op. + /// The box must be Paused; other states return InvalidState. + pub(crate) async fn resume(&self) -> BoxliteResult<()> { + // Check if already shutdown + if self.shutdown_token.is_cancelled() { + return Err(BoxliteError::Stopped( + "Handle invalidated after stop(). Use runtime.get() to get a new handle.".into(), + )); + } + + let status = self.state.read().status; + + // Idempotent: already running + if status.is_running() { + return Ok(()); + } + + // Only Paused boxes can be resumed + if !status.can_resume() { + return Err(BoxliteError::InvalidState(format!( + "Cannot resume box in {} state", + status + ))); + } + + let pid = { + let state = self.state.read(); + state + .pid + .map(|p| p as i32) + .ok_or_else(|| BoxliteError::Internal("Box is paused but has no PID".into()))? + }; + + // Phase 1: SIGCONT — resume vCPUs + // SAFETY: sending SIGCONT to a known valid PID that we own (shim process). + let ret = unsafe { libc::kill(pid, libc::SIGCONT) }; + if ret != 0 { + let os_err = std::io::Error::last_os_error(); + // Process vanished while paused (ESRCH) — transition to Stopped + // so the box doesn't stay stuck in Paused forever. + if os_err.raw_os_error() == Some(libc::ESRCH) { + let mut state = self.state.write(); + state.mark_stop(); + if let Err(e) = self.runtime.box_manager.save_box(self.id(), &state) { + tracing::warn!(box_id = %self.config.id, error = %e, "Failed to persist Stopped state after ESRCH"); + } + return Err(BoxliteError::Internal( + "Shim process died while paused".into(), + )); + } + tracing::error!( + box_id = %self.config.id, + pid, + error = %os_err, + "SIGCONT failed with unexpected error" + ); + return Err(BoxliteError::Internal(format!( + "Failed to SIGCONT shim process (pid={}): {}", + pid, os_err + ))); + } + + // Verify process is alive before transitioning state + if unsafe { libc::kill(pid, 0) } != 0 { + let mut state = self.state.write(); + state.mark_stop(); + if let Err(e) = self.runtime.box_manager.save_box(self.id(), &state) { + tracing::warn!(box_id = %self.config.id, error = %e, "Failed to persist Stopped state"); + } + return Err(BoxliteError::Internal( + "Shim process died while paused".into(), + )); + } + + // Update state + { + let mut state = self.state.write(); + if let Err(e) = state.transition_to(BoxStatus::Running) { + // If stop() raced and state is already Stopping/Stopped, don't override. + if matches!(state.status, BoxStatus::Stopping | BoxStatus::Stopped) { + return Err(BoxliteError::Stopped("Box is being stopped".into())); + } + tracing::warn!(box_id = %self.config.id, error = %e, "State transition to Running failed (race?)"); + state.force_status(BoxStatus::Running); + } + state.quiesced = false; + if let Err(e) = self.runtime.box_manager.save_box(self.id(), &state) { + tracing::warn!(box_id = %self.config.id, error = %e, "Failed to persist Running state"); + } + } + + // Phase 2: Thaw guest I/O (best-effort) + self.guest_thaw().await; + + for listener in &self.event_listeners { + listener.on_box_resumed(&self.config.id); + } + + tracing::info!(box_id = %self.config.id, "Box resumed"); + Ok(()) + } + pub(crate) async fn stop(&self) -> BoxliteResult<()> { let t0 = Instant::now(); @@ -340,6 +579,33 @@ impl BoxImpl { // Cancel the token - signals all in-flight operations to abort self.shutdown_token.cancel(); + // If the box is paused (SIGSTOP'd), we must SIGCONT before attempting + // guest shutdown RPC — a stopped process can't handle gRPC requests. + // Without this, we'd hit the 10s timeout then SIGKILL. + { + let state = self.state.read(); + if state.status == BoxStatus::Paused + && let Some(pid) = state.pid + { + // SAFETY: sending SIGCONT to our own shim process PID. + let ret = unsafe { libc::kill(pid as i32, libc::SIGCONT) }; + if ret != 0 { + tracing::debug!( + box_id = %self.config.id, + pid, + error = %std::io::Error::last_os_error(), + "SIGCONT failed (process may have exited while paused)" + ); + } else { + tracing::debug!( + box_id = %self.config.id, + pid, + "Sent SIGCONT to paused shim before guest shutdown" + ); + } + } + } + // Only try to stop VM if LiveState exists if let Some(live) = self.live.get() { // Gracefully shut down guest (with timeout to avoid hanging on unresponsive guests) @@ -468,6 +734,15 @@ impl BoxImpl { )); } + // Reject when paused — guest can't handle gRPC file upload while SIGSTOP'd. + let status = self.state.read().status; + if status.is_paused() { + return Err(BoxliteError::InvalidState(format!( + "Cannot copy into box in {} state", + status + ))); + } + // Ensure box is running let live = self.live_state().await?; @@ -543,6 +818,15 @@ impl BoxImpl { )); } + // Reject when paused — guest can't handle gRPC file download while SIGSTOP'd. + let status = self.state.read().status; + if status.is_paused() { + return Err(BoxliteError::InvalidState(format!( + "Cannot copy from box in {} state", + status + ))); + } + // Ensure box is running let live = self.live_state().await?; @@ -754,6 +1038,40 @@ impl BoxImpl { } } + // Skip gRPC ping if box is paused — shim can't respond while SIGSTOP'd. + // But verify the process is still alive to detect death during pause. + let (is_paused, paused_pid) = { + let s = state.read(); + (s.status.is_paused(), s.pid) + }; + if is_paused { + if let Some(pid) = paused_pid + && !crate::util::is_process_alive(pid) + { + tracing::error!( + box_id = %box_id, + pid, + "Shim process died while paused, marking box as Stopped" + ); + let mut state_guard = state.write(); + state_guard.mark_stop(); + state_guard.health_status.state = crate::litebox::HealthState::Unhealthy; + if let Err(db_err) = runtime.box_manager.save_box(&box_id, &state_guard) { + tracing::error!( + box_id = %box_id, + error = %db_err, + "Failed to persist dead-while-paused state" + ); + } + break; + } + tracing::debug!( + box_id = %box_id, + "Box is paused, skipping gRPC health check" + ); + continue; + } + let elapsed = start_time.elapsed(); let result = if elapsed < start_period { tracing::debug!( @@ -860,7 +1178,7 @@ impl BoxImpl { let mut state_guard = state.write(); let became_unhealthy = state_guard.mark_health_check_failure(retries); - if let Err(db_err) = runtime.box_manager.save_box(&box_id, &state.read()) { + if let Err(db_err) = runtime.box_manager.save_box(&box_id, &state_guard) { tracing::error!( box_id = %box_id, error = %db_err, @@ -908,52 +1226,72 @@ impl BoxImpl { where Fut: std::future::Future>, { - let (pid, was_running) = { + let (pid, was_running, was_paused, was_quiesced) = { let state = self.state.read(); let running = state.status.is_running(); - let pid = if running { + let paused = state.status.is_paused(); + let quiesced = state.quiesced; + let pid = if running || paused { state.pid.map(|p| p as i32) } else { None }; - (pid, running) + (pid, running, paused, quiesced) }; let Some(pid) = pid else { - if was_running { + if was_running || was_paused { return Err(BoxliteError::Internal( - "Box is running but has no PID".to_string(), + "Box is active but has no PID".to_string(), )); } - // Not running — execute directly, no quiesce needed. + // Not active — execute directly, no quiesce needed. return fut.await; }; let t0 = Instant::now(); // Phase 1: Freeze guest I/O (best-effort, 5s timeout) + // If user paused and quiesce succeeded during pause(), skip (already frozen). + // If user paused but quiesce failed during pause(), log warning — we cannot + // retry because the process is SIGSTOP'd and cannot respond to gRPC. + // The operation degrades to crash-consistent (SIGSTOP-only). let t_quiesce = Instant::now(); - let frozen = self.guest_quiesce().await; + let frozen = if was_paused { + if !was_quiesced { + tracing::warn!( + box_id = %self.id(), + "Box was paused without successful guest quiesce; \ + snapshot/export will be crash-consistent only (SIGSTOP without FIFREEZE)" + ); + } + false + } else { + self.guest_quiesce().await + }; let quiesce_ms = t_quiesce.elapsed().as_millis() as u64; // Phase 2: SIGSTOP — pause vCPUs - // SAFETY: sending SIGSTOP to a known valid PID that we own (shim process). - let ret = unsafe { libc::kill(pid, libc::SIGSTOP) }; - if ret != 0 { - // If SIGSTOP fails, thaw before returning error - if frozen { - self.guest_thaw().await; + // Skip if already paused by user — process is already stopped. + if !was_paused { + // SAFETY: sending SIGSTOP to a known valid PID that we own (shim process). + let ret = unsafe { libc::kill(pid, libc::SIGSTOP) }; + if ret != 0 { + // If SIGSTOP fails, thaw before returning error + if frozen { + self.guest_thaw().await; + } + return Err(BoxliteError::Internal(format!( + "Failed to SIGSTOP shim process (pid={}): {}", + pid, + std::io::Error::last_os_error() + ))); + } + { + let mut state = self.state.write(); + state.force_status(BoxStatus::Paused); + let _ = self.runtime.box_manager.save_box(self.id(), &state); } - return Err(BoxliteError::Internal(format!( - "Failed to SIGSTOP shim process (pid={}): {}", - pid, - std::io::Error::last_os_error() - ))); - } - { - let mut state = self.state.write(); - state.force_status(BoxStatus::Paused); - let _ = self.runtime.box_manager.save_box(self.id(), &state); } // Phase 3: Caller's operation @@ -962,20 +1300,25 @@ impl BoxImpl { let operation_ms = t_op.elapsed().as_millis() as u64; // Phase 4: SIGCONT — resume vCPUs (always, even if f() failed) - // SAFETY: Always send SIGCONT — harmless ESRCH if process already dead. - unsafe { - libc::kill(pid, libc::SIGCONT); - } - // Only transition to Running if process is still alive after resume. - if unsafe { libc::kill(pid, 0) } == 0 { - let mut state = self.state.write(); - state.force_status(BoxStatus::Running); - let _ = self.runtime.box_manager.save_box(self.id(), &state); + // If user had paused the box, leave in Paused state — user must call resume(). + if !was_paused { + // Bracket-initiated pause: resume as before. + // SAFETY: Always send SIGCONT — harmless ESRCH if process already dead. + unsafe { + libc::kill(pid, libc::SIGCONT); + } + // Only transition to Running if process is still alive after resume. + if unsafe { libc::kill(pid, 0) } == 0 { + let mut state = self.state.write(); + state.force_status(BoxStatus::Running); + let _ = self.runtime.box_manager.save_box(self.id(), &state); + } } - // Phase 5: Thaw guest I/O (always, best-effort) + // Phase 5: Thaw guest I/O (best-effort) + // Skip if user had paused — thaw will happen when user calls resume(). let t_thaw = Instant::now(); - if frozen { + if frozen && !was_paused { self.guest_thaw().await; } let thaw_ms = t_thaw.elapsed().as_millis() as u64; @@ -987,6 +1330,7 @@ impl BoxImpl { operation_ms, thaw_ms, frozen, + was_paused, "Quiesce bracket completed" ); @@ -1088,6 +1432,14 @@ impl crate::runtime::backend::BoxBackend for BoxImpl { self.stop().await } + async fn pause(&self) -> BoxliteResult<()> { + self.pause().await + } + + async fn resume(&self) -> BoxliteResult<()> { + self.resume().await + } + async fn copy_into( &self, host_src: &std::path::Path, diff --git a/boxlite/src/litebox/mod.rs b/boxlite/src/litebox/mod.rs index 414590e5..fac0261e 100644 --- a/boxlite/src/litebox/mod.rs +++ b/boxlite/src/litebox/mod.rs @@ -106,6 +106,27 @@ impl LiteBox { self.box_backend.stop().await } + /// Pause the box (freeze VM via SIGSTOP). + /// + /// Quiesces guest filesystems, then sends SIGSTOP to freeze all vCPUs. + /// The box keeps its memory and state but consumes zero CPU. + /// + /// This is idempotent - calling pause() on a Paused box is a no-op. + /// Use resume() to continue execution. + pub async fn pause(&self) -> BoxliteResult<()> { + self.box_backend.pause().await + } + + /// Resume a paused box (SIGCONT + thaw). + /// + /// Sends SIGCONT to resume vCPUs and thaws guest filesystems. + /// The box continues from exactly where it was paused. + /// + /// This is idempotent - calling resume() on a Running box is a no-op. + pub async fn resume(&self) -> BoxliteResult<()> { + self.box_backend.resume().await + } + /// Copy files/directories from host into the container rootfs. pub async fn copy_into( &self, diff --git a/boxlite/src/litebox/state.rs b/boxlite/src/litebox/state.rs index 4e90c546..8087a2cd 100644 --- a/boxlite/src/litebox/state.rs +++ b/boxlite/src/litebox/state.rs @@ -17,10 +17,14 @@ use serde::{Deserialize, Serialize}; /// ```text /// create() → Configured (persisted to DB, no VM) /// start() → Running (VM initialized) -/// SIGSTOP → Paused (VM frozen, used during export/snapshot) -/// SIGCONT → Running (VM resumed) +/// pause() → Paused (VM frozen via SIGSTOP — zero CPU, memory preserved) +/// resume() → Running (VM resumed via SIGCONT) /// stop() → Stopped (VM terminated, can restart) /// ``` +/// +/// The Paused state is used both by the user-facing `pause()`/`resume()` API +/// and internally by the quiesce bracket (`with_quiesce_async`) during +/// export/snapshot/clone operations. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] #[serde(rename_all = "lowercase")] pub enum BoxStatus { @@ -42,7 +46,8 @@ pub enum BoxStatus { Stopped, /// Box VM is frozen via SIGSTOP (all vCPUs and virtio backends paused). - /// Used during export/snapshot for point-in-time consistency. + /// Used by user-facing `pause()`/`resume()` API and internally during + /// export/snapshot for point-in-time consistency. /// Equivalent to Docker's cgroup freezer pause. Paused, } @@ -95,6 +100,18 @@ impl BoxStatus { ) } + /// Check if pause() can be called from this state. + /// Only Running boxes can be paused. + pub fn can_pause(&self) -> bool { + matches!(self, BoxStatus::Running) + } + + /// Check if resume() can be called from this state. + /// Only Paused boxes can be resumed. + pub fn can_resume(&self) -> bool { + matches!(self, BoxStatus::Paused) + } + /// Check if exec() can be called from this state. /// Configured and Stopped will trigger implicit start(). pub fn can_exec(&self) -> bool { @@ -126,8 +143,9 @@ impl BoxStatus { // Stopped → Running (restart) (Stopped, Running) | (Stopped, Unknown) | - // Paused → Running (SIGCONT resume) or Stopped (killed while paused) + // Paused → Running (SIGCONT resume), Stopping (graceful stop), or Stopped (killed while paused) (Paused, Running) | + (Paused, Stopping) | (Paused, Stopped) | (Paused, Unknown) ) @@ -192,6 +210,11 @@ pub struct BoxState { /// Health status. #[serde(default)] pub health_status: HealthStatus, + /// Whether guest I/O was successfully quiesced (FIFREEZE) during pause(). + /// Runtime-only: not persisted to DB. Used by `with_quiesce_async` to decide + /// whether to skip its own quiesce when the box is already paused. + #[serde(skip)] + pub quiesced: bool, } /// Health status of a box. @@ -289,6 +312,7 @@ impl BoxState { last_updated: Utc::now(), lock_id: None, health_status: HealthStatus::new(), + quiesced: false, } } @@ -339,6 +363,7 @@ impl BoxState { pub fn mark_stop(&mut self) { self.status = BoxStatus::Stopped; self.pid = None; + self.quiesced = false; self.last_updated = Utc::now(); } @@ -351,6 +376,7 @@ impl BoxState { self.status = BoxStatus::Stopped; } self.pid = None; + self.quiesced = false; self.last_updated = Utc::now(); } @@ -863,4 +889,126 @@ mod tests { assert_eq!(state.health_status.failures, 0); assert!(state.health_status.last_check.is_none()); } + + // ======================================================================== + // Pause/Resume State Tests + // ======================================================================== + + #[test] + fn test_status_can_pause() { + assert!(!BoxStatus::Configured.can_pause()); + assert!(BoxStatus::Running.can_pause()); + assert!(!BoxStatus::Stopping.can_pause()); + assert!(!BoxStatus::Stopped.can_pause()); + assert!(!BoxStatus::Paused.can_pause()); + assert!(!BoxStatus::Unknown.can_pause()); + } + + #[test] + fn test_status_can_resume() { + assert!(!BoxStatus::Configured.can_resume()); + assert!(!BoxStatus::Running.can_resume()); + assert!(!BoxStatus::Stopping.can_resume()); + assert!(!BoxStatus::Stopped.can_resume()); + assert!(BoxStatus::Paused.can_resume()); + assert!(!BoxStatus::Unknown.can_resume()); + } + + #[test] + fn test_pause_resume_cycle() { + let mut state = BoxState::new(); + + // Configured → Running + assert!(state.transition_to(BoxStatus::Running).is_ok()); + assert_eq!(state.status, BoxStatus::Running); + + // Running → Paused + assert!(state.transition_to(BoxStatus::Paused).is_ok()); + assert_eq!(state.status, BoxStatus::Paused); + + // Paused → Running (resume) + assert!(state.transition_to(BoxStatus::Running).is_ok()); + assert_eq!(state.status, BoxStatus::Running); + + // Running → Paused → Running (second cycle) + assert!(state.transition_to(BoxStatus::Paused).is_ok()); + assert!(state.transition_to(BoxStatus::Running).is_ok()); + assert_eq!(state.status, BoxStatus::Running); + } + + #[test] + fn test_paused_to_stopped() { + let mut state = BoxState::new(); + state.force_status(BoxStatus::Paused); + + // Paused → Stopped (stop while paused) + assert!(state.transition_to(BoxStatus::Stopped).is_ok()); + assert_eq!(state.status, BoxStatus::Stopped); + } + + #[test] + fn test_stopped_cannot_pause() { + // Stopped boxes cannot be paused — must start first + assert!(!BoxStatus::Stopped.can_transition_to(BoxStatus::Paused)); + } + + #[test] + fn test_configured_cannot_pause() { + // Configured boxes cannot be paused — must start first + assert!(!BoxStatus::Configured.can_transition_to(BoxStatus::Paused)); + } + + #[test] + fn test_paused_cannot_exec() { + // Exec is blocked while paused + assert!(!BoxStatus::Paused.can_exec()); + } + + #[test] + fn test_paused_can_stop() { + // Stop is allowed from Paused state + assert!(BoxStatus::Paused.can_stop()); + } + + #[test] + fn test_paused_cannot_start() { + // Start is not allowed from Paused state (use resume instead) + assert!(!BoxStatus::Paused.can_start()); + } + + #[test] + fn test_paused_to_stopping() { + // Paused → Stopping is valid (graceful stop from paused state) + assert!(BoxStatus::Paused.can_transition_to(BoxStatus::Stopping)); + let mut state = BoxState::new(); + state.force_status(BoxStatus::Paused); + assert!(state.transition_to(BoxStatus::Stopping).is_ok()); + assert_eq!(state.status, BoxStatus::Stopping); + } + + #[test] + fn test_paused_cannot_remove() { + // Paused boxes cannot be removed (must stop first) + assert!(!BoxStatus::Paused.can_remove()); + } + + #[test] + fn test_new_state_quiesced_is_false() { + let state = BoxState::new(); + assert!(!state.quiesced); + } + + #[test] + fn test_mark_stop_clears_quiesced() { + let mut state = BoxState::new(); + state.status = BoxStatus::Paused; + state.pid = Some(123); + state.quiesced = true; + + state.mark_stop(); + + assert!(!state.quiesced); + assert_eq!(state.status, BoxStatus::Stopped); + assert_eq!(state.pid, None); + } } diff --git a/boxlite/src/rest/litebox.rs b/boxlite/src/rest/litebox.rs index 9d4681f1..58d3ee27 100644 --- a/boxlite/src/rest/litebox.rs +++ b/boxlite/src/rest/litebox.rs @@ -152,6 +152,24 @@ impl BoxBackend for RestBox { Ok(()) } + async fn pause(&self) -> BoxliteResult<()> { + let box_id = self.box_id_str(); + let path = format!("/boxes/{}/pause", box_id); + let resp: BoxResponse = self.client.post_empty(&path).await?; + let mut info = self.cached_info.write(); + *info = resp.to_box_info(); + Ok(()) + } + + async fn resume(&self) -> BoxliteResult<()> { + let box_id = self.box_id_str(); + let path = format!("/boxes/{}/resume", box_id); + let resp: BoxResponse = self.client.post_empty(&path).await?; + let mut info = self.cached_info.write(); + *info = resp.to_box_info(); + Ok(()) + } + async fn copy_into( &self, host_src: &Path, diff --git a/boxlite/src/runtime/backend.rs b/boxlite/src/runtime/backend.rs index 8c210f3f..9974eea5 100644 --- a/boxlite/src/runtime/backend.rs +++ b/boxlite/src/runtime/backend.rs @@ -82,6 +82,10 @@ pub(crate) trait BoxBackend: Send + Sync { async fn stop(&self) -> BoxliteResult<()>; + async fn pause(&self) -> BoxliteResult<()>; + + async fn resume(&self) -> BoxliteResult<()>; + async fn copy_into( &self, host_src: &Path, diff --git a/boxlite/tests/audit.rs b/boxlite/tests/audit.rs index 4ee95f4a..1bbe4eb6 100644 --- a/boxlite/tests/audit.rs +++ b/boxlite/tests/audit.rs @@ -49,3 +49,69 @@ fn multiple_listeners_all_receive_events() { assert_eq!(l1.events().len(), 1); assert_eq!(l2.events().len(), 1); } + +#[test] +fn audit_event_listener_records_pause_resume() { + let listener = AuditEventListener::new(); + let id = BoxIDMint::mint(); + + listener.on_box_created(&id); + listener.on_box_started(&id); + listener.on_box_paused(&id); + listener.on_box_resumed(&id); + listener.on_box_stopped(&id, Some(0)); + + let events = listener.events(); + assert_eq!(events.len(), 5); + assert!(matches!(events[0].kind, AuditEventKind::BoxCreated)); + assert!(matches!(events[1].kind, AuditEventKind::BoxStarted)); + assert!(matches!(events[2].kind, AuditEventKind::BoxPaused)); + assert!(matches!(events[3].kind, AuditEventKind::BoxResumed)); + assert!(matches!( + events[4].kind, + AuditEventKind::BoxStopped { exit_code: Some(0) } + )); +} + +#[test] +fn pause_resume_via_trait_object() { + let listener: Arc = Arc::new(AuditEventListener::new()); + let id = BoxIDMint::mint(); + + // Verify pause/resume work through dyn trait object + listener.on_box_paused(&id); + listener.on_box_resumed(&id); +} + +#[test] +fn multiple_listeners_all_receive_pause_resume() { + let l1 = Arc::new(AuditEventListener::new()); + let l2 = Arc::new(AuditEventListener::new()); + let listeners: Vec> = vec![l1.clone(), l2.clone()]; + + let id = BoxIDMint::mint(); + for listener in &listeners { + listener.on_box_paused(&id); + listener.on_box_resumed(&id); + } + + assert_eq!(l1.events().len(), 2); + assert_eq!(l2.events().len(), 2); + assert!(matches!(l1.events()[0].kind, AuditEventKind::BoxPaused)); + assert!(matches!(l1.events()[1].kind, AuditEventKind::BoxResumed)); +} + +#[test] +fn pause_resume_events_have_correct_box_id() { + let listener = AuditEventListener::new(); + let id1 = BoxIDMint::mint(); + let id2 = BoxIDMint::mint(); + + listener.on_box_paused(&id1); + listener.on_box_resumed(&id2); + + let events = listener.events(); + assert_eq!(events.len(), 2); + assert_eq!(events[0].box_id, id1); + assert_eq!(events[1].box_id, id2); +} diff --git a/boxlite/tests/pause_resume.rs b/boxlite/tests/pause_resume.rs new file mode 100644 index 00000000..90fe8659 --- /dev/null +++ b/boxlite/tests/pause_resume.rs @@ -0,0 +1,350 @@ +//! Integration tests for the pause/resume API. +//! +//! Tests the high-level `LiteBox::pause()` and `LiteBox::resume()` methods +//! with a real VM (alpine:latest). Validates state transitions, idempotency, +//! exec rejection while paused, and stop-from-paused. +//! +//! Requires a real VM runtime. Run with: +//! +//! ```sh +//! cargo test -p boxlite --test pause_resume +//! ``` + +mod common; + +use boxlite::runtime::options::BoxliteOptions; +use boxlite::runtime::types::BoxStatus; +use boxlite::{BoxCommand, BoxliteRuntime}; + +/// Helper: create a runtime with a per-test home directory. +fn test_runtime() -> (boxlite_test_utils::home::PerTestBoxHome, BoxliteRuntime) { + let home = boxlite_test_utils::home::PerTestBoxHome::new(); + let runtime = BoxliteRuntime::new(BoxliteOptions { + home_dir: home.path.clone(), + image_registries: common::test_registries(), + }) + .expect("create runtime"); + (home, runtime) +} + +#[tokio::test] +async fn pause_freezes_vm_and_resume_restores_it() { + let (_home, runtime) = test_runtime(); + + let litebox = runtime + .create(common::alpine_opts(), Some("pause-test".into())) + .await + .expect("create box"); + + litebox.start().await.expect("start box"); + + // Verify box is responsive + let cmd = BoxCommand::new("echo").args(["before-pause"]); + let mut exec = litebox.exec(cmd).await.expect("exec before pause"); + let result = exec.wait().await.expect("wait before pause"); + assert_eq!(result.exit_code, 0); + + // Pause + litebox.pause().await.expect("pause box"); + assert_eq!(litebox.info().status, BoxStatus::Paused); + + // Resume + litebox.resume().await.expect("resume box"); + assert_eq!(litebox.info().status, BoxStatus::Running); + + // Verify box is still responsive after resume + let cmd = BoxCommand::new("echo").args(["after-resume"]); + let mut exec = litebox.exec(cmd).await.expect("exec after resume"); + let result = exec.wait().await.expect("wait after resume"); + assert_eq!(result.exit_code, 0); + + litebox.stop().await.expect("stop box"); + let _ = runtime.shutdown(Some(common::TEST_SHUTDOWN_TIMEOUT)).await; +} + +#[tokio::test] +async fn exec_rejected_while_paused() { + let (_home, runtime) = test_runtime(); + + let litebox = runtime + .create(common::alpine_opts(), Some("pause-exec-test".into())) + .await + .expect("create box"); + + litebox.start().await.expect("start box"); + litebox.pause().await.expect("pause box"); + + // Exec should fail with InvalidState + let cmd = BoxCommand::new("echo").args(["should-fail"]); + let err = match litebox.exec(cmd).await { + Err(e) => e, + Ok(_) => panic!("exec should fail while paused"), + }; + let msg = err.to_string(); + assert!( + msg.contains("Paused") || msg.contains("paused") || msg.contains("InvalidState"), + "Expected InvalidState/Paused error, got: {msg}" + ); + + // Resume and verify exec works again + litebox.resume().await.expect("resume box"); + let cmd = BoxCommand::new("echo").args(["works-again"]); + let mut exec = litebox.exec(cmd).await.expect("exec after resume"); + let result = exec.wait().await.expect("wait after resume"); + assert_eq!(result.exit_code, 0); + + litebox.stop().await.expect("stop box"); + let _ = runtime.shutdown(Some(common::TEST_SHUTDOWN_TIMEOUT)).await; +} + +#[tokio::test] +async fn pause_is_idempotent() { + let (_home, runtime) = test_runtime(); + + let litebox = runtime + .create(common::alpine_opts(), Some("pause-idempotent".into())) + .await + .expect("create box"); + + litebox.start().await.expect("start box"); + + // Pause twice — second call should be a no-op + litebox.pause().await.expect("first pause"); + assert_eq!(litebox.info().status, BoxStatus::Paused); + litebox.pause().await.expect("second pause (idempotent)"); + assert_eq!(litebox.info().status, BoxStatus::Paused); + + litebox.resume().await.expect("resume"); + litebox.stop().await.expect("stop box"); + let _ = runtime.shutdown(Some(common::TEST_SHUTDOWN_TIMEOUT)).await; +} + +#[tokio::test] +async fn resume_is_idempotent() { + let (_home, runtime) = test_runtime(); + + let litebox = runtime + .create(common::alpine_opts(), Some("resume-idempotent".into())) + .await + .expect("create box"); + + litebox.start().await.expect("start box"); + + // Resume on a Running box should be a no-op + litebox + .resume() + .await + .expect("resume on running (idempotent)"); + assert_eq!(litebox.info().status, BoxStatus::Running); + + litebox.stop().await.expect("stop box"); + let _ = runtime.shutdown(Some(common::TEST_SHUTDOWN_TIMEOUT)).await; +} + +#[tokio::test] +async fn stop_from_paused_state() { + let (_home, runtime) = test_runtime(); + + let litebox = runtime + .create(common::alpine_opts(), Some("pause-stop".into())) + .await + .expect("create box"); + + litebox.start().await.expect("start box"); + litebox.pause().await.expect("pause box"); + assert_eq!(litebox.info().status, BoxStatus::Paused); + + // Stop directly from Paused should work + litebox.stop().await.expect("stop from paused"); + + let info = runtime + .get_info(litebox.id().as_str()) + .await + .expect("get info") + .expect("box should exist"); + assert_eq!(info.status, BoxStatus::Stopped); + + let _ = runtime.shutdown(Some(common::TEST_SHUTDOWN_TIMEOUT)).await; +} + +#[tokio::test] +async fn multiple_pause_resume_cycles() { + let (_home, runtime) = test_runtime(); + + let litebox = runtime + .create(common::alpine_opts(), Some("pause-cycles".into())) + .await + .expect("create box"); + + litebox.start().await.expect("start box"); + + for i in 0..3 { + litebox + .pause() + .await + .unwrap_or_else(|e| panic!("pause cycle {i}: {e}")); + assert_eq!(litebox.info().status, BoxStatus::Paused); + + litebox + .resume() + .await + .unwrap_or_else(|e| panic!("resume cycle {i}: {e}")); + assert_eq!(litebox.info().status, BoxStatus::Running); + + // Verify VM is responsive after each cycle + let cmd = BoxCommand::new("echo").args([format!("cycle-{i}")]); + let mut exec = litebox + .exec(cmd) + .await + .unwrap_or_else(|e| panic!("exec cycle {i}: {e}")); + let result = exec + .wait() + .await + .unwrap_or_else(|e| panic!("wait cycle {i}: {e}")); + assert_eq!(result.exit_code, 0, "command failed in cycle {i}"); + } + + litebox.stop().await.expect("stop box"); + let _ = runtime.shutdown(Some(common::TEST_SHUTDOWN_TIMEOUT)).await; +} + +#[tokio::test] +async fn resume_on_stopped_box_returns_error() { + let (_home, runtime) = test_runtime(); + + let litebox = runtime + .create(common::alpine_opts(), Some("resume-stopped".into())) + .await + .expect("create box"); + + litebox.start().await.expect("start box"); + litebox.stop().await.expect("stop box"); + + // Resume on a Stopped box should fail + let err = match litebox.resume().await { + Err(e) => e, + Ok(()) => panic!("resume should fail on stopped box"), + }; + let msg = err.to_string(); + assert!( + msg.contains("stop") || msg.contains("Stop") || msg.contains("invalidated"), + "Expected stopped/invalidated error, got: {msg}" + ); + + let _ = runtime.shutdown(Some(common::TEST_SHUTDOWN_TIMEOUT)).await; +} + +#[tokio::test] +async fn copy_into_rejected_while_paused() { + let (_home, runtime) = test_runtime(); + + let litebox = runtime + .create(common::alpine_opts(), Some("pause-copy-in".into())) + .await + .expect("create box"); + + litebox.start().await.expect("start box"); + + // Create a temp file to copy + let tmp = std::env::temp_dir().join("boxlite-test-copy-pause"); + std::fs::write(&tmp, b"test").expect("write temp file"); + + litebox.pause().await.expect("pause box"); + + // copy_into should fail while paused + let err = match litebox + .copy_into(&tmp, "/tmp/test", Default::default()) + .await + { + Err(e) => e, + Ok(()) => panic!("copy_into should fail while paused"), + }; + let msg = err.to_string(); + assert!( + msg.contains("paused") || msg.contains("Paused"), + "Expected paused error, got: {msg}" + ); + + // Resume and verify copy works + litebox.resume().await.expect("resume box"); + litebox + .copy_into(&tmp, "/tmp/test", Default::default()) + .await + .expect("copy_into after resume"); + + let _ = std::fs::remove_file(&tmp); + litebox.stop().await.expect("stop box"); + let _ = runtime.shutdown(Some(common::TEST_SHUTDOWN_TIMEOUT)).await; +} + +#[tokio::test] +async fn copy_out_rejected_while_paused() { + let (_home, runtime) = test_runtime(); + + let litebox = runtime + .create(common::alpine_opts(), Some("pause-copy-out".into())) + .await + .expect("create box"); + + litebox.start().await.expect("start box"); + + // Create a file inside the box to copy out + let cmd = BoxCommand::new("sh").args(["-c", "echo test > /tmp/testfile"]); + let mut exec = litebox.exec(cmd).await.expect("create file"); + exec.wait().await.expect("wait create file"); + + litebox.pause().await.expect("pause box"); + + let host_dst = std::env::temp_dir().join("boxlite-test-copy-out-pause"); + + // copy_out should fail while paused + let err = match litebox + .copy_out("/tmp/testfile", &host_dst, Default::default()) + .await + { + Err(e) => e, + Ok(()) => panic!("copy_out should fail while paused"), + }; + let msg = err.to_string(); + assert!( + msg.contains("paused") || msg.contains("Paused"), + "Expected paused error, got: {msg}" + ); + + // Resume and verify copy works + litebox.resume().await.expect("resume box"); + litebox + .copy_out("/tmp/testfile", &host_dst, Default::default()) + .await + .expect("copy_out after resume"); + + let _ = std::fs::remove_file(&host_dst); + litebox.stop().await.expect("stop box"); + let _ = runtime.shutdown(Some(common::TEST_SHUTDOWN_TIMEOUT)).await; +} + +#[tokio::test] +async fn pause_on_stopped_box_returns_error() { + let (_home, runtime) = test_runtime(); + + let litebox = runtime + .create(common::alpine_opts(), Some("pause-stopped".into())) + .await + .expect("create box"); + + litebox.start().await.expect("start box"); + litebox.stop().await.expect("stop box"); + + // Pause on a Stopped box should fail + let err = match litebox.pause().await { + Err(e) => e, + Ok(()) => panic!("pause should fail on stopped box"), + }; + let msg = err.to_string(); + assert!( + msg.contains("stop") || msg.contains("Stop") || msg.contains("invalidated"), + "Expected stopped/invalidated error, got: {msg}" + ); + + let _ = runtime.shutdown(Some(common::TEST_SHUTDOWN_TIMEOUT)).await; +} diff --git a/examples/python/03_lifecycle/pause_and_resume.py b/examples/python/03_lifecycle/pause_and_resume.py new file mode 100644 index 00000000..5b69f520 --- /dev/null +++ b/examples/python/03_lifecycle/pause_and_resume.py @@ -0,0 +1,227 @@ +#!/usr/bin/env python3 +""" +Pause and Resume Example - Zero-CPU VM Freezing + +Demonstrates the pause/resume API: +- pause(): Freezes VM (SIGSTOP) — zero CPU, memory preserved +- resume(): Thaws VM (SIGCONT) — continues from exact point +- Idempotent: pause on paused = no-op, resume on running = no-op +- Exec rejected while paused (InvalidState) +- Stop works directly from paused state +""" + +import asyncio + +import boxlite + + +async def basic_pause_resume(): + """Pause a box, then resume and verify it still works.""" + print("\n=== Basic Pause/Resume ===") + + runtime = boxlite.Boxlite.default() + box = None + + try: + box = await runtime.create(boxlite.BoxOptions( + image="alpine:latest", + auto_remove=False, + )) + box_id = box.id + print(f"Created box: {box_id}") + + # Run a command to verify box is working + execution = await box.exec("echo", ["Box is running"]) + stdout = execution.stdout() + async for line in stdout: + print(f" {line.strip()}") + await execution.wait() + + info = box.info() + print(f"State: {info.state.status}") + + # Pause — VM frozen, zero CPU usage + print("\nPausing box...") + await box.pause() + info = box.info() + print(f"State after pause: {info.state.status}") + + # Resume — VM continues from exact point + print("\nResuming box...") + await box.resume() + info = box.info() + print(f"State after resume: {info.state.status}") + + # Verify box still works + execution = await box.exec("echo", ["Still alive after pause/resume!"]) + stdout = execution.stdout() + async for line in stdout: + print(f" {line.strip()}") + await execution.wait() + + await box.stop() + await runtime.remove(box_id, force=False) + print("\nBox stopped and removed") + + except Exception as e: + print(f"\nError: {e}") + if box is not None: + await box.stop() + await runtime.remove(box.id, force=True) + + +async def exec_blocked_while_paused(): + """Show that exec is rejected while the box is paused.""" + print("\n\n=== Exec Blocked While Paused ===") + + runtime = boxlite.Boxlite.default() + box = None + + try: + box = await runtime.create(boxlite.BoxOptions( + image="alpine:latest", + auto_remove=False, + )) + box_id = box.id + print(f"Created box: {box_id}") + + execution = await box.exec("echo", ["ready"]) + await execution.wait() + + await box.pause() + print("Box paused") + + # Attempt exec while paused + print("Attempting exec while paused...") + try: + await box.exec("echo", ["should fail"]) + print(" Unexpected: exec succeeded") + except Exception as e: + print(f" Expected error: {e}") + + # Resume and exec works again + await box.resume() + print("Box resumed") + + execution = await box.exec("echo", ["works again!"]) + stdout = execution.stdout() + async for line in stdout: + print(f" {line.strip()}") + await execution.wait() + + await box.stop() + await runtime.remove(box_id, force=False) + + except Exception as e: + print(f"\nError: {e}") + if box is not None: + await box.stop() + await runtime.remove(box.id, force=True) + + +async def pause_resume_cycles(): + """Multiple pause/resume cycles without corruption.""" + print("\n\n=== Multiple Pause/Resume Cycles ===") + + runtime = boxlite.Boxlite.default() + box = None + + try: + box = await runtime.create(boxlite.BoxOptions( + image="alpine:latest", + auto_remove=False, + )) + box_id = box.id + print(f"Created box: {box_id}") + + execution = await box.exec("echo", ["init"]) + await execution.wait() + + for i in range(3): + await box.pause() + info = box.info() + print(f" Cycle {i}: paused (status={info.state.status})") + + await box.resume() + execution = await box.exec("echo", [f"cycle-{i}"]) + stdout = execution.stdout() + async for line in stdout: + print(f" Cycle {i}: {line.strip()}") + await execution.wait() + + print("All cycles completed — VM integrity preserved") + + await box.stop() + await runtime.remove(box_id, force=False) + + except Exception as e: + print(f"\nError: {e}") + if box is not None: + await box.stop() + await runtime.remove(box.id, force=True) + + +async def stop_from_paused(): + """Stop a paused box directly (no need to resume first).""" + print("\n\n=== Stop From Paused State ===") + + runtime = boxlite.Boxlite.default() + box = None + + try: + box = await runtime.create(boxlite.BoxOptions( + image="alpine:latest", + auto_remove=False, + )) + box_id = box.id + print(f"Created box: {box_id}") + + execution = await box.exec("echo", ["running"]) + await execution.wait() + + await box.pause() + print(f"State: {box.info().state.status}") + + # Stop directly from Paused — no resume needed + print("Stopping directly from paused state...") + await box.stop() + + info = await runtime.get_info(box_id) + if info: + print(f"State after stop: {info.state.status}") + + await runtime.remove(box_id, force=False) + print("Box removed") + + except Exception as e: + print(f"\nError: {e}") + if box is not None: + await box.stop() + await runtime.remove(box.id, force=True) + + +async def main(): + """Run all pause/resume demonstrations.""" + print("Pause/Resume API Demo") + print("=" * 60) + print("\nKey concepts:") + print(" - pause() freezes VM: zero CPU, memory preserved") + print(" - resume() thaws VM: continues from exact point") + print(" - exec/copy rejected while paused (InvalidState)") + print(" - stop() works directly from paused state") + + await basic_pause_resume() + await exec_blocked_while_paused() + await pause_resume_cycles() + await stop_from_paused() + + print("\n" + "=" * 60) + print("All demos completed!") + print("\nUse cases:") + print(" - Suspend idle AI agent sandboxes (save CPU, keep state)") + print(" - Point-in-time snapshots (pause → snapshot → resume)") + print(" - Resource management (pause low-priority boxes)") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/sdks/node/lib/simplebox.ts b/sdks/node/lib/simplebox.ts index 40637dfd..4d4fe15a 100644 --- a/sdks/node/lib/simplebox.ts +++ b/sdks/node/lib/simplebox.ts @@ -623,6 +623,55 @@ export class SimpleBox { return box.metrics(); } + /** + * Pause the box (freeze VM, zero CPU, state preserved). + * + * Quiesces guest filesystems, then sends SIGSTOP to freeze all vCPUs. + * The box keeps its memory and state but consumes zero CPU. + * + * Idempotent: calling pause() on a Paused box is a no-op. + * Use resume() to continue execution. + * + * Does nothing if the box was never created. + * + * @example + * ```typescript + * await box.pause(); + * // Box is frozen — zero CPU, memory preserved + * await box.resume(); + * ``` + */ + async pause(): Promise { + if (!this._box) { + return; + } + await this._box.pause(); + } + + /** + * Resume the box from paused state. + * + * Sends SIGCONT to resume vCPUs and thaws guest filesystems. + * The box continues from exactly where it was paused. + * + * Idempotent: calling resume() on a Running box is a no-op. + * + * Does nothing if the box was never created. + * + * @example + * ```typescript + * await box.pause(); + * // ... do something while box is frozen ... + * await box.resume(); + * ``` + */ + async resume(): Promise { + if (!this._box) { + return; + } + await this._box.resume(); + } + /** * Stop the box. * diff --git a/sdks/node/src/box_handle.rs b/sdks/node/src/box_handle.rs index d7fc4788..ac335669 100644 --- a/sdks/node/src/box_handle.rs +++ b/sdks/node/src/box_handle.rs @@ -149,6 +149,22 @@ impl JsBox { self.handle.stop().await.map_err(map_err) } + /// Pause the box (freeze VM, zero CPU, state preserved). + /// + /// Idempotent: calling pause() on a Paused box is a no-op. + #[napi] + pub async fn pause(&self) -> Result<()> { + self.handle.pause().await.map_err(map_err) + } + + /// Resume the box from paused state. + /// + /// Idempotent: calling resume() on a Running box is a no-op. + #[napi] + pub async fn resume(&self) -> Result<()> { + self.handle.resume().await.map_err(map_err) + } + /// Get box metrics. #[napi] pub async fn metrics(&self) -> Result { diff --git a/sdks/python/src/box_handle.rs b/sdks/python/src/box_handle.rs index a2c2ad20..9a551269 100644 --- a/sdks/python/src/box_handle.rs +++ b/sdks/python/src/box_handle.rs @@ -106,6 +106,31 @@ impl PyBox { }) } + /// Pause the box (freeze VM, zero CPU, state preserved). + /// + /// Idempotent: calling pause() on a Paused box is a no-op. + fn pause<'a>(&self, py: Python<'a>) -> PyResult> { + let handle = Arc::clone(&self.handle); + + pyo3_async_runtimes::tokio::future_into_py(py, async move { + handle.pause().await.map_err(map_err)?; + Ok(()) + }) + } + + /// Resume the box from paused state. + /// + /// Sends SIGCONT to resume vCPUs and thaws guest filesystems. + /// Idempotent: calling resume() on a Running box is a no-op. + fn resume<'a>(&self, py: Python<'a>) -> PyResult> { + let handle = Arc::clone(&self.handle); + + pyo3_async_runtimes::tokio::future_into_py(py, async move { + handle.resume().await.map_err(map_err)?; + Ok(()) + }) + } + fn metrics<'a>(&self, py: Python<'a>) -> PyResult> { let handle = Arc::clone(&self.handle);