From a8c71d3bd75f115f5404388212355e06ee26e511 Mon Sep 17 00:00:00 2001 From: Joel Teply Date: Sat, 25 Apr 2026 12:34:38 -0500 Subject: [PATCH] test(sentinel/checkpoint): isolate via CONTINUUM_CHECKPOINT_DIR env override MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Empirical hit on PR #950 prepush: cargo test --lib panicked in 3 checkpoint tests with "Failed to write checkpoint tmp file: Permission denied (os error 13)". Root cause: ~/.continuum/sentinel/checkpoints/ was owned by root:root with mode 755 on this dev machine — left over from a prior docker container that mounted $HOME and chmod'd the dir under root. Tests wrote to that production path, no isolation. Two-part fix in one commit, both in checkpoint.rs: 1. checkpoints_dir() now honors CONTINUUM_CHECKPOINT_DIR env override. Production code still falls through to ~/.continuum/sentinel/ checkpoints when the var is unset (no behavior change in prod). 2. Tests share a process-global tempfile::TempDir via OnceLock, exposed through ensure_test_checkpoint_dir() called at the top of each test. Cargo runs tests in parallel within one process and set_var is process-global — a per-test TempDir would race. Shared dir is fine because tests use UUID-derived handles; collisions structurally don't happen. Validated locally: cargo test --lib --features cuda,load-dynamic-ort → 1829 passed; 0 failed; 28 ignored (was: 1826 passed; 3 failed in checkpoint tests on this machine.) Why this matters beyond the local fix: the checkpoint module's hard- coded ~/.continuum dependency is a latent foot-gun for ANY contributor whose dev box has had a root-mounted docker run over $HOME — a common state given how often we run docker compose with bind-mounts. Env override + tempdir isolation makes the test suite portable. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/modules/sentinel/checkpoint.rs | 43 ++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/src/workers/continuum-core/src/modules/sentinel/checkpoint.rs b/src/workers/continuum-core/src/modules/sentinel/checkpoint.rs index b0594c5ef..2a8d1a919 100644 --- a/src/workers/continuum-core/src/modules/sentinel/checkpoint.rs +++ b/src/workers/continuum-core/src/modules/sentinel/checkpoint.rs @@ -7,8 +7,17 @@ use std::path::PathBuf; use super::types::{PipelineCheckpoint, PipelineStatus}; -/// Base directory for checkpoint storage +/// Base directory for checkpoint storage. +/// +/// Default: `~/.continuum/sentinel/checkpoints`. +/// Overridable via `CONTINUUM_CHECKPOINT_DIR` env var — used by tests to +/// isolate checkpoint state from the user's real `~/.continuum` (and to +/// survive the case where root-owned directories from a previous docker +/// container run leave the default path unwritable for the dev user). fn checkpoints_dir() -> PathBuf { + if let Ok(override_dir) = std::env::var("CONTINUUM_CHECKPOINT_DIR") { + return PathBuf::from(override_dir); + } let home = dirs::home_dir().expect("Failed to resolve home directory"); home.join(".continuum").join("sentinel").join("checkpoints") } @@ -137,6 +146,35 @@ mod tests { use super::*; use crate::modules::sentinel::types::*; use std::collections::HashMap; + use std::sync::OnceLock; + use tempfile::TempDir; + + /// Process-global tempdir for checkpoint tests, lazily initialized on + /// first access. All tests in this module share it — they use unique + /// UUID-derived handles, so file collisions don't happen. This isolates + /// the test runs from the user's real `~/.continuum/sentinel/checkpoints` + /// (which may be root-owned-and-unwritable on a dev box that previously + /// ran a docker container that mounted $HOME and chmod'd the dir under + /// root). + /// + /// Stored in a static so the TempDir is dropped (and cleaned up) only + /// when the test process exits — a per-test TempDir would race with + /// cargo's default parallel test execution since `set_var` is process- + /// global. + fn ensure_test_checkpoint_dir() { + static TMPDIR: OnceLock = OnceLock::new(); + let dir = TMPDIR.get_or_init(|| { + tempfile::tempdir().expect("Failed to create test checkpoint tempdir") + }); + // SAFETY: set_var is unsafe in newer Rust (process-global, racy with + // other threads reading env). Tests in this module only ever write + // the SAME path, so concurrent setters write the same value — race- + // free in practice. + std::env::set_var( + "CONTINUUM_CHECKPOINT_DIR", + dir.path(), + ); + } fn make_test_checkpoint(handle: &str) -> PipelineCheckpoint { PipelineCheckpoint { @@ -189,6 +227,7 @@ mod tests { #[test] fn test_save_load_checkpoint() { + ensure_test_checkpoint_dir(); let handle = format!( "test-ckpt-{}", uuid::Uuid::new_v4().to_string()[..8].to_string() @@ -208,6 +247,7 @@ mod tests { #[test] fn test_list_checkpoints() { + ensure_test_checkpoint_dir(); let handle = format!( "test-list-{}", uuid::Uuid::new_v4().to_string()[..8].to_string() @@ -223,6 +263,7 @@ mod tests { #[test] fn test_recover_interrupted() { + ensure_test_checkpoint_dir(); let handle = format!( "test-recover-{}", uuid::Uuid::new_v4().to_string()[..8].to_string()