From 34cd8c2a92785ee05a885bb1fc56c8181aff00e6 Mon Sep 17 00:00:00 2001 From: iberi22 <10615454+iberi22@users.noreply.github.com> Date: Thu, 16 Apr 2026 23:38:17 +0000 Subject: [PATCH] chore: final stabilization pass for VFS and Swarm --- ARCHITECTURE.md | 12 +++- gestalt_core/src/application/indexer.rs | 62 +++++++++++++++----- gestalt_core/src/ports/outbound/vfs.rs | 78 +++++++++++++++++++++++++ gestalt_swarm/README.md | 32 ++++++++++ 4 files changed, 168 insertions(+), 16 deletions(-) create mode 100644 gestalt_swarm/README.md diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index 59ceecf..5454936 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -169,8 +169,16 @@ Crates like `gestalt_core` use `thiserror` to define exhaustive error enums: ## Security Considerations -### 1. VFS Isolation -Agents do not write directly to the host filesystem. Every agent operates within an `OverlayFs`. Changes are staged in memory and only flushed to disk after validation or human approval. Path traversal is prevented by strictly validating paths against the workspace root. +### 1. VFS Isolation & Overlay Architecture +The Virtual File System (VFS) provides a critical isolation layer between AI agents and the host operating system. It follows a layered approach: + +- **RealFileSystem**: The base layer that interacts directly with the physical disk using `tokio::fs`. It is typically used for read-only access by agents. +- **OverlayFs**: A specialized implementation that maintains an in-memory "scratchpad" of changes. When an agent writes a file, it is stored in the overlay, not on disk. +- **Merge Logic**: The `OverlayFs::list` and `read` methods perform a union of the disk state and the in-memory state. If a file exists in both, the overlay version takes precedence (Copy-on-Write semantics). + +This architecture ensures that agents can explore and modify the codebase without causing immediate, irreversible changes. All staged modifications can be inspected via `pending_changes()` and must be explicitly `flush()`-ed to the physical disk. + +Path traversal is prevented by strictly validating all incoming paths against the configured workspace root. ### 2. API Authentication The `gestalt_timeline` server implements a fail-closed `auth_middleware`. Requests must provide a valid `GESTALT_API_TOKEN` via Bearer header or query parameter. diff --git a/gestalt_core/src/application/indexer.rs b/gestalt_core/src/application/indexer.rs index cedd955..1ab4bad 100644 --- a/gestalt_core/src/application/indexer.rs +++ b/gestalt_core/src/application/indexer.rs @@ -1,8 +1,8 @@ -use anyhow::Result; use serde::{Deserialize, Serialize}; use sha2::{Digest, Sha256}; use std::path::{Path, PathBuf}; use tempfile::tempdir; +use thiserror::Error; use tracing::{info, warn}; use walkdir::WalkDir; @@ -31,6 +31,23 @@ pub struct DocumentRecord { pub chunks: Vec, } +#[derive(Debug, Error)] +pub enum IndexerError { + #[error("IO error: {0}")] + Io(#[from] std::io::Error), + + #[error("Git error: {0}")] + Git(#[from] git2::Error), + + #[error("Strip prefix error: {0}")] + StripPrefix(#[from] std::path::StripPrefixError), + + #[error("Other error: {0}")] + Other(String), +} + +pub type Result = std::result::Result; + pub struct Indexer { allowlist: Vec, max_file_size: u64, @@ -79,7 +96,9 @@ impl Indexer { url: &str, ) -> Result<(RepositoryMetadata, PathBuf, Option)> { if Path::new(url).exists() { - let path = PathBuf::from(url).canonicalize()?; + let path = PathBuf::from(url) + .canonicalize() + .map_err(IndexerError::Io)?; let name = path .file_name() .unwrap_or_default() @@ -160,8 +179,12 @@ impl Indexer { /// Process a file into chunks and metadata. pub fn process_file(&self, root: &Path, file_path: &Path) -> Result { - let relative_path = file_path.strip_prefix(root)?.to_string_lossy().to_string(); - let content = std::fs::read_to_string(file_path)?; + let relative_path = file_path + .strip_prefix(root) + .map_err(IndexerError::StripPrefix)? + .to_string_lossy() + .to_string(); + let content = std::fs::read_to_string(file_path).map_err(IndexerError::Io)?; let mut hasher = Sha256::new(); hasher.update(content.as_bytes()); @@ -229,8 +252,13 @@ impl Indexer { #[async_trait::async_trait] pub trait VectorAdapter: Send + Sync { - async fn index_document(&self, repo_id: &str, doc: DocumentRecord) -> Result<()>; - async fn search(&self, repo_id: &str, query: &str, limit: usize) -> Result>; + async fn index_document(&self, repo_id: &str, doc: DocumentRecord) -> anyhow::Result<()>; + async fn search( + &self, + repo_id: &str, + query: &str, + limit: usize, + ) -> anyhow::Result>; } #[derive(Debug, Clone, Serialize, Deserialize)] @@ -252,7 +280,7 @@ impl SurrealAdapter { #[async_trait::async_trait] impl VectorAdapter for SurrealAdapter { - async fn index_document(&self, repo_id: &str, doc: DocumentRecord) -> Result<()> { + async fn index_document(&self, repo_id: &str, doc: DocumentRecord) -> anyhow::Result<()> { let created_at = chrono::Utc::now().to_rfc3339(); // 1. Create document record @@ -293,7 +321,12 @@ impl VectorAdapter for SurrealAdapter { Ok(()) } - async fn search(&self, repo_id: &str, query: &str, limit: usize) -> Result> { + async fn search( + &self, + repo_id: &str, + query: &str, + limit: usize, + ) -> anyhow::Result> { // Simple keyword-based search in SurrealDB as a fallback for pure vector search let sql = "SELECT path, content FROM chunks WHERE doc_id CONTAINS $repo_id AND content CONTAINS $query LIMIT $limit"; let results: Vec = self @@ -337,21 +370,22 @@ mod tests { } #[test] - fn test_scan() { - let dir = tempdir().unwrap(); + fn test_scan() -> anyhow::Result<()> { + let dir = tempdir().expect("failed to create temp dir"); let file_path = dir.path().join("test.rs"); - let mut file = File::create(&file_path).unwrap(); - writeln!(file, "fn main() {{}}").unwrap(); + let mut file = File::create(&file_path).expect("failed to create test file"); + writeln!(file, "fn main() {{}}").expect("failed to write to test file"); let hidden_dir = dir.path().join(".git"); - std::fs::create_dir(&hidden_dir).unwrap(); + std::fs::create_dir(&hidden_dir).expect("failed to create hidden dir"); let hidden_file = hidden_dir.join("config"); - File::create(&hidden_file).unwrap(); + File::create(&hidden_file).expect("failed to create hidden file"); let indexer = Indexer::default(); let files = indexer.scan(dir.path()); assert_eq!(files.len(), 1); assert!(files[0].ends_with("test.rs")); + Ok(()) } } diff --git a/gestalt_core/src/ports/outbound/vfs.rs b/gestalt_core/src/ports/outbound/vfs.rs index 7e4d4f2..49e35bb 100644 --- a/gestalt_core/src/ports/outbound/vfs.rs +++ b/gestalt_core/src/ports/outbound/vfs.rs @@ -57,9 +57,49 @@ pub struct FileWatchEvent { #[async_trait] pub trait VirtualFileSystem: Send + Sync { + /// Reads the content of a file at the given path. + /// + /// # Examples + /// + /// ``` + /// # use std::path::Path; + /// # use gestalt_core::ports::outbound::vfs::{VirtualFileSystem, OverlayFs}; + /// # tokio_test::block_on(async { + /// let vfs = OverlayFs::new(); + /// let data = vfs.read(Path::new("hello.txt")).await; + /// # }); + /// ``` async fn read(&self, path: &Path) -> Result>; + + /// Writes data to a file at the given path, associated with an owner. + /// + /// # Examples + /// + /// ``` + /// # use std::path::Path; + /// # use gestalt_core::ports::outbound::vfs::{VirtualFileSystem, OverlayFs}; + /// # tokio_test::block_on(async { + /// let vfs = OverlayFs::new(); + /// vfs.write(Path::new("hello.txt"), b"world".to_vec(), "agent-1").await.unwrap(); + /// # }); + /// ``` async fn write(&self, path: &Path, data: Vec, owner: &str) -> Result<()>; + + /// Lists entries in the directory at the given path. + /// + /// # Examples + /// + /// ``` + /// # use std::path::Path; + /// # use gestalt_core::ports::outbound::vfs::{VirtualFileSystem, OverlayFs}; + /// # tokio_test::block_on(async { + /// let vfs = OverlayFs::new(); + /// let entries = vfs.list(Path::new(".")).await.unwrap(); + /// # }); + /// ``` async fn list(&self, path: &Path) -> Result>; + + /// Checks if a file or directory exists at the given path. async fn exists(&self, path: &Path) -> Result; // Extended/Compatibility methods @@ -711,4 +751,42 @@ mod tests { Ok(()) } + + #[tokio::test] + async fn list_merges_overlay_and_disk_entries() -> Result<()> { + let tmp = tempdir()?; + let dir = tmp.path().to_path_buf(); + let disk_file = dir.join("disk.txt"); + let overlay_file = dir.join("overlay.txt"); + + tokio::fs::write(&disk_file, "disk").await?; + + let vfs = OverlayFs::new(); + vfs.write_string(&overlay_file, "overlay".to_string(), "agent-a") + .await?; + + let entries = vfs.list(&dir).await?; + assert!(entries.contains(&disk_file)); + assert!(entries.contains(&overlay_file)); + assert_eq!(entries.len(), 2); + + Ok(()) + } + + #[tokio::test] + async fn list_handles_non_existent_disk_dir_with_overlay_entries() -> Result<()> { + let tmp = tempdir()?; + let dir = tmp.path().join("ghost_dir"); + + let vfs = OverlayFs::new(); + let overlay_file = dir.join("new.txt"); + vfs.write_string(&overlay_file, "new".to_string(), "agent-a") + .await?; + + let entries = vfs.list(&dir).await?; + assert!(entries.contains(&overlay_file)); + assert_eq!(entries.len(), 1); + + Ok(()) + } } diff --git a/gestalt_swarm/README.md b/gestalt_swarm/README.md new file mode 100644 index 0000000..48444a6 --- /dev/null +++ b/gestalt_swarm/README.md @@ -0,0 +1,32 @@ +# Gestalt Swarm + +Gestalt Swarm is a high-throughput parallel execution bridge for AI agent tasks. It is designed to run many short-lived tasks in parallel, making it ideal for large-scale codebase analysis or refactoring. + +## CLI Usage + +The `swarm` command provides several subcommands for managing and running parallel tasks. + +### 1. Check Status +Verify that the swarm is active and ready to accept tasks. + +```bash +cargo run -p gestalt_swarm -- status +``` + +### 2. Run a Task +Submit a goal to the swarm for parallel execution. + +```bash +cargo run -p gestalt_swarm -- run --goal "Refactor all unwrap() calls in gestalt_core" +``` + +### 3. Verbose Output +Use the `--verbose` or `-v` flag to enable debug logging. + +```bash +cargo run -p gestalt_swarm -- -v status +``` + +## Architecture + +Swarm utilizes a lead agent to decompose complex goals into smaller, independent tasks which are then dispatched to a pool of worker agents. It leverages the Virtual File System (VFS) to ensure that parallel modifications do not conflict and can be merged safely.