From fee4e26eaf2491fe8a4f729b2e1073633fc4d2aa Mon Sep 17 00:00:00 2001 From: Vincent Prouillet Date: Thu, 29 Jan 2026 10:32:26 +0100 Subject: [PATCH 1/3] Migrate in lib --- Cargo.lock | 26 ++++ Cargo.toml | 1 + dvs/Cargo.toml | 1 + dvs/src/config.rs | 4 +- dvs/src/file.rs | 7 +- dvs/src/hashes.rs | 28 ++-- dvs/src/lib.rs | 1 + dvs/src/migrate.rs | 367 +++++++++++++++++++++++++++++++++++++++++++++ dvs/src/paths.rs | 2 +- 9 files changed, 422 insertions(+), 15 deletions(-) create mode 100644 dvs/src/migrate.rs diff --git a/Cargo.lock b/Cargo.lock index c4c0614..5706cf8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -218,6 +218,7 @@ dependencies = [ "nix", "serde", "serde_json", + "serde_yaml", "tempfile", "toml", "uuid", @@ -557,6 +558,12 @@ version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" +[[package]] +name = "ryu" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a50f4cf475b65d88e057964e0e9bb1f0aa9bbb2036dc65c64596b42932536984" + [[package]] name = "same-file" version = "1.0.6" @@ -618,6 +625,19 @@ dependencies = [ "serde_core", ] +[[package]] +name = "serde_yaml" +version = "0.9.34+deprecated" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" +dependencies = [ + "indexmap", + "itoa", + "ryu", + "serde", + "unsafe-libyaml", +] + [[package]] name = "shlex" version = "1.3.0" @@ -699,6 +719,12 @@ version = "1.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" +[[package]] +name = "unsafe-libyaml" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861" + [[package]] name = "utf8parse" version = "0.2.2" diff --git a/Cargo.toml b/Cargo.toml index de01ac7..44c6cfc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,3 +34,4 @@ jiff = "0.2.18" anyhow = "1.0.100" whoami = "2" uuid = "1.20.0" +serde_yaml = "0.9" diff --git a/dvs/Cargo.toml b/dvs/Cargo.toml index 411782a..eff1a50 100644 --- a/dvs/Cargo.toml +++ b/dvs/Cargo.toml @@ -21,6 +21,7 @@ md5.workspace = true jiff.workspace = true anyhow.workspace = true whoami.workspace = true +serde_yaml.workspace = true uuid = { version = "1.20.0", features = ["v4"] } [target.'cfg(unix)'.dependencies] diff --git a/dvs/src/config.rs b/dvs/src/config.rs index b9e2520..54d44c2 100644 --- a/dvs/src/config.rs +++ b/dvs/src/config.rs @@ -2,7 +2,7 @@ use std::path::Path; use crate::backends::Backend as BackendTrait; use crate::backends::local::LocalBackend; -use crate::paths::{CONFIG_FILE_NAME, DEFAULT_FOLDER_NAME, find_repo_root}; +use crate::paths::{CONFIG_FILE_NAME, DEFAULT_METADATA_FOLDER_NAME, find_repo_root}; use anyhow::{Context, Result}; use fs_err as fs; use serde::{Deserialize, Serialize}; @@ -70,7 +70,7 @@ impl Config { if let Some(name) = &self.metadata_folder_name { name.as_str() } else { - DEFAULT_FOLDER_NAME + DEFAULT_METADATA_FOLDER_NAME } } diff --git a/dvs/src/file.rs b/dvs/src/file.rs index 36ae395..e899fdb 100644 --- a/dvs/src/file.rs +++ b/dvs/src/file.rs @@ -1,3 +1,4 @@ +use std::io::BufReader; use std::path::{Path, PathBuf}; use crate::audit::{AuditEntry, AuditFile}; @@ -56,9 +57,9 @@ impl FileMetadata { bail!("Path {} is not a file", path.as_ref().display()); } - let content = fs::read(path.as_ref())?; - let size = content.len() as u64; - let hashes = Hashes::from(content); + let file = fs::File::open(path.as_ref())?; + let size = file.metadata()?.len(); + let hashes = Hashes::from_reader(BufReader::new(file))?; let created_by = whoami::username()?; let add_time = jiff::Zoned::now().to_string(); diff --git a/dvs/src/hashes.rs b/dvs/src/hashes.rs index ef91e37..b2bba3d 100644 --- a/dvs/src/hashes.rs +++ b/dvs/src/hashes.rs @@ -1,5 +1,6 @@ use serde::{Deserialize, Serialize}; use std::fmt::Display; +use std::io::BufRead; #[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone, Copy)] #[serde(rename_all = "lowercase")] @@ -17,19 +18,28 @@ pub struct Hashes { pub md5: String, } -impl From> for Hashes { - fn from(bytes: Vec) -> Self { - let blake3_hash = format!("{}", blake3::hash(&bytes)); - let md5_hash = format!("{:x}", md5::compute(&bytes)); +impl Hashes { + pub fn from_reader(mut reader: R) -> std::io::Result { + let mut blake3_hasher = blake3::Hasher::new(); + let mut md5_context = md5::Context::new(); - Self { - blake3: blake3_hash, - md5: md5_hash, + loop { + let buf = reader.fill_buf()?; + if buf.is_empty() { + break; + } + blake3_hasher.update(buf); + md5_context.consume(buf); + let len = buf.len(); + reader.consume(len); } + + Ok(Self { + blake3: blake3_hasher.finalize().to_string(), + md5: format!("{:x}", md5_context.finalize()), + }) } -} -impl Hashes { pub fn get_by_alg(&self, alg: HashAlg) -> &str { match alg { HashAlg::Blake3 => &self.blake3, diff --git a/dvs/src/lib.rs b/dvs/src/lib.rs index 96c8b2d..28581af 100644 --- a/dvs/src/lib.rs +++ b/dvs/src/lib.rs @@ -4,6 +4,7 @@ pub mod config; pub mod file; mod hashes; pub mod init; +pub mod migrate; pub mod paths; pub use backends::Backend; diff --git a/dvs/src/migrate.rs b/dvs/src/migrate.rs new file mode 100644 index 0000000..1db2795 --- /dev/null +++ b/dvs/src/migrate.rs @@ -0,0 +1,367 @@ +use std::io::BufReader; +use std::path::{Path, PathBuf}; + +use anyhow::{Result, bail}; +use fs_err as fs; +use serde::Deserialize; +use walkdir::WalkDir; + +use crate::FileMetadata; +use crate::config::Config; +use crate::hashes::Hashes; +use crate::paths::DEFAULT_METADATA_FOLDER_NAME; + +#[derive(Deserialize)] +struct V1Config { + storage_dir: PathBuf, + permissions: Option, + group: Option, +} + +impl V1Config { + pub fn migrate(self) -> Result { + let perms = self.permissions.map(|p| format!("{p}")); + Config::new_local(self.storage_dir, perms, self.group) + } +} + +#[derive(Deserialize)] +struct V1Metadata { + blake3_checksum: String, + size: u64, + add_time: String, + message: Option, + saved_by: String, +} + +impl V1Metadata { + pub fn migrate(self, md5: String) -> FileMetadata { + FileMetadata { + hashes: Hashes { + blake3: self.blake3_checksum, + md5, + }, + size: self.size, + created_by: self.saved_by, + add_time: self.add_time, + message: self.message, + } + } +} + +fn delete_files(files: &[PathBuf]) { + for path in files { + // we ignore errors, not much we can do about it + let _ = fs::remove_file(path); + } +} + +/// Migrate a DVS v1 repository to v2 format. +/// +/// This will: +/// 1. Convert `dvs.yaml` to `dvs.toml` +/// 2. Update all `.dvs` metadata files to the v2 format (adding MD5 hashes) and save them to strings +/// 3. Save all the new files +/// 3. Delete all the v1 files (config + .dvs metadata files) +/// +/// The migration is atomic: all new files are written before any old files are deleted. +/// If any write fails, all newly written files are cleaned up. +/// If a file in the storage doesn't match the hash from the metadata file, the process fill fail. +pub fn migrate(root: impl AsRef) -> Result { + let root = fs::canonicalize(root.as_ref())?; + let yaml_path = root.join("dvs.yaml"); + let toml_path = root.join("dvs.toml"); + + // 1. Validate: dvs.yaml must exist, dvs.toml must not + if !yaml_path.exists() { + bail!("No dvs.yaml found - not a DVS v1 repository"); + } + if toml_path.exists() { + bail!("dvs.toml already exists - repository already migrated?"); + } + + // 2. Parse config (don't write yet) + let yaml_content = fs::read_to_string(&yaml_path)?; + let old_config: V1Config = serde_yaml::from_str(&yaml_content)?; + let storage_dir = old_config.storage_dir.clone(); + let new_config = old_config.migrate()?; + + // Collect files to write and delete + let mut files_to_write: Vec<(PathBuf, String)> = vec![]; + let mut files_to_delete: Vec = vec![yaml_path]; + + // Add config to write list + let config_content = toml::to_string_pretty(&new_config)?; + files_to_write.push((toml_path, config_content)); + + // 3. Process all metadata files + for entry in WalkDir::new(&root) + .into_iter() + .filter_entry(|e| { + // Skip .git/.dvs directory + !e.file_name() + .to_str() + .is_some_and(|s| s == ".git" || s == ".dvs") + }) + .filter_map(|e| e.ok()) + .filter(|e| e.path().extension().is_some_and(|x| x == "dvs")) + { + let dvs_path = fs::canonicalize(entry.path())?; + + // Parse v1 metadata + let content = fs::read_to_string(&dvs_path)?; + let old_meta: V1Metadata = serde_json::from_str(&content)?; + + // Read file from storage and verify hash + let blake3 = &old_meta.blake3_checksum; + let storage_path = storage_dir.join(&blake3[..2]).join(&blake3[2..]); + let file = fs::File::open(&storage_path)?; + let computed = Hashes::from_reader(BufReader::new(file))?; + if computed.blake3 != *blake3 { + bail!( + "Hash mismatch for {}: stored file has {}, metadata claims {}", + dvs_path.display(), + computed.blake3, + blake3 + ); + } + let md5 = computed.md5; + + // Migrate metadata + let new_meta = old_meta.migrate(md5); + let relative_path = dvs_path.strip_prefix(&root)?; + let out = root.join(DEFAULT_METADATA_FOLDER_NAME).join(relative_path); + let json = serde_json::to_string(&new_meta)?; + + files_to_write.push((out, json)); + files_to_delete.push(dvs_path); + } + + let migrated_count = files_to_write.len() - 1; // -1 for config file + + // 4. Write all new files + let mut written: Vec = vec![]; + for (path, content) in &files_to_write { + if let Some(parent) = path.parent() { + if let Err(e) = fs::create_dir_all(parent) { + delete_files(&written); + return Err(e.into()); + } + } + if let Err(e) = fs::write(path, content) { + delete_files(&written); + return Err(e.into()); + } + written.push(path.clone()); + } + + // 5. Delete old files (only after all writes succeeded) + let mut delete_errors = vec![]; + for p in &files_to_delete { + if let Err(e) = fs::remove_file(p) { + delete_errors.push((p.display().to_string(), e)); + } + } + if !delete_errors.is_empty() { + let paths: Vec<_> = delete_errors.iter().map(|(p, _)| p.as_str()).collect(); + bail!( + "Migration completed but failed to delete {} old file(s): {}", + delete_errors.len(), + paths.join(", ") + ); + } + + Ok(migrated_count) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::testutil::create_temp_git_repo; + + fn create_v1_repo(root: &Path, storage_dir: &Path) { + // Create dvs.yaml + let yaml = format!("storage_dir: {}\n", storage_dir.display()); + fs::write(root.join("dvs.yaml"), yaml).unwrap(); + + // Create storage directory + fs::create_dir_all(storage_dir).unwrap(); + } + + fn create_v1_metadata( + dvs_dir: &Path, + relative_path: &str, + blake3: &str, + size: u64, + saved_by: &str, + ) { + let dvs_path = dvs_dir.join(format!("{}.dvs", relative_path)); + if let Some(parent) = dvs_path.parent() { + fs::create_dir_all(parent).unwrap(); + } + let meta = serde_json::json!({ + "blake3_checksum": blake3, + "size": size, + "add_time": "2024-01-01T00:00:00.000Z", + "message": null, + "saved_by": saved_by + }); + fs::write(dvs_path, serde_json::to_string(&meta).unwrap()).unwrap(); + } + + fn store_file_v1(storage_dir: &Path, blake3: &str, content: &[u8]) { + let path = storage_dir.join(&blake3[..2]).join(&blake3[2..]); + fs::create_dir_all(path.parent().unwrap()).unwrap(); + fs::write(path, content).unwrap(); + } + + #[test] + fn migrate_full_repo_success() { + let (_tmp, root) = create_temp_git_repo(); + let storage = root.join(".storage"); + create_v1_repo(&root, &storage); + + // Create 3 files with different content at various nesting levels + let content_root = b"root file content"; + let blake3_root = format!("{}", blake3::hash(content_root)); + store_file_v1(&storage, &blake3_root, content_root); + + let content_data = b"data file one"; + let blake3_data = format!("{}", blake3::hash(content_data)); + store_file_v1(&storage, &blake3_data, content_data); + + let content_nested = b"nested file two"; + let blake3_nested = format!("{}", blake3::hash(content_nested)); + store_file_v1(&storage, &blake3_nested, content_nested); + + // Create v1 metadata files at various locations + create_v1_metadata( + &root, + "root.txt", + &blake3_root, + content_root.len() as u64, + "user_root", + ); + create_v1_metadata( + &root, + "data/file1.txt", + &blake3_data, + content_data.len() as u64, + "user_data", + ); + create_v1_metadata( + &root, + "data/nested/file2.txt", + &blake3_nested, + content_nested.len() as u64, + "user_nested", + ); + + // Run migration + let result = migrate(&root).unwrap(); + + // 1. Returns Ok(3) - 3 metadata files migrated + assert_eq!(result, 3); + + // 2. dvs.yaml is deleted + assert!(!root.join("dvs.yaml").exists()); + + // 3. dvs.toml exists and is valid + assert!(root.join("dvs.toml").exists()); + let config = Config::find(&root).unwrap().unwrap(); + assert!(matches!(config.backend(), _)); + + // 4. All old .dvs files at original locations are deleted + assert!(!root.join("root.txt.dvs").exists()); + assert!(!root.join("data/file1.txt.dvs").exists()); + assert!(!root.join("data/nested/file2.txt.dvs").exists()); + + // 5. New .dvs folder contains all migrated metadata + let dvs_dir = root.join(".dvs"); + assert!(dvs_dir.join("root.txt.dvs").exists()); + assert!(dvs_dir.join("data/file1.txt.dvs").exists()); + assert!(dvs_dir.join("data/nested/file2.txt.dvs").exists()); + + // 6. Each migrated metadata has correct blake3 and md5 hashes + let meta_root: FileMetadata = + serde_json::from_str(&fs::read_to_string(dvs_dir.join("root.txt.dvs")).unwrap()) + .unwrap(); + assert_eq!(meta_root.hashes.blake3, blake3_root); + assert_eq!( + meta_root.hashes.md5, + format!("{:x}", md5::compute(content_root)) + ); + + let meta_data: FileMetadata = + serde_json::from_str(&fs::read_to_string(dvs_dir.join("data/file1.txt.dvs")).unwrap()) + .unwrap(); + assert_eq!(meta_data.hashes.blake3, blake3_data); + assert_eq!( + meta_data.hashes.md5, + format!("{:x}", md5::compute(content_data)) + ); + + let meta_nested: FileMetadata = serde_json::from_str( + &fs::read_to_string(dvs_dir.join("data/nested/file2.txt.dvs")).unwrap(), + ) + .unwrap(); + assert_eq!(meta_nested.hashes.blake3, blake3_nested); + assert_eq!( + meta_nested.hashes.md5, + format!("{:x}", md5::compute(content_nested)) + ); + } + + #[test] + fn migrate_fails_when_storage_file_missing() { + let (_tmp, root) = create_temp_git_repo(); + let storage = root.join(".storage"); + create_v1_repo(&root, &storage); + + // Create v1 metadata referencing a file that doesn't exist in storage + let fake_blake3 = "a".repeat(64); + create_v1_metadata(&root, "missing.txt", &fake_blake3, 100, "user"); + + // Migration should fail + let result = migrate(&root); + assert!(result.is_err()); + + // No partial migration: dvs.yaml still exists, dvs.toml not created + assert!(root.join("dvs.yaml").exists()); + assert!(!root.join("dvs.toml").exists()); + } + + #[test] + fn migrate_fails_when_storage_file_corrupted() { + let (_tmp, root) = create_temp_git_repo(); + let storage = root.join(".storage"); + create_v1_repo(&root, &storage); + + // Create a file and store it + let original_content = b"original content"; + let blake3_hash = format!("{}", blake3::hash(original_content)); + + // Store DIFFERENT content than what the hash claims + let corrupted_content = b"corrupted content"; + store_file_v1(&storage, &blake3_hash, corrupted_content); + + // Create v1 metadata with the original hash + create_v1_metadata( + &root, + "file.txt", + &blake3_hash, + original_content.len() as u64, + "user", + ); + + // Migration should fail with hash mismatch error + let result = migrate(&root); + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!(err.contains("Hash mismatch")); + + // No partial migration: dvs.yaml still exists, dvs.toml not created + assert!(root.join("dvs.yaml").exists()); + assert!(!root.join("dvs.toml").exists()); + } +} diff --git a/dvs/src/paths.rs b/dvs/src/paths.rs index ed0e7d4..2033b41 100644 --- a/dvs/src/paths.rs +++ b/dvs/src/paths.rs @@ -7,7 +7,7 @@ use globset::Glob; use walkdir::WalkDir; pub const CONFIG_FILE_NAME: &str = "dvs.toml"; -pub const DEFAULT_FOLDER_NAME: &str = ".dvs"; +pub const DEFAULT_METADATA_FOLDER_NAME: &str = ".dvs"; /// We can pass either a glob or a list of paths to dvs to handle. /// This enum is here to auto-convert properly From 4d159036f40275db2cb71c0bd417d0ee4d67b58f Mon Sep 17 00:00:00 2001 From: Vincent Prouillet Date: Thu, 29 Jan 2026 12:55:09 +0100 Subject: [PATCH 2/3] Apply permissions/group --- dvs/src/config.rs | 2 +- dvs/src/migrate.rs | 18 ++++++++++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/dvs/src/config.rs b/dvs/src/config.rs index 54d44c2..97304c6 100644 --- a/dvs/src/config.rs +++ b/dvs/src/config.rs @@ -19,7 +19,7 @@ pub struct Config { /// at the root of the repository /// If this option is set, dvs will use that folder name instead of `.dvs` metadata_folder_name: Option, - backend: Backend, + pub backend: Backend, } impl Config { diff --git a/dvs/src/migrate.rs b/dvs/src/migrate.rs index 1db2795..3fd28c6 100644 --- a/dvs/src/migrate.rs +++ b/dvs/src/migrate.rs @@ -7,7 +7,7 @@ use serde::Deserialize; use walkdir::WalkDir; use crate::FileMetadata; -use crate::config::Config; +use crate::config::{Backend, Config}; use crate::hashes::Hashes; use crate::paths::DEFAULT_METADATA_FOLDER_NAME; @@ -85,6 +85,7 @@ pub fn migrate(root: impl AsRef) -> Result { let old_config: V1Config = serde_yaml::from_str(&yaml_content)?; let storage_dir = old_config.storage_dir.clone(); let new_config = old_config.migrate()?; + let Backend::Local(ref backend) = new_config.backend; // Collect files to write and delete let mut files_to_write: Vec<(PathBuf, String)> = vec![]; @@ -139,19 +140,32 @@ pub fn migrate(root: impl AsRef) -> Result { let migrated_count = files_to_write.len() - 1; // -1 for config file - // 4. Write all new files + // 4. Write all new files and apply permissions let mut written: Vec = vec![]; + let mut dirs_created: std::collections::HashSet = std::collections::HashSet::new(); for (path, content) in &files_to_write { if let Some(parent) = path.parent() { if let Err(e) = fs::create_dir_all(parent) { delete_files(&written); return Err(e.into()); } + if dirs_created.insert(parent.to_path_buf()) { + if let Err(e) = backend.apply_perms(parent) { + delete_files(&written); + return Err(e); + } + } } if let Err(e) = fs::write(path, content) { delete_files(&written); return Err(e.into()); } + if path.extension().is_some_and(|ext| ext == "dvs") { + if let Err(e) = backend.apply_perms(path) { + delete_files(&written); + return Err(e); + } + } written.push(path.clone()); } From 1780971bc8af43f7016daadd7520ea0e37bb0783 Mon Sep 17 00:00:00 2001 From: Vincent Prouillet Date: Fri, 30 Jan 2026 14:55:04 +0100 Subject: [PATCH 3/3] Address comments --- Cargo.toml | 1 + dvs/src/migrate.rs | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 44c6cfc..81c61a0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,4 +34,5 @@ jiff = "0.2.18" anyhow = "1.0.100" whoami = "2" uuid = "1.20.0" +# deprecated but ok for our usecase serde_yaml = "0.9" diff --git a/dvs/src/migrate.rs b/dvs/src/migrate.rs index 3fd28c6..ce0e06d 100644 --- a/dvs/src/migrate.rs +++ b/dvs/src/migrate.rs @@ -62,11 +62,11 @@ fn delete_files(files: &[PathBuf]) { /// 1. Convert `dvs.yaml` to `dvs.toml` /// 2. Update all `.dvs` metadata files to the v2 format (adding MD5 hashes) and save them to strings /// 3. Save all the new files -/// 3. Delete all the v1 files (config + .dvs metadata files) +/// 4. Delete all the v1 files (config + .dvs metadata files) /// /// The migration is atomic: all new files are written before any old files are deleted. /// If any write fails, all newly written files are cleaned up. -/// If a file in the storage doesn't match the hash from the metadata file, the process fill fail. +/// If a file in the storage doesn't match the hash from the metadata file, the process will fail. pub fn migrate(root: impl AsRef) -> Result { let root = fs::canonicalize(root.as_ref())?; let yaml_path = root.join("dvs.yaml");