diff --git a/Cargo.lock b/Cargo.lock index c26ead4..f9d92dc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -213,10 +213,12 @@ dependencies = [ "anyhow", "blake3", "fs-err", + "globset", "jiff", "log", "md5", "nix", + "regex", "serde", "serde_json", "tempfile", diff --git a/Cargo.toml b/Cargo.toml index 34f4d73..b13ecf8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,3 +34,5 @@ anyhow = "1.0.100" whoami = "2" uuid = "1.20.0" zstd = "0.13.3" +globset = "0.4" +regex = "1" diff --git a/dvs/Cargo.toml b/dvs/Cargo.toml index 49ef203..696ba32 100644 --- a/dvs/Cargo.toml +++ b/dvs/Cargo.toml @@ -21,6 +21,8 @@ jiff.workspace = true anyhow.workspace = true whoami.workspace = true zstd.workspace = true +globset.workspace = true +regex.workspace = true uuid = { version = "1.20.0", features = ["v4"] } [target.'cfg(unix)'.dependencies] diff --git a/dvs/src/config.rs b/dvs/src/config.rs index ea731a2..d2d6460 100644 --- a/dvs/src/config.rs +++ b/dvs/src/config.rs @@ -5,6 +5,7 @@ use std::path::Path; use crate::backends::Backend as BackendTrait; use crate::backends::local::LocalBackend; use crate::paths::{CONFIG_FILE_NAME, DEFAULT_FOLDER_NAME, find_repo_root}; +use crate::tracking_rules::TrackingRule; use anyhow::{Context, Result}; use fs_err as fs; use serde::{Deserialize, Serialize}; @@ -72,6 +73,8 @@ pub struct Config { /// If this option is set, dvs will use that folder name instead of `.dvs` metadata_folder_name: Option, backend: Backend, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + tracking_rules: Vec, } impl Config { @@ -85,6 +88,7 @@ impl Config { compression: Compression::Zstd, metadata_folder_name: None, backend: Backend::Local(backend), + tracking_rules: Vec::new(), }) } @@ -135,6 +139,10 @@ impl Config { self.compression = compression; } + pub fn tracking_rules(&self) -> &[TrackingRule] { + &self.tracking_rules + } + pub fn backend(&self) -> &dyn BackendTrait { match &self.backend { Backend::Local(b) => b, diff --git a/dvs/src/lib.rs b/dvs/src/lib.rs index c4d43b5..7034597 100644 --- a/dvs/src/lib.rs +++ b/dvs/src/lib.rs @@ -6,6 +6,7 @@ mod gitignore; mod hashes; pub mod init; pub mod paths; +pub mod tracking_rules; pub use backends::Backend; pub use config::Compression; @@ -13,6 +14,7 @@ pub use file::{AddResult, FileMetadata, FileStatus, GetResult, Outcome, Status}; pub use file::{add_files, get_files, get_status}; pub use hashes::{HashAlg, Hashes}; pub use paths::{DvsPaths, find_repo_root}; +pub use tracking_rules::TrackingRule; #[cfg(test)] pub mod testutil { diff --git a/dvs/src/tracking_rules.rs b/dvs/src/tracking_rules.rs new file mode 100644 index 0000000..f70c1f1 --- /dev/null +++ b/dvs/src/tracking_rules.rs @@ -0,0 +1,328 @@ +use anyhow::{Result, bail}; +use globset::{Glob, GlobMatcher}; +use serde::{Deserialize, Serialize}; +use std::path::Path; + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Default)] +pub struct TrackingRule { + #[serde(default, skip_serializing_if = "Option::is_none")] + pub label: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub glob: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub extensions: Option>, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub regex: Option, + #[serde(default, skip_serializing_if = "Option::is_none")] + pub min_size: Option, +} + +impl TrackingRule { + /// Returns the explicit label or auto-generates one from the rule's path matcher. + pub fn label(&self) -> String { + let base = if let Some(label) = &self.label { + label.clone() + } else if let Some(glob) = &self.glob { + format!("glob: {glob}") + } else if let Some(exts) = &self.extensions { + format!("extensions: {}", exts.join(", ")) + } else if let Some(regex) = &self.regex { + format!("regex: {regex}") + } else if self.min_size.is_some() { + "min_size".to_string() + } else { + "empty rule".to_string() + }; + + if let Some(size) = &self.min_size { + format!("{base} (>= {size})") + } else { + base + } + } + + /// Validates and compiles this rule into a `CompiledRule`. + pub fn compile(&self) -> Result { + let matchers = [ + self.glob.is_some(), + self.extensions.is_some(), + self.regex.is_some(), + ]; + let count = matchers.iter().filter(|&&b| b).count(); + if count > 1 { + bail!( + "tracking rule must specify at most one of `glob`, `extensions`, or `regex`; multiple provided" + ); + } + if count == 0 && self.min_size.is_none() { + bail!("tracking rule must specify at least a path matcher or `min_size`"); + } + + let path_matcher = if let Some(pattern) = &self.glob { + let matcher = Glob::new(pattern) + .map_err(|e| anyhow::anyhow!("invalid glob pattern '{pattern}': {e}"))? + .compile_matcher(); + Some(PathMatcher::Glob(matcher)) + } else if let Some(exts) = &self.extensions { + let exts: Vec<&str> = exts + .iter() + .map(|e| e.strip_prefix('.').unwrap_or(e)) + .collect(); + let pattern = format!("**/*.{{{}}}", exts.join(",")); + let matcher = Glob::new(&pattern) + .map_err(|e| anyhow::anyhow!("invalid extensions pattern '{pattern}': {e}"))? + .compile_matcher(); + Some(PathMatcher::Glob(matcher)) + } else if let Some(pattern) = &self.regex { + let re = regex::Regex::new(pattern) + .map_err(|e| anyhow::anyhow!("invalid regex '{pattern}': {e}"))?; + Some(PathMatcher::Regex(re)) + } else { + None + }; + + let min_bytes = match &self.min_size { + Some(s) => Some(parse_size(s)?), + None => None, + }; + + Ok(CompiledRule { + label: self.label(), + path_matcher, + min_bytes, + }) + } +} + +/// Parses a human-readable size string into bytes. +/// +/// Supports: `"5MB"`, `"500KB"`, `"1.5GB"`, `"100"` (raw bytes). +/// Units are 1024-based and case-insensitive: B, KB, MB or GB. +pub fn parse_size(s: &str) -> Result { + let s = s.trim(); + if s.is_empty() { + bail!("empty size string"); + } + + // Split into numeric and unit parts + let unit_start = s.find(|c: char| c.is_ascii_alphabetic()).unwrap_or(s.len()); + let (num_str, unit_str) = s.split_at(unit_start); + let num_str = num_str.trim(); + let unit_str = unit_str.trim(); + + let value: f64 = num_str + .parse() + .map_err(|_| anyhow::anyhow!("invalid size number: '{num_str}'"))?; + + if value < 0.0 { + bail!("size cannot be negative: '{s}'"); + } + + let multiplier: u64 = match unit_str.to_uppercase().as_str() { + "" | "B" => 1, + "KB" => 1024, + "MB" => 1024 * 1024, + "GB" => 1024 * 1024 * 1024, + _ => bail!("unknown size unit: '{unit_str}' (expected B, KB, MB, or GB)"), + }; + + Ok((value * multiplier as f64) as u64) +} + +#[derive(Debug)] +enum PathMatcher { + Glob(GlobMatcher), + Regex(regex::Regex), +} + +/// A compiled, ready-to-match tracking rule. +#[derive(Debug)] +pub struct CompiledRule { + label: String, + path_matcher: Option, + min_bytes: Option, +} + +impl CompiledRule { + pub fn label(&self) -> &str { + &self.label + } + + /// Returns true if the given path and file size match this rule. + /// + /// Both path and size (if configured) must match (AND logic). + pub fn matches(&self, relative_path: &Path, file_size: u64) -> bool { + let path_matches = match &self.path_matcher { + Some(PathMatcher::Glob(m)) => m.is_match(relative_path), + Some(PathMatcher::Regex(re)) => { + let path_str = relative_path.to_string_lossy(); + re.is_match(&path_str) + } + None => true, + }; + + if !path_matches { + return false; + } + + match self.min_bytes { + Some(min) => file_size >= min, + None => true, + } + } +} + +/// Compiles a slice of tracking rules into compiled rules. +pub fn compile_rules(rules: &[TrackingRule]) -> Result> { + rules.iter().map(|r| r.compile()).collect() +} + +#[cfg(test)] +mod tests { + use super::*; + use std::path::Path; + + #[test] + fn parse_size_all_units() { + assert_eq!(parse_size("100").unwrap(), 100); + assert_eq!(parse_size("100B").unwrap(), 100); + assert_eq!(parse_size("500KB").unwrap(), 500 * 1024); + assert_eq!(parse_size("5MB").unwrap(), 5 * 1024 * 1024); + assert_eq!(parse_size("1GB").unwrap(), 1024 * 1024 * 1024); + assert_eq!( + parse_size("1.5GB").unwrap(), + (1.5 * 1024.0 * 1024.0 * 1024.0) as u64 + ); + assert_eq!(parse_size("5mb").unwrap(), 5 * 1024 * 1024); + + assert!(parse_size("").is_err()); + assert!(parse_size("MB").is_err()); + assert!(parse_size("5XB").is_err()); + } + + #[test] + fn label_generation() { + let r = TrackingRule { + glob: Some("**/*.csv".into()), + ..Default::default() + }; + assert_eq!(r.label(), "glob: **/*.csv"); + + let r = TrackingRule { + extensions: Some(vec![".rds".into(), ".parquet".into()]), + ..Default::default() + }; + assert_eq!(r.label(), "extensions: .rds, .parquet"); + + let r = TrackingRule { + regex: Some("^data/.*$".into()), + ..Default::default() + }; + assert_eq!(r.label(), "regex: ^data/.*$"); + + let r = TrackingRule { + glob: Some("**/*.csv".into()), + min_size: Some("5MB".into()), + ..Default::default() + }; + assert_eq!(r.label(), "glob: **/*.csv (>= 5MB)"); + + let r = TrackingRule { + label: Some("Large CSVs".into()), + glob: Some("**/*.csv".into()), + min_size: Some("1MB".into()), + ..Default::default() + }; + assert_eq!(r.label(), "Large CSVs (>= 1MB)"); + } + + #[test] + fn compile_validates_matcher_count() { + assert!(TrackingRule::default().compile().is_err()); + + let r = TrackingRule { + glob: Some("**/*.csv".into()), + extensions: Some(vec![".csv".into()]), + ..Default::default() + }; + assert!(r.compile().unwrap_err().to_string().contains("multiple")); + } + + #[test] + fn size_only_rule() { + let c = TrackingRule { + min_size: Some("10MB".into()), + ..Default::default() + } + .compile() + .unwrap(); + let ten_mb = 10 * 1024 * 1024; + assert!(c.matches(Path::new("anything.txt"), ten_mb)); + assert!(c.matches(Path::new("data/deep/file.bin"), ten_mb + 1)); + assert!(!c.matches(Path::new("anything.txt"), ten_mb - 1)); + } + + #[test] + fn each_matcher_type_works() { + let c = TrackingRule { + glob: Some("**/*.csv".into()), + ..Default::default() + } + .compile() + .unwrap(); + assert!(c.matches(Path::new("data/file.csv"), 0)); + assert!(!c.matches(Path::new("file.txt"), 0)); + + let c = TrackingRule { + extensions: Some(vec![".csv".into(), "rds".into()]), + ..Default::default() + } + .compile() + .unwrap(); + assert!(c.matches(Path::new("deep/file.csv"), 0)); + assert!(c.matches(Path::new("file.rds"), 0)); + assert!(!c.matches(Path::new("file.txt"), 0)); + + let c = TrackingRule { + regex: Some(r"^results/.*\.(csv|tsv)$".into()), + ..Default::default() + } + .compile() + .unwrap(); + assert!(c.matches(Path::new("results/output.csv"), 0)); + assert!(!c.matches(Path::new("other/output.csv"), 0)); + } + + #[test] + fn min_size_is_and_with_path() { + let c = TrackingRule { + glob: Some("**/*.csv".into()), + min_size: Some("1MB".into()), + ..Default::default() + } + .compile() + .unwrap(); + let one_mb = 1024 * 1024; + assert!(!c.matches(Path::new("data.csv"), one_mb - 1)); + assert!(c.matches(Path::new("data.csv"), one_mb)); + assert!(!c.matches(Path::new("data.txt"), one_mb)); + } + + #[test] + fn or_across_rules() { + let compiled = compile_rules(&[ + TrackingRule { + glob: Some("**/*.csv".into()), + ..Default::default() + }, + TrackingRule { + extensions: Some(vec![".rds".into()]), + ..Default::default() + }, + ]) + .unwrap(); + assert!(compiled.iter().any(|r| r.matches(Path::new("file.csv"), 0))); + assert!(compiled.iter().any(|r| r.matches(Path::new("file.rds"), 0))); + assert!(!compiled.iter().any(|r| r.matches(Path::new("file.txt"), 0))); + } +}