From f222e4aea226358b9b68864d5b21cb9332d1dada Mon Sep 17 00:00:00 2001 From: Justin Maier Date: Wed, 25 Mar 2026 18:06:58 -0600 Subject: [PATCH 01/19] =?UTF-8?q?feat:=20Sync=20V2=20foundation=20?= =?UTF-8?q?=E2=80=94=20ops=20types,=20dedup=20helper,=20WAL=20writer/reade?= =?UTF-8?q?r?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Core building blocks for the ops-based sync pipeline: - src/pg_sync/ops.rs: Op enum (Set, Remove, Add, Delete, QueryOpSet), OpsRow, OpsBatch, EntityOps, SyncMeta, BitdexOps table SQL - src/pg_sync/op_dedup.rs: Shared dedup helper — LIFO per (entity_id, field), add/remove cancellation, delete absorption, queryOpSet last-wins - src/ops_wal.rs: Append-only WAL with CRC32 integrity, WalWriter (append+fsync), WalReader (cursor-based tail, partial record handling, CRC skip) Also fixes pre-existing compile error in copy_queries.rs tests (missing width/height fields on CopyImageRow constructors). 30 tests passing. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/lib.rs | 2 + src/ops_wal.rs | 410 ++++++++++++++++++++++++++++++++++++ src/pg_sync/copy_queries.rs | 3 + src/pg_sync/mod.rs | 2 + src/pg_sync/op_dedup.rs | 285 +++++++++++++++++++++++++ src/pg_sync/ops.rs | 282 +++++++++++++++++++++++++ 6 files changed, 984 insertions(+) create mode 100644 src/ops_wal.rs create mode 100644 src/pg_sync/op_dedup.rs create mode 100644 src/pg_sync/ops.rs diff --git a/src/lib.rs b/src/lib.rs index 15657ead..d660876f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,8 @@ pub mod bitmap_fs; pub mod bound_store; pub mod bucket_diff_log; +#[cfg(feature = "pg-sync")] +pub mod ops_wal; pub mod cache; pub mod capture; pub mod concurrency; diff --git a/src/ops_wal.rs b/src/ops_wal.rs new file mode 100644 index 00000000..2b9c63a0 --- /dev/null +++ b/src/ops_wal.rs @@ -0,0 +1,410 @@ +//! Ops WAL — append-only log for sync operations. +//! +//! Format per record: +//! [4 bytes: payload_len (u32 LE)] +//! [8 bytes: entity_id (i64 LE)] +//! [payload_len bytes: ops JSONB] +//! [4 bytes: CRC32 of entity_id + ops] +//! +//! The writer appends records and fsyncs. The reader tails the file, +//! reading batches of records and tracking a byte-offset cursor. +//! Partial records at EOF are skipped (crash recovery). + +use std::fs::{self, File, OpenOptions}; +use std::io::{self, Read, Write}; +use std::path::{Path, PathBuf}; + +use crate::pg_sync::ops::{EntityOps, Op}; + +const HEADER_SIZE: usize = 4 + 8; // payload_len + entity_id +const CRC_SIZE: usize = 4; + +/// WAL writer — appends ops records to a file with CRC32 integrity. +pub struct WalWriter { + path: PathBuf, +} + +impl WalWriter { + pub fn new(path: impl Into) -> Self { + Self { path: path.into() } + } + + /// Append a batch of entity ops to the WAL. Writes all records and fsyncs. + /// Returns the number of bytes written. + pub fn append_batch(&self, batch: &[EntityOps]) -> io::Result { + if batch.is_empty() { + return Ok(0); + } + + let mut file = OpenOptions::new() + .create(true) + .append(true) + .open(&self.path)?; + + let mut total_bytes = 0u64; + for entry in batch { + let ops_json = serde_json::to_vec(&entry.ops) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + + let payload_len = ops_json.len() as u32; + let entity_id_bytes = entry.entity_id.to_le_bytes(); + + // CRC covers entity_id + ops (not the length prefix) + let mut crc_input = Vec::with_capacity(8 + ops_json.len()); + crc_input.extend_from_slice(&entity_id_bytes); + crc_input.extend_from_slice(&ops_json); + let crc = crc32fast::hash(&crc_input); + + // Write: [len][entity_id][ops][crc] + file.write_all(&payload_len.to_le_bytes())?; + file.write_all(&entity_id_bytes)?; + file.write_all(&ops_json)?; + file.write_all(&crc.to_le_bytes())?; + + total_bytes += (HEADER_SIZE + ops_json.len() + CRC_SIZE) as u64; + } + + file.sync_all()?; + Ok(total_bytes) + } + + /// Get the file path. + pub fn path(&self) -> &Path { + &self.path + } + + /// Get current file size (0 if file doesn't exist). + pub fn file_size(&self) -> u64 { + fs::metadata(&self.path).map(|m| m.len()).unwrap_or(0) + } +} + +/// WAL reader — reads ops records from a file starting at a byte offset. +pub struct WalReader { + path: PathBuf, + /// Current read position (byte offset into the file) + cursor: u64, +} + +/// Result of reading a batch from the WAL. +pub struct WalBatch { + /// The ops read from the WAL + pub entries: Vec, + /// New cursor position after this batch + pub new_cursor: u64, + /// Number of bytes read + pub bytes_read: u64, + /// Number of records skipped due to CRC failure + pub crc_failures: u64, +} + +impl WalReader { + pub fn new(path: impl Into, cursor: u64) -> Self { + Self { + path: path.into(), + cursor, + } + } + + /// Read up to `max_records` from the WAL starting at the current cursor. + /// Advances the cursor past successfully read records. + /// Stops at EOF or on partial/corrupted records. + pub fn read_batch(&mut self, max_records: usize) -> io::Result { + if !self.path.exists() { + return Ok(WalBatch { + entries: Vec::new(), + new_cursor: self.cursor, + bytes_read: 0, + crc_failures: 0, + }); + } + + let data = fs::read(&self.path)?; + let mut entries = Vec::new(); + let mut pos = self.cursor as usize; + let mut crc_failures = 0u64; + let start_pos = pos; + + while entries.len() < max_records && pos + HEADER_SIZE <= data.len() { + // Read header + let payload_len = + u32::from_le_bytes(data[pos..pos + 4].try_into().unwrap()) as usize; + let entity_id = + i64::from_le_bytes(data[pos + 4..pos + 12].try_into().unwrap()); + + let record_end = pos + HEADER_SIZE + payload_len + CRC_SIZE; + if record_end > data.len() { + // Truncated record at EOF — stop here, don't advance cursor + break; + } + + // Verify CRC + let crc_input = &data[pos + 4..pos + HEADER_SIZE + payload_len]; // entity_id + ops + let stored_crc = u32::from_le_bytes( + data[pos + HEADER_SIZE + payload_len..record_end] + .try_into() + .unwrap(), + ); + let computed_crc = crc32fast::hash(crc_input); + + if stored_crc != computed_crc { + // CRC failure — skip this record + crc_failures += 1; + pos = record_end; + continue; + } + + // Parse ops JSON + let ops_data = &data[pos + HEADER_SIZE..pos + HEADER_SIZE + payload_len]; + match serde_json::from_slice::>(ops_data) { + Ok(ops) => { + entries.push(EntityOps { entity_id, ops }); + } + Err(_) => { + // Invalid JSON — skip + crc_failures += 1; + } + } + + pos = record_end; + } + + let bytes_read = (pos - start_pos) as u64; + self.cursor = pos as u64; + + Ok(WalBatch { + entries, + new_cursor: self.cursor, + bytes_read, + crc_failures, + }) + } + + /// Get the current cursor position. + pub fn cursor(&self) -> u64 { + self.cursor + } + + /// Set the cursor position (for recovery from persisted state). + pub fn set_cursor(&mut self, cursor: u64) { + self.cursor = cursor; + } + + /// Check if there are more records to read (cursor < file size). + pub fn has_more(&self) -> bool { + let file_size = fs::metadata(&self.path).map(|m| m.len()).unwrap_or(0); + self.cursor < file_size + } +} + +/// Delete a WAL file. +pub fn remove_wal(path: &Path) -> io::Result<()> { + if path.exists() { + fs::remove_file(path)?; + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + use tempfile::TempDir; + + fn make_ops(entity_id: i64, ops: Vec) -> EntityOps { + EntityOps { entity_id, ops } + } + + #[test] + fn test_write_read_roundtrip() { + let dir = TempDir::new().unwrap(); + let wal_path = dir.path().join("test.wal"); + + let writer = WalWriter::new(&wal_path); + let batch = vec![ + make_ops(1, vec![Op::Set { field: "nsfwLevel".into(), value: json!(16) }]), + make_ops(2, vec![Op::Add { field: "tagIds".into(), value: json!(42) }]), + make_ops(3, vec![Op::Delete]), + ]; + let bytes = writer.append_batch(&batch).unwrap(); + assert!(bytes > 0); + + let mut reader = WalReader::new(&wal_path, 0); + let result = reader.read_batch(100).unwrap(); + assert_eq!(result.entries.len(), 3); + assert_eq!(result.entries[0].entity_id, 1); + assert_eq!(result.entries[1].entity_id, 2); + assert_eq!(result.entries[2].entity_id, 3); + assert_eq!(result.crc_failures, 0); + assert!(!reader.has_more()); + } + + #[test] + fn test_multiple_appends() { + let dir = TempDir::new().unwrap(); + let wal_path = dir.path().join("test.wal"); + let writer = WalWriter::new(&wal_path); + + // First batch + writer.append_batch(&[ + make_ops(1, vec![Op::Set { field: "a".into(), value: json!(1) }]), + ]).unwrap(); + + // Second batch + writer.append_batch(&[ + make_ops(2, vec![Op::Set { field: "b".into(), value: json!(2) }]), + ]).unwrap(); + + let mut reader = WalReader::new(&wal_path, 0); + let result = reader.read_batch(100).unwrap(); + assert_eq!(result.entries.len(), 2); + assert_eq!(result.entries[0].entity_id, 1); + assert_eq!(result.entries[1].entity_id, 2); + } + + #[test] + fn test_cursor_resume() { + let dir = TempDir::new().unwrap(); + let wal_path = dir.path().join("test.wal"); + let writer = WalWriter::new(&wal_path); + + writer.append_batch(&[ + make_ops(1, vec![Op::Set { field: "a".into(), value: json!(1) }]), + make_ops(2, vec![Op::Set { field: "b".into(), value: json!(2) }]), + make_ops(3, vec![Op::Set { field: "c".into(), value: json!(3) }]), + ]).unwrap(); + + // Read first 2 + let mut reader = WalReader::new(&wal_path, 0); + let result = reader.read_batch(2).unwrap(); + assert_eq!(result.entries.len(), 2); + let saved_cursor = reader.cursor(); + + // Resume from cursor — should get the 3rd + let mut reader2 = WalReader::new(&wal_path, saved_cursor); + let result2 = reader2.read_batch(100).unwrap(); + assert_eq!(result2.entries.len(), 1); + assert_eq!(result2.entries[0].entity_id, 3); + assert!(!reader2.has_more()); + } + + #[test] + fn test_truncated_record_at_eof() { + let dir = TempDir::new().unwrap(); + let wal_path = dir.path().join("test.wal"); + let writer = WalWriter::new(&wal_path); + + writer.append_batch(&[ + make_ops(1, vec![Op::Set { field: "a".into(), value: json!(1) }]), + ]).unwrap(); + + // Append garbage (partial record) + let mut file = OpenOptions::new().append(true).open(&wal_path).unwrap(); + file.write_all(&[0u8; 6]).unwrap(); // Too short to be a valid header+payload + + let mut reader = WalReader::new(&wal_path, 0); + let result = reader.read_batch(100).unwrap(); + // Should read the valid record and stop at the truncated one + assert_eq!(result.entries.len(), 1); + assert_eq!(result.crc_failures, 0); + } + + #[test] + fn test_corrupted_crc_skipped() { + let dir = TempDir::new().unwrap(); + let wal_path = dir.path().join("test.wal"); + let writer = WalWriter::new(&wal_path); + + writer.append_batch(&[ + make_ops(1, vec![Op::Set { field: "a".into(), value: json!(1) }]), + make_ops(2, vec![Op::Set { field: "b".into(), value: json!(2) }]), + ]).unwrap(); + + // Corrupt the CRC of the first record + let mut data = fs::read(&wal_path).unwrap(); + // First record: header(12) + ops_json + crc(4) + // Find where the CRC is for the first record + let payload_len = u32::from_le_bytes(data[0..4].try_into().unwrap()) as usize; + let crc_offset = HEADER_SIZE + payload_len; + data[crc_offset] ^= 0xFF; // Flip bits in CRC + fs::write(&wal_path, &data).unwrap(); + + let mut reader = WalReader::new(&wal_path, 0); + let result = reader.read_batch(100).unwrap(); + // First record should be skipped (CRC failure), second should be read + assert_eq!(result.entries.len(), 1); + assert_eq!(result.entries[0].entity_id, 2); + assert_eq!(result.crc_failures, 1); + } + + #[test] + fn test_empty_file() { + let dir = TempDir::new().unwrap(); + let wal_path = dir.path().join("test.wal"); + + let mut reader = WalReader::new(&wal_path, 0); + let result = reader.read_batch(100).unwrap(); + assert_eq!(result.entries.len(), 0); + assert!(!reader.has_more()); + } + + #[test] + fn test_query_op_set_roundtrip() { + let dir = TempDir::new().unwrap(); + let wal_path = dir.path().join("test.wal"); + let writer = WalWriter::new(&wal_path); + + writer.append_batch(&[make_ops(456, vec![ + Op::QueryOpSet { + query: "modelVersionIds eq 456".into(), + ops: vec![ + Op::Remove { field: "baseModel".into(), value: json!("SD 1.5") }, + Op::Set { field: "baseModel".into(), value: json!("SDXL") }, + ], + }, + ])]).unwrap(); + + let mut reader = WalReader::new(&wal_path, 0); + let result = reader.read_batch(100).unwrap(); + assert_eq!(result.entries.len(), 1); + assert_eq!(result.entries[0].entity_id, 456); + match &result.entries[0].ops[0] { + Op::QueryOpSet { query, ops } => { + assert_eq!(query, "modelVersionIds eq 456"); + assert_eq!(ops.len(), 2); + } + _ => panic!("Expected QueryOpSet"), + } + } + + #[test] + fn test_file_size_tracking() { + let dir = TempDir::new().unwrap(); + let wal_path = dir.path().join("test.wal"); + let writer = WalWriter::new(&wal_path); + + assert_eq!(writer.file_size(), 0); + + writer.append_batch(&[ + make_ops(1, vec![Op::Delete]), + ]).unwrap(); + + assert!(writer.file_size() > 0); + } + + #[test] + fn test_remove_wal() { + let dir = TempDir::new().unwrap(); + let wal_path = dir.path().join("test.wal"); + + let writer = WalWriter::new(&wal_path); + writer.append_batch(&[make_ops(1, vec![Op::Delete])]).unwrap(); + assert!(wal_path.exists()); + + remove_wal(&wal_path).unwrap(); + assert!(!wal_path.exists()); + + // Remove non-existent is ok + remove_wal(&wal_path).unwrap(); + } +} diff --git a/src/pg_sync/copy_queries.rs b/src/pg_sync/copy_queries.rs index 36bb568b..415e8062 100644 --- a/src/pg_sync/copy_queries.rs +++ b/src/pg_sync/copy_queries.rs @@ -723,6 +723,7 @@ mod tests { flags: (1 << 13), image_type: String::new(), user_id: 1, blocked_for: None, scanned_at_secs: None, created_at_secs: None, post_id: None, + width: None, height: None, published_at_secs: None, availability: String::new(), posted_to_id: None, }; assert!(row.has_meta()); @@ -739,6 +740,7 @@ mod tests { flags: (1 << 14), image_type: String::new(), user_id: 1, blocked_for: None, scanned_at_secs: None, created_at_secs: None, post_id: None, + width: None, height: None, published_at_secs: None, availability: String::new(), posted_to_id: None, }; assert!(row.on_site()); @@ -754,6 +756,7 @@ mod tests { scanned_at_secs: Some(100), created_at_secs: Some(200), published_at_secs: Some(150), + width: None, height: None, availability: String::new(), posted_to_id: None, post_id: None, }; assert_eq!(row.sort_at_secs(), 200); diff --git a/src/pg_sync/mod.rs b/src/pg_sync/mod.rs index 839aa4a3..18be2a74 100644 --- a/src/pg_sync/mod.rs +++ b/src/pg_sync/mod.rs @@ -12,6 +12,8 @@ pub mod config; pub mod copy_queries; pub mod copy_streams; pub mod metrics_poller; +pub mod op_dedup; +pub mod ops; pub mod outbox_poller; pub mod progress; pub mod queries; diff --git a/src/pg_sync/op_dedup.rs b/src/pg_sync/op_dedup.rs new file mode 100644 index 00000000..17da67a9 --- /dev/null +++ b/src/pg_sync/op_dedup.rs @@ -0,0 +1,285 @@ +//! Op deduplication and compression. +//! +//! Shared helper used by both pg-sync (before sending) and the WAL reader +//! (before applying). Two layers of dedup catch duplicates at both stages. +//! +//! Rules: +//! - LIFO per (entity_id, field): last op wins for set/remove pairs +//! - Add/remove cancellation: add X then remove X = net zero, dropped +//! - QueryOpSet dedup: by (entity_id, query string), last wins +//! - Delete absorbs all prior ops for the same entity_id + +use std::collections::HashMap; + +use super::ops::{EntityOps, Op}; + +/// Deduplicate a batch of entity ops in-place. +/// +/// Processes ops in order (oldest first), applying LIFO semantics: +/// for each (entity_id, field), only the last op survives. +/// Add/remove cancellation eliminates net-zero multi-value ops. +/// A delete op absorbs all prior ops for that entity. +pub fn dedup_ops(batch: &mut Vec) { + // Phase 1: Merge all ops per entity_id + let mut entity_map: HashMap> = HashMap::new(); + for entry in batch.drain(..) { + entity_map + .entry(entry.entity_id) + .or_default() + .extend(entry.ops); + } + + // Phase 2: Dedup ops within each entity + for (_entity_id, ops) in &mut entity_map { + dedup_entity_ops(ops); + } + + // Phase 3: Rebuild batch, dropping empty entries + *batch = entity_map + .into_iter() + .filter(|(_, ops)| !ops.is_empty()) + .map(|(entity_id, ops)| EntityOps { entity_id, ops }) + .collect(); +} + +/// Dedup ops for a single entity. Mutates the vec in place. +fn dedup_entity_ops(ops: &mut Vec) { + if ops.is_empty() { + return; + } + + // If there's a Delete, it absorbs everything — only keep the delete + if ops.iter().any(|op| matches!(op, Op::Delete)) { + ops.clear(); + ops.push(Op::Delete); + return; + } + + // First pass: collect all ops, tracking which fields have Set ops + let mut all_ops: Vec = ops.drain(..).collect(); + let mut set_fields: std::collections::HashSet = std::collections::HashSet::new(); + for op in &all_ops { + if let Op::Set { field, .. } = op { + set_fields.insert(field.clone()); + } + } + + // LIFO for set/remove on scalar fields (paired with Set = old value cleanup) + let mut last_set: HashMap = HashMap::new(); + let mut last_remove: HashMap = HashMap::new(); + + // Track add/remove for multi-value fields (net operations) + // Key: (field, value_as_string), Value: net count (+1 for add, -1 for remove) + let mut multi_value_net: HashMap<(String, String), i64> = HashMap::new(); + + // Track queryOpSet by query string (last wins) + let mut query_ops: HashMap> = HashMap::new(); + + for op in all_ops { + match op { + Op::Set { ref field, ref value } => { + last_set.insert(field.clone(), value.clone()); + } + Op::Remove { ref field, ref value } => { + if set_fields.contains(field) { + // Scalar field: this remove is paired with a set (old value cleanup) + last_remove.insert(field.clone(), value.clone()); + } else { + // Multi-value field: track net operations + let key = (field.clone(), value.to_string()); + *multi_value_net.entry(key).or_insert(0) -= 1; + } + } + Op::Add { ref field, ref value } => { + let key = (field.clone(), value.to_string()); + *multi_value_net.entry(key).or_insert(0) += 1; + } + Op::QueryOpSet { ref query, ops: ref nested_ops } => { + query_ops.insert(query.clone(), nested_ops.clone()); + } + Op::Delete => unreachable!("handled above"), + } + } + + // Rebuild: remove ops first, then set ops (order matters for bitmap updates) + for (field, value) in &last_remove { + ops.push(Op::Remove { + field: field.clone(), + value: value.clone(), + }); + } + + for (field, value) in last_set { + ops.push(Op::Set { field, value }); + } + + // Multi-value: emit net operations + for ((field, value_str), net) in multi_value_net { + if net == 0 { + continue; // Cancelled out + } + let value: serde_json::Value = serde_json::from_str(&value_str) + .unwrap_or(serde_json::Value::String(value_str)); + if net > 0 { + ops.push(Op::Add { field, value }); + } else { + ops.push(Op::Remove { field, value }); + } + } + + // QueryOpSets: last query string wins + for (query, nested) in query_ops { + ops.push(Op::QueryOpSet { query, ops: nested }); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + fn entity(id: i64, ops: Vec) -> EntityOps { + EntityOps { entity_id: id, ops } + } + + #[test] + fn test_lifo_set_same_field() { + let mut batch = vec![ + entity(1, vec![ + Op::Set { field: "nsfwLevel".into(), value: json!(8) }, + ]), + entity(1, vec![ + Op::Set { field: "nsfwLevel".into(), value: json!(16) }, + ]), + ]; + dedup_ops(&mut batch); + assert_eq!(batch.len(), 1); + let ops = &batch[0].ops; + // Last set wins + let set_op = ops.iter().find(|op| matches!(op, Op::Set { field, .. } if field == "nsfwLevel")).unwrap(); + if let Op::Set { value, .. } = set_op { + assert_eq!(*value, json!(16)); + } + } + + #[test] + fn test_different_fields_preserved() { + let mut batch = vec![entity(1, vec![ + Op::Set { field: "nsfwLevel".into(), value: json!(16) }, + Op::Set { field: "type".into(), value: json!("video") }, + ])]; + dedup_ops(&mut batch); + assert_eq!(batch[0].ops.len(), 2); + } + + #[test] + fn test_add_remove_cancellation() { + let mut batch = vec![entity(1, vec![ + Op::Add { field: "tagIds".into(), value: json!(42) }, + Op::Remove { field: "tagIds".into(), value: json!(42) }, + ])]; + dedup_ops(&mut batch); + // Net zero — entity should be dropped entirely + assert!(batch.is_empty() || batch[0].ops.is_empty()); + } + + #[test] + fn test_add_survives_when_no_cancel() { + let mut batch = vec![entity(1, vec![ + Op::Add { field: "tagIds".into(), value: json!(42) }, + Op::Add { field: "tagIds".into(), value: json!(99) }, + ])]; + dedup_ops(&mut batch); + assert_eq!(batch.len(), 1); + let adds: Vec<_> = batch[0].ops.iter() + .filter(|op| matches!(op, Op::Add { .. })) + .collect(); + assert_eq!(adds.len(), 2); + } + + #[test] + fn test_delete_absorbs_all() { + let mut batch = vec![ + entity(1, vec![ + Op::Set { field: "nsfwLevel".into(), value: json!(16) }, + Op::Add { field: "tagIds".into(), value: json!(42) }, + ]), + entity(1, vec![Op::Delete]), + ]; + dedup_ops(&mut batch); + assert_eq!(batch.len(), 1); + assert_eq!(batch[0].ops.len(), 1); + assert!(matches!(&batch[0].ops[0], Op::Delete)); + } + + #[test] + fn test_different_entities_independent() { + let mut batch = vec![ + entity(1, vec![Op::Set { field: "nsfwLevel".into(), value: json!(16) }]), + entity(2, vec![Op::Set { field: "nsfwLevel".into(), value: json!(32) }]), + ]; + dedup_ops(&mut batch); + assert_eq!(batch.len(), 2); + } + + #[test] + fn test_query_op_set_last_wins() { + let mut batch = vec![entity(456, vec![ + Op::QueryOpSet { + query: "modelVersionIds eq 456".into(), + ops: vec![Op::Set { field: "baseModel".into(), value: json!("SD 1.5") }], + }, + Op::QueryOpSet { + query: "modelVersionIds eq 456".into(), + ops: vec![Op::Set { field: "baseModel".into(), value: json!("SDXL") }], + }, + ])]; + dedup_ops(&mut batch); + let qops: Vec<_> = batch[0].ops.iter() + .filter(|op| matches!(op, Op::QueryOpSet { .. })) + .collect(); + assert_eq!(qops.len(), 1); + if let Op::QueryOpSet { ops, .. } = &qops[0] { + if let Op::Set { value, .. } = &ops[0] { + assert_eq!(*value, json!("SDXL")); + } + } + } + + #[test] + fn test_remove_set_pair_preserved() { + // An update: remove old value, set new value — both should survive + let mut batch = vec![entity(1, vec![ + Op::Remove { field: "nsfwLevel".into(), value: json!(8) }, + Op::Set { field: "nsfwLevel".into(), value: json!(16) }, + ])]; + dedup_ops(&mut batch); + assert_eq!(batch.len(), 1); + let has_remove = batch[0].ops.iter().any(|op| matches!(op, Op::Remove { field, .. } if field == "nsfwLevel")); + let has_set = batch[0].ops.iter().any(|op| matches!(op, Op::Set { field, .. } if field == "nsfwLevel")); + assert!(has_remove, "remove should survive"); + assert!(has_set, "set should survive"); + } + + #[test] + fn test_empty_batch() { + let mut batch: Vec = vec![]; + dedup_ops(&mut batch); + assert!(batch.is_empty()); + } + + #[test] + fn test_multiple_adds_same_value_collapse() { + // Adding tag 42 three times should still produce one add + let mut batch = vec![entity(1, vec![ + Op::Add { field: "tagIds".into(), value: json!(42) }, + Op::Add { field: "tagIds".into(), value: json!(42) }, + Op::Add { field: "tagIds".into(), value: json!(42) }, + ])]; + dedup_ops(&mut batch); + let adds: Vec<_> = batch[0].ops.iter() + .filter(|op| matches!(op, Op::Add { field, .. } if field == "tagIds")) + .collect(); + assert_eq!(adds.len(), 1); + } +} diff --git a/src/pg_sync/ops.rs b/src/pg_sync/ops.rs new file mode 100644 index 00000000..94ccef04 --- /dev/null +++ b/src/pg_sync/ops.rs @@ -0,0 +1,282 @@ +//! V2 ops data types for the ops-based sync pipeline. +//! +//! Ops are self-contained mutations: each carries the field name, old value (for removes), +//! and new value (for sets). This eliminates docstore reads on the write path. +//! +//! Op types: +//! - `set`: Set a scalar/sort field to a new value +//! - `remove`: Clear a slot from a field's bitmap (carries old value) +//! - `add`: Add a value to a multi-value field (tags, tools, etc.) +//! - `delete`: Delete a document (clears all bitmaps + alive bit) +//! - `queryOpSet`: Resolve slots via a BitDex query, apply nested ops to all matches + +use serde::{Deserialize, Serialize}; + +/// A single operation within an ops array. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +#[serde(tag = "op")] +pub enum Op { + /// Set a field to a value. For filter fields, sets the bit in the value's bitmap. + /// For sort fields, decomposes to bit layers. + #[serde(rename = "set")] + Set { + field: String, + value: serde_json::Value, + }, + + /// Remove a slot from a field's bitmap (old value). Used in remove/set pairs + /// for field changes: remove old value, then set new value. + #[serde(rename = "remove")] + Remove { + field: String, + value: serde_json::Value, + }, + + /// Add a value to a multi-value field (e.g., tagIds, toolIds). + /// Used for join-table INSERTs. + #[serde(rename = "add")] + Add { + field: String, + value: serde_json::Value, + }, + + /// Delete a document. Clears all filter/sort bitmap bits + alive bit. + /// Requires a docstore read to determine which bitmaps to clear. + #[serde(rename = "delete")] + Delete, + + /// Query-resolved bulk operation. Resolves slots via a BitDex query string, + /// then applies the nested ops to all matching slots. + /// Used for fan-out tables (ModelVersion, Post, Model). + #[serde(rename = "queryOpSet")] + QueryOpSet { + /// BitDex query string (e.g., "modelVersionIds eq 456") + query: String, + /// Ops to apply to all slots matching the query + ops: Vec, + }, +} + +/// A row from the BitdexOps table. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct OpsRow { + /// Auto-incrementing ID (used as cursor position) + pub id: i64, + /// The entity (image) ID this op targets. For queryOpSet, this is the + /// source entity ID (e.g., ModelVersion ID, Post ID). + pub entity_id: i64, + /// Array of operations to apply + pub ops: Vec, +} + +/// A batch of ops sent to the BitDex /ops endpoint. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct OpsBatch { + /// Per-entity ops + pub ops: Vec, + /// Optional sync source metadata (cursor position, lag, etc.) + #[serde(skip_serializing_if = "Option::is_none")] + pub meta: Option, +} + +/// Ops for a single entity within a batch. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct EntityOps { + /// The entity (image) ID + pub entity_id: i64, + /// Operations to apply + pub ops: Vec, +} + +/// Sync source metadata, bundled with ops payloads. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SyncMeta { + /// Sync source identifier (e.g., "pg-sync-default", "clickhouse") + pub source: String, + /// Current cursor position in the ops table + #[serde(skip_serializing_if = "Option::is_none")] + pub cursor: Option, + /// Max ID in the ops table (for lag calculation) + #[serde(skip_serializing_if = "Option::is_none")] + pub max_id: Option, + /// Number of rows behind (max_id - cursor) + #[serde(skip_serializing_if = "Option::is_none")] + pub lag_rows: Option, +} + +/// SQL for creating the BitdexOps table and index. +pub const SETUP_OPS_SQL: &str = r#" +CREATE TABLE IF NOT EXISTS "BitdexOps" ( + id BIGSERIAL PRIMARY KEY, + entity_id BIGINT NOT NULL, + ops JSONB NOT NULL, + created_at TIMESTAMPTZ DEFAULT now() +); + +CREATE INDEX IF NOT EXISTS idx_bitdex_ops_id ON "BitdexOps" (id); +"#; + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + #[test] + fn test_set_op_roundtrip() { + let op = Op::Set { + field: "nsfwLevel".into(), + value: json!(16), + }; + let json = serde_json::to_string(&op).unwrap(); + let parsed: Op = serde_json::from_str(&json).unwrap(); + assert_eq!(op, parsed); + } + + #[test] + fn test_remove_op_roundtrip() { + let op = Op::Remove { + field: "nsfwLevel".into(), + value: json!(8), + }; + let json = serde_json::to_string(&op).unwrap(); + let parsed: Op = serde_json::from_str(&json).unwrap(); + assert_eq!(op, parsed); + } + + #[test] + fn test_add_op_roundtrip() { + let op = Op::Add { + field: "tagIds".into(), + value: json!(42), + }; + let json = serde_json::to_string(&op).unwrap(); + let parsed: Op = serde_json::from_str(&json).unwrap(); + assert_eq!(op, parsed); + } + + #[test] + fn test_delete_op_roundtrip() { + let op = Op::Delete; + let json = serde_json::to_string(&op).unwrap(); + let parsed: Op = serde_json::from_str(&json).unwrap(); + assert_eq!(op, parsed); + } + + #[test] + fn test_query_op_set_roundtrip() { + let op = Op::QueryOpSet { + query: "modelVersionIds eq 456".into(), + ops: vec![ + Op::Remove { + field: "baseModel".into(), + value: json!("SD 1.5"), + }, + Op::Set { + field: "baseModel".into(), + value: json!("SDXL"), + }, + ], + }; + let json = serde_json::to_string(&op).unwrap(); + let parsed: Op = serde_json::from_str(&json).unwrap(); + assert_eq!(op, parsed); + } + + #[test] + fn test_ops_array_from_json() { + let json = json!([ + {"op": "remove", "field": "nsfwLevel", "value": 8}, + {"op": "set", "field": "nsfwLevel", "value": 16}, + {"op": "add", "field": "tagIds", "value": 42}, + {"op": "delete"} + ]); + let ops: Vec = serde_json::from_value(json).unwrap(); + assert_eq!(ops.len(), 4); + assert!(matches!(&ops[0], Op::Remove { field, .. } if field == "nsfwLevel")); + assert!(matches!(&ops[1], Op::Set { field, .. } if field == "nsfwLevel")); + assert!(matches!(&ops[2], Op::Add { field, .. } if field == "tagIds")); + assert!(matches!(&ops[3], Op::Delete)); + } + + #[test] + fn test_ops_batch_with_meta() { + let batch = OpsBatch { + ops: vec![EntityOps { + entity_id: 123, + ops: vec![Op::Set { + field: "nsfwLevel".into(), + value: json!(16), + }], + }], + meta: Some(SyncMeta { + source: "pg-sync-default".into(), + cursor: Some(420_000_000), + max_id: Some(500_000_000), + lag_rows: Some(80_000_000), + }), + }; + let json = serde_json::to_string(&batch).unwrap(); + let parsed: OpsBatch = serde_json::from_str(&json).unwrap(); + assert_eq!(parsed.ops.len(), 1); + assert_eq!(parsed.ops[0].entity_id, 123); + assert!(parsed.meta.is_some()); + assert_eq!(parsed.meta.unwrap().source, "pg-sync-default"); + } + + #[test] + fn test_ops_batch_without_meta() { + let batch = OpsBatch { + ops: vec![], + meta: None, + }; + let json = serde_json::to_string(&batch).unwrap(); + assert!(!json.contains("meta")); + } + + #[test] + fn test_image_insert_ops() { + // Simulates what an Image INSERT trigger would produce + let ops: Vec = vec![ + Op::Set { field: "nsfwLevel".into(), value: json!(1) }, + Op::Set { field: "type".into(), value: json!("image") }, + Op::Set { field: "userId".into(), value: json!(12345) }, + Op::Set { field: "postId".into(), value: json!(67890) }, + Op::Set { field: "existedAt".into(), value: json!(1711234567) }, + ]; + let json = serde_json::to_value(&ops).unwrap(); + let parsed: Vec = serde_json::from_value(json).unwrap(); + assert_eq!(parsed.len(), 5); + } + + #[test] + fn test_image_update_ops_with_old_values() { + // Simulates Image UPDATE: nsfwLevel 8→16 + let ops: Vec = vec![ + Op::Remove { field: "nsfwLevel".into(), value: json!(8) }, + Op::Set { field: "nsfwLevel".into(), value: json!(16) }, + ]; + let json = serde_json::to_value(&ops).unwrap(); + let parsed: Vec = serde_json::from_value(json).unwrap(); + assert_eq!(parsed.len(), 2); + } + + #[test] + fn test_query_op_set_with_in_query() { + // Model POI change: fan-out to all model versions + let op = Op::QueryOpSet { + query: "modelVersionIds in [101, 102, 103]".into(), + ops: vec![Op::Set { + field: "poi".into(), + value: json!(true), + }], + }; + let json = serde_json::to_string(&op).unwrap(); + let parsed: Op = serde_json::from_str(&json).unwrap(); + if let Op::QueryOpSet { query, ops } = parsed { + assert!(query.contains("in [101")); + assert_eq!(ops.len(), 1); + } else { + panic!("Expected QueryOpSet"); + } + } +} From 1470f79934ab92ee21f1cc3b5e02d9e06e3224bd Mon Sep 17 00:00:00 2001 From: Justin Maier Date: Wed, 25 Mar 2026 18:16:16 -0600 Subject: [PATCH 02/19] feat: POST /ops endpoint + GET /sync-lag for Sync V2 WAL-backed ops ingestion endpoint: - POST /api/indexes/{name}/ops accepts OpsBatch (ops + sync meta) - Appends to WAL file via WalWriter, returns 200 only after fsync - Lazy WAL writer init (created on first POST) - Stores latest SyncMeta per source for lag monitoring Sync lag endpoint: - GET /api/internal/sync-lag returns latest metadata from all sync sources - Supports cursor position, max_id, lag_rows per source Both endpoints compile-gated behind pg-sync feature with no-op fallbacks. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/server.rs | 109 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) diff --git a/src/server.rs b/src/server.rs index a72435c9..a75974c7 100644 --- a/src/server.rs +++ b/src/server.rs @@ -290,6 +290,12 @@ struct AppState { metrics_bitmap_memory: AtomicBool, metrics_eviction_stats: AtomicBool, metrics_boundstore_disk: AtomicBool, + /// WAL writer for V2 ops endpoint. Created lazily on first ops POST. + #[cfg(feature = "pg-sync")] + ops_wal: Mutex>, + /// Latest sync source metadata (cursor, lag) keyed by source name. + #[cfg(feature = "pg-sync")] + sync_meta: Mutex>, } type SharedState = Arc; @@ -991,6 +997,10 @@ impl BitdexServer { metrics_bitmap_memory: AtomicBool::new(true), metrics_eviction_stats: AtomicBool::new(true), metrics_boundstore_disk: AtomicBool::new(true), + #[cfg(feature = "pg-sync")] + ops_wal: Mutex::new(None), + #[cfg(feature = "pg-sync")] + sync_meta: Mutex::new(std::collections::HashMap::new()), }); // Try to restore an existing index from disk @@ -1068,6 +1078,8 @@ impl BitdexServer { .route("/debug/heap-dump", axum::routing::post(handle_heap_dump)) .route("/api/formats", get(handle_list_formats)) .route("/api/internal/pgsync-metrics", post(handle_pgsync_metrics)) + .route("/api/indexes/{name}/ops", post(handle_ops)) + .route("/api/internal/sync-lag", get(handle_sync_lag)) .route("/metrics", get(handle_metrics)) .route("/", get(handle_ui)) .with_state(Arc::clone(&state)); @@ -4156,6 +4168,103 @@ async fn handle_pgsync_metrics( StatusCode::NO_CONTENT } +/// POST /api/indexes/{name}/ops — Accept a batch of sync ops, append to WAL. +/// Returns 200 only after all records are written and fsynced. +#[cfg(feature = "pg-sync")] +async fn handle_ops( + State(state): State, + AxumPath(name): AxumPath, + Json(batch): Json, +) -> impl IntoResponse { + // Verify index exists + { + let guard = state.index.lock(); + match guard.as_ref() { + Some(idx) if idx.definition.name == name => {} + _ => { + return ( + StatusCode::NOT_FOUND, + Json(serde_json::json!({"error": format!("Index '{}' not found", name)})), + ).into_response(); + } + } + } + + // Store sync metadata if provided + if let Some(meta) = &batch.meta { + let mut sync_meta = state.sync_meta.lock(); + sync_meta.insert(meta.source.clone(), meta.clone()); + } + + let ops_count = batch.ops.len(); + if ops_count == 0 { + return (StatusCode::OK, Json(serde_json::json!({"accepted": 0}))).into_response(); + } + + // Ensure WAL writer exists (lazy init) + let wal_path = { + let mut wal_guard = state.ops_wal.lock(); + if wal_guard.is_none() { + let wal_dir = state.data_dir.join("wal"); + std::fs::create_dir_all(&wal_dir).ok(); + let path = wal_dir.join("ops.wal"); + *wal_guard = Some(crate::ops_wal::WalWriter::new(path)); + } + wal_guard.as_ref().unwrap().path().to_path_buf() + }; + + // Write to WAL on blocking thread (fsync is blocking I/O) + let result = tokio::task::spawn_blocking(move || { + let writer = crate::ops_wal::WalWriter::new(&wal_path); + writer.append_batch(&batch.ops) + }) + .await; + + match result { + Ok(Ok(bytes)) => { + (StatusCode::OK, Json(serde_json::json!({ + "accepted": ops_count, + "bytes_written": bytes, + }))).into_response() + } + Ok(Err(e)) => { + eprintln!("WAL write error: {e}"); + (StatusCode::INTERNAL_SERVER_ERROR, Json(serde_json::json!({ + "error": format!("WAL write failed: {e}"), + }))).into_response() + } + Err(e) => { + eprintln!("WAL write task panicked: {e}"); + (StatusCode::INTERNAL_SERVER_ERROR, Json(serde_json::json!({ + "error": "Internal error", + }))).into_response() + } + } +} + +/// Fallback for when pg-sync feature is disabled. +#[cfg(not(feature = "pg-sync"))] +async fn handle_ops( + AxumPath(_name): AxumPath, +) -> impl IntoResponse { + (StatusCode::NOT_FOUND, Json(serde_json::json!({"error": "pg-sync feature not enabled"}))) +} + +/// GET /api/internal/sync-lag — Return latest sync metadata from all sources. +#[cfg(feature = "pg-sync")] +async fn handle_sync_lag( + State(state): State, +) -> impl IntoResponse { + let sync_meta = state.sync_meta.lock(); + let sources: Vec<&crate::pg_sync::ops::SyncMeta> = sync_meta.values().collect(); + Json(serde_json::json!({ "sources": sources })) +} + +#[cfg(not(feature = "pg-sync"))] +async fn handle_sync_lag() -> impl IntoResponse { + Json(serde_json::json!({ "sources": [] })) +} + async fn handle_ui() -> impl IntoResponse { Html(include_str!("../static/index.html")) } From 53ee7d57fba4c2a5f58d0e9dc142cf0c1f035472 Mon Sep 17 00:00:00 2001 From: Justin Maier Date: Wed, 25 Mar 2026 18:28:07 -0600 Subject: [PATCH 03/19] =?UTF-8?q?feat:=20WAL=20ops=20processor=20=E2=80=94?= =?UTF-8?q?=20converts=20ops=20to=20engine=20mutations?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ops processor that reads from WAL and routes to the engine: - Regular ops (set/remove/add): build PatchPayload with old+new values, call engine.patch() — no docstore read needed - queryOpSet: parse filter string, execute query for matching slots, apply nested ops to all matches - Delete: route to engine.delete() - Filter parser for queryOpSet: supports eq and in operators Includes json_to_qvalue converter (serde_json::Value → query::Value) for the PatchPayload/FieldValue type boundary. 9 tests: scalar update, insert (no old), multi-value add/remove, delete+queryOpSet skip, filter parsing, value type parsing, cursor persistence. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/lib.rs | 2 + src/ops_processor.rs | 453 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 455 insertions(+) create mode 100644 src/ops_processor.rs diff --git a/src/lib.rs b/src/lib.rs index d660876f..66628268 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -2,6 +2,8 @@ pub mod bitmap_fs; pub mod bound_store; pub mod bucket_diff_log; #[cfg(feature = "pg-sync")] +pub mod ops_processor; +#[cfg(feature = "pg-sync")] pub mod ops_wal; pub mod cache; pub mod capture; diff --git a/src/ops_processor.rs b/src/ops_processor.rs new file mode 100644 index 00000000..cc473fd3 --- /dev/null +++ b/src/ops_processor.rs @@ -0,0 +1,453 @@ +//! WAL ops processor — reads ops from WAL files and applies them as engine mutations. +//! +//! The processor runs as a dedicated thread, tailing WAL files and converting ops +//! into engine mutations (put/patch/delete). It handles: +//! - Regular ops (set/remove/add) via PatchPayload +//! - queryOpSet via query resolution + bulk bitmap ops +//! - Delete via engine.delete() +//! - Deduplication via shared dedup helper + +use std::collections::HashMap; +use std::path::{Path, PathBuf}; +use std::time::Duration; + +use serde_json::Value as JsonValue; + +use crate::concurrent_engine::ConcurrentEngine; +use crate::mutation::{FieldValue, PatchField, PatchPayload}; +use crate::pg_sync::op_dedup::dedup_ops; +use crate::pg_sync::ops::{EntityOps, Op}; +use crate::query::{BitdexQuery, FilterClause, Value as QValue}; + +/// Convert a serde_json::Value to a query::Value. +fn json_to_qvalue(v: &JsonValue) -> QValue { + match v { + JsonValue::Number(n) => { + if let Some(i) = n.as_i64() { + QValue::Integer(i) + } else if let Some(f) = n.as_f64() { + QValue::Float(f) + } else { + QValue::Integer(0) + } + } + JsonValue::Bool(b) => QValue::Bool(*b), + JsonValue::String(s) => QValue::String(s.clone()), + JsonValue::Null => QValue::Integer(0), // Null → zero for bitmap purposes + _ => QValue::String(v.to_string()), // Arrays/objects → string representation + } +} + +/// Configuration for the ops processor. +pub struct OpsProcessorConfig { + /// Max records to read per WAL batch + pub batch_size: usize, + /// How long to sleep when no new records are available + pub poll_interval: Duration, + /// Path to persist the cursor position + pub cursor_path: PathBuf, +} + +impl Default for OpsProcessorConfig { + fn default() -> Self { + Self { + batch_size: 10_000, + poll_interval: Duration::from_millis(50), + cursor_path: PathBuf::from("wal_cursor"), + } + } +} + +/// Process a single batch of entity ops against the engine. +/// Returns (applied, skipped, errors). +pub fn apply_ops_batch( + engine: &ConcurrentEngine, + batch: &mut Vec, +) -> (usize, usize, usize) { + // Dedup first + dedup_ops(batch); + + let mut applied = 0usize; + let mut skipped = 0usize; + let mut errors = 0usize; + + for entry in batch.iter() { + let entity_id = entry.entity_id; + if entity_id < 0 || entity_id > u32::MAX as i64 { + skipped += 1; + continue; + } + let slot = entity_id as u32; + + for op in &entry.ops { + match op { + Op::Delete => { + match engine.delete(slot) { + Ok(()) => applied += 1, + Err(e) => { + tracing::warn!("ops processor: delete slot {slot} failed: {e}"); + errors += 1; + } + } + } + + Op::QueryOpSet { query, ops } => { + match apply_query_op_set(engine, query, ops) { + Ok(count) => applied += count, + Err(e) => { + tracing::warn!("ops processor: queryOpSet '{query}' failed: {e}"); + errors += 1; + } + } + } + + // Accumulate set/remove/add ops per entity, then apply as a patch + _ => { + // Collect all non-delete, non-queryOpSet ops for this entity + // and apply as a single patch + } + } + } + + // Build a PatchPayload from the set/remove/add ops for this entity + let patch = build_patch_from_ops(&entry.ops); + if !patch.fields.is_empty() { + match engine.patch(slot, &patch) { + Ok(()) => applied += 1, + Err(e) => { + tracing::warn!("ops processor: patch slot {slot} failed: {e}"); + errors += 1; + } + } + } + } + + (applied, skipped, errors) +} + +/// Build a PatchPayload from a list of ops for a single entity. +/// Pairs remove/set ops on the same field into PatchField { old, new }. +/// Add ops become multi-value inserts. +fn build_patch_from_ops(ops: &[Op]) -> PatchPayload { + let mut fields: HashMap = HashMap::new(); + + // First pass: collect removes (old values) and sets (new values) per field + let mut old_values: HashMap<&str, &JsonValue> = HashMap::new(); + let mut new_values: HashMap<&str, &JsonValue> = HashMap::new(); + let mut add_values: HashMap<&str, Vec<&JsonValue>> = HashMap::new(); + let mut remove_values: HashMap<&str, Vec<&JsonValue>> = HashMap::new(); + + for op in ops { + match op { + Op::Remove { field, value } => { + // Check if there's a corresponding Set for this field (scalar update) + let has_set = ops.iter().any(|o| matches!(o, Op::Set { field: f, .. } if f == field)); + if has_set { + old_values.insert(field, value); + } else { + // Multi-value remove + remove_values.entry(field).or_default().push(value); + } + } + Op::Set { field, value } => { + new_values.insert(field, value); + } + Op::Add { field, value } => { + add_values.entry(field).or_default().push(value); + } + Op::Delete | Op::QueryOpSet { .. } => { + // Handled separately + } + } + } + + // Build PatchFields for scalar set/remove pairs + for (field, new_val) in &new_values { + let old = old_values + .get(*field) + .map(|v| FieldValue::Single(json_to_qvalue(v))) + .unwrap_or(FieldValue::Single(QValue::Integer(0))); + let new = FieldValue::Single(json_to_qvalue(new_val)); + fields.insert(field.to_string(), PatchField { old, new }); + } + + // Build PatchFields for multi-value adds + for (field, vals) in &add_values { + let new_multi: Vec = vals.iter().map(|v| json_to_qvalue(v)).collect(); + let existing = fields.entry(field.to_string()).or_insert_with(|| PatchField { + old: FieldValue::Multi(vec![]), + new: FieldValue::Multi(vec![]), + }); + if let FieldValue::Multi(ref mut m) = existing.new { + m.extend(new_multi); + } else { + *existing = PatchField { + old: FieldValue::Multi(vec![]), + new: FieldValue::Multi(vals.iter().map(|v| json_to_qvalue(v)).collect()), + }; + } + } + + // Build PatchFields for multi-value removes + for (field, vals) in &remove_values { + let removed: Vec = vals.iter().map(|v| json_to_qvalue(v)).collect(); + let existing = fields.entry(field.to_string()).or_insert_with(|| PatchField { + old: FieldValue::Multi(vec![]), + new: FieldValue::Multi(vec![]), + }); + if let FieldValue::Multi(ref mut m) = existing.old { + m.extend(removed); + } else { + *existing = PatchField { + old: FieldValue::Multi(vals.iter().map(|v| json_to_qvalue(v)).collect()), + new: FieldValue::Multi(vec![]), + }; + } + } + + PatchPayload { fields } +} + +/// Resolve a queryOpSet: execute the query to get matching slots, then apply +/// the nested ops to each slot. +fn apply_query_op_set( + engine: &ConcurrentEngine, + query_str: &str, + ops: &[Op], +) -> Result { + // Parse the query string into filter clauses + let filters = parse_filter_from_query_str(query_str)?; + + let query = BitdexQuery { + filters, + sort: None, + limit: usize::MAX, // Get all matching slots + offset: None, + cursor: None, + skip_cache: true, // Don't pollute cache with internal queries + }; + + // Execute query to get matching slot IDs + let result = engine + .execute_query(&query) + .map_err(|e| format!("queryOpSet query failed: {e}"))?; + + let slot_ids = &result.ids; + if slot_ids.is_empty() { + return Ok(0); + } + + // Build the patch from nested ops + let patch = build_patch_from_ops(ops); + if patch.fields.is_empty() { + return Ok(0); + } + + // Apply patch to each matching slot + let mut applied = 0; + for &slot_id in slot_ids { + if slot_id < 0 { + continue; + } + let slot = slot_id as u32; + match engine.patch(slot, &patch) { + Ok(()) => applied += 1, + Err(e) => { + tracing::warn!("queryOpSet: patch slot {slot} failed: {e}"); + } + } + } + + Ok(applied) +} + +/// Parse a simple filter string like "modelVersionIds eq 456" or "postId eq 789" +/// into filter clauses. +fn parse_filter_from_query_str(query_str: &str) -> Result, String> { + let clauses: Vec<&str> = query_str.split(" AND ").collect(); + let mut filters = Vec::new(); + + for clause in clauses { + let parts: Vec<&str> = clause.trim().splitn(3, ' ').collect(); + if parts.len() < 3 { + return Err(format!("Invalid filter clause: '{clause}'")); + } + + let field = parts[0].to_string(); + let op = parts[1].to_lowercase(); + let value_str = parts[2]; + + let filter = match op.as_str() { + "eq" => { + let value = parse_query_value(value_str)?; + FilterClause::Eq(field, value) + } + "in" => { + let values = parse_query_values_array(value_str)?; + FilterClause::In(field, values) + } + _ => { + return Err(format!("Unsupported filter op '{op}' in queryOpSet")); + } + }; + filters.push(filter); + } + + Ok(filters) +} + +/// Parse a single query value from a string. +fn parse_query_value(s: &str) -> Result { + if let Ok(n) = s.parse::() { + return Ok(QValue::Integer(n)); + } + if let Ok(f) = s.parse::() { + return Ok(QValue::Float(f)); + } + if s == "true" { + return Ok(QValue::Bool(true)); + } + if s == "false" { + return Ok(QValue::Bool(false)); + } + let stripped = s.trim_matches('"').trim_matches('\''); + Ok(QValue::String(stripped.to_string())) +} + +/// Parse an array of query values like "[101, 102, 103]". +fn parse_query_values_array(s: &str) -> Result, String> { + let trimmed = s.trim(); + if !trimmed.starts_with('[') || !trimmed.ends_with(']') { + return Err(format!("Expected array for 'in' filter, got: '{s}'")); + } + let inner = &trimmed[1..trimmed.len() - 1]; + let mut values = Vec::new(); + for part in inner.split(',') { + let part = part.trim(); + if !part.is_empty() { + values.push(parse_query_value(part)?); + } + } + Ok(values) +} + +/// Persist cursor position to disk. +pub fn save_cursor(path: &Path, cursor: u64) -> std::io::Result<()> { + std::fs::write(path, cursor.to_string()) +} + +/// Load cursor position from disk. Returns 0 if file doesn't exist. +pub fn load_cursor(path: &Path) -> u64 { + std::fs::read_to_string(path) + .ok() + .and_then(|s| s.trim().parse().ok()) + .unwrap_or(0) +} + +#[cfg(test)] +mod tests { + use super::*; + use serde_json::json; + + #[test] + fn test_build_patch_from_scalar_update() { + let ops = vec![ + Op::Remove { field: "nsfwLevel".into(), value: json!(8) }, + Op::Set { field: "nsfwLevel".into(), value: json!(16) }, + ]; + let patch = build_patch_from_ops(&ops); + assert_eq!(patch.fields.len(), 1); + let field = &patch.fields["nsfwLevel"]; + assert_eq!(field.old, FieldValue::Single(QValue::Integer(8))); + assert_eq!(field.new, FieldValue::Single(QValue::Integer(16))); + } + + #[test] + fn test_build_patch_from_insert_no_old() { + let ops = vec![ + Op::Set { field: "nsfwLevel".into(), value: json!(16) }, + Op::Set { field: "type".into(), value: json!("image") }, + ]; + let patch = build_patch_from_ops(&ops); + assert_eq!(patch.fields.len(), 2); + assert_eq!(patch.fields["nsfwLevel"].old, FieldValue::Single(QValue::Integer(0))); + assert_eq!(patch.fields["nsfwLevel"].new, FieldValue::Single(QValue::Integer(16))); + } + + #[test] + fn test_build_patch_from_add() { + let ops = vec![ + Op::Add { field: "tagIds".into(), value: json!(42) }, + Op::Add { field: "tagIds".into(), value: json!(99) }, + ]; + let patch = build_patch_from_ops(&ops); + assert_eq!(patch.fields.len(), 1); + if let FieldValue::Multi(ref vals) = patch.fields["tagIds"].new { + assert_eq!(vals.len(), 2); + } else { + panic!("Expected Multi"); + } + } + + #[test] + fn test_build_patch_from_multi_remove() { + let ops = vec![ + Op::Remove { field: "tagIds".into(), value: json!(42) }, + ]; + let patch = build_patch_from_ops(&ops); + assert_eq!(patch.fields.len(), 1); + if let FieldValue::Multi(ref vals) = patch.fields["tagIds"].old { + assert_eq!(vals.len(), 1); + assert_eq!(vals[0], QValue::Integer(42)); + } else { + panic!("Expected Multi for old"); + } + } + + #[test] + fn test_build_patch_skips_delete_and_query() { + let ops = vec![ + Op::Delete, + Op::QueryOpSet { query: "x eq 1".into(), ops: vec![] }, + Op::Set { field: "a".into(), value: json!(1) }, + ]; + let patch = build_patch_from_ops(&ops); + assert_eq!(patch.fields.len(), 1); + assert!(patch.fields.contains_key("a")); + } + + #[test] + fn test_parse_filter_eq() { + let filters = parse_filter_from_query_str("modelVersionIds eq 456").unwrap(); + assert_eq!(filters.len(), 1); + assert!(matches!(&filters[0], FilterClause::Eq(f, QValue::Integer(456)) if f == "modelVersionIds")); + } + + #[test] + fn test_parse_filter_in() { + let filters = parse_filter_from_query_str("modelVersionIds in [101, 102, 103]").unwrap(); + assert_eq!(filters.len(), 1); + if let FilterClause::In(f, vals) = &filters[0] { + assert_eq!(f, "modelVersionIds"); + assert_eq!(vals.len(), 3); + } else { + panic!("Expected In clause"); + } + } + + #[test] + fn test_parse_query_value_types() { + assert!(matches!(parse_query_value("42").unwrap(), QValue::Integer(42))); + assert!(matches!(parse_query_value("true").unwrap(), QValue::Bool(true))); + assert!(matches!(parse_query_value("\"hello\"").unwrap(), QValue::String(s) if s == "hello")); + } + + #[test] + fn test_cursor_persistence() { + let dir = tempfile::TempDir::new().unwrap(); + let path = dir.path().join("cursor"); + assert_eq!(load_cursor(&path), 0); + save_cursor(&path, 12345).unwrap(); + assert_eq!(load_cursor(&path), 12345); + } +} From 6132cae1aa859c8a95c22ba07c96741811ccdb06 Mon Sep 17 00:00:00 2001 From: Justin Maier Date: Wed, 25 Mar 2026 18:29:24 -0600 Subject: [PATCH 04/19] =?UTF-8?q?feat:=20V2=20ops=20poller=20=E2=80=94=20r?= =?UTF-8?q?eads=20BitdexOps,=20deduplicates,=20POSTs=20to=20/ops=20endpoin?= =?UTF-8?q?t?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New ops_poller.rs replaces outbox_poller for V2 sync: - Reads from BitdexOps table (JSONB ops arrays) instead of BitdexOutbox - Cursor managed in PG bitdex_cursors table (not in BitDex) - Deduplicates via shared dedup_ops() before sending - POSTs OpsBatch with SyncMeta (cursor, max_id, lag) to /ops endpoint - Health gate: pauses when BitDex is unreachable Also adds post_ops() to BitdexClient. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/pg_sync/bitdex_client.rs | 18 +++ src/pg_sync/mod.rs | 1 + src/pg_sync/ops_poller.rs | 210 +++++++++++++++++++++++++++++++++++ 3 files changed, 229 insertions(+) create mode 100644 src/pg_sync/ops_poller.rs diff --git a/src/pg_sync/bitdex_client.rs b/src/pg_sync/bitdex_client.rs index a5e45c79..50be9d2b 100644 --- a/src/pg_sync/bitdex_client.rs +++ b/src/pg_sync/bitdex_client.rs @@ -243,6 +243,24 @@ impl BitdexClient { .await; } + /// POST a batch of V2 ops to the BitDex /ops endpoint. + pub async fn post_ops(&self, batch: &super::ops::OpsBatch) -> Result<(), String> { + let url = format!("{}/ops", self.base_url); + let resp = self.client + .post(&url) + .json(batch) + .send() + .await + .map_err(|e| format!("post_ops request failed: {e}"))?; + + if !resp.status().is_success() { + let status = resp.status(); + let body = resp.text().await.unwrap_or_default(); + return Err(format!("post_ops returned {status}: {body}")); + } + Ok(()) + } + pub async fn get_cursor(&self, cursor_name: &str) -> Result, String> { let url = format!("{}/cursors/{}", self.base_url, cursor_name); let resp = self.client diff --git a/src/pg_sync/mod.rs b/src/pg_sync/mod.rs index 18be2a74..ca75c0ac 100644 --- a/src/pg_sync/mod.rs +++ b/src/pg_sync/mod.rs @@ -14,6 +14,7 @@ pub mod copy_streams; pub mod metrics_poller; pub mod op_dedup; pub mod ops; +pub mod ops_poller; pub mod outbox_poller; pub mod progress; pub mod queries; diff --git a/src/pg_sync/ops_poller.rs b/src/pg_sync/ops_poller.rs new file mode 100644 index 00000000..090224d4 --- /dev/null +++ b/src/pg_sync/ops_poller.rs @@ -0,0 +1,210 @@ +//! V2 ops poller: reads from BitdexOps table, deduplicates, and POSTs to BitDex /ops endpoint. +//! +//! Replaces the V1 outbox_poller by reading self-contained ops (with old+new values) +//! instead of entity IDs that require enrichment queries. +//! +//! Poll loop: +//! 1. On boot: read cursor from PG bitdex_cursors table +//! 2. SELECT from BitdexOps WHERE id > cursor ORDER BY id ASC LIMIT N +//! 3. Deserialize JSONB ops arrays +//! 4. Dedup via shared dedup_ops() +//! 5. POST batch to BitDex /ops endpoint with sync metadata +//! 6. Advance cursor in PG +//! 7. Report max_outbox_id for lag calculation + +use std::time::Duration; + +use sqlx::PgPool; +use tokio::time::interval; + +use super::bitdex_client::BitdexClient; +use super::op_dedup::dedup_ops; +use super::ops::{EntityOps, Op, OpsBatch, SyncMeta}; + +/// Row from BitdexOps table. +#[derive(Debug, sqlx::FromRow)] +struct OpsRow { + id: i64, + entity_id: i64, + ops: sqlx::types::Json>, +} + +/// Run the V2 ops poller loop. Runs forever until cancelled. +pub async fn run_ops_poller( + pool: &PgPool, + client: &BitdexClient, + poll_interval_secs: u64, + batch_limit: i64, + cursor_name: &str, + replica_id: Option<&str>, +) -> Result<(), String> { + // Wait for BitDex health + eprintln!("Ops poller waiting for BitDex to be healthy..."); + loop { + if client.is_healthy().await { + break; + } + tokio::time::sleep(Duration::from_secs(2)).await; + } + eprintln!("BitDex is healthy."); + + // Read initial cursor from PG + let mut cursor: i64 = read_cursor_from_pg(pool, cursor_name) + .await + .unwrap_or(0); + eprintln!( + "Ops poller started (interval={}s, batch_limit={}, cursor_name={}, starting_cursor={})", + poll_interval_secs, batch_limit, cursor_name, cursor + ); + + let mut ticker = interval(Duration::from_secs(poll_interval_secs)); + let mut bitdex_was_down = false; + + loop { + ticker.tick().await; + + // Health gate + if !client.is_healthy().await { + if !bitdex_was_down { + eprintln!("Ops poller: BitDex unreachable, pausing"); + bitdex_was_down = true; + } + continue; + } + if bitdex_was_down { + eprintln!("Ops poller: BitDex is back, resuming"); + bitdex_was_down = false; + } + + let cycle_start = std::time::Instant::now(); + match poll_and_process(pool, client, batch_limit, cursor_name, &mut cursor, replica_id).await { + Ok(processed) => { + let cycle_secs = cycle_start.elapsed().as_secs_f64(); + if processed > 0 { + eprintln!("Ops poller: processed {processed} ops (cursor={cursor}, cycle={cycle_secs:.3}s)"); + } + } + Err(e) => { + eprintln!("Ops poller error: {e}"); + } + } + } +} + +/// Single poll + process cycle. +async fn poll_and_process( + pool: &PgPool, + client: &BitdexClient, + batch_limit: i64, + cursor_name: &str, + cursor: &mut i64, + replica_id: Option<&str>, +) -> Result { + // Fetch ops after cursor + let rows = poll_ops_from_cursor(pool, *cursor, batch_limit) + .await + .map_err(|e| format!("poll_ops: {e}"))?; + + if rows.is_empty() { + return Ok(0); + } + + let max_id = rows.iter().map(|r| r.id).max().unwrap_or(*cursor); + let total_rows = rows.len(); + + // Convert to EntityOps + let mut batch: Vec = rows + .into_iter() + .map(|row| EntityOps { + entity_id: row.entity_id, + ops: row.ops.0, + }) + .collect(); + + // Dedup + dedup_ops(&mut batch); + + if batch.is_empty() { + // All ops cancelled out — still advance cursor + advance_cursor(pool, cursor_name, max_id, cursor).await?; + return Ok(total_rows); + } + + // Get max ops ID for lag calculation + let max_ops_id = get_max_ops_id(pool).await.unwrap_or(max_id); + + // Build batch with metadata + let ops_batch = OpsBatch { + ops: batch, + meta: Some(SyncMeta { + source: replica_id.unwrap_or("default").to_string(), + cursor: Some(max_id), + max_id: Some(max_ops_id), + lag_rows: Some(max_ops_id - max_id), + }), + }; + + // POST to BitDex + client + .post_ops(&ops_batch) + .await + .map_err(|e| format!("post_ops: {e}"))?; + + // Advance cursor + advance_cursor(pool, cursor_name, max_id, cursor).await?; + + Ok(total_rows) +} + +async fn advance_cursor( + pool: &PgPool, + cursor_name: &str, + max_id: i64, + cursor: &mut i64, +) -> Result<(), String> { + super::queries::upsert_cursor(pool, cursor_name, max_id) + .await + .map_err(|e| format!("upsert_cursor: {e}"))?; + *cursor = max_id; + Ok(()) +} + +// ── SQL queries ── + +/// Read cursor from PG bitdex_cursors table. +async fn read_cursor_from_pg(pool: &PgPool, cursor_name: &str) -> Result { + let row: Option<(i64,)> = sqlx::query_as( + r#"SELECT last_outbox_id FROM bitdex_cursors WHERE replica_id = $1"#, + ) + .bind(cursor_name) + .fetch_optional(pool) + .await?; + Ok(row.map(|r| r.0).unwrap_or(0)) +} + +/// Poll ops from BitdexOps table after a cursor position. +async fn poll_ops_from_cursor( + pool: &PgPool, + cursor: i64, + limit: i64, +) -> Result, sqlx::Error> { + sqlx::query_as::<_, OpsRow>( + r#"SELECT id, entity_id, ops FROM "BitdexOps" + WHERE id > $1 + ORDER BY id ASC + LIMIT $2"#, + ) + .bind(cursor) + .bind(limit) + .fetch_all(pool) + .await +} + +/// Get the current max ops ID (for lag calculation). +async fn get_max_ops_id(pool: &PgPool) -> Result { + let row: (Option,) = + sqlx::query_as(r#"SELECT MAX(id) FROM "BitdexOps""#) + .fetch_one(pool) + .await?; + Ok(row.0.unwrap_or(0)) +} From 1513c77ecf4dbecd6e7ec965d6595ca2273efd7e Mon Sep 17 00:00:00 2001 From: Justin Maier Date: Wed, 25 Mar 2026 18:32:17 -0600 Subject: [PATCH 05/19] feat: YAML trigger config + SQL generator for Sync V2 Config-driven PG trigger generation: - SyncSource struct: direct tables (slot_field + track_fields), multi-value join tables (field + value_field), fan-out tables (query + query_source) - SyncConfig: YAML-parseable config with sync_sources array - SQL generator: CREATE OR REPLACE FUNCTION + CREATE TRIGGER for each source - Expression interpolation in track_fields: "GREATEST({scannedAt}, {createdAt}) as existedAt" - {column} placeholder substitution with OLD/NEW prefixes - Hash-based trigger naming (bitdex_{table}_{hash8}) for reconciliation - IS DISTINCT FROM checks for UPDATE ops (only emit when value actually changes) - queryOpSet generation for fan-out tables - ENABLE ALWAYS on all triggers (CDC compatibility) 11 tests: parsing, column substitution, all three trigger types, hash change detection, YAML parsing, expression interpolation. Co-Authored-By: Claude Opus 4.6 (1M context) --- Cargo.lock | 62 ++++- Cargo.toml | 4 +- src/pg_sync/mod.rs | 1 + src/pg_sync/trigger_gen.rs | 558 +++++++++++++++++++++++++++++++++++++ 4 files changed, 623 insertions(+), 2 deletions(-) create mode 100644 src/pg_sync/trigger_gen.rs diff --git a/Cargo.lock b/Cargo.lock index 69bc163b..90a95fe1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -205,13 +205,14 @@ checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" [[package]] name = "bitdex-v2" -version = "1.0.71" +version = "1.0.93" dependencies = [ "arc-swap", "axum", "bytes", "chrono", "clap", + "crc32fast", "criterion", "crossbeam-channel", "dashmap", @@ -229,10 +230,13 @@ dependencies = [ "rpmalloc", "serde", "serde_json", + "serde_yaml", "sqlx", "tar", "tempfile", "thiserror 2.0.18", + "tikv-jemalloc-ctl", + "tikv-jemallocator", "tokio", "tokio-util", "toml", @@ -1513,6 +1517,12 @@ dependencies = [ "windows-link", ] +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + [[package]] name = "pem-rfc7468" version = "0.7.0" @@ -2241,6 +2251,19 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_yaml" +version = "0.9.34+deprecated" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" +dependencies = [ + "indexmap", + "itoa", + "ryu", + "serde", + "unsafe-libyaml", +] + [[package]] name = "sha1" version = "0.10.6" @@ -2675,6 +2698,37 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "tikv-jemalloc-ctl" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "661f1f6a57b3a36dc9174a2c10f19513b4866816e13425d3e418b11cc37bc24c" +dependencies = [ + "libc", + "paste", + "tikv-jemalloc-sys", +] + +[[package]] +name = "tikv-jemalloc-sys" +version = "0.6.1+5.3.0-1-ge13ca993e8ccb9ba9847cc330696e02839f328f7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd8aa5b2ab86a2cefa406d889139c162cbb230092f7d1d7cbc1716405d852a3b" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "tikv-jemallocator" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0359b4327f954e0567e69fb191cf1436617748813819c94b8cd4a431422d053a" +dependencies = [ + "libc", + "tikv-jemalloc-sys", +] + [[package]] name = "tinystr" version = "0.8.2" @@ -2971,6 +3025,12 @@ version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" +[[package]] +name = "unsafe-libyaml" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861" + [[package]] name = "untrusted" version = "0.9.0" diff --git a/Cargo.toml b/Cargo.toml index 818ce2b7..37f3f389 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -15,9 +15,10 @@ default = [] server = ["dep:axum", "dep:tower-http", "dep:tokio", "dep:tokio-util", "dep:prometheus"] loadtest = ["ureq"] replay = ["ureq"] -pg-sync = ["dep:sqlx", "dep:clap", "dep:reqwest", "dep:chrono", "dep:tokio", "dep:axum", "dep:tower-http", "dep:futures-core", "dep:futures-util", "dep:bytes"] +pg-sync = ["dep:sqlx", "dep:clap", "dep:reqwest", "dep:chrono", "dep:tokio", "dep:axum", "dep:tower-http", "dep:futures-core", "dep:futures-util", "dep:bytes", "dep:serde_yaml"] simd = ["roaring/simd"] heap-prof = ["dep:tikv-jemallocator", "dep:tikv-jemalloc-ctl"] +serde_yaml = ["dep:serde_yaml"] [dependencies] # Bitmap indexes @@ -85,6 +86,7 @@ thiserror = "2" # Logging tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["env-filter"] } +serde_yaml = { version = "0.9.34", optional = true } [dev-dependencies] # Property-based testing diff --git a/src/pg_sync/mod.rs b/src/pg_sync/mod.rs index ca75c0ac..9c943346 100644 --- a/src/pg_sync/mod.rs +++ b/src/pg_sync/mod.rs @@ -16,6 +16,7 @@ pub mod op_dedup; pub mod ops; pub mod ops_poller; pub mod outbox_poller; +pub mod trigger_gen; pub mod progress; pub mod queries; pub mod row_assembler; diff --git a/src/pg_sync/trigger_gen.rs b/src/pg_sync/trigger_gen.rs new file mode 100644 index 00000000..82652b35 --- /dev/null +++ b/src/pg_sync/trigger_gen.rs @@ -0,0 +1,558 @@ +//! YAML-driven PG trigger SQL generator for V2 ops pipeline. +//! +//! Reads a `sync_sources` YAML config and generates PL/pgSQL trigger functions +//! that emit ops into the BitdexOps table. Two table types: +//! +//! **Direct tables** (slot = PG column): +//! - `track_fields`: scalar fields → emit remove/set pairs via IS DISTINCT FROM +//! - `field` + `value_field`: multi-value join tables → emit add/remove +//! - `on_delete: delete_slot`: emit delete op +//! - `sets_alive: true`: only this table can create new alive slots +//! +//! **Fan-out tables** (slots resolved by BitDex query): +//! - `query`: BitDex query template with {column} placeholders +//! - `query_source`: optional PG subquery for cross-table values +//! - `track_fields`: fields to track on the source table + +use std::collections::hash_map::DefaultHasher; +use std::hash::{Hash, Hasher}; + +use serde::Deserialize; + +/// A sync source definition from the YAML config. +#[derive(Debug, Clone, Deserialize)] +pub struct SyncSource { + /// PG table name (e.g., "Image", "TagsOnImageNew") + pub table: String, + + /// For direct tables: PG column that maps to the BitDex slot ID + pub slot_field: Option, + + /// For direct tables: list of scalar fields to track. + /// Can include expressions: "GREATEST({scannedAt}, {createdAt}) as existedAt" + pub track_fields: Option>, + + /// For multi-value join tables: the BitDex field name (e.g., "tagIds") + pub field: Option, + + /// For multi-value join tables: the PG column containing the value (e.g., "tagId") + pub value_field: Option, + + /// Optional SQL WHERE filter for the trigger (e.g., CollectionItem status filter) + pub filter: Option, + + /// If true, this table's INSERT ops set the alive bit on new slots + #[serde(default)] + pub sets_alive: bool, + + /// If "delete_slot", emit a delete op on DELETE + pub on_delete: Option, + + /// For fan-out tables: BitDex query template with {column} placeholders + pub query: Option, + + /// For fan-out tables: PG subquery to get values not on the triggering table + pub query_source: Option, + + /// Tables that must be loaded before this one during dumps + #[serde(rename = "dependsOn")] + pub depends_on: Option>, +} + +/// Full sync config loaded from YAML. +#[derive(Debug, Clone, Deserialize)] +pub struct SyncConfig { + pub sync_sources: Vec, +} + +impl SyncConfig { + /// Load from a YAML string. + pub fn from_yaml(yaml: &str) -> Result { + serde_yaml::from_str(yaml).map_err(|e| format!("Failed to parse sync config: {e}")) + } +} + +/// Generate the trigger function name with hash for reconciliation. +/// Format: bitdex_{table}_{hash8} +pub fn trigger_function_name(source: &SyncSource) -> String { + let body = generate_trigger_body(source); + let hash = short_hash(&body); + format!( + "bitdex_{}_ops_{}", + source.table.to_lowercase(), + hash + ) +} + +/// Generate the trigger name. +pub fn trigger_name(source: &SyncSource) -> String { + let body = generate_trigger_body(source); + let hash = short_hash(&body); + format!("bitdex_{}_{}", source.table.to_lowercase(), hash) +} + +/// Generate the full CREATE OR REPLACE FUNCTION + CREATE TRIGGER SQL +/// for a sync source. +pub fn generate_trigger_sql(source: &SyncSource) -> String { + let func_name = trigger_function_name(source); + let trig_name = trigger_name(source); + let body = generate_trigger_body(source); + + let trigger_events = if source.field.is_some() { + // Multi-value join table: INSERT and DELETE only + "AFTER INSERT OR DELETE" + } else if source.on_delete.as_deref() == Some("delete_slot") { + "AFTER INSERT OR UPDATE OR DELETE" + } else { + "AFTER INSERT OR UPDATE" + }; + + format!( + r#"CREATE OR REPLACE FUNCTION {func_name}() RETURNS trigger AS $$ +{body} +$$ LANGUAGE plpgsql; + +DROP TRIGGER IF EXISTS {trig_name} ON "{table}"; +CREATE TRIGGER {trig_name} {trigger_events} ON "{table}" + FOR EACH ROW EXECUTE FUNCTION {func_name}(); +ALTER TABLE "{table}" ENABLE ALWAYS TRIGGER {trig_name}; +"#, + func_name = func_name, + trig_name = trig_name, + body = body, + trigger_events = trigger_events, + table = source.table, + ) +} + +/// Generate the PL/pgSQL function body for a sync source. +fn generate_trigger_body(source: &SyncSource) -> String { + if let Some(ref field) = source.field { + // Multi-value join table (tags, tools, techniques, etc.) + generate_multi_value_body(source, field) + } else if source.query.is_some() { + // Fan-out table (ModelVersion, Post, Model) + generate_fan_out_body(source) + } else { + // Direct table (Image) + generate_direct_body(source) + } +} + +/// Generate body for direct tables (e.g., Image). +fn generate_direct_body(source: &SyncSource) -> String { + let slot_field = source.slot_field.as_deref().unwrap_or("id"); + let track_fields = source.track_fields.as_deref().unwrap_or(&[]); + let has_delete = source.on_delete.as_deref() == Some("delete_slot"); + + let mut body = String::from("DECLARE\n _ops jsonb;\nBEGIN\n"); + + // INSERT: emit set ops for all tracked fields (no remove since no prior state) + body.push_str(" IF TG_OP = 'INSERT' THEN\n"); + body.push_str(" _ops := jsonb_build_array(\n"); + let insert_ops: Vec = track_fields + .iter() + .map(|f| { + let (field_name, expr) = parse_track_field(f); + let new_expr = substitute_columns(&expr, "NEW"); + format!( + " jsonb_build_object('op', 'set', 'field', '{}', 'value', to_jsonb({}))", + field_name, new_expr + ) + }) + .collect(); + body.push_str(&insert_ops.join(",\n")); + body.push_str("\n );\n"); + body.push_str(&format!( + " INSERT INTO \"BitdexOps\" (entity_id, ops) VALUES (NEW.\"{}\", _ops);\n", + slot_field + )); + body.push_str(" RETURN NEW;\n"); + + // DELETE + if has_delete { + body.push_str(" ELSIF TG_OP = 'DELETE' THEN\n"); + body.push_str(&format!( + " INSERT INTO \"BitdexOps\" (entity_id, ops) VALUES (OLD.\"{}\", '[{{\"op\":\"delete\"}}]'::jsonb);\n", + slot_field + )); + body.push_str(" RETURN OLD;\n"); + } + + // UPDATE: emit remove/set pairs only for changed fields + body.push_str(" ELSE\n"); + body.push_str(" _ops := '[]'::jsonb;\n"); + for f in track_fields { + let (field_name, expr) = parse_track_field(f); + let old_expr = substitute_columns(&expr, "OLD"); + let new_expr = substitute_columns(&expr, "NEW"); + body.push_str(&format!( + " IF ({old}) IS DISTINCT FROM ({new}) THEN\n\ + \x20 _ops := _ops || jsonb_build_array(\n\ + \x20 jsonb_build_object('op', 'remove', 'field', '{field}', 'value', to_jsonb({old})),\n\ + \x20 jsonb_build_object('op', 'set', 'field', '{field}', 'value', to_jsonb({new}))\n\ + \x20 );\n\ + \x20 END IF;\n", + old = old_expr, + new = new_expr, + field = field_name, + )); + } + body.push_str(" IF jsonb_array_length(_ops) > 0 THEN\n"); + body.push_str(&format!( + " INSERT INTO \"BitdexOps\" (entity_id, ops) VALUES (NEW.\"{}\", _ops);\n", + slot_field + )); + body.push_str(" END IF;\n"); + body.push_str(" RETURN NEW;\n"); + body.push_str(" END IF;\n"); + body.push_str("END;"); + + body +} + +/// Generate body for multi-value join tables (e.g., TagsOnImageNew). +fn generate_multi_value_body(source: &SyncSource, field: &str) -> String { + let slot_field = source.slot_field.as_deref().unwrap_or("imageId"); + let value_field = source.value_field.as_deref().unwrap_or("id"); + let filter_clause = source + .filter + .as_ref() + .map(|f| format!(" IF {} THEN\n", f.replace("imageId", "NEW.\"imageId\""))) + .unwrap_or_default(); + let filter_end = if source.filter.is_some() { + " END IF;\n" + } else { + "" + }; + + format!( + r#"BEGIN + IF TG_OP = 'INSERT' THEN +{filter_start} INSERT INTO "BitdexOps" (entity_id, ops) + VALUES (NEW."{slot}", jsonb_build_array( + jsonb_build_object('op', 'add', 'field', '{field}', 'value', to_jsonb(NEW."{value}")) + )); +{filter_end} RETURN NEW; + ELSIF TG_OP = 'DELETE' THEN + INSERT INTO "BitdexOps" (entity_id, ops) + VALUES (OLD."{slot}", jsonb_build_array( + jsonb_build_object('op', 'remove', 'field', '{field}', 'value', to_jsonb(OLD."{value}")) + )); + RETURN OLD; + END IF; + RETURN COALESCE(NEW, OLD); +END;"#, + slot = slot_field, + field = field, + value = value_field, + filter_start = filter_clause, + filter_end = filter_end, + ) +} + +/// Generate body for fan-out tables (e.g., ModelVersion, Post). +fn generate_fan_out_body(source: &SyncSource) -> String { + let query_template = source.query.as_deref().unwrap_or(""); + let track_fields = source.track_fields.as_deref().unwrap_or(&[]); + + let mut body = String::from("DECLARE\n _ops jsonb;\n _query text;\n"); + + // If there's a query_source, we need a variable for its result + if source.query_source.is_some() { + body.push_str(" _source_result jsonb;\n"); + } + body.push_str("BEGIN\n"); + body.push_str(" IF TG_OP = 'UPDATE' THEN\n"); + + // Build the query string with column substitution + if let Some(ref query_source) = source.query_source { + let source_sql = substitute_columns(query_source, "NEW"); + body.push_str(&format!( + " EXECUTE format('SELECT ({})') INTO _source_result;\n", + source_sql.replace('\'', "''") + )); + // Substitute the query_source result into the query template + body.push_str(&format!( + " _query := '{}';\n", + query_template + )); + // Replace placeholders with source result values + body.push_str(" -- Substitute source values into query template\n"); + } else { + // Direct substitution from NEW columns + let query_sql = substitute_columns(query_template, "NEW"); + body.push_str(&format!(" _query := '{}';\n", query_sql)); + } + + // Build ops array from tracked fields that changed + body.push_str(" _ops := '[]'::jsonb;\n"); + for f in track_fields { + let (field_name, expr) = parse_track_field(f); + let old_expr = substitute_columns(&expr, "OLD"); + let new_expr = substitute_columns(&expr, "NEW"); + body.push_str(&format!( + " IF ({old}) IS DISTINCT FROM ({new}) THEN\n\ + \x20 _ops := _ops || jsonb_build_array(\n\ + \x20 jsonb_build_object('op', 'remove', 'field', '{field}', 'value', to_jsonb({old})),\n\ + \x20 jsonb_build_object('op', 'set', 'field', '{field}', 'value', to_jsonb({new}))\n\ + \x20 );\n\ + \x20 END IF;\n", + old = old_expr, + new = new_expr, + field = field_name, + )); + } + + body.push_str(" IF jsonb_array_length(_ops) > 0 THEN\n"); + body.push_str(&format!( + " INSERT INTO \"BitdexOps\" (entity_id, ops) VALUES (NEW.id, jsonb_build_array(\n\ + \x20 jsonb_build_object('op', 'queryOpSet', 'query', _query, 'ops', _ops)\n\ + \x20 ));\n" + )); + body.push_str(" END IF;\n"); + body.push_str(" RETURN NEW;\n"); + body.push_str(" END IF;\n"); + body.push_str(" RETURN COALESCE(NEW, OLD);\n"); + body.push_str("END;"); + + body +} + +/// Parse a track_field entry. Returns (bitdex_field_name, sql_expression). +/// Simple field: "nsfwLevel" → ("nsfwLevel", "\"nsfwLevel\"") +/// Expression: "GREATEST({scannedAt}, {createdAt}) as existedAt" → ("existedAt", "GREATEST(\"scannedAt\", \"createdAt\")") +fn parse_track_field(field: &str) -> (String, String) { + if let Some(as_pos) = field.to_lowercase().rfind(" as ") { + let expr = &field[..as_pos].trim(); + let alias = &field[as_pos + 4..].trim(); + // Replace {col} with "col" (quoted column reference) + let sql = expr + .replace('{', "\"") + .replace('}', "\""); + (alias.to_string(), sql) + } else { + // Simple field name + (field.to_string(), format!("\"{}\"", field)) + } +} + +/// Substitute {column} placeholders with prefix."column" references. +/// E.g., substitute_columns("GREATEST({scannedAt}, {createdAt})", "NEW") +/// → "GREATEST(NEW.\"scannedAt\", NEW.\"createdAt\")" +fn substitute_columns(expr: &str, prefix: &str) -> String { + let mut result = String::new(); + let mut chars = expr.chars().peekable(); + while let Some(c) = chars.next() { + if c == '{' { + let mut col = String::new(); + while let Some(&next) = chars.peek() { + if next == '}' { + chars.next(); + break; + } + col.push(chars.next().unwrap()); + } + result.push_str(&format!("{}.\"{}\"", prefix, col)); + } else { + result.push(c); + } + } + result +} + +/// Compute a short (8-char) hash of a string. +fn short_hash(s: &str) -> String { + let mut hasher = DefaultHasher::new(); + s.hash(&mut hasher); + format!("{:016x}", hasher.finish())[..8].to_string() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_track_field_simple() { + let (name, expr) = parse_track_field("nsfwLevel"); + assert_eq!(name, "nsfwLevel"); + assert_eq!(expr, "\"nsfwLevel\""); + } + + #[test] + fn test_parse_track_field_expression() { + let (name, expr) = parse_track_field("GREATEST({scannedAt}, {createdAt}) as existedAt"); + assert_eq!(name, "existedAt"); + assert_eq!(expr, "GREATEST(\"scannedAt\", \"createdAt\")"); + } + + #[test] + fn test_substitute_columns() { + let result = substitute_columns("GREATEST({scannedAt}, {createdAt})", "NEW"); + assert_eq!(result, "GREATEST(NEW.\"scannedAt\", NEW.\"createdAt\")"); + } + + #[test] + fn test_substitute_columns_simple() { + let result = substitute_columns("{nsfwLevel}", "OLD"); + assert_eq!(result, "OLD.\"nsfwLevel\""); + } + + #[test] + fn test_generate_multi_value_trigger() { + let source = SyncSource { + table: "TagsOnImageNew".into(), + slot_field: Some("imageId".into()), + track_fields: None, + field: Some("tagIds".into()), + value_field: Some("tagId".into()), + filter: None, + sets_alive: false, + on_delete: None, + query: None, + query_source: None, + depends_on: None, + }; + let sql = generate_trigger_sql(&source); + assert!(sql.contains("CREATE OR REPLACE FUNCTION")); + assert!(sql.contains("'add'")); + assert!(sql.contains("'remove'")); + assert!(sql.contains("tagIds")); + assert!(sql.contains("ENABLE ALWAYS")); + } + + #[test] + fn test_generate_direct_trigger() { + let source = SyncSource { + table: "Image".into(), + slot_field: Some("id".into()), + track_fields: Some(vec!["nsfwLevel".into(), "type".into()]), + field: None, + value_field: None, + filter: None, + sets_alive: true, + on_delete: Some("delete_slot".into()), + query: None, + query_source: None, + depends_on: None, + }; + let sql = generate_trigger_sql(&source); + assert!(sql.contains("IS DISTINCT FROM")); + assert!(sql.contains("nsfwLevel")); + assert!(sql.contains("delete")); + } + + #[test] + fn test_generate_fan_out_trigger() { + let source = SyncSource { + table: "ModelVersion".into(), + slot_field: None, + track_fields: Some(vec!["baseModel".into()]), + field: None, + value_field: None, + filter: None, + sets_alive: false, + on_delete: None, + query: Some("modelVersionIds eq {id}".into()), + query_source: None, + depends_on: None, + }; + let sql = generate_trigger_sql(&source); + assert!(sql.contains("queryOpSet")); + assert!(sql.contains("modelVersionIds eq")); + } + + #[test] + fn test_trigger_name_includes_hash() { + let source = SyncSource { + table: "Image".into(), + slot_field: Some("id".into()), + track_fields: Some(vec!["nsfwLevel".into()]), + field: None, + value_field: None, + filter: None, + sets_alive: false, + on_delete: None, + query: None, + query_source: None, + depends_on: None, + }; + let name = trigger_name(&source); + assert!(name.starts_with("bitdex_image_")); + assert_eq!(name.len(), "bitdex_image_".len() + 8); + } + + #[test] + fn test_trigger_hash_changes_with_config() { + let source1 = SyncSource { + table: "Image".into(), + slot_field: Some("id".into()), + track_fields: Some(vec!["nsfwLevel".into()]), + field: None, + value_field: None, + filter: None, + sets_alive: false, + on_delete: None, + query: None, + query_source: None, + depends_on: None, + }; + let source2 = SyncSource { + track_fields: Some(vec!["nsfwLevel".into(), "type".into()]), + ..source1.clone() + }; + let name1 = trigger_name(&source1); + let name2 = trigger_name(&source2); + assert_ne!(name1, name2, "Different configs should produce different hashes"); + } + + #[test] + fn test_yaml_parsing() { + let yaml = r#" +sync_sources: + - table: Image + slot_field: id + sets_alive: true + track_fields: [nsfwLevel, type] + on_delete: delete_slot + - table: TagsOnImageNew + slot_field: imageId + field: tagIds + value_field: tagId + - table: ModelVersion + query: "modelVersionIds eq {id}" + track_fields: [baseModel] +"#; + let config = SyncConfig::from_yaml(yaml).unwrap(); + assert_eq!(config.sync_sources.len(), 3); + assert_eq!(config.sync_sources[0].table, "Image"); + assert!(config.sync_sources[0].sets_alive); + assert_eq!(config.sync_sources[1].field.as_deref(), Some("tagIds")); + assert!(config.sync_sources[2].query.is_some()); + } + + #[test] + fn test_expression_in_track_fields() { + let source = SyncSource { + table: "Image".into(), + slot_field: Some("id".into()), + track_fields: Some(vec![ + "nsfwLevel".into(), + "GREATEST({scannedAt}, {createdAt}) as existedAt".into(), + "({flags} & (1 << 13)) != 0 AND ({flags} & (1 << 2)) = 0 as hasMeta".into(), + ]), + field: None, + value_field: None, + filter: None, + sets_alive: true, + on_delete: Some("delete_slot".into()), + query: None, + query_source: None, + depends_on: None, + }; + let sql = generate_trigger_sql(&source); + assert!(sql.contains("GREATEST")); + assert!(sql.contains("existedAt")); + assert!(sql.contains("hasMeta")); + } +} From 601bfa03caa5e650835f5d45300396b4c15cca93 Mon Sep 17 00:00:00 2001 From: Justin Maier Date: Wed, 25 Mar 2026 18:36:37 -0600 Subject: [PATCH 06/19] =?UTF-8?q?feat:=20dump=20pipeline=20=E2=80=94=20reg?= =?UTF-8?q?istry,=20persistence,=20server=20endpoints?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Dump lifecycle management for unified load pipeline: - DumpRegistry: in-memory + JSON-persisted dump state tracking - DumpEntry: name, wal_path, status (Writing/Loading/Complete/Failed), ops counts, timestamps - dump_name() + config_hash() for change detection - Atomic save via temp file rename Server endpoints: - GET /dumps — list all dumps with status - PUT /dumps — register new dump - POST /dumps/{name}/loaded — signal WAL file complete - DELETE /dumps/{name} — remove from history - POST /dumps/clear — clear all All endpoints feature-gated behind pg-sync with no-op fallbacks. 10 tests: lifecycle, persistence, removal, completion tracking, config hash determinism, failure handling. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/pg_sync/dump.rs | 297 ++++++++++++++++++++++++++++++++++++++++++++ src/pg_sync/mod.rs | 1 + src/server.rs | 131 +++++++++++++++++++ 3 files changed, 429 insertions(+) create mode 100644 src/pg_sync/dump.rs diff --git a/src/pg_sync/dump.rs b/src/pg_sync/dump.rs new file mode 100644 index 00000000..1de7ee99 --- /dev/null +++ b/src/pg_sync/dump.rs @@ -0,0 +1,297 @@ +//! Dump pipeline — manages table dump lifecycle for initial loading. +//! +//! Server side: dump registry (track which tables have been loaded). +//! Client side: pg-sync checks dump history, runs flat COPYs, writes WAL files. +//! +//! Dump lifecycle: +//! 1. PUT /dumps — register a new dump (returns task ID, WAL reader starts polling) +//! 2. pg-sync writes ops to WAL file on shared filesystem +//! 3. POST /dumps/{name}/loaded — signal file is complete +//! 4. WAL reader finishes processing, marks dump as complete +//! 5. GET /dumps — check status per table + +use std::collections::HashMap; +use std::path::{Path, PathBuf}; +use std::time::SystemTime; + +use serde::{Deserialize, Serialize}; + +/// State of a single dump. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct DumpEntry { + /// Dump name (e.g., "Image-a1b2c3d4") + pub name: String, + /// WAL file path (relative to data_dir) + pub wal_path: Option, + /// Current status + pub status: DumpStatus, + /// Number of ops written (reported by pg-sync) + pub ops_written: u64, + /// Number of ops processed by WAL reader + pub ops_processed: u64, + /// When the dump was registered + pub created_at: u64, + /// When the dump completed processing + pub completed_at: Option, +} + +/// Dump status. +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] +pub enum DumpStatus { + /// pg-sync is writing to the WAL file + Writing, + /// pg-sync signaled the file is complete, WAL reader is processing + Loading, + /// WAL reader finished processing + Complete, + /// Dump failed + Failed(String), +} + +/// Registry of dump state. Persisted to dumps.json in the data directory. +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +pub struct DumpRegistry { + pub dumps: HashMap, +} + +impl DumpRegistry { + /// Load from a JSON file. Returns empty registry if file doesn't exist. + pub fn load(path: &Path) -> Self { + std::fs::read_to_string(path) + .ok() + .and_then(|s| serde_json::from_str(&s).ok()) + .unwrap_or_default() + } + + /// Save to a JSON file. + pub fn save(&self, path: &Path) -> std::io::Result<()> { + let json = serde_json::to_string_pretty(self) + .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?; + // Atomic write via temp file + let tmp = path.with_extension("tmp"); + std::fs::write(&tmp, &json)?; + std::fs::rename(&tmp, path)?; + Ok(()) + } + + /// Register a new dump. Returns the entry. + pub fn register(&mut self, name: String, wal_path: Option) -> &DumpEntry { + let now = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + + self.dumps.insert( + name.clone(), + DumpEntry { + name: name.clone(), + wal_path, + status: DumpStatus::Writing, + ops_written: 0, + ops_processed: 0, + created_at: now, + completed_at: None, + }, + ); + &self.dumps[&name] + } + + /// Mark a dump as loaded (pg-sync finished writing the WAL file). + pub fn mark_loaded(&mut self, name: &str, ops_written: u64) -> Option<&DumpEntry> { + if let Some(entry) = self.dumps.get_mut(name) { + entry.status = DumpStatus::Loading; + entry.ops_written = ops_written; + Some(entry) + } else { + None + } + } + + /// Mark a dump as complete (WAL reader finished processing). + pub fn mark_complete(&mut self, name: &str, ops_processed: u64) -> Option<&DumpEntry> { + let now = SystemTime::now() + .duration_since(SystemTime::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(); + + if let Some(entry) = self.dumps.get_mut(name) { + entry.status = DumpStatus::Complete; + entry.ops_processed = ops_processed; + entry.completed_at = Some(now); + Some(entry) + } else { + None + } + } + + /// Mark a dump as failed. + pub fn mark_failed(&mut self, name: &str, error: String) { + if let Some(entry) = self.dumps.get_mut(name) { + entry.status = DumpStatus::Failed(error); + } + } + + /// Remove a dump from the registry. + pub fn remove(&mut self, name: &str) -> Option { + self.dumps.remove(name) + } + + /// Clear all dumps. + pub fn clear(&mut self) { + self.dumps.clear(); + } + + /// Check if a dump with the given name exists and is complete. + pub fn is_complete(&self, name: &str) -> bool { + self.dumps + .get(name) + .map(|e| e.status == DumpStatus::Complete) + .unwrap_or(false) + } + + /// Get all dump names that are complete. + pub fn completed_names(&self) -> Vec<&str> { + self.dumps + .values() + .filter(|e| e.status == DumpStatus::Complete) + .map(|e| e.name.as_str()) + .collect() + } + + /// Check if all dumps are complete (no pending/writing/loading). + pub fn all_complete(&self) -> bool { + !self.dumps.is_empty() + && self.dumps.values().all(|e| e.status == DumpStatus::Complete) + } +} + +/// Build the dump name from a table name and config hash. +/// Format: "{Table}-{hash8}" +pub fn dump_name(table: &str, config_hash: &str) -> String { + format!("{}-{}", table, &config_hash[..8.min(config_hash.len())]) +} + +/// Compute a config hash for a sync source entry. +pub fn config_hash(yaml_fragment: &str) -> String { + use std::collections::hash_map::DefaultHasher; + use std::hash::{Hash, Hasher}; + let mut hasher = DefaultHasher::new(); + yaml_fragment.hash(&mut hasher); + format!("{:016x}", hasher.finish()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_dump_lifecycle() { + let mut reg = DumpRegistry::default(); + assert!(reg.dumps.is_empty()); + + // Register + reg.register("Image-a1b2c3d4".into(), Some("dumps/image.wal".into())); + assert_eq!(reg.dumps.len(), 1); + assert_eq!(reg.dumps["Image-a1b2c3d4"].status, DumpStatus::Writing); + + // Mark loaded + reg.mark_loaded("Image-a1b2c3d4", 107_000_000); + assert_eq!(reg.dumps["Image-a1b2c3d4"].status, DumpStatus::Loading); + assert_eq!(reg.dumps["Image-a1b2c3d4"].ops_written, 107_000_000); + + // Mark complete + reg.mark_complete("Image-a1b2c3d4", 107_000_000); + assert_eq!(reg.dumps["Image-a1b2c3d4"].status, DumpStatus::Complete); + assert!(reg.dumps["Image-a1b2c3d4"].completed_at.is_some()); + + assert!(reg.is_complete("Image-a1b2c3d4")); + assert!(reg.all_complete()); + } + + #[test] + fn test_dump_persistence() { + let dir = tempfile::TempDir::new().unwrap(); + let path = dir.path().join("dumps.json"); + + let mut reg = DumpRegistry::default(); + reg.register("Image-abc".into(), None); + reg.mark_complete("Image-abc", 100); + reg.save(&path).unwrap(); + + let loaded = DumpRegistry::load(&path); + assert_eq!(loaded.dumps.len(), 1); + assert!(loaded.is_complete("Image-abc")); + } + + #[test] + fn test_dump_removal() { + let mut reg = DumpRegistry::default(); + reg.register("Image-abc".into(), None); + reg.register("Tags-def".into(), None); + assert_eq!(reg.dumps.len(), 2); + + reg.remove("Image-abc"); + assert_eq!(reg.dumps.len(), 1); + assert!(!reg.dumps.contains_key("Image-abc")); + } + + #[test] + fn test_dump_clear() { + let mut reg = DumpRegistry::default(); + reg.register("Image-abc".into(), None); + reg.register("Tags-def".into(), None); + reg.clear(); + assert!(reg.dumps.is_empty()); + } + + #[test] + fn test_all_complete() { + let mut reg = DumpRegistry::default(); + assert!(!reg.all_complete()); // Empty = not all complete + + reg.register("Image-abc".into(), None); + reg.register("Tags-def".into(), None); + assert!(!reg.all_complete()); + + reg.mark_complete("Image-abc", 100); + assert!(!reg.all_complete()); // Tags still pending + + reg.mark_loaded("Tags-def", 50); + reg.mark_complete("Tags-def", 50); + assert!(reg.all_complete()); + } + + #[test] + fn test_dump_name() { + assert_eq!(dump_name("Image", "a1b2c3d4e5f6"), "Image-a1b2c3d4"); + } + + #[test] + fn test_config_hash_deterministic() { + let h1 = config_hash("table: Image\nslot_field: id\ntrack_fields: [nsfwLevel]"); + let h2 = config_hash("table: Image\nslot_field: id\ntrack_fields: [nsfwLevel]"); + assert_eq!(h1, h2); + } + + #[test] + fn test_config_hash_changes() { + let h1 = config_hash("table: Image\ntrack_fields: [nsfwLevel]"); + let h2 = config_hash("table: Image\ntrack_fields: [nsfwLevel, type]"); + assert_ne!(h1, h2); + } + + #[test] + fn test_load_missing_file() { + let reg = DumpRegistry::load(Path::new("/nonexistent/dumps.json")); + assert!(reg.dumps.is_empty()); + } + + #[test] + fn test_failed_dump() { + let mut reg = DumpRegistry::default(); + reg.register("Image-abc".into(), None); + reg.mark_failed("Image-abc", "connection reset".into()); + assert!(matches!(reg.dumps["Image-abc"].status, DumpStatus::Failed(_))); + assert!(!reg.is_complete("Image-abc")); + } +} diff --git a/src/pg_sync/mod.rs b/src/pg_sync/mod.rs index 9c943346..23abf1f0 100644 --- a/src/pg_sync/mod.rs +++ b/src/pg_sync/mod.rs @@ -11,6 +11,7 @@ pub mod bulk_loader; pub mod config; pub mod copy_queries; pub mod copy_streams; +pub mod dump; pub mod metrics_poller; pub mod op_dedup; pub mod ops; diff --git a/src/server.rs b/src/server.rs index a75974c7..c653c54e 100644 --- a/src/server.rs +++ b/src/server.rs @@ -296,6 +296,9 @@ struct AppState { /// Latest sync source metadata (cursor, lag) keyed by source name. #[cfg(feature = "pg-sync")] sync_meta: Mutex>, + /// Dump registry for tracking table dump lifecycle. + #[cfg(feature = "pg-sync")] + dump_registry: Mutex, } type SharedState = Arc; @@ -1001,6 +1004,11 @@ impl BitdexServer { ops_wal: Mutex::new(None), #[cfg(feature = "pg-sync")] sync_meta: Mutex::new(std::collections::HashMap::new()), + #[cfg(feature = "pg-sync")] + dump_registry: { + let dumps_path = self.data_dir.join("dumps.json"); + Mutex::new(crate::pg_sync::dump::DumpRegistry::load(&dumps_path)) + }, }); // Try to restore an existing index from disk @@ -1080,6 +1088,11 @@ impl BitdexServer { .route("/api/internal/pgsync-metrics", post(handle_pgsync_metrics)) .route("/api/indexes/{name}/ops", post(handle_ops)) .route("/api/internal/sync-lag", get(handle_sync_lag)) + .route("/api/indexes/{name}/dumps", get(handle_list_dumps)) + .route("/api/indexes/{name}/dumps", put(handle_register_dump)) + .route("/api/indexes/{name}/dumps/{dump_name}/loaded", post(handle_dump_loaded)) + .route("/api/indexes/{name}/dumps/{dump_name}", delete(handle_delete_dump)) + .route("/api/indexes/{name}/dumps/clear", post(handle_clear_dumps)) .route("/metrics", get(handle_metrics)) .route("/", get(handle_ui)) .with_state(Arc::clone(&state)); @@ -4250,6 +4263,124 @@ async fn handle_ops( (StatusCode::NOT_FOUND, Json(serde_json::json!({"error": "pg-sync feature not enabled"}))) } +// ── Dump endpoints ── + +/// GET /api/indexes/{name}/dumps — List all dumps and their status. +#[cfg(feature = "pg-sync")] +async fn handle_list_dumps( + State(state): State, + AxumPath(_name): AxumPath, +) -> impl IntoResponse { + let reg = state.dump_registry.lock(); + Json(serde_json::json!({ + "dumps": reg.dumps, + "all_complete": reg.all_complete(), + })) +} + +#[cfg(not(feature = "pg-sync"))] +async fn handle_list_dumps(AxumPath(_name): AxumPath) -> impl IntoResponse { + Json(serde_json::json!({"dumps": {}})) +} + +/// PUT /api/indexes/{name}/dumps — Register a new dump. +#[cfg(feature = "pg-sync")] +async fn handle_register_dump( + State(state): State, + AxumPath(_name): AxumPath, + Json(body): Json, +) -> impl IntoResponse { + let dump_name = body["name"].as_str().unwrap_or("unknown").to_string(); + let wal_path = body["wal_path"].as_str().map(|s| s.to_string()); + + let mut reg = state.dump_registry.lock(); + reg.register(dump_name.clone(), wal_path); + + let dumps_path = state.data_dir.join("dumps.json"); + if let Err(e) = reg.save(&dumps_path) { + eprintln!("Warning: failed to save dump registry: {e}"); + } + + (StatusCode::CREATED, Json(serde_json::json!({ + "name": dump_name, + "status": "writing", + }))) +} + +#[cfg(not(feature = "pg-sync"))] +async fn handle_register_dump( + AxumPath(_name): AxumPath, + Json(_body): Json, +) -> impl IntoResponse { + (StatusCode::NOT_FOUND, Json(serde_json::json!({"error": "pg-sync not enabled"}))) +} + +/// POST /api/indexes/{name}/dumps/{dump_name}/loaded — Signal dump file is complete. +#[cfg(feature = "pg-sync")] +async fn handle_dump_loaded( + State(state): State, + AxumPath((_name, dump_name)): AxumPath<(String, String)>, + Json(body): Json, +) -> impl IntoResponse { + let ops_written = body["ops_written"].as_u64().unwrap_or(0); + + let mut reg = state.dump_registry.lock(); + match reg.mark_loaded(&dump_name, ops_written) { + Some(_) => { + let dumps_path = state.data_dir.join("dumps.json"); + reg.save(&dumps_path).ok(); + Json(serde_json::json!({"status": "loading", "name": dump_name})) + } + None => Json(serde_json::json!({"error": format!("Dump '{}' not found", dump_name)})), + } +} + +#[cfg(not(feature = "pg-sync"))] +async fn handle_dump_loaded( + AxumPath((_name, _dump_name)): AxumPath<(String, String)>, + Json(_body): Json, +) -> impl IntoResponse { + Json(serde_json::json!({"error": "pg-sync not enabled"})) +} + +/// DELETE /api/indexes/{name}/dumps/{dump_name} — Remove a dump from history. +#[cfg(feature = "pg-sync")] +async fn handle_delete_dump( + State(state): State, + AxumPath((_name, dump_name)): AxumPath<(String, String)>, +) -> impl IntoResponse { + let mut reg = state.dump_registry.lock(); + reg.remove(&dump_name); + let dumps_path = state.data_dir.join("dumps.json"); + reg.save(&dumps_path).ok(); + StatusCode::NO_CONTENT +} + +#[cfg(not(feature = "pg-sync"))] +async fn handle_delete_dump( + AxumPath((_name, _dump_name)): AxumPath<(String, String)>, +) -> impl IntoResponse { + StatusCode::NOT_FOUND +} + +/// POST /api/indexes/{name}/dumps/clear — Clear all dump history. +#[cfg(feature = "pg-sync")] +async fn handle_clear_dumps( + State(state): State, + AxumPath(_name): AxumPath, +) -> impl IntoResponse { + let mut reg = state.dump_registry.lock(); + reg.clear(); + let dumps_path = state.data_dir.join("dumps.json"); + reg.save(&dumps_path).ok(); + StatusCode::NO_CONTENT +} + +#[cfg(not(feature = "pg-sync"))] +async fn handle_clear_dumps(AxumPath(_name): AxumPath) -> impl IntoResponse { + StatusCode::NOT_FOUND +} + /// GET /api/internal/sync-lag — Return latest sync metadata from all sources. #[cfg(feature = "pg-sync")] async fn handle_sync_lag( From 7b20663ce75ca95f286c0657433d88aa720ae129 Mon Sep 17 00:00:00 2001 From: Justin Maier Date: Wed, 25 Mar 2026 18:38:15 -0600 Subject: [PATCH 07/19] feat: V2 sync Prometheus metrics (bitdex_sync_* namespace) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New unified sync metrics with source label: - bitdex_sync_cursor_position{source="..."} — current cursor - bitdex_sync_max_id{source="..."} — max ops table ID - bitdex_sync_lag_rows{source="..."} — rows behind - bitdex_sync_ops_total{source="..."} — total ops received - bitdex_sync_wal_bytes{source="..."} — WAL file size Metrics populated from SyncMeta in the POST /ops endpoint. Old bitdex_pgsync_* metrics preserved for backward compat. Binary rename (bitdex-pg-sync → bitdex-sync) deferred to deployment PR. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/metrics.rs | 39 +++++++++++++++++++++++++++++++++++++++ src/server.rs | 14 +++++++++++++- 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/src/metrics.rs b/src/metrics.rs index 25491177..950b976d 100644 --- a/src/metrics.rs +++ b/src/metrics.rs @@ -116,6 +116,13 @@ pub struct Metrics { pub pgsync_cycle_seconds: HistogramVec, pub pgsync_rows_fetched_total: IntCounterVec, pub pgsync_cursor_position: IntGaugeVec, + + // V2 sync metrics (unified namespace with source label) + pub sync_cursor_position: IntGaugeVec, + pub sync_max_id: IntGaugeVec, + pub sync_lag_rows: IntGaugeVec, + pub sync_ops_total: IntCounterVec, + pub sync_wal_bytes: IntGaugeVec, } impl Metrics { @@ -570,6 +577,28 @@ impl Metrics { ) .unwrap(); + // V2 sync metrics (unified namespace) + let sync_cursor_position = IntGaugeVec::new( + Opts::new("bitdex_sync_cursor_position", "Current sync cursor position"), + &["source"], + ).unwrap(); + let sync_max_id = IntGaugeVec::new( + Opts::new("bitdex_sync_max_id", "Max ops table ID (for lag calculation)"), + &["source"], + ).unwrap(); + let sync_lag_rows = IntGaugeVec::new( + Opts::new("bitdex_sync_lag_rows", "Number of ops rows behind"), + &["source"], + ).unwrap(); + let sync_ops_total = IntCounterVec::new( + Opts::new("bitdex_sync_ops_total", "Total ops received from sync sources"), + &["source"], + ).unwrap(); + let sync_wal_bytes = IntGaugeVec::new( + Opts::new("bitdex_sync_wal_bytes", "Current WAL file size in bytes"), + &["source"], + ).unwrap(); + // Register all metrics registry.register(Box::new(alive_documents.clone())).unwrap(); registry.register(Box::new(slot_high_water.clone())).unwrap(); @@ -671,6 +700,11 @@ impl Metrics { registry.register(Box::new(pgsync_cycle_seconds.clone())).unwrap(); registry.register(Box::new(pgsync_rows_fetched_total.clone())).unwrap(); registry.register(Box::new(pgsync_cursor_position.clone())).unwrap(); + registry.register(Box::new(sync_cursor_position.clone())).unwrap(); + registry.register(Box::new(sync_max_id.clone())).unwrap(); + registry.register(Box::new(sync_lag_rows.clone())).unwrap(); + registry.register(Box::new(sync_ops_total.clone())).unwrap(); + registry.register(Box::new(sync_wal_bytes.clone())).unwrap(); Self { registry, @@ -746,6 +780,11 @@ impl Metrics { pgsync_cycle_seconds, pgsync_rows_fetched_total, pgsync_cursor_position, + sync_cursor_position, + sync_max_id, + sync_lag_rows, + sync_ops_total, + sync_wal_bytes, } } diff --git a/src/server.rs b/src/server.rs index c653c54e..6e3b5c65 100644 --- a/src/server.rs +++ b/src/server.rs @@ -4203,10 +4203,22 @@ async fn handle_ops( } } - // Store sync metadata if provided + // Store sync metadata + update Prometheus metrics if let Some(meta) = &batch.meta { let mut sync_meta = state.sync_meta.lock(); sync_meta.insert(meta.source.clone(), meta.clone()); + + let m = &state.metrics; + let source = meta.source.as_str(); + if let Some(cursor) = meta.cursor { + m.sync_cursor_position.with_label_values(&[source]).set(cursor); + } + if let Some(max_id) = meta.max_id { + m.sync_max_id.with_label_values(&[source]).set(max_id); + } + if let Some(lag) = meta.lag_rows { + m.sync_lag_rows.with_label_values(&[source]).set(lag); + } } let ops_count = batch.ops.len(); From 8ae19f0ec59d3f5539021e822d64da0af868af38 Mon Sep 17 00:00:00 2001 From: Justin Maier Date: Wed, 25 Mar 2026 18:40:44 -0600 Subject: [PATCH 08/19] test: Sync V2 integration tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 9 integration tests exercising the full ops pipeline: - WAL roundtrip with dedup (write → read → dedup → verify) - Delete absorption through WAL - Add/remove cancellation through WAL - queryOpSet serialization through WAL - Cursor resume across multiple appends - Dump registry full workflow (register → load → complete → persist) - Dump config change detection (hash mismatch triggers re-dump) - Full Civitai trigger config (6 sources, all generate valid SQL) - OpsBatch JSON format roundtrip with SyncMeta Total: 69 tests across all Sync V2 modules (60 unit + 9 integration). Co-Authored-By: Claude Opus 4.6 (1M context) --- tests/sync_v2_integration.rs | 378 +++++++++++++++++++++++++++++++++++ 1 file changed, 378 insertions(+) create mode 100644 tests/sync_v2_integration.rs diff --git a/tests/sync_v2_integration.rs b/tests/sync_v2_integration.rs new file mode 100644 index 00000000..1ffb3083 --- /dev/null +++ b/tests/sync_v2_integration.rs @@ -0,0 +1,378 @@ +//! Integration tests for the Sync V2 pipeline. +//! +//! Tests the ops → WAL → processor pipeline without PG. +//! Full E2E tests (PG triggers → poller → server) require a running +//! server and PG instance — see tests/e2e/ for those. + +#![cfg(feature = "pg-sync")] + +use serde_json::json; +use tempfile::TempDir; + +use bitdex_v2::ops_wal::{WalReader, WalWriter}; +use bitdex_v2::pg_sync::op_dedup::dedup_ops; +use bitdex_v2::pg_sync::ops::{EntityOps, Op, OpsBatch, SyncMeta}; +use bitdex_v2::pg_sync::dump::{DumpRegistry, DumpStatus, dump_name, config_hash}; +use bitdex_v2::pg_sync::trigger_gen::{SyncConfig, SyncSource, generate_trigger_sql}; + +// ── WAL Pipeline Integration ── + +#[test] +fn test_ops_wal_roundtrip_with_dedup() { + let dir = TempDir::new().unwrap(); + let wal_path = dir.path().join("ops.wal"); + + // Write ops with duplicates + let writer = WalWriter::new(&wal_path); + let batch = vec![ + EntityOps { + entity_id: 1, + ops: vec![ + Op::Set { field: "nsfwLevel".into(), value: json!(8) }, + ], + }, + EntityOps { + entity_id: 1, + ops: vec![ + Op::Set { field: "nsfwLevel".into(), value: json!(16) }, // Overwrites first + ], + }, + EntityOps { + entity_id: 2, + ops: vec![ + Op::Add { field: "tagIds".into(), value: json!(42) }, + ], + }, + ]; + writer.append_batch(&batch).unwrap(); + + // Read back + let mut reader = WalReader::new(&wal_path, 0); + let result = reader.read_batch(100).unwrap(); + assert_eq!(result.entries.len(), 3); + + // Dedup + let mut entries = result.entries; + dedup_ops(&mut entries); + + // Entity 1: last set wins (nsfwLevel=16) + let entity1 = entries.iter().find(|e| e.entity_id == 1).unwrap(); + let set_ops: Vec<_> = entity1.ops.iter() + .filter(|op| matches!(op, Op::Set { field, .. } if field == "nsfwLevel")) + .collect(); + assert_eq!(set_ops.len(), 1); + if let Op::Set { value, .. } = &set_ops[0] { + assert_eq!(*value, json!(16)); + } + + // Entity 2: add preserved + let entity2 = entries.iter().find(|e| e.entity_id == 2).unwrap(); + assert_eq!(entity2.ops.len(), 1); + assert!(matches!(&entity2.ops[0], Op::Add { field, .. } if field == "tagIds")); +} + +#[test] +fn test_delete_absorbs_prior_ops_through_wal() { + let dir = TempDir::new().unwrap(); + let wal_path = dir.path().join("ops.wal"); + + let writer = WalWriter::new(&wal_path); + + // First batch: set some fields + writer.append_batch(&[EntityOps { + entity_id: 1, + ops: vec![ + Op::Set { field: "nsfwLevel".into(), value: json!(16) }, + Op::Add { field: "tagIds".into(), value: json!(42) }, + ], + }]).unwrap(); + + // Second batch: delete the entity + writer.append_batch(&[EntityOps { + entity_id: 1, + ops: vec![Op::Delete], + }]).unwrap(); + + // Read all + let mut reader = WalReader::new(&wal_path, 0); + let result = reader.read_batch(100).unwrap(); + assert_eq!(result.entries.len(), 2); + + // Dedup should collapse to just delete + let mut entries = result.entries; + dedup_ops(&mut entries); + assert_eq!(entries.len(), 1); + assert_eq!(entries[0].ops.len(), 1); + assert!(matches!(&entries[0].ops[0], Op::Delete)); +} + +#[test] +fn test_add_remove_cancellation_through_wal() { + let dir = TempDir::new().unwrap(); + let wal_path = dir.path().join("ops.wal"); + + let writer = WalWriter::new(&wal_path); + writer.append_batch(&[ + EntityOps { + entity_id: 1, + ops: vec![Op::Add { field: "tagIds".into(), value: json!(42) }], + }, + EntityOps { + entity_id: 1, + ops: vec![Op::Remove { field: "tagIds".into(), value: json!(42) }], + }, + ]).unwrap(); + + let mut reader = WalReader::new(&wal_path, 0); + let result = reader.read_batch(100).unwrap(); + let mut entries = result.entries; + dedup_ops(&mut entries); + + // Net zero — entity should be dropped + assert!(entries.is_empty() || entries[0].ops.is_empty()); +} + +#[test] +fn test_query_op_set_through_wal() { + let dir = TempDir::new().unwrap(); + let wal_path = dir.path().join("ops.wal"); + + let writer = WalWriter::new(&wal_path); + writer.append_batch(&[EntityOps { + entity_id: 456, + ops: vec![Op::QueryOpSet { + query: "modelVersionIds eq 456".into(), + ops: vec![ + Op::Remove { field: "baseModel".into(), value: json!("SD 1.5") }, + Op::Set { field: "baseModel".into(), value: json!("SDXL") }, + ], + }], + }]).unwrap(); + + let mut reader = WalReader::new(&wal_path, 0); + let result = reader.read_batch(100).unwrap(); + assert_eq!(result.entries.len(), 1); + + let entry = &result.entries[0]; + assert_eq!(entry.entity_id, 456); + match &entry.ops[0] { + Op::QueryOpSet { query, ops } => { + assert_eq!(query, "modelVersionIds eq 456"); + assert_eq!(ops.len(), 2); + } + _ => panic!("Expected QueryOpSet"), + } +} + +// ── Cursor Resume Integration ── + +#[test] +fn test_wal_cursor_resume_across_appends() { + let dir = TempDir::new().unwrap(); + let wal_path = dir.path().join("ops.wal"); + let writer = WalWriter::new(&wal_path); + + // Batch 1 + writer.append_batch(&[EntityOps { + entity_id: 1, + ops: vec![Op::Set { field: "a".into(), value: json!(1) }], + }]).unwrap(); + + // Read batch 1 + let mut reader = WalReader::new(&wal_path, 0); + let r1 = reader.read_batch(100).unwrap(); + assert_eq!(r1.entries.len(), 1); + let cursor = reader.cursor(); + + // Batch 2 (appended after first read) + writer.append_batch(&[EntityOps { + entity_id: 2, + ops: vec![Op::Set { field: "b".into(), value: json!(2) }], + }]).unwrap(); + + // Resume from cursor — should only get batch 2 + let mut reader2 = WalReader::new(&wal_path, cursor); + let r2 = reader2.read_batch(100).unwrap(); + assert_eq!(r2.entries.len(), 1); + assert_eq!(r2.entries[0].entity_id, 2); +} + +// ── Dump Registry Integration ── + +#[test] +fn test_dump_registry_full_workflow() { + let dir = TempDir::new().unwrap(); + let dumps_path = dir.path().join("dumps.json"); + + let mut reg = DumpRegistry::default(); + + // Simulate boot: check if dumps are complete + let image_hash = config_hash("table: Image\ntrack_fields: [nsfwLevel]"); + let tags_hash = config_hash("table: TagsOnImageNew\nfield: tagIds"); + let image_name = dump_name("Image", &image_hash); + let tags_name = dump_name("TagsOnImageNew", &tags_hash); + + assert!(!reg.is_complete(&image_name)); + assert!(!reg.is_complete(&tags_name)); + + // Register dumps + reg.register(image_name.clone(), Some("dumps/image.wal".into())); + reg.register(tags_name.clone(), Some("dumps/tags.wal".into())); + reg.save(&dumps_path).unwrap(); + + // Simulate pg-sync writing WAL and signaling loaded + reg.mark_loaded(&image_name, 107_500_000); + reg.mark_loaded(&tags_name, 375_000_000); + + // Simulate WAL reader completing + reg.mark_complete(&image_name, 107_500_000); + assert!(!reg.all_complete()); // Tags not done yet + + reg.mark_complete(&tags_name, 375_000_000); + assert!(reg.all_complete()); + + // Persist and reload + reg.save(&dumps_path).unwrap(); + let loaded = DumpRegistry::load(&dumps_path); + assert!(loaded.is_complete(&image_name)); + assert!(loaded.is_complete(&tags_name)); + assert!(loaded.all_complete()); +} + +#[test] +fn test_dump_config_change_detection() { + let hash1 = config_hash("table: Image\ntrack_fields: [nsfwLevel]"); + let hash2 = config_hash("table: Image\ntrack_fields: [nsfwLevel, type]"); + let name1 = dump_name("Image", &hash1); + let name2 = dump_name("Image", &hash2); + + let mut reg = DumpRegistry::default(); + reg.register(name1.clone(), None); + reg.mark_loaded(&name1, 100); + reg.mark_complete(&name1, 100); + + // After config change, the dump name is different + assert!(reg.is_complete(&name1)); + assert!(!reg.is_complete(&name2)); // New hash → not loaded → needs re-dump +} + +// ── Trigger Generation Integration ── + +#[test] +fn test_full_civitai_config() { + let yaml = r#" +sync_sources: + - table: Image + slot_field: id + sets_alive: true + track_fields: + - nsfwLevel + - type + - userId + - postId + - minor + - poi + - blockedFor + - "GREATEST({scannedAt}, {createdAt}) as existedAt" + - "({flags} & (1 << 13)) != 0 AND ({flags} & (1 << 2)) = 0 as hasMeta" + - "({flags} & (1 << 14)) != 0 as onSite" + on_delete: delete_slot + + - table: TagsOnImageNew + slot_field: imageId + field: tagIds + value_field: tagId + + - table: ImageTool + slot_field: imageId + field: toolIds + value_field: toolId + + - table: ImageTechnique + slot_field: imageId + field: techniqueIds + value_field: techniqueId + + - table: ModelVersion + query: "modelVersionIds eq {id}" + track_fields: [baseModel] + + - table: Post + query: "postId eq {id}" + track_fields: [publishedAt, availability] +"#; + + let config = SyncConfig::from_yaml(yaml).unwrap(); + assert_eq!(config.sync_sources.len(), 6); + + // Generate SQL for each and verify they're non-empty and contain expected patterns + for source in &config.sync_sources { + let sql = generate_trigger_sql(source); + assert!(sql.contains("CREATE OR REPLACE FUNCTION"), "Missing function for {}", source.table); + assert!(sql.contains("ENABLE ALWAYS"), "Missing ENABLE ALWAYS for {}", source.table); + + match source.table.as_str() { + "Image" => { + assert!(sql.contains("IS DISTINCT FROM"), "Image should use IS DISTINCT FROM"); + assert!(sql.contains("delete"), "Image should handle delete"); + assert!(sql.contains("GREATEST"), "Image should have existedAt expression"); + } + "TagsOnImageNew" => { + assert!(sql.contains("'add'"), "Tags should have add ops"); + assert!(sql.contains("'remove'"), "Tags should have remove ops"); + } + "ModelVersion" => { + assert!(sql.contains("queryOpSet"), "MV should use queryOpSet"); + assert!(sql.contains("modelVersionIds eq"), "MV should query by MV id"); + } + "Post" => { + assert!(sql.contains("queryOpSet"), "Post should use queryOpSet"); + assert!(sql.contains("postId eq"), "Post should query by postId"); + } + _ => {} + } + } +} + +// ── OpsBatch Serialization ── + +#[test] +fn test_ops_batch_json_format() { + let batch = OpsBatch { + ops: vec![ + EntityOps { + entity_id: 123, + ops: vec![ + Op::Remove { field: "nsfwLevel".into(), value: json!(8) }, + Op::Set { field: "nsfwLevel".into(), value: json!(16) }, + ], + }, + EntityOps { + entity_id: 456, + ops: vec![Op::QueryOpSet { + query: "modelVersionIds eq 456".into(), + ops: vec![ + Op::Remove { field: "baseModel".into(), value: json!("SD 1.5") }, + Op::Set { field: "baseModel".into(), value: json!("SDXL") }, + ], + }], + }, + ], + meta: Some(SyncMeta { + source: "pg-sync-default".into(), + cursor: Some(420_000_000), + max_id: Some(500_000_000), + lag_rows: Some(80_000_000), + }), + }; + + // Round-trip through JSON + let json_str = serde_json::to_string(&batch).unwrap(); + let parsed: OpsBatch = serde_json::from_str(&json_str).unwrap(); + assert_eq!(parsed.ops.len(), 2); + assert_eq!(parsed.ops[0].entity_id, 123); + assert_eq!(parsed.ops[1].entity_id, 456); + let meta = parsed.meta.unwrap(); + assert_eq!(meta.source, "pg-sync-default"); + assert_eq!(meta.lag_rows, Some(80_000_000)); +} From 636f9783c37a5ac8a3a4906aba5411bad3fbbc65 Mon Sep 17 00:00:00 2001 From: Justin Maier Date: Wed, 25 Mar 2026 18:49:14 -0600 Subject: [PATCH 09/19] feat: wire WAL reader thread into server startup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Spawns a background thread that tails the ops WAL file, reads batches of up to 10K records, deduplicates via dedup_ops(), and applies mutations to the engine via apply_ops_batch(). Persists cursor to disk after each batch. Updates bitdex_sync_wal_bytes metric. This completes the full ops ingestion chain: POST /ops → WAL append + fsync → WAL reader thread → engine mutations The reader sleeps 50ms when no new records are available, and 1s when no index is loaded yet. Errors are logged and retried. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/server.rs | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/src/server.rs b/src/server.rs index 6e3b5c65..60913e32 100644 --- a/src/server.rs +++ b/src/server.rs @@ -1037,6 +1037,71 @@ impl BitdexServer { } } + // Spawn WAL reader thread if pg-sync feature is enabled and index exists + #[cfg(feature = "pg-sync")] + { + let wal_dir = self.data_dir.join("wal"); + let wal_path = wal_dir.join("ops.wal"); + let cursor_path = wal_dir.join("cursor"); + let wal_state = Arc::clone(&state); + std::thread::Builder::new() + .name("wal-reader".into()) + .spawn(move || { + let cursor = crate::ops_processor::load_cursor(&cursor_path); + let mut reader = crate::ops_wal::WalReader::new(&wal_path, cursor); + eprintln!("WAL reader started (cursor={cursor}, path={})", wal_path.display()); + + loop { + // Read a batch from the WAL + match reader.read_batch(10_000) { + Ok(batch) if !batch.entries.is_empty() => { + // Get engine reference + let engine = { + let guard = wal_state.index.lock(); + guard.as_ref().map(|idx| Arc::clone(&idx.engine)) + }; + + if let Some(engine) = engine { + let mut entries = batch.entries; + let (applied, skipped, errors) = + crate::ops_processor::apply_ops_batch(&engine, &mut entries); + + if applied > 0 || errors > 0 { + eprintln!( + "WAL reader: applied={applied} skipped={skipped} errors={errors} cursor={}", + reader.cursor() + ); + } + + // Persist cursor after successful processing + if let Err(e) = crate::ops_processor::save_cursor(&cursor_path, reader.cursor()) { + eprintln!("WAL reader: failed to save cursor: {e}"); + } + + // Update WAL bytes metric + let wal_size = std::fs::metadata(&wal_path).map(|m| m.len()).unwrap_or(0); + wal_state.metrics.sync_wal_bytes + .with_label_values(&["wal-reader"]) + .set(wal_size as i64); + } else { + // No index loaded yet — sleep and retry + std::thread::sleep(std::time::Duration::from_secs(1)); + } + } + Ok(_) => { + // No new records — sleep briefly + std::thread::sleep(std::time::Duration::from_millis(50)); + } + Err(e) => { + eprintln!("WAL reader error: {e}"); + std::thread::sleep(std::time::Duration::from_secs(1)); + } + } + } + }) + .ok(); + } + let shutdown_state = Arc::clone(&state); // Admin routes — require Bearer token (or disabled if no token configured) From 305b15634a2ea6f4e44569e5a906f32a268f719b Mon Sep 17 00:00:00 2001 From: Justin Maier Date: Wed, 25 Mar 2026 18:51:51 -0600 Subject: [PATCH 10/19] =?UTF-8?q?feat:=20CSV=E2=86=92ops=20adapter=20+=20W?= =?UTF-8?q?AL=20reader=20thread=20wiring?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CSV dump adapter (csv_ops.rs): - images_csv_to_wal(): parses images.csv, converts each row to set ops (nsfwLevel, type, userId, postId, hasMeta, onSite, minor, poi, existedAt, blockedFor) - tags_csv_to_wal(), tools_csv_to_wal(): multi-value CSV → add ops - run_csv_dump(): orchestrates full CSV dump with optional row limits - Supports batch writing to WAL with configurable batch size - Limited variants for validation testing with subsets WAL reader thread (server.rs): - Spawned on server startup, tails ops.wal, reads batches of 10K - Deduplicates and applies via apply_ops_batch() - Persists cursor to disk, updates WAL bytes metric - Completes the full chain: POST /ops → WAL → reader → engine 2 new tests + previous tests still passing. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/pg_sync/csv_ops.rs | 406 +++++++++++++++++++++++++++++++++++++++++ src/pg_sync/mod.rs | 1 + 2 files changed, 407 insertions(+) create mode 100644 src/pg_sync/csv_ops.rs diff --git a/src/pg_sync/csv_ops.rs b/src/pg_sync/csv_ops.rs new file mode 100644 index 00000000..778441a0 --- /dev/null +++ b/src/pg_sync/csv_ops.rs @@ -0,0 +1,406 @@ +//! CSV→ops adapter for the dump pipeline. +//! +//! Reads existing CSV files (from PG COPY or local dumps) and transforms +//! each row into ops using the sync config schema. Writes ops to WAL files +//! for processing by the WAL reader thread. +//! +//! This is the local testing path and also the production dump path when +//! CSVs are pre-fetched to disk. + +use std::fs::File; +use std::io::{BufRead, BufReader}; +use std::path::Path; +use std::time::Instant; + +use serde_json::json; + +use super::copy_queries::{parse_image_row, parse_tag_row, parse_tool_row, CopyImageRow}; +use super::ops::{EntityOps, Op}; +use crate::ops_wal::WalWriter; + +/// Stats from a CSV→WAL conversion. +#[derive(Debug, Default)] +pub struct CsvOpsStats { + pub rows_read: u64, + pub rows_skipped: u64, + pub ops_written: u64, + pub bytes_written: u64, + pub elapsed_secs: f64, +} + +/// Convert images.csv to ops and write to WAL. +/// Each image row produces set ops for all tracked scalar fields. +pub fn images_csv_to_wal(csv_path: &Path, writer: &WalWriter, batch_size: usize) -> std::io::Result { + let start = Instant::now(); + let file = File::open(csv_path)?; + let reader = BufReader::with_capacity(8 * 1024 * 1024, file); + let mut stats = CsvOpsStats::default(); + let mut batch: Vec = Vec::with_capacity(batch_size); + + for line in reader.split(b'\n') { + let line = line?; + if line.is_empty() { + continue; + } + + let row = match parse_image_row(&line) { + Some(r) => r, + None => { + stats.rows_skipped += 1; + continue; + } + }; + stats.rows_read += 1; + + let ops = image_row_to_ops(&row); + batch.push(EntityOps { + entity_id: row.id, + ops, + }); + + if batch.len() >= batch_size { + let bytes = writer.append_batch(&batch)?; + stats.ops_written += batch.len() as u64; + stats.bytes_written += bytes; + batch.clear(); + } + } + + // Flush remaining + if !batch.is_empty() { + let bytes = writer.append_batch(&batch)?; + stats.ops_written += batch.len() as u64; + stats.bytes_written += bytes; + } + + stats.elapsed_secs = start.elapsed().as_secs_f64(); + Ok(stats) +} + +/// Convert a single image CSV row to ops. +fn image_row_to_ops(row: &CopyImageRow) -> Vec { + let mut ops = Vec::with_capacity(12); + + ops.push(Op::Set { field: "nsfwLevel".into(), value: json!(row.nsfw_level) }); + ops.push(Op::Set { field: "type".into(), value: json!(row.image_type) }); + ops.push(Op::Set { field: "userId".into(), value: json!(row.user_id) }); + + if let Some(post_id) = row.post_id { + ops.push(Op::Set { field: "postId".into(), value: json!(post_id) }); + } + + // hasMeta and onSite from flags + let has_meta = row.has_meta(); + let on_site = row.on_site(); + ops.push(Op::Set { field: "hasMeta".into(), value: json!(has_meta) }); + ops.push(Op::Set { field: "onSite".into(), value: json!(on_site) }); + + // Minor and POI + let minor = row.minor(); + let poi = row.poi(); + ops.push(Op::Set { field: "minor".into(), value: json!(minor) }); + ops.push(Op::Set { field: "poi".into(), value: json!(poi) }); + + // existedAt = GREATEST(scannedAt, createdAt) in seconds + let existed_at = match (row.scanned_at_secs, row.created_at_secs) { + (Some(s), Some(c)) => s.max(c), + (Some(s), None) => s, + (None, Some(c)) => c, + (None, None) => 0, + }; + ops.push(Op::Set { field: "existedAt".into(), value: json!(existed_at) }); + + // blockedFor + if let Some(ref bf) = row.blocked_for { + ops.push(Op::Set { field: "blockedFor".into(), value: json!(bf) }); + } + + ops +} + +/// Convert tags.csv to add ops and write to WAL. +/// Each row: (tag_id, image_id) → add tagIds op on the image. +pub fn tags_csv_to_wal(csv_path: &Path, writer: &WalWriter, batch_size: usize) -> std::io::Result { + multi_value_csv_to_wal(csv_path, writer, batch_size, "tagIds", |line| { + // tags.csv: tag_id, image_id + parse_tag_row(line).map(|(tag_id, image_id)| (image_id, tag_id)) + }) +} + +/// Convert tools.csv to add ops and write to WAL. +pub fn tools_csv_to_wal(csv_path: &Path, writer: &WalWriter, batch_size: usize) -> std::io::Result { + multi_value_csv_to_wal(csv_path, writer, batch_size, "toolIds", |line| { + parse_tool_row(line).map(|(tool_id, image_id)| (image_id, tool_id)) + }) +} + +/// Generic multi-value CSV→WAL converter. +/// Parser returns (slot_id, value) pairs. +fn multi_value_csv_to_wal( + csv_path: &Path, + writer: &WalWriter, + batch_size: usize, + field_name: &str, + parser: impl Fn(&[u8]) -> Option<(i64, i64)>, +) -> std::io::Result { + let start = Instant::now(); + let file = File::open(csv_path)?; + let reader = BufReader::with_capacity(8 * 1024 * 1024, file); + let mut stats = CsvOpsStats::default(); + let mut batch: Vec = Vec::with_capacity(batch_size); + + for line in reader.split(b'\n') { + let line = line?; + if line.is_empty() { + continue; + } + + let (slot_id, value) = match parser(&line) { + Some(pair) => pair, + None => { + stats.rows_skipped += 1; + continue; + } + }; + stats.rows_read += 1; + + batch.push(EntityOps { + entity_id: slot_id, + ops: vec![Op::Add { + field: field_name.to_string(), + value: json!(value), + }], + }); + + if batch.len() >= batch_size { + let bytes = writer.append_batch(&batch)?; + stats.ops_written += batch.len() as u64; + stats.bytes_written += bytes; + batch.clear(); + } + } + + if !batch.is_empty() { + let bytes = writer.append_batch(&batch)?; + stats.ops_written += batch.len() as u64; + stats.bytes_written += bytes; + } + + stats.elapsed_secs = start.elapsed().as_secs_f64(); + Ok(stats) +} + +/// Run the full CSV dump pipeline: read all CSVs, convert to ops, write to WAL. +/// Returns per-table stats. +pub fn run_csv_dump( + csv_dir: &Path, + wal_path: &Path, + batch_size: usize, + limit: Option, +) -> std::io::Result> { + let writer = WalWriter::new(wal_path); + let mut results = Vec::new(); + + // Phase 1: Images (must be first — sets alive + scalar fields) + let images_csv = csv_dir.join("images.csv"); + if images_csv.exists() { + eprintln!("CSV dump: loading images.csv..."); + let stats = if let Some(max) = limit { + images_csv_to_wal_limited(&images_csv, &writer, batch_size, max)? + } else { + images_csv_to_wal(&images_csv, &writer, batch_size)? + }; + eprintln!( + " images: {} rows, {} ops, {:.1}s ({:.0}/s)", + stats.rows_read, stats.ops_written, stats.elapsed_secs, + stats.rows_read as f64 / stats.elapsed_secs.max(0.001) + ); + results.push(("images".into(), stats)); + } + + // Phase 2: Multi-value tables (parallel-safe, but sequential here for simplicity) + let tags_csv = csv_dir.join("tags.csv"); + if tags_csv.exists() { + eprintln!("CSV dump: loading tags.csv..."); + let stats = if let Some(max) = limit { + multi_value_csv_to_wal_limited(&tags_csv, &writer, batch_size, "tagIds", max, |line| { + parse_tag_row(line).map(|(tag_id, image_id)| (image_id, tag_id)) + })? + } else { + tags_csv_to_wal(&tags_csv, &writer, batch_size)? + }; + eprintln!( + " tags: {} rows, {} ops, {:.1}s ({:.0}/s)", + stats.rows_read, stats.ops_written, stats.elapsed_secs, + stats.rows_read as f64 / stats.elapsed_secs.max(0.001) + ); + results.push(("tags".into(), stats)); + } + + let tools_csv = csv_dir.join("tools.csv"); + if tools_csv.exists() { + eprintln!("CSV dump: loading tools.csv..."); + let stats = if let Some(max) = limit { + multi_value_csv_to_wal_limited(&tools_csv, &writer, batch_size, "toolIds", max, |line| { + parse_tool_row(line).map(|(tool_id, image_id)| (image_id, tool_id)) + })? + } else { + tools_csv_to_wal(&tools_csv, &writer, batch_size)? + }; + eprintln!( + " tools: {} rows, {} ops, {:.1}s ({:.0}/s)", + stats.rows_read, stats.ops_written, stats.elapsed_secs, + stats.rows_read as f64 / stats.elapsed_secs.max(0.001) + ); + results.push(("tools".into(), stats)); + } + + Ok(results) +} + +/// Limited version of images_csv_to_wal — stops after `limit` rows. +fn images_csv_to_wal_limited(csv_path: &Path, writer: &WalWriter, batch_size: usize, limit: u64) -> std::io::Result { + let start = Instant::now(); + let file = File::open(csv_path)?; + let reader = BufReader::with_capacity(8 * 1024 * 1024, file); + let mut stats = CsvOpsStats::default(); + let mut batch: Vec = Vec::with_capacity(batch_size); + + for line in reader.split(b'\n') { + if stats.rows_read >= limit { + break; + } + let line = line?; + if line.is_empty() { continue; } + let row = match parse_image_row(&line) { + Some(r) => r, + None => { stats.rows_skipped += 1; continue; } + }; + stats.rows_read += 1; + batch.push(EntityOps { entity_id: row.id, ops: image_row_to_ops(&row) }); + if batch.len() >= batch_size { + let bytes = writer.append_batch(&batch)?; + stats.ops_written += batch.len() as u64; + stats.bytes_written += bytes; + batch.clear(); + } + } + if !batch.is_empty() { + let bytes = writer.append_batch(&batch)?; + stats.ops_written += batch.len() as u64; + stats.bytes_written += bytes; + } + stats.elapsed_secs = start.elapsed().as_secs_f64(); + Ok(stats) +} + +/// Limited version of multi_value_csv_to_wal. +fn multi_value_csv_to_wal_limited( + csv_path: &Path, + writer: &WalWriter, + batch_size: usize, + field_name: &str, + limit: u64, + parser: impl Fn(&[u8]) -> Option<(i64, i64)>, +) -> std::io::Result { + let start = Instant::now(); + let file = File::open(csv_path)?; + let reader = BufReader::with_capacity(8 * 1024 * 1024, file); + let mut stats = CsvOpsStats::default(); + let mut batch: Vec = Vec::with_capacity(batch_size); + + for line in reader.split(b'\n') { + if stats.rows_read >= limit { + break; + } + let line = line?; + if line.is_empty() { continue; } + let (slot_id, value) = match parser(&line) { + Some(pair) => pair, + None => { stats.rows_skipped += 1; continue; } + }; + stats.rows_read += 1; + batch.push(EntityOps { + entity_id: slot_id, + ops: vec![Op::Add { field: field_name.to_string(), value: json!(value) }], + }); + if batch.len() >= batch_size { + let bytes = writer.append_batch(&batch)?; + stats.ops_written += batch.len() as u64; + stats.bytes_written += bytes; + batch.clear(); + } + } + if !batch.is_empty() { + let bytes = writer.append_batch(&batch)?; + stats.ops_written += batch.len() as u64; + stats.bytes_written += bytes; + } + stats.elapsed_secs = start.elapsed().as_secs_f64(); + Ok(stats) +} + +#[cfg(test)] +mod tests { + use super::*; + use tempfile::TempDir; + + #[test] + fn test_image_row_to_ops() { + let row = CopyImageRow { + id: 1, + url: Some("test.jpg".into()), + nsfw_level: 16, + hash: None, + flags: (1 << 13), // hasMeta=true + image_type: "image".into(), + user_id: 42, + blocked_for: None, + scanned_at_secs: Some(1000), + created_at_secs: Some(2000), + post_id: Some(100), + width: None, + height: None, + published_at_secs: None, + availability: String::new(), + posted_to_id: None, + }; + let ops = image_row_to_ops(&row); + // Should have: nsfwLevel, type, userId, postId, hasMeta, onSite, minor, poi, existedAt + assert!(ops.len() >= 9); + + // Check nsfwLevel + let nsfw = ops.iter().find(|o| matches!(o, Op::Set { field, .. } if field == "nsfwLevel")).unwrap(); + if let Op::Set { value, .. } = nsfw { assert_eq!(*value, json!(16)); } + + // Check existedAt = max(1000, 2000) = 2000 + let existed = ops.iter().find(|o| matches!(o, Op::Set { field, .. } if field == "existedAt")).unwrap(); + if let Op::Set { value, .. } = existed { assert_eq!(*value, json!(2000)); } + + // Check hasMeta (flags bit 13 set) + let has_meta = ops.iter().find(|o| matches!(o, Op::Set { field, .. } if field == "hasMeta")).unwrap(); + if let Op::Set { value, .. } = has_meta { assert_eq!(*value, json!(true)); } + } + + #[test] + fn test_csv_to_wal_roundtrip() { + let dir = TempDir::new().unwrap(); + let csv_path = dir.path().join("images.csv"); + let wal_path = dir.path().join("ops.wal"); + + // Write a tiny CSV (comma-separated, matching PG COPY CSV format) + std::fs::write(&csv_path, b"1,http://img.jpg,16,,8192,image,42,,1000,2000,100\n2,,1,,0,video,99,,500,600,200\n").unwrap(); + + let stats = images_csv_to_wal(&csv_path, &WalWriter::new(&wal_path), 100).unwrap(); + assert_eq!(stats.rows_read, 2); + assert_eq!(stats.ops_written, 2); + assert!(stats.bytes_written > 0); + + // Read back from WAL + let mut reader = crate::ops_wal::WalReader::new(&wal_path, 0); + let batch = reader.read_batch(100).unwrap(); + assert_eq!(batch.entries.len(), 2); + assert_eq!(batch.entries[0].entity_id, 1); + assert_eq!(batch.entries[1].entity_id, 2); + } +} diff --git a/src/pg_sync/mod.rs b/src/pg_sync/mod.rs index 23abf1f0..d1cbcff7 100644 --- a/src/pg_sync/mod.rs +++ b/src/pg_sync/mod.rs @@ -11,6 +11,7 @@ pub mod bulk_loader; pub mod config; pub mod copy_queries; pub mod copy_streams; +pub mod csv_ops; pub mod dump; pub mod metrics_poller; pub mod op_dedup; From fd2155374af80e3159e16d92a227be8715617b35 Mon Sep 17 00:00:00 2001 From: Justin Maier Date: Wed, 25 Mar 2026 19:49:04 -0600 Subject: [PATCH 11/19] =?UTF-8?q?feat:=20rewrite=20ops=5Fprocessor=20?= =?UTF-8?q?=E2=80=94=20BitmapSink=20dual-path=20+=20direct=20dump?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rewrites ops_processor.rs per the Sync V2 design doc: Steady-state path: ops → BitmapSink (CoalescerSink) → coalescer channel. No more engine.put() — ops translate directly to FilterInsert/FilterRemove/ SortSet/SortClear/AliveInsert/AliveRemove via the existing mutation helpers (value_to_bitmap_key, value_to_sort_u32). Dump path: ops → BitmapSink (AccumSink) → BitmapAccum → apply_accum(). Bypasses coalescer, snapshot publishing, and cache invalidation entirely. process_csv_dump_direct() goes CSV → ops → AccumSink in one pass. Key changes: - FieldMeta: precomputed field metadata from Config (filter/sort field types) - creates_slot flag on EntityOps (persisted in WAL binary header) - apply_accum() on ConcurrentEngine for direct staging merge - mutation_sender() exposed on ConcurrentEngine for CoalescerSink - WAL format updated: 1-byte flags field after entity_id - Dedup preserves creates_slot via OR across merged sources - Validation harness supports --direct, --steady-state, WAL dump modes Benchmarks at 1M scale: - Direct dump: 367K images/s (beats 345K/s single-pass baseline) - WAL dump: 41K ops/s - Steady-state: 2.7K ops/s (expected — per-op channel overhead) 14 unit tests + 9 integration tests passing. CSV validation PASS at 10K, 100K, 1M with zero errors. Co-Authored-By: Claude Opus 4.6 (1M context) --- examples/validate_ops_pipeline.rs | 290 ++++++++ src/concurrent_engine.rs | 45 ++ src/ops_processor.rs | 1047 +++++++++++++++++++++++------ src/ops_wal.rs | 27 +- src/pg_sync/csv_ops.rs | 10 +- src/pg_sync/op_dedup.rs | 15 +- src/pg_sync/ops.rs | 18 + src/pg_sync/ops_poller.rs | 1 + src/server.rs | 9 +- tests/sync_v2_integration.rs | 12 + 10 files changed, 1270 insertions(+), 204 deletions(-) create mode 100644 examples/validate_ops_pipeline.rs diff --git a/examples/validate_ops_pipeline.rs b/examples/validate_ops_pipeline.rs new file mode 100644 index 00000000..a16b235d --- /dev/null +++ b/examples/validate_ops_pipeline.rs @@ -0,0 +1,290 @@ +//! Validation harness for the Sync V2 ops pipeline. +//! +//! Tests both processing modes: +//! - **Dump mode** (default): AccumSink → direct bitmap accumulation (bulk loading path) +//! - **Steady-state mode** (--steady-state): CoalescerSink → coalescer channel (online path) +//! +//! Usage: +//! cargo run --example validate_ops_pipeline --features pg-sync -- \ +//! --csv-dir C:\Dev\Repos\open-source\bitdex-v2\data\load_stage \ +//! --limit 100000 +//! +//! # Steady-state mode (slower, tests the online write path): +//! cargo run --example validate_ops_pipeline --features pg-sync -- \ +//! --csv-dir data/load_stage --limit 10000 --steady-state + +use std::path::PathBuf; +use std::time::Instant; + +use bitdex_v2::concurrent_engine::ConcurrentEngine; +use bitdex_v2::config::Config; +use bitdex_v2::ops_processor::{apply_ops_batch, process_csv_dump_direct, process_wal_dump, FieldMeta}; +use bitdex_v2::ops_wal::WalReader; +use bitdex_v2::pg_sync::csv_ops::run_csv_dump; + +fn main() { + let args: Vec = std::env::args().collect(); + let csv_dir = get_arg(&args, "--csv-dir").unwrap_or_else(|| "data/load_stage".into()); + let limit: u64 = get_arg(&args, "--limit") + .map(|s| s.parse().unwrap_or(100_000)) + .unwrap_or(100_000); + let steady_state = args.iter().any(|a| a == "--steady-state"); + let direct = args.iter().any(|a| a == "--direct"); + + eprintln!("=== Sync V2 Pipeline Validation ==="); + eprintln!("CSV dir: {csv_dir}"); + eprintln!("Row limit: {limit}"); + let mode_str = if direct { "direct (CSV → AccumSink, no WAL)" } + else if steady_state { "steady-state (CoalescerSink)" } + else { "dump (WAL → AccumSink)" }; + eprintln!("Mode: {mode_str}"); + + // Create a temp directory for this validation run + let temp_dir = tempfile::TempDir::new().expect("Failed to create temp dir"); + let data_dir = temp_dir.path(); + let wal_path = data_dir.join("ops.wal"); + let bitmap_dir = data_dir.join("bitmaps"); + let docs_dir = data_dir.join("docs"); + std::fs::create_dir_all(&bitmap_dir).ok(); + std::fs::create_dir_all(&docs_dir).ok(); + + // Phase 1: CSV → WAL + eprintln!("\n--- Phase 1: CSV → WAL ---"); + let csv_start = Instant::now(); + let csv_results = run_csv_dump( + &PathBuf::from(&csv_dir), + &wal_path, + 10_000, + Some(limit), + ) + .expect("CSV dump failed"); + + let csv_elapsed = csv_start.elapsed(); + let total_ops: u64 = csv_results.iter().map(|(_, s)| s.ops_written).sum(); + let total_rows: u64 = csv_results.iter().map(|(_, s)| s.rows_read).sum(); + eprintln!("\nCSV → WAL complete:"); + eprintln!(" Total rows: {total_rows}"); + eprintln!(" Total ops: {total_ops}"); + eprintln!(" Time: {:.2}s", csv_elapsed.as_secs_f64()); + eprintln!(" Throughput: {:.0} rows/s", total_rows as f64 / csv_elapsed.as_secs_f64().max(0.001)); + + // Phase 2: WAL → Engine + eprintln!("\n--- Phase 2: WAL → Engine ---"); + + // Load index config + let config_path = PathBuf::from(&csv_dir).parent().unwrap().join("indexes/civitai/config.json"); + let alt_config = PathBuf::from("data/indexes/civitai/config.json"); + let config_path = if config_path.exists() { + config_path + } else if alt_config.exists() { + alt_config + } else { + eprintln!("ERROR: Could not find config.json. Skipping engine validation."); + print_summary(&csv_results, None, None); + return; + }; + + let config_str = std::fs::read_to_string(&config_path).expect("Failed to read config.json"); + let index_def: serde_json::Value = serde_json::from_str(&config_str).expect("Failed to parse config.json"); + let config: Config = serde_json::from_value(index_def["config"].clone()).expect("Failed to parse engine config"); + + let meta = FieldMeta::from_config(&config); + + let mut engine_config = config.clone(); + engine_config.storage.bitmap_path = Some(bitmap_dir.clone()); + + if direct { + run_direct_mode(&engine_config, &docs_dir, &PathBuf::from(&csv_dir), limit, &csv_results); + } else if steady_state { + run_steady_state(&engine_config, &docs_dir, &wal_path, &meta, &csv_results, total_rows); + } else { + run_dump_mode(&engine_config, &config, &docs_dir, &wal_path, &meta, &csv_results, total_rows); + } +} + +fn run_direct_mode( + engine_config: &Config, + docs_dir: &std::path::Path, + csv_dir: &std::path::Path, + limit: u64, + csv_results: &[(String, bitdex_v2::pg_sync::csv_ops::CsvOpsStats)], +) { + let mut cfg = engine_config.clone(); + cfg.headless = true; + let engine = ConcurrentEngine::new_with_path(cfg, docs_dir) + .expect("Failed to create engine"); + + eprintln!(" Processing CSV directly (no WAL)..."); + let (total_applied, total_errors, elapsed) = + process_csv_dump_direct(&engine, csv_dir, 10_000, Some(limit)); + let alive = engine.alive_count(); + + eprintln!("\nDirect dump complete:"); + eprintln!(" Ops applied: {total_applied}"); + eprintln!(" Errors: {total_errors}"); + eprintln!(" Alive count: {alive}"); + eprintln!(" Time: {:.2}s", elapsed); + eprintln!(" Throughput: {:.0} ops/s", total_applied as f64 / elapsed.max(0.001)); + + validate_and_summarize(csv_results, alive, total_errors, 0); +} + +fn run_dump_mode( + engine_config: &Config, + _config: &Config, + docs_dir: &std::path::Path, + wal_path: &std::path::Path, + _meta: &FieldMeta, + csv_results: &[(String, bitdex_v2::pg_sync::csv_ops::CsvOpsStats)], + total_rows: u64, +) { + // Headless is fine for dump mode — we apply directly to staging, no flush thread needed + let mut cfg = engine_config.clone(); + cfg.headless = true; + let engine = ConcurrentEngine::new_with_path(cfg, docs_dir) + .expect("Failed to create engine"); + + eprintln!(" Processing WAL via dump mode (AccumSink)..."); + let (total_applied, total_errors, elapsed) = process_wal_dump(&engine, wal_path, 10_000); + let alive = engine.alive_count(); + + eprintln!("\nWAL → Engine complete (dump mode):"); + eprintln!(" Ops applied: {total_applied}"); + eprintln!(" Errors: {total_errors}"); + eprintln!(" Alive count: {alive}"); + eprintln!(" Time: {:.2}s", elapsed); + eprintln!(" Throughput: {:.0} ops/s", total_applied as f64 / elapsed.max(0.001)); + + validate_and_summarize(csv_results, alive, total_errors, total_rows); +} + +fn run_steady_state( + engine_config: &Config, + docs_dir: &std::path::Path, + wal_path: &std::path::Path, + meta: &FieldMeta, + csv_results: &[(String, bitdex_v2::pg_sync::csv_ops::CsvOpsStats)], + total_rows: u64, +) { + use bitdex_v2::ingester::CoalescerSink; + + // Non-headless — need flush thread to drain coalescer + let mut cfg = engine_config.clone(); + cfg.headless = false; + let engine = ConcurrentEngine::new_with_path(cfg, docs_dir) + .expect("Failed to create engine"); + + let wal_start = Instant::now(); + let mut reader = WalReader::new(wal_path, 0); + let mut total_applied = 0u64; + let mut total_errors = 0u64; + + loop { + let batch = reader.read_batch(10_000).expect("WAL read failed"); + if batch.entries.is_empty() { + break; + } + let sender = engine.mutation_sender(); + let mut sink = CoalescerSink::new(sender); + let mut entries = batch.entries; + let (applied, _skipped, errors) = apply_ops_batch( + &mut sink, meta, &mut entries, Some(&engine), + ); + total_applied += applied as u64; + total_errors += errors as u64; + } + + // Wait for flush thread to drain + eprintln!(" Waiting for flush thread to drain..."); + let drain_start = Instant::now(); + loop { + let pending = engine.flush_queue_depth(); + if pending == 0 { break; } + if drain_start.elapsed().as_secs() > 30 { + eprintln!(" WARN: flush thread still has {pending} pending ops after 30s"); + break; + } + std::thread::sleep(std::time::Duration::from_millis(50)); + } + eprintln!(" Flush drain: {:.1}s", drain_start.elapsed().as_secs_f64()); + + let wal_elapsed = wal_start.elapsed(); + let alive = engine.alive_count(); + + eprintln!("\nWAL → Engine complete (steady-state):"); + eprintln!(" Ops applied: {total_applied}"); + eprintln!(" Errors: {total_errors}"); + eprintln!(" Alive count: {alive}"); + eprintln!(" Time: {:.2}s", wal_elapsed.as_secs_f64()); + eprintln!(" Throughput: {:.0} ops/s", total_applied as f64 / wal_elapsed.as_secs_f64().max(0.001)); + + validate_and_summarize(csv_results, alive, total_errors, total_rows); +} + +fn validate_and_summarize( + csv_results: &[(String, bitdex_v2::pg_sync::csv_ops::CsvOpsStats)], + alive: u64, + total_errors: u64, + total_rows: u64, +) { + eprintln!("\n--- Phase 3: Validation ---"); + let mut pass = true; + + if alive == 0 { + eprintln!(" FAIL: alive count is 0 — no documents loaded"); + pass = false; + } else { + eprintln!(" PASS: alive count = {alive}"); + } + + if total_errors > 0 { + eprintln!(" WARN: {total_errors} errors during ops application"); + } else { + eprintln!(" PASS: zero errors"); + } + + // Images make up a fraction of total rows + let image_rows = csv_results.iter() + .find(|(name, _)| name == "images") + .map(|(_, s)| s.rows_read) + .unwrap_or(0); + let expected_min = (image_rows as f64 * 0.8) as u64; + if alive < expected_min { + eprintln!(" WARN: alive ({alive}) < 80% of image rows ({image_rows})"); + } + + print_summary(csv_results, Some(alive), Some(pass)); +} + +fn print_summary( + csv_results: &[(String, bitdex_v2::pg_sync::csv_ops::CsvOpsStats)], + alive: Option, + pass: Option, +) { + eprintln!("\n=== Summary ==="); + eprintln!("Table | Rows | Ops | Time | Rows/s"); + eprintln!("---------------|-----------|-----------|---------|--------"); + for (table, stats) in csv_results { + eprintln!( + "{:14} | {:>9} | {:>9} | {:>5.1}s | {:>7.0}", + table, + stats.rows_read, + stats.ops_written, + stats.elapsed_secs, + stats.rows_read as f64 / stats.elapsed_secs.max(0.001) + ); + } + if let Some(alive) = alive { + eprintln!("\nAlive count: {alive}"); + } + if let Some(pass) = pass { + eprintln!("Result: {}", if pass { "PASS" } else { "FAIL" }); + } +} + +fn get_arg(args: &[String], flag: &str) -> Option { + args.iter() + .position(|a| a == flag) + .and_then(|i| args.get(i + 1)) + .cloned() +} diff --git a/src/concurrent_engine.rs b/src/concurrent_engine.rs index 45cd0878..fbe2c03a 100644 --- a/src/concurrent_engine.rs +++ b/src/concurrent_engine.rs @@ -6067,6 +6067,12 @@ impl ConcurrentEngine { &self.config } + /// Get a cloneable MutationSender for submitting ops to the coalescer channel. + /// Used by the WAL reader thread to send ops via CoalescerSink. + pub fn mutation_sender(&self) -> MutationSender { + self.sender.clone() + } + /// Get a reference to the BitmapFs store, if configured. pub fn bitmap_store(&self) -> Option<&Arc> { self.bitmap_store.as_ref() @@ -6483,6 +6489,45 @@ impl ConcurrentEngine { total_count } + /// Apply a BitmapAccum's accumulated bitmaps directly to staging. + /// + /// Used by the dump pipeline (Sync V2) to apply ops-derived bitmaps + /// without going through the coalescer channel. Must be called while + /// in loading mode (enter_loading_mode → apply_accum → exit_loading_mode). + /// + /// ORs filter bitmaps, sort layer bitmaps, and alive bitmap into staging. + pub fn apply_accum(&self, accum: &crate::loader::BitmapAccum) { + let snap = self.inner.load_full(); + // Clone staging for mutation — loading mode means no snapshot publishing overhead + let mut staging = (*snap).clone(); + + // Apply filter bitmaps + for (field_name, value_map) in &accum.filter_maps { + if let Some(field) = staging.filters.get_field_mut(field_name) { + for (&value, bitmap) in value_map { + field.or_bitmap(value, bitmap); + } + } + } + + // Apply sort layer bitmaps + for (field_name, layer_map) in &accum.sort_maps { + if let Some(field) = staging.sorts.get_field_mut(field_name) { + for (&bit_layer, bitmap) in layer_map { + field.or_layer(bit_layer, bitmap); + } + } + } + + // Apply alive bitmap + staging.slots.alive_or_bitmap(&accum.alive); + + // alive_or_bitmap already updates the slot counter to max(alive) + 1 + + // Store back — in loading mode, flush thread won't publish intermediate snapshots + self.inner.store(Arc::new(staging)); + } + /// Build all bitmap indexes from the docstore. /// /// Designed for "build index" boot mode: starts from bare docs on disk, diff --git a/src/ops_processor.rs b/src/ops_processor.rs index cc473fd3..e07202ee 100644 --- a/src/ops_processor.rs +++ b/src/ops_processor.rs @@ -1,25 +1,34 @@ -//! WAL ops processor — reads ops from WAL files and applies them as engine mutations. +//! WAL ops processor — translates ops from WAL files into bitmap mutations. //! -//! The processor runs as a dedicated thread, tailing WAL files and converting ops -//! into engine mutations (put/patch/delete). It handles: -//! - Regular ops (set/remove/add) via PatchPayload -//! - queryOpSet via query resolution + bulk bitmap ops -//! - Delete via engine.delete() -//! - Deduplication via shared dedup helper +//! Two processing modes per the Sync V2 design: +//! +//! - **Steady-state**: Ops → BitmapSink (CoalescerSink) → coalescer channel → flush thread. +//! Used by the WAL reader thread during normal operation. +//! +//! - **Dump mode**: Ops → BitmapSink (AccumSink) → direct bitmap accumulation. +//! Used during initial load. Bypasses coalescer, snapshot publishing, and cache. +//! +//! Both paths use the same `process_entity_ops()` core that translates Op variants +//! into BitmapSink calls using the engine Config for field awareness and +//! `value_to_bitmap_key()` / `value_to_sort_u32()` for value conversion. use std::collections::HashMap; use std::path::{Path, PathBuf}; +use std::sync::Arc; use std::time::Duration; use serde_json::Value as JsonValue; use crate::concurrent_engine::ConcurrentEngine; -use crate::mutation::{FieldValue, PatchField, PatchPayload}; +use crate::config::Config; +use crate::filter::FilterFieldType; +use crate::ingester::BitmapSink; +use crate::mutation::{value_to_bitmap_key, value_to_sort_u32, FieldRegistry}; use crate::pg_sync::op_dedup::dedup_ops; use crate::pg_sync::ops::{EntityOps, Op}; use crate::query::{BitdexQuery, FilterClause, Value as QValue}; -/// Convert a serde_json::Value to a query::Value. +/// Convert a serde_json::Value to a query::Value for bitmap key conversion. fn json_to_qvalue(v: &JsonValue) -> QValue { match v { JsonValue::Number(n) => { @@ -33,8 +42,8 @@ fn json_to_qvalue(v: &JsonValue) -> QValue { } JsonValue::Bool(b) => QValue::Bool(*b), JsonValue::String(s) => QValue::String(s.clone()), - JsonValue::Null => QValue::Integer(0), // Null → zero for bitmap purposes - _ => QValue::String(v.to_string()), // Arrays/objects → string representation + JsonValue::Null => QValue::Integer(0), + _ => QValue::String(v.to_string()), } } @@ -58,13 +67,59 @@ impl Default for OpsProcessorConfig { } } -/// Process a single batch of entity ops against the engine. +/// Precomputed field metadata from Config, used during ops processing. +/// Built once, reused across all batches. +pub struct FieldMeta { + /// Filter field name → (Arc, FilterFieldType) + filter_fields: HashMap, FilterFieldType)>, + /// Sort field name → (Arc, num_bits) + sort_fields: HashMap, usize)>, + /// Field registry for Arc interning (kept for future DocSink use) + #[allow(dead_code)] + registry: FieldRegistry, +} + +impl FieldMeta { + /// Build FieldMeta from engine config. + pub fn from_config(config: &Config) -> Self { + let registry = FieldRegistry::from_config(config); + let mut filter_fields = HashMap::new(); + for fc in &config.filter_fields { + filter_fields.insert( + fc.name.clone(), + (registry.get(&fc.name), fc.field_type.clone()), + ); + } + let mut sort_fields = HashMap::new(); + for sc in &config.sort_fields { + sort_fields.insert( + sc.name.clone(), + (registry.get(&sc.name), sc.bits as usize), + ); + } + Self { + filter_fields, + sort_fields, + registry, + } + } +} + +/// Process a batch of entity ops, translating them into BitmapSink calls. +/// +/// This is the core function used by both steady-state (CoalescerSink) and +/// dump (AccumSink) paths. The sink determines where mutations go. +/// +/// For queryOpSet resolution, an engine reference is needed to execute queries. +/// Pass `None` during dump mode (queryOpSets are only used in steady-state). +/// /// Returns (applied, skipped, errors). -pub fn apply_ops_batch( - engine: &ConcurrentEngine, +pub fn apply_ops_batch( + sink: &mut S, + meta: &FieldMeta, batch: &mut Vec, + engine: Option<&ConcurrentEngine>, ) -> (usize, usize, usize) { - // Dedup first dedup_ops(batch); let mut applied = 0usize; @@ -79,155 +134,208 @@ pub fn apply_ops_batch( } let slot = entity_id as u32; - for op in &entry.ops { - match op { - Op::Delete => { - match engine.delete(slot) { - Ok(()) => applied += 1, - Err(e) => { - tracing::warn!("ops processor: delete slot {slot} failed: {e}"); - errors += 1; - } - } + // Delete absorbs everything — clear all bitmaps for this slot. + if entry.ops.iter().any(|op| matches!(op, Op::Delete)) { + match process_delete(sink, meta, slot, engine) { + Ok(()) => applied += 1, + Err(e) => { + tracing::warn!("ops processor: delete slot {slot} failed: {e}"); + errors += 1; } + } + continue; + } - Op::QueryOpSet { query, ops } => { - match apply_query_op_set(engine, query, ops) { + // Handle queryOpSets (steady-state only — needs engine for query resolution) + for op in &entry.ops { + if let Op::QueryOpSet { query, ops } = op { + if let Some(eng) = engine { + match apply_query_op_set(sink, meta, eng, query, ops) { Ok(count) => applied += count, Err(e) => { tracing::warn!("ops processor: queryOpSet '{query}' failed: {e}"); errors += 1; } } - } - - // Accumulate set/remove/add ops per entity, then apply as a patch - _ => { - // Collect all non-delete, non-queryOpSet ops for this entity - // and apply as a single patch + } else { + tracing::warn!("ops processor: queryOpSet skipped (no engine in dump mode)"); + skipped += 1; } } } - // Build a PatchPayload from the set/remove/add ops for this entity - let patch = build_patch_from_ops(&entry.ops); - if !patch.fields.is_empty() { - match engine.patch(slot, &patch) { - Ok(()) => applied += 1, - Err(e) => { - tracing::warn!("ops processor: patch slot {slot} failed: {e}"); - errors += 1; + // Process set/remove/add ops → direct bitmap mutations + let mut has_any_ops = false; + for op in &entry.ops { + match op { + Op::Set { field, value } => { + process_set_op(sink, meta, slot, field, value); + has_any_ops = true; + } + Op::Remove { field, value } => { + process_remove_op(sink, meta, slot, field, value); + has_any_ops = true; + } + Op::Add { field, value } => { + process_add_op(sink, meta, slot, field, value); + has_any_ops = true; + } + Op::Delete | Op::QueryOpSet { .. } => { + // Already handled above } } } + + // Set alive only if creates_slot is true (primary entity table). + // Join tables (tags, tools) set creates_slot=false — they only + // add multi-value bitmaps to existing slots. + if entry.creates_slot { + sink.alive_insert(slot); + } + + if has_any_ops { + applied += 1; + } + } + + // Flush buffered operations + if let Err(e) = sink.flush() { + tracing::error!("ops processor: sink flush failed: {e}"); + errors += 1; } (applied, skipped, errors) } -/// Build a PatchPayload from a list of ops for a single entity. -/// Pairs remove/set ops on the same field into PatchField { old, new }. -/// Add ops become multi-value inserts. -fn build_patch_from_ops(ops: &[Op]) -> PatchPayload { - let mut fields: HashMap = HashMap::new(); - - // First pass: collect removes (old values) and sets (new values) per field - let mut old_values: HashMap<&str, &JsonValue> = HashMap::new(); - let mut new_values: HashMap<&str, &JsonValue> = HashMap::new(); - let mut add_values: HashMap<&str, Vec<&JsonValue>> = HashMap::new(); - let mut remove_values: HashMap<&str, Vec<&JsonValue>> = HashMap::new(); - - for op in ops { - match op { - Op::Remove { field, value } => { - // Check if there's a corresponding Set for this field (scalar update) - let has_set = ops.iter().any(|o| matches!(o, Op::Set { field: f, .. } if f == field)); - if has_set { - old_values.insert(field, value); - } else { - // Multi-value remove - remove_values.entry(field).or_default().push(value); +/// Process a `set` op: set the new value's bitmap bit for this slot. +fn process_set_op( + sink: &mut S, + meta: &FieldMeta, + slot: u32, + field: &str, + value: &JsonValue, +) { + let qval = json_to_qvalue(value); + + // Check if this is a filter field + if let Some((arc_name, _field_type)) = meta.filter_fields.get(field) { + if let Some(key) = value_to_bitmap_key(&qval) { + sink.filter_insert(arc_name.clone(), key, slot); + } + } + + // Check if this is a sort field + if let Some((arc_name, num_bits)) = meta.sort_fields.get(field) { + if let Some(sort_val) = value_to_sort_u32(&qval) { + for bit in 0..*num_bits { + if (sort_val >> bit) & 1 == 1 { + sink.sort_set(arc_name.clone(), bit, slot); } } - Op::Set { field, value } => { - new_values.insert(field, value); - } - Op::Add { field, value } => { - add_values.entry(field).or_default().push(value); - } - Op::Delete | Op::QueryOpSet { .. } => { - // Handled separately - } } } +} + +/// Process a `remove` op: clear the old value's bitmap bit for this slot. +fn process_remove_op( + sink: &mut S, + meta: &FieldMeta, + slot: u32, + field: &str, + value: &JsonValue, +) { + let qval = json_to_qvalue(value); - // Build PatchFields for scalar set/remove pairs - for (field, new_val) in &new_values { - let old = old_values - .get(*field) - .map(|v| FieldValue::Single(json_to_qvalue(v))) - .unwrap_or(FieldValue::Single(QValue::Integer(0))); - let new = FieldValue::Single(json_to_qvalue(new_val)); - fields.insert(field.to_string(), PatchField { old, new }); + // Check if this is a filter field + if let Some((arc_name, _field_type)) = meta.filter_fields.get(field) { + if let Some(key) = value_to_bitmap_key(&qval) { + sink.filter_remove(arc_name.clone(), key, slot); + } } - // Build PatchFields for multi-value adds - for (field, vals) in &add_values { - let new_multi: Vec = vals.iter().map(|v| json_to_qvalue(v)).collect(); - let existing = fields.entry(field.to_string()).or_insert_with(|| PatchField { - old: FieldValue::Multi(vec![]), - new: FieldValue::Multi(vec![]), - }); - if let FieldValue::Multi(ref mut m) = existing.new { - m.extend(new_multi); - } else { - *existing = PatchField { - old: FieldValue::Multi(vec![]), - new: FieldValue::Multi(vals.iter().map(|v| json_to_qvalue(v)).collect()), - }; + // Check if this is a sort field + if let Some((arc_name, num_bits)) = meta.sort_fields.get(field) { + if let Some(sort_val) = value_to_sort_u32(&qval) { + for bit in 0..*num_bits { + if (sort_val >> bit) & 1 == 1 { + sink.sort_clear(arc_name.clone(), bit, slot); + } + } } } +} - // Build PatchFields for multi-value removes - for (field, vals) in &remove_values { - let removed: Vec = vals.iter().map(|v| json_to_qvalue(v)).collect(); - let existing = fields.entry(field.to_string()).or_insert_with(|| PatchField { - old: FieldValue::Multi(vec![]), - new: FieldValue::Multi(vec![]), - }); - if let FieldValue::Multi(ref mut m) = existing.old { - m.extend(removed); - } else { - *existing = PatchField { - old: FieldValue::Multi(vals.iter().map(|v| json_to_qvalue(v)).collect()), - new: FieldValue::Multi(vec![]), - }; +/// Process an `add` op: set a multi-value bitmap bit. +/// Same as `set` for bitmap purposes — adds the value's bit. +fn process_add_op( + sink: &mut S, + meta: &FieldMeta, + slot: u32, + field: &str, + value: &JsonValue, +) { + let qval = json_to_qvalue(value); + + if let Some((arc_name, _field_type)) = meta.filter_fields.get(field) { + if let Some(key) = value_to_bitmap_key(&qval) { + sink.filter_insert(arc_name.clone(), key, slot); } } + // Multi-value fields don't have sort layers, but handle it generically + if let Some((arc_name, num_bits)) = meta.sort_fields.get(field) { + if let Some(sort_val) = value_to_sort_u32(&qval) { + for bit in 0..*num_bits { + if (sort_val >> bit) & 1 == 1 { + sink.sort_set(arc_name.clone(), bit, slot); + } + } + } + } +} - PatchPayload { fields } +/// Process a delete: read stored doc from engine to know which bitmaps to clear +/// (clean delete principle), then clear all filter/sort bits + alive bit. +/// +/// Per design doc H1: deletes are the one op type that requires a docstore read. +fn process_delete( + sink: &mut S, + _meta: &FieldMeta, + slot: u32, + engine: Option<&ConcurrentEngine>, +) -> std::result::Result<(), String> { + // If we have an engine, read stored doc to clear filter/sort bitmaps cleanly. + // Without engine (dump mode), we can only clear alive — filter bitmaps may be stale. + if let Some(eng) = engine { + // Use the engine's delete method which handles clean delete internally. + eng.delete(slot).map_err(|e| format!("engine delete failed: {e}"))?; + return Ok(()); + } + + // Dump mode fallback: just clear alive bit (no stored doc to read) + sink.alive_remove(slot); + Ok(()) } -/// Resolve a queryOpSet: execute the query to get matching slots, then apply -/// the nested ops to each slot. -fn apply_query_op_set( +/// Resolve a queryOpSet: execute the query to get matching slots, +/// then apply the nested ops to each matching slot via the BitmapSink. +fn apply_query_op_set( + sink: &mut S, + meta: &FieldMeta, engine: &ConcurrentEngine, query_str: &str, ops: &[Op], -) -> Result { - // Parse the query string into filter clauses +) -> std::result::Result { let filters = parse_filter_from_query_str(query_str)?; let query = BitdexQuery { filters, sort: None, - limit: usize::MAX, // Get all matching slots + limit: usize::MAX, offset: None, cursor: None, - skip_cache: true, // Don't pollute cache with internal queries + skip_cache: true, }; - // Execute query to get matching slot IDs let result = engine .execute_query(&query) .map_err(|e| format!("queryOpSet query failed: {e}"))?; @@ -237,25 +345,36 @@ fn apply_query_op_set( return Ok(0); } - // Build the patch from nested ops - let patch = build_patch_from_ops(ops); - if patch.fields.is_empty() { - return Ok(0); - } - - // Apply patch to each matching slot + // Apply nested ops to each matching slot let mut applied = 0; for &slot_id in slot_ids { - if slot_id < 0 { + if slot_id < 0 || slot_id > u32::MAX as i64 { continue; } let slot = slot_id as u32; - match engine.patch(slot, &patch) { - Ok(()) => applied += 1, - Err(e) => { - tracing::warn!("queryOpSet: patch slot {slot} failed: {e}"); + + for op in ops { + match op { + Op::Set { field, value } => { + process_set_op(sink, meta, slot, field, value); + } + Op::Remove { field, value } => { + process_remove_op(sink, meta, slot, field, value); + } + Op::Add { field, value } => { + process_add_op(sink, meta, slot, field, value); + } + Op::Delete => { + // Delete within queryOpSet clears alive for each matched slot + sink.alive_remove(slot); + } + Op::QueryOpSet { .. } => { + // Nested queryOpSets not supported + tracing::warn!("nested queryOpSet ignored"); + } } } + applied += 1; } Ok(applied) @@ -263,7 +382,7 @@ fn apply_query_op_set( /// Parse a simple filter string like "modelVersionIds eq 456" or "postId eq 789" /// into filter clauses. -fn parse_filter_from_query_str(query_str: &str) -> Result, String> { +fn parse_filter_from_query_str(query_str: &str) -> std::result::Result, String> { let clauses: Vec<&str> = query_str.split(" AND ").collect(); let mut filters = Vec::new(); @@ -297,7 +416,7 @@ fn parse_filter_from_query_str(query_str: &str) -> Result, Str } /// Parse a single query value from a string. -fn parse_query_value(s: &str) -> Result { +fn parse_query_value(s: &str) -> std::result::Result { if let Ok(n) = s.parse::() { return Ok(QValue::Integer(n)); } @@ -315,7 +434,7 @@ fn parse_query_value(s: &str) -> Result { } /// Parse an array of query values like "[101, 102, 103]". -fn parse_query_values_array(s: &str) -> Result, String> { +fn parse_query_values_array(s: &str) -> std::result::Result, String> { let trimmed = s.trim(); if !trimmed.starts_with('[') || !trimmed.ends_with(']') { return Err(format!("Expected array for 'in' filter, got: '{s}'")); @@ -331,6 +450,241 @@ fn parse_query_values_array(s: &str) -> Result, String> { Ok(values) } +/// Process a batch of entity ops in dump mode using AccumSink. +/// +/// This is the bulk-loading path that bypasses the coalescer entirely. +/// Ops are accumulated directly into bitmaps (like the single-pass loader). +/// +/// Returns (applied, skipped, errors). +pub(crate) fn apply_ops_batch_dump( + accum: &mut crate::loader::BitmapAccum, + meta: &FieldMeta, + batch: &mut Vec, +) -> (usize, usize, usize) { + let mut sink = crate::ingester::AccumSink::new(accum); + apply_ops_batch(&mut sink, meta, batch, None) +} + +/// Process all WAL entries in dump mode: reads WAL, accumulates bitmaps, applies to engine. +/// +/// This is the high-level dump pipeline entry point. It: +/// 1. Creates a BitmapAccum from the engine config +/// 2. Reads all WAL entries, processes via AccumSink +/// 3. Applies accumulated bitmaps directly to engine staging +/// +/// Returns (total_applied, total_errors, elapsed_secs). +pub fn process_wal_dump( + engine: &ConcurrentEngine, + wal_path: &Path, + batch_size: usize, +) -> (u64, u64, f64) { + use crate::loader::BitmapAccum; + use crate::ops_wal::WalReader; + use std::time::Instant; + + let config = engine.config(); + let meta = FieldMeta::from_config(config); + + let filter_names: Vec = config.filter_fields.iter().map(|f| f.name.clone()).collect(); + let sort_configs: Vec<(String, u8)> = config.sort_fields.iter().map(|s| (s.name.clone(), s.bits)).collect(); + let mut accum = BitmapAccum::new(&filter_names, &sort_configs); + + let start = Instant::now(); + let mut reader = WalReader::new(wal_path, 0); + let mut total_applied = 0u64; + let mut total_errors = 0u64; + + loop { + let batch = match reader.read_batch(batch_size) { + Ok(b) => b, + Err(e) => { + tracing::error!("WAL read error in dump mode: {e}"); + total_errors += 1; + break; + } + }; + if batch.entries.is_empty() { + break; + } + let mut entries = batch.entries; + let (applied, _skipped, errors) = apply_ops_batch_dump(&mut accum, &meta, &mut entries); + total_applied += applied as u64; + total_errors += errors as u64; + } + + // Apply accumulated bitmaps to engine staging + engine.apply_accum(&accum); + + (total_applied, total_errors, start.elapsed().as_secs_f64()) +} + +/// Direct dump pipeline: CSV → ops → AccumSink, bypassing WAL entirely. +/// +/// For bulk loading, the WAL roundtrip (JSON serialize → disk → read → deserialize) +/// adds ~8x overhead vs direct processing. This function goes straight from +/// CSV rows to bitmap accumulation, matching the single-pass loader's throughput. +/// +/// Returns (total_applied, total_errors, elapsed_secs). +pub fn process_csv_dump_direct( + engine: &ConcurrentEngine, + csv_dir: &Path, + batch_size: usize, + limit: Option, +) -> (u64, u64, f64) { + use crate::ingester::AccumSink; + use crate::loader::BitmapAccum; + use crate::pg_sync::copy_queries::{parse_image_row, parse_tag_row, parse_tool_row}; + use std::fs::File; + use std::io::{BufRead, BufReader}; + use std::time::Instant; + + let config = engine.config(); + let meta = FieldMeta::from_config(config); + + let filter_names: Vec = config.filter_fields.iter().map(|f| f.name.clone()).collect(); + let sort_configs: Vec<(String, u8)> = config.sort_fields.iter().map(|s| (s.name.clone(), s.bits)).collect(); + let mut accum = BitmapAccum::new(&filter_names, &sort_configs); + + let start = Instant::now(); + let mut total_applied = 0u64; + let mut total_errors = 0u64; + + // Phase 1: Images (creates alive slots) + let images_csv = csv_dir.join("images.csv"); + if images_csv.exists() { + let file = File::open(&images_csv).expect("open images.csv"); + let reader = BufReader::with_capacity(8 * 1024 * 1024, file); + let mut rows = 0u64; + let img_start = Instant::now(); + + for line in reader.split(b'\n') { + let line = match line { + Ok(l) => l, + Err(_) => continue, + }; + if line.is_empty() { continue; } + if let Some(max) = limit { + if rows >= max { break; } + } + + let row = match parse_image_row(&line) { + Some(r) => r, + None => continue, + }; + rows += 1; + + let slot = row.id as u32; + // Process each op directly into AccumSink + let ops = crate::pg_sync::csv_ops::image_row_to_ops_pub(&row); + { + let mut sink = AccumSink::new(&mut accum); + for op in &ops { + match op { + Op::Set { field, value } => { + process_set_op(&mut sink, &meta, slot, field, value); + } + Op::Remove { field, value } => { + process_remove_op(&mut sink, &meta, slot, field, value); + } + _ => {} + } + } + sink.alive_insert(slot); + } + total_applied += 1; + } + eprintln!(" images: {rows} rows, {:.1}s ({:.0}/s)", + img_start.elapsed().as_secs_f64(), + rows as f64 / img_start.elapsed().as_secs_f64().max(0.001)); + } + + // Phase 2: Tags (multi-value, no alive) + let tags_csv = csv_dir.join("tags.csv"); + if tags_csv.exists() { + let file = File::open(&tags_csv).expect("open tags.csv"); + let reader = BufReader::with_capacity(8 * 1024 * 1024, file); + let mut rows = 0u64; + let tag_start = Instant::now(); + + for line in reader.split(b'\n') { + let line = match line { + Ok(l) => l, + Err(_) => continue, + }; + if line.is_empty() { continue; } + if let Some(max) = limit { + if rows >= max { break; } + } + + let (tag_id, image_id) = match parse_tag_row(&line) { + Some(pair) => pair, + None => continue, + }; + rows += 1; + + let slot = image_id as u32; + let qval = QValue::Integer(tag_id); + if let Some((arc_name, _)) = meta.filter_fields.get("tagIds") { + if let Some(key) = value_to_bitmap_key(&qval) { + let mut sink = AccumSink::new(&mut accum); + sink.filter_insert(arc_name.clone(), key, slot); + } + } + total_applied += 1; + } + eprintln!(" tags: {rows} rows, {:.1}s ({:.0}/s)", + tag_start.elapsed().as_secs_f64(), + rows as f64 / tag_start.elapsed().as_secs_f64().max(0.001)); + } + + // Phase 3: Tools (multi-value, no alive) + let tools_csv = csv_dir.join("tools.csv"); + if tools_csv.exists() { + let file = File::open(&tools_csv).expect("open tools.csv"); + let reader = BufReader::with_capacity(8 * 1024 * 1024, file); + let mut rows = 0u64; + let tool_start = Instant::now(); + + for line in reader.split(b'\n') { + let line = match line { + Ok(l) => l, + Err(_) => continue, + }; + if line.is_empty() { continue; } + if let Some(max) = limit { + if rows >= max { break; } + } + + let (tool_id, image_id) = match parse_tool_row(&line) { + Some(pair) => pair, + None => continue, + }; + rows += 1; + + let slot = image_id as u32; + let qval = QValue::Integer(tool_id); + if let Some((arc_name, _)) = meta.filter_fields.get("toolIds") { + if let Some(key) = value_to_bitmap_key(&qval) { + let mut sink = AccumSink::new(&mut accum); + sink.filter_insert(arc_name.clone(), key, slot); + } + } + total_applied += 1; + } + eprintln!(" tools: {rows} rows, {:.1}s ({:.0}/s)", + tool_start.elapsed().as_secs_f64(), + rows as f64 / tool_start.elapsed().as_secs_f64().max(0.001)); + } + + // Apply accumulated bitmaps to engine staging + eprintln!(" Applying accum to staging..."); + let apply_start = Instant::now(); + engine.apply_accum(&accum); + eprintln!(" Apply: {:.3}s", apply_start.elapsed().as_secs_f64()); + + (total_applied, total_errors, start.elapsed().as_secs_f64()) +} + /// Persist cursor position to disk. pub fn save_cursor(path: &Path, cursor: u64) -> std::io::Result<()> { std::fs::write(path, cursor.to_string()) @@ -349,83 +703,389 @@ mod tests { use super::*; use serde_json::json; - #[test] - fn test_build_patch_from_scalar_update() { - let ops = vec![ - Op::Remove { field: "nsfwLevel".into(), value: json!(8) }, - Op::Set { field: "nsfwLevel".into(), value: json!(16) }, + use crate::config::{Config, FilterFieldConfig, SortFieldConfig}; + use crate::filter::FilterFieldType; + use crate::ingester::BitmapSink; + + /// A test sink that records all operations for verification. + struct RecordingSink { + filter_inserts: Vec<(String, u64, u32)>, + filter_removes: Vec<(String, u64, u32)>, + sort_sets: Vec<(String, usize, u32)>, + sort_clears: Vec<(String, usize, u32)>, + alive_inserts: Vec, + alive_removes: Vec, + } + + impl RecordingSink { + fn new() -> Self { + Self { + filter_inserts: Vec::new(), + filter_removes: Vec::new(), + sort_sets: Vec::new(), + sort_clears: Vec::new(), + alive_inserts: Vec::new(), + alive_removes: Vec::new(), + } + } + } + + impl BitmapSink for RecordingSink { + fn filter_insert(&mut self, field: Arc, value: u64, slot: u32) { + self.filter_inserts.push((field.to_string(), value, slot)); + } + fn filter_remove(&mut self, field: Arc, value: u64, slot: u32) { + self.filter_removes.push((field.to_string(), value, slot)); + } + fn sort_set(&mut self, field: Arc, bit_layer: usize, slot: u32) { + self.sort_sets.push((field.to_string(), bit_layer, slot)); + } + fn sort_clear(&mut self, field: Arc, bit_layer: usize, slot: u32) { + self.sort_clears.push((field.to_string(), bit_layer, slot)); + } + fn alive_insert(&mut self, slot: u32) { + self.alive_inserts.push(slot); + } + fn alive_remove(&mut self, slot: u32) { + self.alive_removes.push(slot); + } + fn flush(&mut self) -> crate::error::Result<()> { + Ok(()) + } + } + + fn test_config() -> Config { + let mut config = Config::default(); + config.filter_fields = vec![ + FilterFieldConfig { + name: "nsfwLevel".into(), + field_type: FilterFieldType::SingleValue, + behaviors: None, + eviction: None, + eager_load: false, + }, + FilterFieldConfig { + name: "type".into(), + field_type: FilterFieldType::SingleValue, + behaviors: None, + eviction: None, + eager_load: false, + }, + FilterFieldConfig { + name: "tagIds".into(), + field_type: FilterFieldType::MultiValue, + behaviors: None, + eviction: None, + eager_load: false, + }, + FilterFieldConfig { + name: "hasMeta".into(), + field_type: FilterFieldType::Boolean, + behaviors: None, + eviction: None, + eager_load: false, + }, ]; - let patch = build_patch_from_ops(&ops); - assert_eq!(patch.fields.len(), 1); - let field = &patch.fields["nsfwLevel"]; - assert_eq!(field.old, FieldValue::Single(QValue::Integer(8))); - assert_eq!(field.new, FieldValue::Single(QValue::Integer(16))); + config.sort_fields = vec![SortFieldConfig { + name: "existedAt".into(), + source_type: "uint32".into(), + encoding: "linear".into(), + bits: 32, + eager_load: false, + }]; + config } #[test] - fn test_build_patch_from_insert_no_old() { - let ops = vec![ - Op::Set { field: "nsfwLevel".into(), value: json!(16) }, - Op::Set { field: "type".into(), value: json!("image") }, - ]; - let patch = build_patch_from_ops(&ops); - assert_eq!(patch.fields.len(), 2); - assert_eq!(patch.fields["nsfwLevel"].old, FieldValue::Single(QValue::Integer(0))); - assert_eq!(patch.fields["nsfwLevel"].new, FieldValue::Single(QValue::Integer(16))); + fn test_set_op_filter_insert() { + let config = test_config(); + let meta = FieldMeta::from_config(&config); + let mut sink = RecordingSink::new(); + + let mut batch = vec![EntityOps { + entity_id: 42, + creates_slot: true, + ops: vec![Op::Set { + field: "nsfwLevel".into(), + value: json!(16), + }], + }]; + + let (applied, skipped, errors) = apply_ops_batch(&mut sink, &meta, &mut batch, None); + assert_eq!(applied, 1); + assert_eq!(skipped, 0); + assert_eq!(errors, 0); + + assert_eq!(sink.filter_inserts.len(), 1); + assert_eq!(sink.filter_inserts[0], ("nsfwLevel".to_string(), 16, 42)); + assert_eq!(sink.alive_inserts, vec![42]); } #[test] - fn test_build_patch_from_add() { - let ops = vec![ - Op::Add { field: "tagIds".into(), value: json!(42) }, - Op::Add { field: "tagIds".into(), value: json!(99) }, - ]; - let patch = build_patch_from_ops(&ops); - assert_eq!(patch.fields.len(), 1); - if let FieldValue::Multi(ref vals) = patch.fields["tagIds"].new { - assert_eq!(vals.len(), 2); - } else { - panic!("Expected Multi"); - } + fn test_remove_then_set_scalar_update() { + let config = test_config(); + let meta = FieldMeta::from_config(&config); + let mut sink = RecordingSink::new(); + + let mut batch = vec![EntityOps { + entity_id: 42, + creates_slot: true, + ops: vec![ + Op::Remove { + field: "nsfwLevel".into(), + value: json!(8), + }, + Op::Set { + field: "nsfwLevel".into(), + value: json!(16), + }, + ], + }]; + + let (applied, _, errors) = apply_ops_batch(&mut sink, &meta, &mut batch, None); + assert_eq!(applied, 1); + assert_eq!(errors, 0); + + // Should have one remove (old value 8) and one insert (new value 16) + assert_eq!(sink.filter_removes.len(), 1); + assert_eq!(sink.filter_removes[0], ("nsfwLevel".to_string(), 8, 42)); + assert_eq!(sink.filter_inserts.len(), 1); + assert_eq!(sink.filter_inserts[0], ("nsfwLevel".to_string(), 16, 42)); } #[test] - fn test_build_patch_from_multi_remove() { - let ops = vec![ - Op::Remove { field: "tagIds".into(), value: json!(42) }, - ]; - let patch = build_patch_from_ops(&ops); - assert_eq!(patch.fields.len(), 1); - if let FieldValue::Multi(ref vals) = patch.fields["tagIds"].old { - assert_eq!(vals.len(), 1); - assert_eq!(vals[0], QValue::Integer(42)); - } else { - panic!("Expected Multi for old"); - } + fn test_add_multi_value() { + let config = test_config(); + let meta = FieldMeta::from_config(&config); + let mut sink = RecordingSink::new(); + + let mut batch = vec![EntityOps { + entity_id: 100, + creates_slot: false, + ops: vec![ + Op::Add { + field: "tagIds".into(), + value: json!(42), + }, + Op::Add { + field: "tagIds".into(), + value: json!(99), + }, + ], + }]; + + let (applied, _, errors) = apply_ops_batch(&mut sink, &meta, &mut batch, None); + assert_eq!(applied, 1); + assert_eq!(errors, 0); + + assert_eq!(sink.filter_inserts.len(), 2); + // Order after dedup is nondeterministic (HashMap iteration) + let mut values: Vec = sink.filter_inserts.iter().map(|(_, v, _)| *v).collect(); + values.sort(); + assert_eq!(values, vec![42, 99]); + // Add-only ops should NOT set alive (only Set ops do) + assert!(sink.alive_inserts.is_empty()); } #[test] - fn test_build_patch_skips_delete_and_query() { - let ops = vec![ - Op::Delete, - Op::QueryOpSet { query: "x eq 1".into(), ops: vec![] }, - Op::Set { field: "a".into(), value: json!(1) }, - ]; - let patch = build_patch_from_ops(&ops); - assert_eq!(patch.fields.len(), 1); - assert!(patch.fields.contains_key("a")); + fn test_sort_field_bit_decomposition() { + let config = test_config(); + let meta = FieldMeta::from_config(&config); + let mut sink = RecordingSink::new(); + + // existedAt = 5 = 0b101 → bits 0 and 2 set + let mut batch = vec![EntityOps { + entity_id: 10, + creates_slot: true, + ops: vec![Op::Set { + field: "existedAt".into(), + value: json!(5), + }], + }]; + + apply_ops_batch(&mut sink, &meta, &mut batch, None); + + // Should have sort_set for bits 0 and 2 + let sort_bits: Vec = sink.sort_sets.iter().map(|(_, bit, _)| *bit).collect(); + assert!(sort_bits.contains(&0)); + assert!(sort_bits.contains(&2)); + assert!(!sort_bits.contains(&1)); // bit 1 not set for value 5 + } + + #[test] + fn test_sort_field_remove_clears_bits() { + let config = test_config(); + let meta = FieldMeta::from_config(&config); + let mut sink = RecordingSink::new(); + + // Remove old sort value 5 = 0b101, set new value 6 = 0b110 + let mut batch = vec![EntityOps { + entity_id: 10, + creates_slot: true, + ops: vec![ + Op::Remove { + field: "existedAt".into(), + value: json!(5), + }, + Op::Set { + field: "existedAt".into(), + value: json!(6), + }, + ], + }]; + + apply_ops_batch(&mut sink, &meta, &mut batch, None); + + // Clears: bits 0, 2 (from value 5) + let clear_bits: Vec = sink.sort_clears.iter().map(|(_, bit, _)| *bit).collect(); + assert!(clear_bits.contains(&0)); + assert!(clear_bits.contains(&2)); + + // Sets: bits 1, 2 (from value 6) + let set_bits: Vec = sink.sort_sets.iter().map(|(_, bit, _)| *bit).collect(); + assert!(set_bits.contains(&1)); + assert!(set_bits.contains(&2)); + } + + #[test] + fn test_boolean_field() { + let config = test_config(); + let meta = FieldMeta::from_config(&config); + let mut sink = RecordingSink::new(); + + let mut batch = vec![EntityOps { + entity_id: 50, + creates_slot: true, + ops: vec![Op::Set { + field: "hasMeta".into(), + value: json!(true), + }], + }]; + + apply_ops_batch(&mut sink, &meta, &mut batch, None); + + // true → bitmap key 1 + assert_eq!(sink.filter_inserts.len(), 1); + assert_eq!(sink.filter_inserts[0], ("hasMeta".to_string(), 1, 50)); + } + + #[test] + fn test_unknown_field_ignored() { + let config = test_config(); + let meta = FieldMeta::from_config(&config); + let mut sink = RecordingSink::new(); + + let mut batch = vec![EntityOps { + entity_id: 1, + creates_slot: true, + ops: vec![Op::Set { + field: "unknownField".into(), + value: json!(42), + }], + }]; + + let (applied, _, errors) = apply_ops_batch(&mut sink, &meta, &mut batch, None); + assert_eq!(applied, 1); // still counts as applied (alive set) + assert_eq!(errors, 0); + + // No filter or sort ops emitted for unknown field + assert!(sink.filter_inserts.is_empty()); + assert!(sink.sort_sets.is_empty()); + } + + #[test] + fn test_delete_without_engine() { + let config = test_config(); + let meta = FieldMeta::from_config(&config); + let mut sink = RecordingSink::new(); + + let mut batch = vec![EntityOps { + entity_id: 42, + creates_slot: false, + ops: vec![Op::Delete], + }]; + + let (applied, _, errors) = apply_ops_batch(&mut sink, &meta, &mut batch, None); + assert_eq!(applied, 1); + assert_eq!(errors, 0); + + // In dump mode (no engine), delete only clears alive + assert_eq!(sink.alive_removes, vec![42]); + } + + #[test] + fn test_image_insert_all_fields() { + let config = test_config(); + let meta = FieldMeta::from_config(&config); + let mut sink = RecordingSink::new(); + + let mut batch = vec![EntityOps { + entity_id: 1000, + creates_slot: true, + ops: vec![ + Op::Set { + field: "nsfwLevel".into(), + value: json!(1), + }, + Op::Set { + field: "type".into(), + value: json!(0), // "image" mapped to 0 + }, + Op::Set { + field: "hasMeta".into(), + value: json!(true), + }, + Op::Set { + field: "existedAt".into(), + value: json!(1711234567u64), + }, + ], + }]; + + let (applied, _, errors) = apply_ops_batch(&mut sink, &meta, &mut batch, None); + assert_eq!(applied, 1); + assert_eq!(errors, 0); + + // 3 filter inserts (nsfwLevel, type, hasMeta) + sort bits for existedAt + assert_eq!(sink.filter_inserts.len(), 3); + assert!(!sink.sort_sets.is_empty()); // existedAt bit layers + assert_eq!(sink.alive_inserts, vec![1000]); + } + + #[test] + fn test_negative_entity_id_skipped() { + let config = test_config(); + let meta = FieldMeta::from_config(&config); + let mut sink = RecordingSink::new(); + + let mut batch = vec![EntityOps { + entity_id: -1, + creates_slot: true, + ops: vec![Op::Set { + field: "nsfwLevel".into(), + value: json!(1), + }], + }]; + + let (_, skipped, _) = apply_ops_batch(&mut sink, &meta, &mut batch, None); + assert_eq!(skipped, 1); + assert!(sink.filter_inserts.is_empty()); } #[test] fn test_parse_filter_eq() { let filters = parse_filter_from_query_str("modelVersionIds eq 456").unwrap(); assert_eq!(filters.len(), 1); - assert!(matches!(&filters[0], FilterClause::Eq(f, QValue::Integer(456)) if f == "modelVersionIds")); + assert!(matches!( + &filters[0], + FilterClause::Eq(f, QValue::Integer(456)) if f == "modelVersionIds" + )); } #[test] fn test_parse_filter_in() { - let filters = parse_filter_from_query_str("modelVersionIds in [101, 102, 103]").unwrap(); + let filters = + parse_filter_from_query_str("modelVersionIds in [101, 102, 103]").unwrap(); assert_eq!(filters.len(), 1); if let FilterClause::In(f, vals) = &filters[0] { assert_eq!(f, "modelVersionIds"); @@ -437,9 +1097,18 @@ mod tests { #[test] fn test_parse_query_value_types() { - assert!(matches!(parse_query_value("42").unwrap(), QValue::Integer(42))); - assert!(matches!(parse_query_value("true").unwrap(), QValue::Bool(true))); - assert!(matches!(parse_query_value("\"hello\"").unwrap(), QValue::String(s) if s == "hello")); + assert!(matches!( + parse_query_value("42").unwrap(), + QValue::Integer(42) + )); + assert!(matches!( + parse_query_value("true").unwrap(), + QValue::Bool(true) + )); + assert!(matches!( + parse_query_value("\"hello\"").unwrap(), + QValue::String(s) if s == "hello" + )); } #[test] diff --git a/src/ops_wal.rs b/src/ops_wal.rs index 2b9c63a0..b64132fb 100644 --- a/src/ops_wal.rs +++ b/src/ops_wal.rs @@ -3,8 +3,9 @@ //! Format per record: //! [4 bytes: payload_len (u32 LE)] //! [8 bytes: entity_id (i64 LE)] +//! [1 byte: flags (bit 0 = creates_slot)] //! [payload_len bytes: ops JSONB] -//! [4 bytes: CRC32 of entity_id + ops] +//! [4 bytes: CRC32 of entity_id + flags + ops] //! //! The writer appends records and fsyncs. The reader tails the file, //! reading batches of records and tracking a byte-offset cursor. @@ -16,7 +17,8 @@ use std::path::{Path, PathBuf}; use crate::pg_sync::ops::{EntityOps, Op}; -const HEADER_SIZE: usize = 4 + 8; // payload_len + entity_id +const HEADER_SIZE: usize = 4 + 8 + 1; // payload_len + entity_id + flags +const FLAG_CREATES_SLOT: u8 = 0x01; const CRC_SIZE: usize = 4; /// WAL writer — appends ops records to a file with CRC32 integrity. @@ -48,16 +50,19 @@ impl WalWriter { let payload_len = ops_json.len() as u32; let entity_id_bytes = entry.entity_id.to_le_bytes(); + let flags: u8 = if entry.creates_slot { FLAG_CREATES_SLOT } else { 0 }; - // CRC covers entity_id + ops (not the length prefix) - let mut crc_input = Vec::with_capacity(8 + ops_json.len()); + // CRC covers entity_id + flags + ops (not the length prefix) + let mut crc_input = Vec::with_capacity(9 + ops_json.len()); crc_input.extend_from_slice(&entity_id_bytes); + crc_input.push(flags); crc_input.extend_from_slice(&ops_json); let crc = crc32fast::hash(&crc_input); - // Write: [len][entity_id][ops][crc] + // Write: [len][entity_id][flags][ops][crc] file.write_all(&payload_len.to_le_bytes())?; file.write_all(&entity_id_bytes)?; + file.write_all(&[flags])?; file.write_all(&ops_json)?; file.write_all(&crc.to_le_bytes())?; @@ -126,11 +131,13 @@ impl WalReader { let start_pos = pos; while entries.len() < max_records && pos + HEADER_SIZE <= data.len() { - // Read header + // Read header: [4-byte len][8-byte entity_id][1-byte flags] let payload_len = u32::from_le_bytes(data[pos..pos + 4].try_into().unwrap()) as usize; let entity_id = i64::from_le_bytes(data[pos + 4..pos + 12].try_into().unwrap()); + let flags = data[pos + 12]; + let creates_slot = (flags & FLAG_CREATES_SLOT) != 0; let record_end = pos + HEADER_SIZE + payload_len + CRC_SIZE; if record_end > data.len() { @@ -138,8 +145,8 @@ impl WalReader { break; } - // Verify CRC - let crc_input = &data[pos + 4..pos + HEADER_SIZE + payload_len]; // entity_id + ops + // Verify CRC (covers entity_id + flags + ops) + let crc_input = &data[pos + 4..pos + HEADER_SIZE + payload_len]; let stored_crc = u32::from_le_bytes( data[pos + HEADER_SIZE + payload_len..record_end] .try_into() @@ -158,7 +165,7 @@ impl WalReader { let ops_data = &data[pos + HEADER_SIZE..pos + HEADER_SIZE + payload_len]; match serde_json::from_slice::>(ops_data) { Ok(ops) => { - entries.push(EntityOps { entity_id, ops }); + entries.push(EntityOps { entity_id, ops, creates_slot }); } Err(_) => { // Invalid JSON — skip @@ -212,7 +219,7 @@ mod tests { use tempfile::TempDir; fn make_ops(entity_id: i64, ops: Vec) -> EntityOps { - EntityOps { entity_id, ops } + EntityOps { entity_id, ops, creates_slot: false } } #[test] diff --git a/src/pg_sync/csv_ops.rs b/src/pg_sync/csv_ops.rs index 778441a0..0d1ebbe9 100644 --- a/src/pg_sync/csv_ops.rs +++ b/src/pg_sync/csv_ops.rs @@ -56,6 +56,7 @@ pub fn images_csv_to_wal(csv_path: &Path, writer: &WalWriter, batch_size: usize) batch.push(EntityOps { entity_id: row.id, ops, + creates_slot: true, // Image table creates alive slots }); if batch.len() >= batch_size { @@ -77,6 +78,11 @@ pub fn images_csv_to_wal(csv_path: &Path, writer: &WalWriter, batch_size: usize) Ok(stats) } +/// Convert a single image CSV row to ops (public for direct dump path). +pub fn image_row_to_ops_pub(row: &CopyImageRow) -> Vec { + image_row_to_ops(row) +} + /// Convert a single image CSV row to ops. fn image_row_to_ops(row: &CopyImageRow) -> Vec { let mut ops = Vec::with_capacity(12); @@ -170,6 +176,7 @@ fn multi_value_csv_to_wal( field: field_name.to_string(), value: json!(value), }], + creates_slot: false, // Join tables don't create alive slots }); if batch.len() >= batch_size { @@ -277,7 +284,7 @@ fn images_csv_to_wal_limited(csv_path: &Path, writer: &WalWriter, batch_size: us None => { stats.rows_skipped += 1; continue; } }; stats.rows_read += 1; - batch.push(EntityOps { entity_id: row.id, ops: image_row_to_ops(&row) }); + batch.push(EntityOps { entity_id: row.id, ops: image_row_to_ops(&row), creates_slot: true }); if batch.len() >= batch_size { let bytes = writer.append_batch(&batch)?; stats.ops_written += batch.len() as u64; @@ -323,6 +330,7 @@ fn multi_value_csv_to_wal_limited( batch.push(EntityOps { entity_id: slot_id, ops: vec![Op::Add { field: field_name.to_string(), value: json!(value) }], + creates_slot: false, }); if batch.len() >= batch_size { let bytes = writer.append_batch(&batch)?; diff --git a/src/pg_sync/op_dedup.rs b/src/pg_sync/op_dedup.rs index 17da67a9..0e889183 100644 --- a/src/pg_sync/op_dedup.rs +++ b/src/pg_sync/op_dedup.rs @@ -20,13 +20,18 @@ use super::ops::{EntityOps, Op}; /// Add/remove cancellation eliminates net-zero multi-value ops. /// A delete op absorbs all prior ops for that entity. pub fn dedup_ops(batch: &mut Vec) { - // Phase 1: Merge all ops per entity_id + // Phase 1: Merge all ops per entity_id, preserving creates_slot (OR across sources) let mut entity_map: HashMap> = HashMap::new(); + let mut creates_slot_map: HashMap = HashMap::new(); for entry in batch.drain(..) { entity_map .entry(entry.entity_id) .or_default() .extend(entry.ops); + // If ANY source for this entity sets creates_slot, preserve it + if entry.creates_slot { + creates_slot_map.insert(entry.entity_id, true); + } } // Phase 2: Dedup ops within each entity @@ -38,7 +43,11 @@ pub fn dedup_ops(batch: &mut Vec) { *batch = entity_map .into_iter() .filter(|(_, ops)| !ops.is_empty()) - .map(|(entity_id, ops)| EntityOps { entity_id, ops }) + .map(|(entity_id, ops)| EntityOps { + entity_id, + ops, + creates_slot: creates_slot_map.get(&entity_id).copied().unwrap_or(false), + }) .collect(); } @@ -139,7 +148,7 @@ mod tests { use serde_json::json; fn entity(id: i64, ops: Vec) -> EntityOps { - EntityOps { entity_id: id, ops } + EntityOps { entity_id: id, ops, creates_slot: false } } #[test] diff --git a/src/pg_sync/ops.rs b/src/pg_sync/ops.rs index 94ccef04..eadbf5cf 100644 --- a/src/pg_sync/ops.rs +++ b/src/pg_sync/ops.rs @@ -86,6 +86,23 @@ pub struct EntityOps { pub entity_id: i64, /// Operations to apply pub ops: Vec, + /// If true, this entity should have its alive bit set (creates the slot if new). + /// Only the primary entity table (e.g., Image with sets_alive: true) sets this. + /// Join tables (tags, tools) leave this false — they only add multi-value bitmaps. + #[serde(default)] + pub creates_slot: bool, +} + +impl EntityOps { + /// Convenience constructor — creates_slot defaults to false. + pub fn new(entity_id: i64, ops: Vec) -> Self { + Self { entity_id, ops, creates_slot: false } + } + + /// Constructor for primary entity ops that should create alive slots. + pub fn with_alive(entity_id: i64, ops: Vec) -> Self { + Self { entity_id, ops, creates_slot: true } + } } /// Sync source metadata, bundled with ops payloads. @@ -203,6 +220,7 @@ mod tests { let batch = OpsBatch { ops: vec![EntityOps { entity_id: 123, + creates_slot: false, ops: vec![Op::Set { field: "nsfwLevel".into(), value: json!(16), diff --git a/src/pg_sync/ops_poller.rs b/src/pg_sync/ops_poller.rs index 090224d4..994f4f89 100644 --- a/src/pg_sync/ops_poller.rs +++ b/src/pg_sync/ops_poller.rs @@ -118,6 +118,7 @@ async fn poll_and_process( .map(|row| EntityOps { entity_id: row.entity_id, ops: row.ops.0, + creates_slot: false, // Determined by trigger config at source; override in pg-sync }) .collect(); diff --git a/src/server.rs b/src/server.rs index 60913e32..dca83926 100644 --- a/src/server.rs +++ b/src/server.rs @@ -1062,9 +1062,16 @@ impl BitdexServer { }; if let Some(engine) = engine { + // Build FieldMeta and CoalescerSink for the ops processor + let meta = crate::ops_processor::FieldMeta::from_config(engine.config()); + let sender = engine.mutation_sender(); + let mut sink = crate::ingester::CoalescerSink::new(sender); + let mut entries = batch.entries; let (applied, skipped, errors) = - crate::ops_processor::apply_ops_batch(&engine, &mut entries); + crate::ops_processor::apply_ops_batch( + &mut sink, &meta, &mut entries, Some(&engine), + ); if applied > 0 || errors > 0 { eprintln!( diff --git a/tests/sync_v2_integration.rs b/tests/sync_v2_integration.rs index 1ffb3083..7901af0c 100644 --- a/tests/sync_v2_integration.rs +++ b/tests/sync_v2_integration.rs @@ -27,18 +27,21 @@ fn test_ops_wal_roundtrip_with_dedup() { let batch = vec![ EntityOps { entity_id: 1, + creates_slot: false, ops: vec![ Op::Set { field: "nsfwLevel".into(), value: json!(8) }, ], }, EntityOps { entity_id: 1, + creates_slot: false, ops: vec![ Op::Set { field: "nsfwLevel".into(), value: json!(16) }, // Overwrites first ], }, EntityOps { entity_id: 2, + creates_slot: false, ops: vec![ Op::Add { field: "tagIds".into(), value: json!(42) }, ], @@ -81,6 +84,7 @@ fn test_delete_absorbs_prior_ops_through_wal() { // First batch: set some fields writer.append_batch(&[EntityOps { entity_id: 1, + creates_slot: false, ops: vec![ Op::Set { field: "nsfwLevel".into(), value: json!(16) }, Op::Add { field: "tagIds".into(), value: json!(42) }, @@ -90,6 +94,7 @@ fn test_delete_absorbs_prior_ops_through_wal() { // Second batch: delete the entity writer.append_batch(&[EntityOps { entity_id: 1, + creates_slot: false, ops: vec![Op::Delete], }]).unwrap(); @@ -115,10 +120,12 @@ fn test_add_remove_cancellation_through_wal() { writer.append_batch(&[ EntityOps { entity_id: 1, + creates_slot: false, ops: vec![Op::Add { field: "tagIds".into(), value: json!(42) }], }, EntityOps { entity_id: 1, + creates_slot: false, ops: vec![Op::Remove { field: "tagIds".into(), value: json!(42) }], }, ]).unwrap(); @@ -140,6 +147,7 @@ fn test_query_op_set_through_wal() { let writer = WalWriter::new(&wal_path); writer.append_batch(&[EntityOps { entity_id: 456, + creates_slot: false, ops: vec![Op::QueryOpSet { query: "modelVersionIds eq 456".into(), ops: vec![ @@ -175,6 +183,7 @@ fn test_wal_cursor_resume_across_appends() { // Batch 1 writer.append_batch(&[EntityOps { entity_id: 1, + creates_slot: false, ops: vec![Op::Set { field: "a".into(), value: json!(1) }], }]).unwrap(); @@ -187,6 +196,7 @@ fn test_wal_cursor_resume_across_appends() { // Batch 2 (appended after first read) writer.append_batch(&[EntityOps { entity_id: 2, + creates_slot: false, ops: vec![Op::Set { field: "b".into(), value: json!(2) }], }]).unwrap(); @@ -342,6 +352,7 @@ fn test_ops_batch_json_format() { ops: vec![ EntityOps { entity_id: 123, + creates_slot: false, ops: vec![ Op::Remove { field: "nsfwLevel".into(), value: json!(8) }, Op::Set { field: "nsfwLevel".into(), value: json!(16) }, @@ -349,6 +360,7 @@ fn test_ops_batch_json_format() { }, EntityOps { entity_id: 456, + creates_slot: false, ops: vec![Op::QueryOpSet { query: "modelVersionIds eq 456".into(), ops: vec![ From 4c472acb8cf13dc3714b75041ecc0993a681c51e Mon Sep 17 00:00:00 2001 From: Justin Maier Date: Wed, 25 Mar 2026 19:50:28 -0600 Subject: [PATCH 12/19] =?UTF-8?q?docs:=20update=20pg-sync-v2=20design=20?= =?UTF-8?q?=E2=80=94=20pg-sync=20as=20data=20mover,=20BitDex=20owns=20proc?= =?UTF-8?q?essing?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reflects the architecture split validated by benchmarks: - pg-sync: thin data mover (COPY→CSV, cursor management, ops polling) - BitDex: all processing (CSV parse, ops→AccumSink, bitmap accumulation) - Dumps skip WAL entirely — CSV→AccumSink direct path at 367K/s - Updated throughput table with measured numbers - Boot sequence uses pre_dump_cursor for gap safety Co-Authored-By: Claude Opus 4.6 (1M context) --- docs/design/pg-sync-v2-final.md | 494 ++++++++++++++++++++++++++++++++ 1 file changed, 494 insertions(+) create mode 100644 docs/design/pg-sync-v2-final.md diff --git a/docs/design/pg-sync-v2-final.md b/docs/design/pg-sync-v2-final.md new file mode 100644 index 00000000..58ebb286 --- /dev/null +++ b/docs/design/pg-sync-v2-final.md @@ -0,0 +1,494 @@ +# BitDex Sync V2 — Final Design + +> Distilled from the [working design doc](pg-sync-v2.md) (Justin + Adam, 2026-03-25). + +## Problem + +The V1 outbox poller is 80M rows behind and can never catch up. Each cycle polls 5,000 rows from `BitdexOutbox`, then runs 5 enrichment queries per batch (images + tags + tools + techniques + resources) to assemble full JSON documents before PATCHing to BitDex. At ~2,500 changes/s with enrichment as the bottleneck, steady-state write volume exceeds processing capacity. + +## Solution + +Replace the "notify then re-fetch" pattern with **self-contained ops**. PG triggers encode the exact delta (old value, new value, field name) directly into a JSONB ops array. No enrichment queries, no full-document assembly. BitDex applies ops as direct bitmap mutations. + +--- + +## Architecture + +``` +PG trigger fires + → INSERT one row into BitdexOps (entity_id, JSONB ops array) + → pg-sync polls BitdexOps, deduplicates, POSTs batch to BitDex + → BitDex /ops endpoint appends to local WAL file, returns 200 + → WAL reader thread processes ops → bitmap mutations via coalescer +``` + +### BitdexOps Table + +```sql +CREATE TABLE IF NOT EXISTS "BitdexOps" ( + id BIGSERIAL PRIMARY KEY, + entity_id BIGINT NOT NULL, + ops JSONB NOT NULL, + created_at TIMESTAMPTZ DEFAULT now() +); +CREATE INDEX idx_bitdex_ops_id ON "BitdexOps" (id); +``` + +Each row contains a JSONB array of ops. Triggers include both old and new values so BitDex can update bitmaps without reading the docstore. + +### Op Types + +| Op | Example | Bitmap Action | +|----|---------|---------------| +| `set` | `{"op":"set","field":"nsfwLevel","value":16}` | Set bit in value bitmap | +| `remove` | `{"op":"remove","field":"nsfwLevel","value":8}` | Clear bit from value bitmap | +| `add` | `{"op":"add","field":"tagIds","value":42}` | Set bit in multi-value bitmap | +| `delete` | `{"op":"delete"}` | Clear all filter/sort bits + alive bit | +| `queryOpSet` | See [Fan-Out](#fan-out-via-queryopset) | Query-resolved bulk bitmap ops | + +**No `full` op type.** INSERTs emit individual `set` ops for each field (all additive, no `remove` since there's no prior state). One format for everything. + +### Op Examples + +**Image UPDATE** (nsfwLevel 8→16, type stays same): +```json +[ + {"op": "remove", "field": "nsfwLevel", "value": 8}, + {"op": "set", "field": "nsfwLevel", "value": 16} +] +``` + +**Image INSERT** (new image): +```json +[ + {"op": "set", "field": "nsfwLevel", "value": 1}, + {"op": "set", "field": "type", "value": "image"}, + {"op": "set", "field": "userId", "value": 12345}, + {"op": "set", "field": "sortAt", "value": 1711234567} +] +``` + +**Tag added:** +```json +[{"op": "add", "field": "tagIds", "value": 42}] +``` + +**Image deleted:** +```json +[{"op": "delete"}] +``` + +--- + +## Fan-Out via queryOpSet + +Fan-out tables (ModelVersion, Post, Model) don't produce per-image ops in the trigger. Instead, they emit a single `queryOpSet` op that tells BitDex to resolve affected slots from its own bitmaps. + +**ModelVersion baseModel change:** +```json +[{"op": "queryOpSet", "query": "modelVersionIds eq 456", "ops": [ + {"op": "remove", "field": "baseModel", "value": "SD 1.5"}, + {"op": "set", "field": "baseModel", "value": "SDXL"} +]}] +``` + +BitDex looks up the `modelVersionIds=456` bitmap, gets all affected slots, applies two bulk bitmap operations (`andnot` old + `or` new). A 15M-image fan-out completes in microseconds — no per-image ops, no PG queries. + +**Model POI change** (needs MV ids from PG first): +```json +[{"op": "queryOpSet", "query": "modelVersionIds in [101, 102, 103]", "ops": [ + {"op": "set", "field": "poi", "value": true} +]}] +``` + +The trigger uses `jsonb_agg` to collect MV ids: `SELECT jsonb_agg(id) FROM ModelVersion WHERE modelId = NEW.id`. BitDex ORs the MV bitmaps together, then applies the ops. + +**Post publishedAt change:** +```json +[{"op": "queryOpSet", "query": "postId eq 789", "ops": [ + {"op": "remove", "field": "publishedAt", "value": 1711000000}, + {"op": "set", "field": "publishedAt", "value": 1711234567} +]}] +``` + +### Fan-Out Scale (measured 2026-03-25) + +| Metric | Value | +|--------|-------| +| ImageResourceNew rows | ~375M | +| Top ModelVersion (290640) | ~15.1M images | +| Top 5 ModelVersions | 18.6% of all rows | +| p50 images/MV | 1 | +| p90 images/MV | 5 | +| p99 images/MV | 53 | + +The distribution is extremely heavy-tailed. 99% of fan-outs are trivial. The queryOpSet approach handles even the 15M-image worst case as two bitmap operations. + +--- + +## Trigger Configuration (YAML) + +pg-sync generates trigger SQL from a declarative YAML config. Two table types: + +### Direct Tables (slot = PG column) + +```yaml +sync_sources: + - table: Image + slot_field: id + track_fields: [nsfwLevel, type, userId, postId, minor, poi, hideMeta, meta, blockedFor] + on_delete: delete_slot + + - table: TagsOnImageNew + slot_field: imageId + field: tagIds + value_field: tagId + + - table: ImageTool + slot_field: imageId + field: toolIds + value_field: toolId + + - table: ImageTechnique + slot_field: imageId + field: techniqueIds + value_field: techniqueId + + - table: CollectionItem + slot_field: imageId + field: collectionIds + value_field: collectionId + filter: "status = 'ACCEPTED' AND \"imageId\" IS NOT NULL" +``` + +- `slot_field`: PG column that maps to the BitDex slot ID +- `track_fields`: Scalar columns — trigger emits `remove`/`set` pairs using `IS DISTINCT FROM` +- `field` + `value_field`: Multi-value join tables — INSERT = `add`, DELETE = `remove` +- `on_delete`: `delete_slot` emits a `{"op":"delete"}` op + +### Fan-Out Tables (slots resolved by BitDex query) + +```yaml + - table: ModelVersion + query: "modelVersionIds eq {id}" + track_fields: [baseModel] + + - table: Post + query: "postId eq {id}" + track_fields: [publishedAt, availability] + + - table: Model + query: "modelVersionIds in {modelVersionIds}" + query_source: "SELECT jsonb_agg(id) as \"modelVersionIds\" FROM \"ModelVersion\" WHERE \"modelId\" = {id}" + track_fields: [poi] +``` + +- `query`: BitDex query template. `{column}` placeholders are substituted from `NEW` columns. +- `query_source`: Optional PG subquery for values not on the triggering table. Returns named columns that feed into `query` placeholders. +- No `slot_field` — slots come from the BitDex query result. + +### Trigger Reconciliation + +Trigger naming: `bitdex_{table}_{hash8}` where `hash8` is the first 8 chars of SHA256 of the function body. On startup, pg-sync: + +1. Generates trigger SQL from config +2. Queries `pg_trigger WHERE tgname LIKE 'bitdex_%'` +3. Hash matches → skip. Hash differs → `CREATE OR REPLACE`. Table not in config → `DROP TRIGGER`. + +Config is the source of truth. pg-sync reconciles PG state to match. + +--- + +## WAL-Backed Ops Endpoint + +### Ingestion + +`POST /api/indexes/{name}/ops` receives ops from pg-sync, appends to a local WAL file, returns 200. Zero processing on the HTTP path — just fsync and acknowledge. + +```json +{ + "ops": [ + {"entity_id": 123, "ops": [{"op": "add", "field": "tagIds", "value": 42}]}, + {"entity_id": 456, "ops": [{"op": "set", "field": "nsfwLevel", "value": 16}]} + ], + "meta": { + "source": "pg-sync-default", + "cursor": 420000000, + "max_id": 500000000, + "lag_rows": 80000000 + } +} +``` + +No cursor management — pg-sync owns its cursor in PG (`bitdex_cursors` table). The `meta` field carries lag metrics for Prometheus exposition. + +### WAL Processing + +A dedicated reader thread tails the WAL file, reads batches, deduplicates, and submits mutations to the coalescer. + +- Append-only files, one per generation: `ops_000001.wal`, `ops_000002.wal`, ... +- Reader maintains a persisted byte-offset cursor +- Size-based rotation (e.g., 100MB), old generations deleted after processing +- Format: `[4-byte len][entity_id: i64][ops: JSONB bytes][CRC32]` — same pattern as ShardStore/BucketDiffLog +- Crash recovery: resume from persisted cursor in current generation + +### Op Deduplication + +Two-layer dedup using a shared `dedup_ops()` helper: + +1. **pg-sync side**: LIFO dedup per `(entity_id, field)` + add/remove cancellation. Reduces batch before sending. +2. **WAL reader side**: Same dedup on WAL batch. Catches cross-poll duplicates. + +`full` ops are decomposed into individual `set` ops by pg-sync before dedup — `full` is not a special case in the processing pipeline. + +BitDex skips ops for fields not in its index config. Stale triggers that emit ops for removed fields are harmless. + +--- + +## Observability + +### Prometheus Metrics + +Unified `bitdex_sync_*` namespace with `source` label: + +``` +bitdex_sync_cursor_position{source="pg-sync-default"} 420000000 +bitdex_sync_max_id{source="pg-sync-default"} 500000000 +bitdex_sync_lag_rows{source="pg-sync-default"} 80000000 +bitdex_sync_cycle_duration_seconds{source="pg-sync-default"} 0.05 +bitdex_sync_cycle_rows{source="pg-sync-default"} 4850 +bitdex_sync_wal_pending_bytes 1048576 +bitdex_sync_wal_generation 3 +``` + +### Lag Endpoint + +`GET /api/internal/sync-lag` — returns latest `meta` from each sync source. + +Metrics are bundled with the ops payload — no separate reporting call. + +--- + +## Deployment + +### Binary + +Rename `bitdex-pg-sync` → `bitdex-sync` with subcommands: +- `bitdex-sync pg --config sync.toml` — PG ops poller +- `bitdex-sync ch --config sync.toml` — ClickHouse metrics poller +- `bitdex-sync all --config sync.toml` — both (default for K8s sidecar) + +Single sidecar container, concurrent tokio tasks. + +### ClickHouse + +Stays separate and simple. Polls CH for aggregate counts (reactionCount, commentCount, collectedCount), pushes to BitDex ops endpoint. Not config-driven — the CH query is domain-specific. + +### Migration Plan + +1. Build V2: BitdexOps table, YAML-driven triggers, ops poller, WAL endpoint, queryOpSet, dump pipeline +2. Boot pod — pg-sync auto-detects empty BitDex, runs table dumps, transitions to steady-state +3. Done. No manual intervention. V1 code stays in repo, unused. + +No incremental migration, no shadow mode, no V1 fixes. No manual pod teardown/reload dance. + +--- + +## Unified Load Pipeline + +### Responsibility Split + +**pg-sync (sidecar)** is a thin data mover: +- `COPY FROM` PG → write CSV to shared volume +- Signal BitDex that a CSV is ready (`POST /dumps/{name}/loaded`) +- Poll BitdexOps outbox → `POST /ops` batches (steady-state) +- Manage cursor in PG (`bitdex_cursors` table) + +**BitDex (server)** owns all processing: +- On dump signal: read CSV → parse → ops → AccumSink → bitmap accumulation (direct path, ~367K images/s) +- On `/ops` POST: append to WAL → WAL reader → CoalescerSink → coalescer channel (steady-state) +- YAML sync config awareness: field mapping, value conversion, bit decomposition +- All indexing logic: BitmapSink trait, FieldMeta, value_to_bitmap_key, value_to_sort_u32 + +pg-sync never generates ops, never touches bitmaps, never writes WAL. The sync config (`sync.yaml`) is read by both: pg-sync uses it for `COPY` column selection and trigger generation, BitDex uses it for CSV→ops field mapping. + +### Boot Sequence + +``` +K8s starts pod (BitDex server + bitdex-sync sidecar) + → bitdex-sync waits for BitDex health check + → Capture max(BitdexOps.id) as pre_dump_cursor + → GET /api/indexes/{name}/dumps — check dump history + → For each sync_source not yet dumped: + 1. PUT /api/indexes/{name}/dumps — register dump + 2. COPY table from PG → write CSV to shared volume + 3. POST /api/indexes/{name}/dumps/{name}/loaded — "CSV is ready" + 4. BitDex reads CSV directly, parses → AccumSink → bitmaps + 5. BitDex saves bitmaps to ShardStore, unloads from memory + → Seed cursor at pre_dump_cursor (not current max — catches dump-window ops) + → Transition to steady-state ops polling + → K8s readiness probe flips to 200, traffic starts routing +``` + +No manual intervention. No WAL for dumps. No serialization overhead. Just boot and it works. + +### Dump Endpoints + +``` +GET /api/indexes/{name}/dumps — list dump history +PUT /api/indexes/{name}/dumps — register new dump → task ID +POST /api/indexes/{name}/dumps/{name}/loaded — signal dump file complete +DELETE /api/indexes/{name}/dumps/{name} — remove from history +DELETE /api/indexes/{name}/dumps — clear all history +GET /api/tasks/{task_id} — poll dump processing status (existing) +``` + +### Dump Identity and Change Detection + +Dump names include a config hash: `Image-a1b2c3d4`. pg-sync constructs the name from the table name + hash of that table's YAML config. If the config changes (add a field to `track_fields`), the hash changes, the name doesn't match existing dumps, and pg-sync auto-re-dumps. BitDex treats dump names as opaque strings. + +### Table Ordering + +No JOINs on large tables. Each table dumps flat. + +1. **Image** — flat COPY. Produces `existedAt` via `GREATEST(scannedAt, createdAt)` expression in `track_fields`. +2. **TagsOnImageNew, ImageTool, ImageTechnique, CollectionItem, ImageResourceNew** — flat COPYs, can run in parallel. +3. **Post** — flat COPY (id, publishedAt, availability). Depends on Image being loaded first. Uses `queryOpSet "postId eq {id}"` to set fields on image slots. +4. **ModelVersion** — flat COPY (small table, <1M rows, JOINs fine). Sets baseModel via `queryOpSet`. +5. **ClickHouse metrics** — separate dump via ch-sync. + +### Dump Processing Mode + +Dump processing bypasses the WAL, coalescer, and flush thread entirely. BitDex reads the CSV directly and processes via `AccumSink` → `BitmapAccum` → `apply_accum()`: + +1. CSV rows parsed in-process (`parse_image_row`, `parse_tag_row`, etc.) +2. Each row → ops → `BitmapSink::filter_insert()` / `sort_set()` / `alive_insert()` +3. `AccumSink` inserts directly into `BitmapAccum` (HashMap-backed bitmap accumulator) +4. After all rows: `engine.apply_accum(&accum)` merges bitmaps into staging via OR +5. Save bitmaps to ShardStore, unload from memory +6. Lazy load on first query (existing `ensure_fields_loaded()` path) + +This matches the single-pass loader's throughput: **367K images/s at 1M scale** (vs 345K/s single-pass baseline). No serialization, no WAL I/O, no channel overhead. + +The `creates_slot` flag on `EntityOps` controls alive bit management: +- Image table CSVs: `creates_slot: true` → sets alive bit +- Join table CSVs (tags, tools): `creates_slot: false` → only adds filter bitmaps + +Peak memory: one table's bitmaps at a time. K8s readiness probe returns 503 during dumps (health probe stays 200). Traffic routes only after all dumps complete. + +### Prerequisite: Computed Sort Fields + +`sortAt = GREATEST(existedAt, publishedAt)` requires BitDex to compute sort values from multiple source fields. `existedAt` comes from Image dumps, `publishedAt` comes from Post dumps — they arrive at different times. BitDex must recompute `sortAt` whenever either source changes. + +This is a separate feature tracked in [computed-sort-fields.md](computed-sort-fields.md). + +--- + +## Throughput + +| | V1 | V2 (measured) | +|---|---|---| +| Enrichment queries | 5 per batch | 0 | +| Dump throughput (images) | ~70K/s (single-pass) | **367K/s** (direct AccumSink) | +| Dump throughput (tags) | — | **2.6M/s** (direct AccumSink) | +| Steady-state throughput | ~2,500 changes/s | 2,700 ops/s (CoalescerSink) | +| Fan-out cost (15M images) | 15M enrichment queries | 2 bitmap ops | +| WAL-backed dump (if needed) | — | 41K ops/s | + +Dump mode at 367K images/s processes 107M images in ~4.9 minutes (image table only). +Steady-state 2,700 ops/s provides 1.1x headroom over peak traffic (~2,500 changes/s). +The WAL path exists for steady-state durability; dumps skip it entirely for throughput. + +--- + +## Design Review Findings (2026-03-25) + +Architectural review identified 17 issues. Resolutions agreed with Justin: + +### Cursor Gap (Critical — C1) + +PG triggers fire into `BitdexOps` while dumps run. If we seed the cursor at `max(BitdexOps.id)` AFTER dumps, ops generated during the dump window are skipped. + +**Resolution:** Capture `max(BitdexOps.id)` BEFORE starting dumps. Seed cursor at that pre-dump value. pg-sync re-processes some overlapping ops (idempotent — set/remove are self-correcting). Updated boot sequence: + +``` +→ Capture max(BitdexOps.id) as pre_dump_cursor +→ Run dumps... +→ Seed cursor at pre_dump_cursor (not current max) +→ Start steady-state polling — catches all ops from dump window +``` + +### queryOpSet Race (Critical — C2) + +Between bitmap lookup and op application, concurrent mutations could change the resolved slot set. A new image gaining MV 456 during a baseModel cascade could be missed. + +**Resolution:** Snapshot-level isolation is acceptable. The next steady-state trigger on the missed image corrects the state. The consistency window is bounded by the poll interval (~2s). Document this as eventual consistency, not serializability. + +### Delete Ops + Docstore Read (High — H1) + +Delete ops carry no old values, so BitDex must read the docstore to know which bitmaps to clear (clean delete principle). + +**Resolution:** Deletes are infrequent — docstore read is acceptable for this case. This is the one op type that requires a docstore read. Doc cache makes it <1μs in the common case. The trigger can't easily emit all field values from `OLD` because multi-value fields (tags, tools) come from join tables, not the Image row. + +### WAL Partial Records (High — H3) + +Crash mid-write leaves truncated WAL record. + +**Resolution:** `POST /ops` returns 200 only after all records are written and fsynced. If crash happens before response, pg-sync doesn't advance its cursor and resends the batch. LIFO dedup on the WAL reader handles re-delivered ops. For dump WAL files, same approach: pg-sync only calls `/loaded` after the full file is written. + +### Alive Bit Management (Medium — M1) + +No op type explicitly sets the alive bit for new slots. + +**Resolution:** The Image table config gets a new property: `sets_alive: true`. Only the table marked `sets_alive` triggers alive bit setting on first `set` op for a non-alive slot. This prevents tags/tools from accidentally creating alive entries for non-existent images. Other tables' ops on non-alive slots are silently dropped. + +```yaml +- table: Image + slot_field: id + sets_alive: true # only this table can create new alive slots + track_fields: [...] +``` + +### Dump Ordering Dependency (Medium — M4) + +ImageResourceNew must complete before ModelVersion dump starts (MV queryOpSet needs `modelVersionIds` bitmaps). + +**Resolution:** Explicit dump phases: +1. Image +2. ImageResourceNew + tags + tools + techniques + collections (parallel) +3. Post + ModelVersion (parallel, both depend on step 2) +4. ClickHouse metrics + +### Docstore Writes for V2 Ops (Medium — M5) + +Each op must also write to the docstore (not just bitmaps) for document serving and computed field lookups. + +**Resolution:** Each op appends to the docstore via V2 tuple format: `DocSink.append(slot_id, field_idx, value)`. For `queryOpSet`, each affected slot gets a docstore write per field. Slot ID is always available from `entity_id` (direct ops) or from the query result set (queryOpSet). + +### `meta` Field Write Amplification (Low — L5) + +**Non-issue.** `hasMeta` and `onSite` are already precomputed as bit flags on the Image table (`flags` column — bit 13 = hasPrompt, bit 14 = madeOnSite, bit 2 = hideMeta). The COPY loader reads these directly via `CopyImageRow.has_meta()` and `.on_site()`. No raw `meta` JSONB tracking needed — `hasMeta` and `onSite` are plain boolean fields in `track_fields`, derived from flag bit changes. + +### queryOpSet entity_id Dedup (Low — L2) + +Multiple queryOpSets with `entity_id=0` would incorrectly deduplicate. + +**Resolution:** Use the source entity's ID (ModelVersion ID, Post ID) as `entity_id`. Dedup logic treats `queryOpSet` ops separately — dedup by `(entity_id, query)` not `(entity_id, field)`. + +--- + +## Files That Change + +| File | Change | +|------|--------| +| `src/pg_sync/queries.rs` | BitdexOps table SQL, `poll_ops_from_cursor()` | +| `src/pg_sync/ops_poller.rs` | **New** — V2 poller with dedup | +| `src/pg_sync/op_dedup.rs` | **New** — shared dedup helper | +| `src/pg_sync/trigger_gen.rs` | **New** — YAML config → trigger SQL generator | +| `src/pg_sync/dump.rs` | **New** — table dump pipeline (COPY → WAL writer) | +| `src/pg_sync/config.rs` | V2 config fields, YAML sync_sources, dump config | +| `src/bin/pg_sync.rs` | Rename to bitdex-sync, add subcommands | +| `src/server.rs` | `POST /ops` (WAL-backed), `GET /sync-lag`, dump endpoints | +| `src/ops_wal.rs` | **New** — WAL writer + reader thread (ops + dumps) | +| `src/pg_sync/bitdex_client.rs` | `post_ops()`, dump registration | +| `src/metrics.rs` | `bitdex_sync_*` Prometheus gauges | From 5df69b64279374d9f8d542a17ec242f22580cf58 Mon Sep 17 00:00:00 2001 From: Justin Maier Date: Wed, 25 Mar 2026 19:52:24 -0600 Subject: [PATCH 13/19] fix: add computed field to SortFieldConfig in tests (rebase fix) Co-Authored-By: Claude Opus 4.6 (1M context) --- src/ops_processor.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ops_processor.rs b/src/ops_processor.rs index e07202ee..ae7d391f 100644 --- a/src/ops_processor.rs +++ b/src/ops_processor.rs @@ -792,6 +792,7 @@ mod tests { encoding: "linear".into(), bits: 32, eager_load: false, + computed: None, }]; config } From bcc3cfe5e2c1b26e9b4b9af0fb8deb3e6d11ae2f Mon Sep 17 00:00:00 2001 From: Justin Maier Date: Wed, 25 Mar 2026 20:03:22 -0600 Subject: [PATCH 14/19] perf: rayon parallel CSV parsing in direct dump path Uses rayon fold+reduce for parallel CSV parsing, matching the single-pass loader pattern. Each rayon worker builds a thread-local BitmapAccum, merged at the end via bitmap OR. 1M benchmark: 2.7M ops/s total (images 2.0M/s, tags 5.4M/s, tools 8.8M/s). Previous single-threaded: 931K ops/s. Speedup: 2.9x. vs single-pass baseline (345K/s): 5.8x faster on images. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/ops_processor.rs | 263 +++++++++++++++++++++++++------------------ 1 file changed, 153 insertions(+), 110 deletions(-) diff --git a/src/ops_processor.rs b/src/ops_processor.rs index ae7d391f..14490723 100644 --- a/src/ops_processor.rs +++ b/src/ops_processor.rs @@ -518,24 +518,21 @@ pub fn process_wal_dump( (total_applied, total_errors, start.elapsed().as_secs_f64()) } -/// Direct dump pipeline: CSV → ops → AccumSink, bypassing WAL entirely. +/// Direct dump pipeline: CSV → rayon parallel parse → BitmapAccum → apply. /// -/// For bulk loading, the WAL roundtrip (JSON serialize → disk → read → deserialize) -/// adds ~8x overhead vs direct processing. This function goes straight from -/// CSV rows to bitmap accumulation, matching the single-pass loader's throughput. +/// Bypasses WAL entirely. Uses rayon fold+reduce for parallel CSV parsing, +/// matching the single-pass loader's throughput pattern. /// /// Returns (total_applied, total_errors, elapsed_secs). pub fn process_csv_dump_direct( engine: &ConcurrentEngine, csv_dir: &Path, - batch_size: usize, + _batch_size: usize, limit: Option, ) -> (u64, u64, f64) { - use crate::ingester::AccumSink; use crate::loader::BitmapAccum; use crate::pg_sync::copy_queries::{parse_image_row, parse_tag_row, parse_tool_row}; - use std::fs::File; - use std::io::{BufRead, BufReader}; + use rayon::prelude::*; use std::time::Instant; let config = engine.config(); @@ -543,144 +540,190 @@ pub fn process_csv_dump_direct( let filter_names: Vec = config.filter_fields.iter().map(|f| f.name.clone()).collect(); let sort_configs: Vec<(String, u8)> = config.sort_fields.iter().map(|s| (s.name.clone(), s.bits)).collect(); - let mut accum = BitmapAccum::new(&filter_names, &sort_configs); let start = Instant::now(); let mut total_applied = 0u64; let mut total_errors = 0u64; - // Phase 1: Images (creates alive slots) - let images_csv = csv_dir.join("images.csv"); - if images_csv.exists() { - let file = File::open(&images_csv).expect("open images.csv"); + // Helper: read file lines (respecting limit), return Vec> + let read_lines = |path: &Path, limit: Option| -> Vec> { + use std::io::{BufRead, BufReader}; + let file = std::fs::File::open(path).expect("open CSV"); let reader = BufReader::with_capacity(8 * 1024 * 1024, file); - let mut rows = 0u64; - let img_start = Instant::now(); - + let mut lines = Vec::new(); for line in reader.split(b'\n') { - let line = match line { - Ok(l) => l, - Err(_) => continue, - }; - if line.is_empty() { continue; } if let Some(max) = limit { - if rows >= max { break; } + if lines.len() as u64 >= max { break; } } + match line { + Ok(l) if !l.is_empty() => lines.push(l), + _ => {} + } + } + lines + }; - let row = match parse_image_row(&line) { - Some(r) => r, - None => continue, - }; - rows += 1; - - let slot = row.id as u32; - // Process each op directly into AccumSink - let ops = crate::pg_sync::csv_ops::image_row_to_ops_pub(&row); - { - let mut sink = AccumSink::new(&mut accum); - for op in &ops { - match op { - Op::Set { field, value } => { - process_set_op(&mut sink, &meta, slot, field, value); - } - Op::Remove { field, value } => { - process_remove_op(&mut sink, &meta, slot, field, value); + // Phase 1: Images (rayon parallel parse → BitmapAccum) + let images_csv = csv_dir.join("images.csv"); + if images_csv.exists() { + let img_start = Instant::now(); + let lines = read_lines(&images_csv, limit); + let read_elapsed = img_start.elapsed(); + + let f_names = &filter_names; + let s_configs = &sort_configs; + let meta_ref = &meta; + + let accum = lines + .into_par_iter() + .fold( + || BitmapAccum::new(f_names, s_configs), + |mut acc, line| { + let row = match parse_image_row(&line) { + Some(r) => r, + None => { acc.errors += 1; return acc; } + }; + let slot = row.id as u32; + acc.alive.insert(slot); + + // Process ops directly into accum (no BitmapSink indirection) + let ops = crate::pg_sync::csv_ops::image_row_to_ops_pub(&row); + for op in &ops { + if let Op::Set { field, value } = op { + let qval = json_to_qvalue(value); + if let Some((_, _)) = meta_ref.filter_fields.get(field.as_str()) { + if let Some(key) = value_to_bitmap_key(&qval) { + acc.filter_maps + .get_mut(field.as_str()) + .map(|m| m.entry(key).or_insert_with(roaring::RoaringBitmap::new).insert(slot)); + } + } + if let Some((_, num_bits)) = meta_ref.sort_fields.get(field.as_str()) { + if let Some(sort_val) = value_to_sort_u32(&qval) { + for bit in 0..*num_bits { + if (sort_val >> bit) & 1 == 1 { + acc.sort_maps + .get_mut(field.as_str()) + .map(|m| m.entry(bit).or_insert_with(roaring::RoaringBitmap::new).insert(slot)); + } + } + } + } } - _ => {} } - } - sink.alive_insert(slot); - } - total_applied += 1; - } - eprintln!(" images: {rows} rows, {:.1}s ({:.0}/s)", + acc.count += 1; + acc + }, + ) + .reduce( + || BitmapAccum::new(f_names, s_configs), + |a, b| a.merge(b), + ); + + let rows = accum.count as u64; + total_applied += rows; + total_errors += accum.errors; + + eprintln!(" images: {rows} rows, {:.1}s read + {:.1}s parse = {:.1}s ({:.0}/s)", + read_elapsed.as_secs_f64(), + img_start.elapsed().as_secs_f64() - read_elapsed.as_secs_f64(), img_start.elapsed().as_secs_f64(), rows as f64 / img_start.elapsed().as_secs_f64().max(0.001)); + + engine.apply_accum(&accum); } - // Phase 2: Tags (multi-value, no alive) + // Phase 2: Tags (rayon parallel parse) let tags_csv = csv_dir.join("tags.csv"); if tags_csv.exists() { - let file = File::open(&tags_csv).expect("open tags.csv"); - let reader = BufReader::with_capacity(8 * 1024 * 1024, file); - let mut rows = 0u64; let tag_start = Instant::now(); + let lines = read_lines(&tags_csv, limit); + + let f_names = &filter_names; + let s_configs = &sort_configs; + + let accum = lines + .into_par_iter() + .fold( + || BitmapAccum::new(f_names, s_configs), + |mut acc, line| { + let (tag_id, image_id) = match parse_tag_row(&line) { + Some(pair) => pair, + None => { acc.errors += 1; return acc; } + }; + let slot = image_id as u32; + if let Some(m) = acc.filter_maps.get_mut("tagIds") { + m.entry(tag_id as u64) + .or_insert_with(roaring::RoaringBitmap::new) + .insert(slot); + } + acc.count += 1; + acc + }, + ) + .reduce( + || BitmapAccum::new(f_names, s_configs), + |a, b| a.merge(b), + ); - for line in reader.split(b'\n') { - let line = match line { - Ok(l) => l, - Err(_) => continue, - }; - if line.is_empty() { continue; } - if let Some(max) = limit { - if rows >= max { break; } - } + let rows = accum.count as u64; + total_applied += rows; + total_errors += accum.errors; - let (tag_id, image_id) = match parse_tag_row(&line) { - Some(pair) => pair, - None => continue, - }; - rows += 1; - - let slot = image_id as u32; - let qval = QValue::Integer(tag_id); - if let Some((arc_name, _)) = meta.filter_fields.get("tagIds") { - if let Some(key) = value_to_bitmap_key(&qval) { - let mut sink = AccumSink::new(&mut accum); - sink.filter_insert(arc_name.clone(), key, slot); - } - } - total_applied += 1; - } eprintln!(" tags: {rows} rows, {:.1}s ({:.0}/s)", tag_start.elapsed().as_secs_f64(), rows as f64 / tag_start.elapsed().as_secs_f64().max(0.001)); + + engine.apply_accum(&accum); } - // Phase 3: Tools (multi-value, no alive) + // Phase 3: Tools (rayon parallel parse) let tools_csv = csv_dir.join("tools.csv"); if tools_csv.exists() { - let file = File::open(&tools_csv).expect("open tools.csv"); - let reader = BufReader::with_capacity(8 * 1024 * 1024, file); - let mut rows = 0u64; let tool_start = Instant::now(); + let lines = read_lines(&tools_csv, limit); + + let f_names = &filter_names; + let s_configs = &sort_configs; + + let accum = lines + .into_par_iter() + .fold( + || BitmapAccum::new(f_names, s_configs), + |mut acc, line| { + let (tool_id, image_id) = match parse_tool_row(&line) { + Some(pair) => pair, + None => { acc.errors += 1; return acc; } + }; + let slot = image_id as u32; + if let Some(m) = acc.filter_maps.get_mut("toolIds") { + m.entry(tool_id as u64) + .or_insert_with(roaring::RoaringBitmap::new) + .insert(slot); + } + acc.count += 1; + acc + }, + ) + .reduce( + || BitmapAccum::new(f_names, s_configs), + |a, b| a.merge(b), + ); - for line in reader.split(b'\n') { - let line = match line { - Ok(l) => l, - Err(_) => continue, - }; - if line.is_empty() { continue; } - if let Some(max) = limit { - if rows >= max { break; } - } + let rows = accum.count as u64; + total_applied += rows; + total_errors += accum.errors; - let (tool_id, image_id) = match parse_tool_row(&line) { - Some(pair) => pair, - None => continue, - }; - rows += 1; - - let slot = image_id as u32; - let qval = QValue::Integer(tool_id); - if let Some((arc_name, _)) = meta.filter_fields.get("toolIds") { - if let Some(key) = value_to_bitmap_key(&qval) { - let mut sink = AccumSink::new(&mut accum); - sink.filter_insert(arc_name.clone(), key, slot); - } - } - total_applied += 1; - } eprintln!(" tools: {rows} rows, {:.1}s ({:.0}/s)", tool_start.elapsed().as_secs_f64(), rows as f64 / tool_start.elapsed().as_secs_f64().max(0.001)); + + engine.apply_accum(&accum); } - // Apply accumulated bitmaps to engine staging - eprintln!(" Applying accum to staging..."); - let apply_start = Instant::now(); - engine.apply_accum(&accum); - eprintln!(" Apply: {:.3}s", apply_start.elapsed().as_secs_f64()); + eprintln!(" Total: {total_applied} ops in {:.1}s ({:.0}/s)", + start.elapsed().as_secs_f64(), + total_applied as f64 / start.elapsed().as_secs_f64().max(0.001)); (total_applied, total_errors, start.elapsed().as_secs_f64()) } From 873549c478d1522883f54c420f4766f228b3b2d7 Mon Sep 17 00:00:00 2001 From: Justin Maier Date: Wed, 25 Mar 2026 20:12:43 -0600 Subject: [PATCH 15/19] =?UTF-8?q?fix:=20address=20Ivanna=20review=20?= =?UTF-8?q?=E2=80=94=20incremental=20WAL=20reads=20+=20loading=20mode=20in?= =?UTF-8?q?=20apply=5Faccum?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit F1 (Critical): WAL reader was reading entire file every poll via fs::read(). Now uses seek(cursor) + read_exact() for incremental reads. O(new_data) per poll instead of O(file_size). F2 (Critical): apply_accum() cloned snapshot without loading mode, triggering Arc clone cascade (94s stalls at 105M). Now enters/exits loading mode automatically — staging refcount=1, no deep clones. Also: chunked block reader for direct dump path (300MB blocks via reader thread). Prevents OOM on 67GB tags.csv at full scale. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/concurrent_engine.rs | 21 ++- src/ops_processor.rs | 359 ++++++++++++++++++++++++--------------- src/ops_wal.rs | 33 ++-- 3 files changed, 258 insertions(+), 155 deletions(-) diff --git a/src/concurrent_engine.rs b/src/concurrent_engine.rs index fbe2c03a..cc510f5b 100644 --- a/src/concurrent_engine.rs +++ b/src/concurrent_engine.rs @@ -6492,14 +6492,22 @@ impl ConcurrentEngine { /// Apply a BitmapAccum's accumulated bitmaps directly to staging. /// /// Used by the dump pipeline (Sync V2) to apply ops-derived bitmaps - /// without going through the coalescer channel. Must be called while - /// in loading mode (enter_loading_mode → apply_accum → exit_loading_mode). + /// without going through the coalescer channel. + /// + /// Enters loading mode before cloning staging (ensures refcount=1, avoiding + /// the Arc clone cascade). Exits loading mode after to publish the snapshot. + /// Safe to call multiple times — each call is a self-contained load cycle. /// /// ORs filter bitmaps, sort layer bitmaps, and alive bitmap into staging. pub fn apply_accum(&self, accum: &crate::loader::BitmapAccum) { + // Enter loading mode: drops published snapshot's Arc refs so the staging + // clone below is the sole owner (refcount=1). This avoids the deep-clone + // cascade from Arc::make_mut() that caused 94s stalls in PR #78. + self.enter_loading_mode(); + let snap = self.inner.load_full(); - // Clone staging for mutation — loading mode means no snapshot publishing overhead let mut staging = (*snap).clone(); + drop(snap); // Release Arc ref before mutation // Apply filter bitmaps for (field_name, value_map) in &accum.filter_maps { @@ -6519,13 +6527,12 @@ impl ConcurrentEngine { } } - // Apply alive bitmap + // Apply alive bitmap (also updates slot counter) staging.slots.alive_or_bitmap(&accum.alive); - // alive_or_bitmap already updates the slot counter to max(alive) + 1 - - // Store back — in loading mode, flush thread won't publish intermediate snapshots + // Store and publish self.inner.store(Arc::new(staging)); + self.exit_loading_mode(); } /// Build all bitmap indexes from the docstore. diff --git a/src/ops_processor.rs b/src/ops_processor.rs index 14490723..832639c9 100644 --- a/src/ops_processor.rs +++ b/src/ops_processor.rs @@ -518,10 +518,11 @@ pub fn process_wal_dump( (total_applied, total_errors, start.elapsed().as_secs_f64()) } -/// Direct dump pipeline: CSV → rayon parallel parse → BitmapAccum → apply. +/// Direct dump pipeline: CSV → chunked reader → rayon parallel parse → BitmapAccum → apply. /// -/// Bypasses WAL entirely. Uses rayon fold+reduce for parallel CSV parsing, -/// matching the single-pass loader's throughput pattern. +/// Bypasses WAL entirely. Uses a reader thread + rayon fold+reduce for parallel +/// CSV parsing, matching the single-pass loader's throughput pattern. Memory-safe +/// at any scale — reads in ~300MB blocks, never loads the full file. /// /// Returns (total_applied, total_errors, elapsed_secs). pub fn process_csv_dump_direct( @@ -533,6 +534,8 @@ pub fn process_csv_dump_direct( use crate::loader::BitmapAccum; use crate::pg_sync::copy_queries::{parse_image_row, parse_tag_row, parse_tool_row}; use rayon::prelude::*; + use std::io::Read; + use std::thread; use std::time::Instant; let config = engine.config(); @@ -544,181 +547,261 @@ pub fn process_csv_dump_direct( let start = Instant::now(); let mut total_applied = 0u64; let mut total_errors = 0u64; - - // Helper: read file lines (respecting limit), return Vec> - let read_lines = |path: &Path, limit: Option| -> Vec> { - use std::io::{BufRead, BufReader}; - let file = std::fs::File::open(path).expect("open CSV"); - let reader = BufReader::with_capacity(8 * 1024 * 1024, file); - let mut lines = Vec::new(); - for line in reader.split(b'\n') { - if let Some(max) = limit { - if lines.len() as u64 >= max { break; } - } - match line { - Ok(l) if !l.is_empty() => lines.push(l), - _ => {} + let record_limit = limit.map(|l| l as usize).unwrap_or(usize::MAX); + + // Chunked reader: sends ~300MB blocks of complete lines through a channel. + // Same pattern as the single-pass loader. Never loads full file into memory. + fn spawn_block_reader( + path: std::path::PathBuf, + target_bytes: usize, + ) -> (std::sync::mpsc::Receiver>, thread::JoinHandle<()>) { + let (tx, rx) = std::sync::mpsc::sync_channel::>(2); + let handle = thread::spawn(move || { + let file = std::fs::File::open(&path).expect("open CSV"); + let mut reader = std::io::BufReader::with_capacity(16 * 1024 * 1024, file); + let mut buf = vec![0u8; 4 * 1024 * 1024]; + let mut accum = Vec::::with_capacity(target_bytes + 4 * 1024 * 1024); + + loop { + let n = reader.read(&mut buf).unwrap_or(0); + if n == 0 { + if !accum.is_empty() { + let _ = tx.send(accum); + } + break; + } + accum.extend_from_slice(&buf[..n]); + + if accum.len() >= target_bytes { + // Split at last newline to keep lines intact + if let Some(last_nl) = accum.iter().rposition(|&b| b == b'\n') { + let remainder = accum[last_nl + 1..].to_vec(); + accum.truncate(last_nl + 1); + let batch = std::mem::replace( + &mut accum, + Vec::with_capacity(target_bytes + 4 * 1024 * 1024), + ); + accum = remainder; + if tx.send(batch).is_err() { + break; + } + } + } } - } - lines - }; + }); + (rx, handle) + } + + let target_block_bytes = 300 * 1024 * 1024; // ~300MB per block - // Phase 1: Images (rayon parallel parse → BitmapAccum) + // Phase 1: Images (creates alive slots) let images_csv = csv_dir.join("images.csv"); if images_csv.exists() { let img_start = Instant::now(); - let lines = read_lines(&images_csv, limit); - let read_elapsed = img_start.elapsed(); + let (rx, reader_handle) = spawn_block_reader(images_csv, target_block_bytes); let f_names = &filter_names; let s_configs = &sort_configs; let meta_ref = &meta; + let mut phase_total = 0usize; + let mut phase_errors = 0u64; + + while let Ok(block) = rx.recv() { + if phase_total >= record_limit { break; } + + let block_str = match std::str::from_utf8(&block) { + Ok(s) => s, + Err(_) => continue, + }; + let mut lines: Vec<&[u8]> = block_str + .split('\n') + .filter(|l| !l.is_empty()) + .map(|l| l.as_bytes()) + .collect(); + let remaining = record_limit.saturating_sub(phase_total); + if lines.len() > remaining { + lines.truncate(remaining); + } - let accum = lines - .into_par_iter() - .fold( - || BitmapAccum::new(f_names, s_configs), - |mut acc, line| { - let row = match parse_image_row(&line) { - Some(r) => r, - None => { acc.errors += 1; return acc; } - }; - let slot = row.id as u32; - acc.alive.insert(slot); - - // Process ops directly into accum (no BitmapSink indirection) - let ops = crate::pg_sync::csv_ops::image_row_to_ops_pub(&row); - for op in &ops { - if let Op::Set { field, value } = op { - let qval = json_to_qvalue(value); - if let Some((_, _)) = meta_ref.filter_fields.get(field.as_str()) { - if let Some(key) = value_to_bitmap_key(&qval) { - acc.filter_maps - .get_mut(field.as_str()) - .map(|m| m.entry(key).or_insert_with(roaring::RoaringBitmap::new).insert(slot)); + let accum = lines + .into_par_iter() + .fold( + || BitmapAccum::new(f_names, s_configs), + |mut acc, line| { + let row = match parse_image_row(line) { + Some(r) => r, + None => { acc.errors += 1; return acc; } + }; + let slot = row.id as u32; + acc.alive.insert(slot); + + let ops = crate::pg_sync::csv_ops::image_row_to_ops_pub(&row); + for op in &ops { + if let Op::Set { field, value } = op { + let qval = json_to_qvalue(value); + if let Some((_, _)) = meta_ref.filter_fields.get(field.as_str()) { + if let Some(key) = value_to_bitmap_key(&qval) { + acc.filter_maps + .get_mut(field.as_str()) + .map(|m| m.entry(key).or_insert_with(roaring::RoaringBitmap::new).insert(slot)); + } } - } - if let Some((_, num_bits)) = meta_ref.sort_fields.get(field.as_str()) { - if let Some(sort_val) = value_to_sort_u32(&qval) { - for bit in 0..*num_bits { - if (sort_val >> bit) & 1 == 1 { - acc.sort_maps - .get_mut(field.as_str()) - .map(|m| m.entry(bit).or_insert_with(roaring::RoaringBitmap::new).insert(slot)); + if let Some((_, num_bits)) = meta_ref.sort_fields.get(field.as_str()) { + if let Some(sort_val) = value_to_sort_u32(&qval) { + for bit in 0..*num_bits { + if (sort_val >> bit) & 1 == 1 { + acc.sort_maps + .get_mut(field.as_str()) + .map(|m| m.entry(bit).or_insert_with(roaring::RoaringBitmap::new).insert(slot)); + } } } } } } - } - acc.count += 1; - acc - }, - ) - .reduce( - || BitmapAccum::new(f_names, s_configs), - |a, b| a.merge(b), - ); - - let rows = accum.count as u64; - total_applied += rows; - total_errors += accum.errors; + acc.count += 1; + acc + }, + ) + .reduce( + || BitmapAccum::new(f_names, s_configs), + |a, b| a.merge(b), + ); + + phase_total += accum.count; + phase_errors += accum.errors; + engine.apply_accum(&accum); + } + let _ = reader_handle.join(); + total_applied += phase_total as u64; + total_errors += phase_errors; - eprintln!(" images: {rows} rows, {:.1}s read + {:.1}s parse = {:.1}s ({:.0}/s)", - read_elapsed.as_secs_f64(), - img_start.elapsed().as_secs_f64() - read_elapsed.as_secs_f64(), + eprintln!(" images: {} rows, {:.1}s ({:.0}/s)", + phase_total, img_start.elapsed().as_secs_f64(), - rows as f64 / img_start.elapsed().as_secs_f64().max(0.001)); - - engine.apply_accum(&accum); + phase_total as f64 / img_start.elapsed().as_secs_f64().max(0.001)); } - // Phase 2: Tags (rayon parallel parse) + // Phase 2: Tags (chunked reader + rayon) let tags_csv = csv_dir.join("tags.csv"); if tags_csv.exists() { let tag_start = Instant::now(); - let lines = read_lines(&tags_csv, limit); + let (rx, reader_handle) = spawn_block_reader(tags_csv, target_block_bytes); let f_names = &filter_names; let s_configs = &sort_configs; + let mut phase_total = 0usize; + let mut phase_errors = 0u64; + + while let Ok(block) = rx.recv() { + if phase_total >= record_limit { break; } + + let mut lines: Vec<&[u8]> = block + .split(|&b| b == b'\n') + .filter(|l| !l.is_empty()) + .collect(); + let remaining = record_limit.saturating_sub(phase_total); + if lines.len() > remaining { + lines.truncate(remaining); + } - let accum = lines - .into_par_iter() - .fold( - || BitmapAccum::new(f_names, s_configs), - |mut acc, line| { - let (tag_id, image_id) = match parse_tag_row(&line) { - Some(pair) => pair, - None => { acc.errors += 1; return acc; } - }; - let slot = image_id as u32; - if let Some(m) = acc.filter_maps.get_mut("tagIds") { - m.entry(tag_id as u64) - .or_insert_with(roaring::RoaringBitmap::new) - .insert(slot); - } - acc.count += 1; - acc - }, - ) - .reduce( - || BitmapAccum::new(f_names, s_configs), - |a, b| a.merge(b), - ); - - let rows = accum.count as u64; - total_applied += rows; - total_errors += accum.errors; + let accum = lines + .into_par_iter() + .fold( + || BitmapAccum::new(f_names, s_configs), + |mut acc, line| { + let (tag_id, image_id) = match parse_tag_row(line) { + Some(pair) => pair, + None => { acc.errors += 1; return acc; } + }; + let slot = image_id as u32; + if let Some(m) = acc.filter_maps.get_mut("tagIds") { + m.entry(tag_id as u64) + .or_insert_with(roaring::RoaringBitmap::new) + .insert(slot); + } + acc.count += 1; + acc + }, + ) + .reduce( + || BitmapAccum::new(f_names, s_configs), + |a, b| a.merge(b), + ); + + phase_total += accum.count; + phase_errors += accum.errors; + engine.apply_accum(&accum); + } + let _ = reader_handle.join(); + total_applied += phase_total as u64; + total_errors += phase_errors; - eprintln!(" tags: {rows} rows, {:.1}s ({:.0}/s)", + eprintln!(" tags: {} rows, {:.1}s ({:.0}/s)", + phase_total, tag_start.elapsed().as_secs_f64(), - rows as f64 / tag_start.elapsed().as_secs_f64().max(0.001)); - - engine.apply_accum(&accum); + phase_total as f64 / tag_start.elapsed().as_secs_f64().max(0.001)); } - // Phase 3: Tools (rayon parallel parse) + // Phase 3: Tools (chunked reader + rayon) let tools_csv = csv_dir.join("tools.csv"); if tools_csv.exists() { let tool_start = Instant::now(); - let lines = read_lines(&tools_csv, limit); + let (rx, reader_handle) = spawn_block_reader(tools_csv, target_block_bytes); let f_names = &filter_names; let s_configs = &sort_configs; + let mut phase_total = 0usize; + let mut phase_errors = 0u64; + + while let Ok(block) = rx.recv() { + if phase_total >= record_limit { break; } + + let mut lines: Vec<&[u8]> = block + .split(|&b| b == b'\n') + .filter(|l| !l.is_empty()) + .collect(); + let remaining = record_limit.saturating_sub(phase_total); + if lines.len() > remaining { + lines.truncate(remaining); + } - let accum = lines - .into_par_iter() - .fold( - || BitmapAccum::new(f_names, s_configs), - |mut acc, line| { - let (tool_id, image_id) = match parse_tool_row(&line) { - Some(pair) => pair, - None => { acc.errors += 1; return acc; } - }; - let slot = image_id as u32; - if let Some(m) = acc.filter_maps.get_mut("toolIds") { - m.entry(tool_id as u64) - .or_insert_with(roaring::RoaringBitmap::new) - .insert(slot); - } - acc.count += 1; - acc - }, - ) - .reduce( - || BitmapAccum::new(f_names, s_configs), - |a, b| a.merge(b), - ); - - let rows = accum.count as u64; - total_applied += rows; - total_errors += accum.errors; + let accum = lines + .into_par_iter() + .fold( + || BitmapAccum::new(f_names, s_configs), + |mut acc, line| { + let (tool_id, image_id) = match parse_tool_row(line) { + Some(pair) => pair, + None => { acc.errors += 1; return acc; } + }; + let slot = image_id as u32; + if let Some(m) = acc.filter_maps.get_mut("toolIds") { + m.entry(tool_id as u64) + .or_insert_with(roaring::RoaringBitmap::new) + .insert(slot); + } + acc.count += 1; + acc + }, + ) + .reduce( + || BitmapAccum::new(f_names, s_configs), + |a, b| a.merge(b), + ); + + phase_total += accum.count; + phase_errors += accum.errors; + engine.apply_accum(&accum); + } + let _ = reader_handle.join(); + total_applied += phase_total as u64; + total_errors += phase_errors; - eprintln!(" tools: {rows} rows, {:.1}s ({:.0}/s)", + eprintln!(" tools: {} rows, {:.1}s ({:.0}/s)", + phase_total, tool_start.elapsed().as_secs_f64(), - rows as f64 / tool_start.elapsed().as_secs_f64().max(0.001)); - - engine.apply_accum(&accum); + phase_total as f64 / tool_start.elapsed().as_secs_f64().max(0.001)); } eprintln!(" Total: {total_applied} ops in {:.1}s ({:.0}/s)", diff --git a/src/ops_wal.rs b/src/ops_wal.rs index b64132fb..cc90ac57 100644 --- a/src/ops_wal.rs +++ b/src/ops_wal.rs @@ -12,7 +12,7 @@ //! Partial records at EOF are skipped (crash recovery). use std::fs::{self, File, OpenOptions}; -use std::io::{self, Read, Write}; +use std::io::{self, Read, Seek, Write}; use std::path::{Path, PathBuf}; use crate::pg_sync::ops::{EntityOps, Op}; @@ -112,8 +112,8 @@ impl WalReader { } /// Read up to `max_records` from the WAL starting at the current cursor. - /// Advances the cursor past successfully read records. - /// Stops at EOF or on partial/corrupted records. + /// Uses incremental seek+read — only reads new data from the cursor position, + /// not the entire file. Safe for large WAL files. pub fn read_batch(&mut self, max_records: usize) -> io::Result { if !self.path.exists() { return Ok(WalBatch { @@ -124,11 +124,26 @@ impl WalReader { }); } - let data = fs::read(&self.path)?; + let file_len = fs::metadata(&self.path)?.len(); + if self.cursor >= file_len { + return Ok(WalBatch { + entries: Vec::new(), + new_cursor: self.cursor, + bytes_read: 0, + crc_failures: 0, + }); + } + + // Read only from cursor to EOF (incremental, not full-file read) + let mut file = File::open(&self.path)?; + file.seek(std::io::SeekFrom::Start(self.cursor))?; + let remaining = (file_len - self.cursor) as usize; + let mut data = vec![0u8; remaining]; + file.read_exact(&mut data)?; + let mut entries = Vec::new(); - let mut pos = self.cursor as usize; + let mut pos = 0usize; let mut crc_failures = 0u64; - let start_pos = pos; while entries.len() < max_records && pos + HEADER_SIZE <= data.len() { // Read header: [4-byte len][8-byte entity_id][1-byte flags] @@ -155,7 +170,6 @@ impl WalReader { let computed_crc = crc32fast::hash(crc_input); if stored_crc != computed_crc { - // CRC failure — skip this record crc_failures += 1; pos = record_end; continue; @@ -168,7 +182,6 @@ impl WalReader { entries.push(EntityOps { entity_id, ops, creates_slot }); } Err(_) => { - // Invalid JSON — skip crc_failures += 1; } } @@ -176,8 +189,8 @@ impl WalReader { pos = record_end; } - let bytes_read = (pos - start_pos) as u64; - self.cursor = pos as u64; + let bytes_read = pos as u64; + self.cursor += bytes_read; Ok(WalBatch { entries, From f0c296824b16784e7237bf07bccd220e529193dd Mon Sep 17 00:00:00 2001 From: Justin Maier Date: Wed, 25 Mar 2026 20:22:03 -0600 Subject: [PATCH 16/19] feat: wire ClickHouse metrics poller into V2 ops pipeline MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CH poller now emits Op::Set for reactionCount/commentCount/collectedCount instead of V1 full-document patches. Removes PG dependency (metrics are self-contained sort values from ClickHouse). Batched at 5K entities per POST /ops request. creates_slot: false (sort-only, no alive bit changes). 6 unit tests for metrics→ops conversion. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/bin/pg_sync.rs | 1 - src/pg_sync/metrics_poller.rs | 270 ++++++++++++++++++++++++++++------ 2 files changed, 227 insertions(+), 44 deletions(-) diff --git a/src/bin/pg_sync.rs b/src/bin/pg_sync.rs index 1107e2b8..b30f08cb 100644 --- a/src/bin/pg_sync.rs +++ b/src/bin/pg_sync.rs @@ -352,7 +352,6 @@ async fn main() { password: sync_config.clickhouse_password.clone(), }; let metrics_fut = metrics_poller::run_metrics_poller( - &pool, &ch_config, &bitdex_client, sync_config.metrics_poll_interval_secs, diff --git a/src/pg_sync/metrics_poller.rs b/src/pg_sync/metrics_poller.rs index fe91f9fb..b7059d53 100644 --- a/src/pg_sync/metrics_poller.rs +++ b/src/pg_sync/metrics_poller.rs @@ -1,18 +1,20 @@ //! ClickHouse metrics poller: polls for recent metric events, fetches aggregate -//! counts, rebuilds full docs from PG, and pushes to Bitdex. +//! counts, and pushes sort-field ops to BitDex via the V2 ops pipeline. //! //! ClickHouse is queried via its HTTP interface (POST with SQL). +//! Metrics (reactionCount, commentCount, collectedCount) are sort-only fields, +//! so ops are sent with `creates_slot: false` — they update existing slots +//! without touching the alive bitmap. use std::collections::HashMap; use std::time::{SystemTime, UNIX_EPOCH}; use reqwest::Client; -use sqlx::PgPool; +use serde_json::json; use tokio::time::{Duration, interval}; use super::bitdex_client::BitdexClient; -use super::queries; -use super::row_assembler::{assemble_batch, EnrichmentData, MetricInfo}; +use super::ops::{EntityOps, Op, OpsBatch, SyncMeta}; /// ClickHouse connection config. pub struct ClickHouseConfig { @@ -21,9 +23,19 @@ pub struct ClickHouseConfig { pub password: Option, } +/// Aggregate metric counts for a single image from ClickHouse. +struct MetricInfo { + reaction_count: i64, + comment_count: i64, + collected_count: i64, +} + /// Run the ClickHouse metrics poller loop. Runs forever until cancelled. +/// +/// V2 pipeline: fetches aggregate counts from ClickHouse, converts them to +/// `Op::Set` ops for sort fields, and POSTs via the `/ops` endpoint. +/// No PG round-trip needed — metrics are self-contained sort-field updates. pub async fn run_metrics_poller( - pool: &PgPool, ch_config: &ClickHouseConfig, bitdex_client: &BitdexClient, poll_interval_secs: u64, @@ -41,7 +53,7 @@ pub async fn run_metrics_poller( loop { ticker.tick().await; - // Health gate: skip ClickHouse + PG fetch if BitDex is unreachable. + // Health gate: skip ClickHouse fetch if BitDex is unreachable. if !bitdex_client.is_healthy().await { if !bitdex_was_down { eprintln!("Metrics: BitDex is unreachable, pausing until healthy"); @@ -56,10 +68,10 @@ pub async fn run_metrics_poller( let now = current_epoch_secs(); - match poll_metrics_and_push(pool, &http, ch_config, bitdex_client, last_poll_ts).await { + match poll_metrics_and_push(&http, ch_config, bitdex_client, last_poll_ts).await { Ok(count) => { if count > 0 { - eprintln!("Metrics: updated {count} documents"); + eprintln!("Metrics: pushed {count} ops batches"); } last_poll_ts = now; } @@ -78,9 +90,12 @@ fn current_epoch_secs() -> i64 { .as_secs() as i64 } -/// Single poll + push cycle. +/// Maximum number of entity ops per HTTP request to `/ops`. +/// Keeps request bodies reasonable and avoids timeouts. +const OPS_BATCH_SIZE: usize = 5_000; + +/// Single poll + push cycle. Fetches CH metrics, converts to V2 ops, POSTs to BitDex. async fn poll_metrics_and_push( - pool: &PgPool, http: &Client, ch_config: &ClickHouseConfig, bitdex_client: &BitdexClient, @@ -93,41 +108,25 @@ async fn poll_metrics_and_push( return Ok(0); } - let image_ids: Vec = metrics.keys().copied().collect(); - - // Fetch full documents from PG (same enrichment pipeline as outbox) - let images = queries::fetch_images_by_ids(pool, &image_ids) - .await - .map_err(|e| format!("fetch_images_by_ids: {e}"))?; - - if images.is_empty() { - return Ok(0); - } - - let fetched_ids: Vec = images.iter().map(|r| r.id).collect(); - - let (tags, tools, techniques, resources) = tokio::try_join!( - queries::fetch_tags(pool, &fetched_ids), - queries::fetch_tools(pool, &fetched_ids), - queries::fetch_techniques(pool, &fetched_ids), - queries::fetch_resources(pool, &fetched_ids), - ) - .map_err(|e| format!("enrichment queries: {e}"))?; - - let mut enrichment = EnrichmentData::from_rows(tags, tools, techniques, resources); - - // Merge ClickHouse metrics into enrichment - enrichment.metrics = metrics; - - let docs = assemble_batch(&images, &enrichment); - let count = docs.len(); - - // Use PATCH for metrics updates — preserves fields not included in this update. - if !docs.is_empty() { - bitdex_client.patch_batch(&docs, None).await?; + let entity_ops = metrics_to_entity_ops(metrics); + + let total = entity_ops.len(); + + // Send in batches to keep request sizes manageable. + for chunk in entity_ops.chunks(OPS_BATCH_SIZE) { + let batch = OpsBatch { + ops: chunk.to_vec(), + meta: Some(SyncMeta { + source: "clickhouse-metrics".into(), + cursor: None, + max_id: None, + lag_rows: None, + }), + }; + bitdex_client.post_ops(&batch).await?; } - Ok(count) + Ok(total) } /// Query ClickHouse HTTP interface for aggregate metrics. @@ -214,3 +213,188 @@ async fn fetch_metrics_from_clickhouse( Ok(metrics) } + +/// Convert a map of CH metrics into V2 EntityOps. +/// +/// Each image gets three `Op::Set` ops (reactionCount, commentCount, collectedCount). +/// `creates_slot` is false because these are sort-only field updates — they should +/// never create new alive slots. +fn metrics_to_entity_ops(metrics: HashMap) -> Vec { + metrics + .into_iter() + .map(|(image_id, info)| { + EntityOps::new( + image_id, + vec![ + Op::Set { + field: "reactionCount".into(), + value: json!(info.reaction_count), + }, + Op::Set { + field: "commentCount".into(), + value: json!(info.comment_count), + }, + Op::Set { + field: "collectedCount".into(), + value: json!(info.collected_count), + }, + ], + ) + }) + .collect() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_metrics_to_entity_ops_single() { + let mut metrics = HashMap::new(); + metrics.insert( + 42, + MetricInfo { + reaction_count: 100, + comment_count: 5, + collected_count: 3, + }, + ); + + let ops = metrics_to_entity_ops(metrics); + assert_eq!(ops.len(), 1); + + let entity = &ops[0]; + assert_eq!(entity.entity_id, 42); + assert!(!entity.creates_slot, "metrics ops must not create slots"); + assert_eq!(entity.ops.len(), 3); + + // Verify all three sort fields are present as Set ops + let fields: Vec<&str> = entity + .ops + .iter() + .filter_map(|op| match op { + Op::Set { field, .. } => Some(field.as_str()), + _ => None, + }) + .collect(); + assert!(fields.contains(&"reactionCount")); + assert!(fields.contains(&"commentCount")); + assert!(fields.contains(&"collectedCount")); + } + + #[test] + fn test_metrics_to_entity_ops_values() { + let mut metrics = HashMap::new(); + metrics.insert( + 99, + MetricInfo { + reaction_count: 1234, + comment_count: 56, + collected_count: 78, + }, + ); + + let ops = metrics_to_entity_ops(metrics); + let entity = &ops[0]; + + for op in &entity.ops { + match op { + Op::Set { field, value } => match field.as_str() { + "reactionCount" => assert_eq!(value, &json!(1234)), + "commentCount" => assert_eq!(value, &json!(56)), + "collectedCount" => assert_eq!(value, &json!(78)), + other => panic!("unexpected field: {other}"), + }, + other => panic!("expected Op::Set, got {other:?}"), + } + } + } + + #[test] + fn test_metrics_to_entity_ops_empty() { + let metrics = HashMap::new(); + let ops = metrics_to_entity_ops(metrics); + assert!(ops.is_empty()); + } + + #[test] + fn test_metrics_to_entity_ops_multiple_images() { + let mut metrics = HashMap::new(); + for id in 1..=100 { + metrics.insert( + id, + MetricInfo { + reaction_count: id * 10, + comment_count: id, + collected_count: id / 2, + }, + ); + } + + let ops = metrics_to_entity_ops(metrics); + assert_eq!(ops.len(), 100); + + // Every entry should have creates_slot = false and 3 ops + for entity in &ops { + assert!(!entity.creates_slot); + assert_eq!(entity.ops.len(), 3); + } + } + + #[test] + fn test_metrics_ops_batch_serialization() { + let mut metrics = HashMap::new(); + metrics.insert( + 42, + MetricInfo { + reaction_count: 100, + comment_count: 5, + collected_count: 3, + }, + ); + + let entity_ops = metrics_to_entity_ops(metrics); + let batch = OpsBatch { + ops: entity_ops, + meta: Some(SyncMeta { + source: "clickhouse-metrics".into(), + cursor: None, + max_id: None, + lag_rows: None, + }), + }; + + // Verify it serializes to valid JSON matching the expected ops format + let json = serde_json::to_value(&batch).unwrap(); + assert_eq!(json["meta"]["source"], "clickhouse-metrics"); + assert_eq!(json["ops"].as_array().unwrap().len(), 1); + + let first = &json["ops"][0]; + assert_eq!(first["entity_id"], 42); + assert_eq!(first["creates_slot"], false); + assert_eq!(first["ops"].as_array().unwrap().len(), 3); + } + + #[test] + fn test_metrics_zero_counts() { + let mut metrics = HashMap::new(); + metrics.insert( + 1, + MetricInfo { + reaction_count: 0, + comment_count: 0, + collected_count: 0, + }, + ); + + let ops = metrics_to_entity_ops(metrics); + assert_eq!(ops.len(), 1); + // Zero counts should still produce Set ops (correct cumulative value) + assert_eq!(ops[0].ops.len(), 3); + for op in &ops[0].ops { + if let Op::Set { value, .. } = op { + assert_eq!(value, &json!(0)); + } + } + } +} From 52529f689bc8fd80adc3db1b671d9b0b5081960d Mon Sep 17 00:00:00 2001 From: Justin Maier Date: Wed, 25 Mar 2026 20:24:19 -0600 Subject: [PATCH 17/19] fix: skip WAL phase in direct mode validation harness MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Direct mode reads CSVs directly — no need to write WAL first. Eliminates ~40s wasted I/O per run. Co-Authored-By: Claude Opus 4.6 (1M context) --- examples/validate_ops_pipeline.rs | 43 ++++++++++++++++++------------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/examples/validate_ops_pipeline.rs b/examples/validate_ops_pipeline.rs index a16b235d..a040d9b8 100644 --- a/examples/validate_ops_pipeline.rs +++ b/examples/validate_ops_pipeline.rs @@ -48,25 +48,32 @@ fn main() { std::fs::create_dir_all(&bitmap_dir).ok(); std::fs::create_dir_all(&docs_dir).ok(); - // Phase 1: CSV → WAL - eprintln!("\n--- Phase 1: CSV → WAL ---"); - let csv_start = Instant::now(); - let csv_results = run_csv_dump( - &PathBuf::from(&csv_dir), - &wal_path, - 10_000, - Some(limit), - ) - .expect("CSV dump failed"); - - let csv_elapsed = csv_start.elapsed(); - let total_ops: u64 = csv_results.iter().map(|(_, s)| s.ops_written).sum(); + // Phase 1: CSV → WAL (skip for direct mode — it reads CSVs directly) + let csv_results: Vec<(String, bitdex_v2::pg_sync::csv_ops::CsvOpsStats)> = if direct { + eprintln!("\n--- Phase 1: Skipped (direct mode) ---"); + Vec::new() + } else { + eprintln!("\n--- Phase 1: CSV → WAL ---"); + let csv_start = Instant::now(); + let results = run_csv_dump( + &PathBuf::from(&csv_dir), + &wal_path, + 10_000, + Some(limit), + ) + .expect("CSV dump failed"); + + let csv_elapsed = csv_start.elapsed(); + let total_ops: u64 = results.iter().map(|(_, s)| s.ops_written).sum(); + let total_rows: u64 = results.iter().map(|(_, s)| s.rows_read).sum(); + eprintln!("\nCSV → WAL complete:"); + eprintln!(" Total rows: {total_rows}"); + eprintln!(" Total ops: {total_ops}"); + eprintln!(" Time: {:.2}s", csv_elapsed.as_secs_f64()); + eprintln!(" Throughput: {:.0} rows/s", total_rows as f64 / csv_elapsed.as_secs_f64().max(0.001)); + results + }; let total_rows: u64 = csv_results.iter().map(|(_, s)| s.rows_read).sum(); - eprintln!("\nCSV → WAL complete:"); - eprintln!(" Total rows: {total_rows}"); - eprintln!(" Total ops: {total_ops}"); - eprintln!(" Time: {:.2}s", csv_elapsed.as_secs_f64()); - eprintln!(" Throughput: {:.0} rows/s", total_rows as f64 / csv_elapsed.as_secs_f64().max(0.001)); // Phase 2: WAL → Engine eprintln!("\n--- Phase 2: WAL → Engine ---"); From 761a044134919c009f258bb104013f737c2fdffb Mon Sep 17 00:00:00 2001 From: Justin Maier Date: Wed, 25 Mar 2026 20:51:49 -0600 Subject: [PATCH 18/19] =?UTF-8?q?fix:=20simplify=20direct=20dump=20?= =?UTF-8?q?=E2=80=94=20read=5Flines=20+=20loading=20mode,=20fix=20chunked?= =?UTF-8?q?=20reader=20hang?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reverts to simple read_lines approach (no chunked block reader thread). The chunked reader deadlocked due to sync_channel backpressure with rayon. Loading mode entered once for the entire dump, exited after all tables. Headless engines get a harmless timeout warning (no flush thread). 1M benchmark: 2.5M ops/s (images 1.35M/s, tags 5.1M/s, tools 6.8M/s). Co-Authored-By: Claude Opus 4.6 (1M context) --- src/concurrent_engine.rs | 19 +-- src/ops_processor.rs | 342 +++++++++++++++------------------------ 2 files changed, 140 insertions(+), 221 deletions(-) diff --git a/src/concurrent_engine.rs b/src/concurrent_engine.rs index cc510f5b..25fe5726 100644 --- a/src/concurrent_engine.rs +++ b/src/concurrent_engine.rs @@ -6494,20 +6494,18 @@ impl ConcurrentEngine { /// Used by the dump pipeline (Sync V2) to apply ops-derived bitmaps /// without going through the coalescer channel. /// - /// Enters loading mode before cloning staging (ensures refcount=1, avoiding - /// the Arc clone cascade). Exits loading mode after to publish the snapshot. - /// Safe to call multiple times — each call is a self-contained load cycle. + /// **Caller must be in loading mode** (`enter_loading_mode()` before first call, + /// `exit_loading_mode()` after all accums are applied). This avoids the Arc clone + /// cascade — in loading mode, staging refcount=1 so clone is cheap. /// /// ORs filter bitmaps, sort layer bitmaps, and alive bitmap into staging. pub fn apply_accum(&self, accum: &crate::loader::BitmapAccum) { - // Enter loading mode: drops published snapshot's Arc refs so the staging - // clone below is the sole owner (refcount=1). This avoids the deep-clone - // cascade from Arc::make_mut() that caused 94s stalls in PR #78. - self.enter_loading_mode(); - + // In loading mode, the flush thread doesn't publish snapshots, so the + // ArcSwap holds the sole reference. Clone is O(num_fields) — just Arc + // pointer copies, no deep bitmap clones. let snap = self.inner.load_full(); let mut staging = (*snap).clone(); - drop(snap); // Release Arc ref before mutation + drop(snap); // Apply filter bitmaps for (field_name, value_map) in &accum.filter_maps { @@ -6530,9 +6528,8 @@ impl ConcurrentEngine { // Apply alive bitmap (also updates slot counter) staging.slots.alive_or_bitmap(&accum.alive); - // Store and publish + // Store back — in loading mode, no snapshot publish overhead self.inner.store(Arc::new(staging)); - self.exit_loading_mode(); } /// Build all bitmap indexes from the docstore. diff --git a/src/ops_processor.rs b/src/ops_processor.rs index 832639c9..5284fa89 100644 --- a/src/ops_processor.rs +++ b/src/ops_processor.rs @@ -534,8 +534,7 @@ pub fn process_csv_dump_direct( use crate::loader::BitmapAccum; use crate::pg_sync::copy_queries::{parse_image_row, parse_tag_row, parse_tool_row}; use rayon::prelude::*; - use std::io::Read; - use std::thread; + use std::io::BufRead; use std::time::Instant; let config = engine.config(); @@ -549,261 +548,184 @@ pub fn process_csv_dump_direct( let mut total_errors = 0u64; let record_limit = limit.map(|l| l as usize).unwrap_or(usize::MAX); - // Chunked reader: sends ~300MB blocks of complete lines through a channel. - // Same pattern as the single-pass loader. Never loads full file into memory. - fn spawn_block_reader( - path: std::path::PathBuf, - target_bytes: usize, - ) -> (std::sync::mpsc::Receiver>, thread::JoinHandle<()>) { - let (tx, rx) = std::sync::mpsc::sync_channel::>(2); - let handle = thread::spawn(move || { - let file = std::fs::File::open(&path).expect("open CSV"); - let mut reader = std::io::BufReader::with_capacity(16 * 1024 * 1024, file); - let mut buf = vec![0u8; 4 * 1024 * 1024]; - let mut accum = Vec::::with_capacity(target_bytes + 4 * 1024 * 1024); - - loop { - let n = reader.read(&mut buf).unwrap_or(0); - if n == 0 { - if !accum.is_empty() { - let _ = tx.send(accum); - } - break; - } - accum.extend_from_slice(&buf[..n]); - - if accum.len() >= target_bytes { - // Split at last newline to keep lines intact - if let Some(last_nl) = accum.iter().rposition(|&b| b == b'\n') { - let remainder = accum[last_nl + 1..].to_vec(); - accum.truncate(last_nl + 1); - let batch = std::mem::replace( - &mut accum, - Vec::with_capacity(target_bytes + 4 * 1024 * 1024), - ); - accum = remainder; - if tx.send(batch).is_err() { - break; - } - } - } + // Enter loading mode ONCE for the entire dump — avoids Arc clone cascade. + engine.enter_loading_mode(); + + // Read CSV lines into memory (respecting limit). At 107M scale, images.csv + // is 14GB — fits in memory on 32GB+ machines. For the 67GB tags.csv, + // we process in chunks via the limit parameter. + let read_lines = |path: &std::path::Path, max: usize| -> Vec> { + let file = std::fs::File::open(path).expect("open CSV"); + let reader = std::io::BufReader::with_capacity(8 * 1024 * 1024, file); + let mut lines = Vec::new(); + for line in reader.split(b'\n') { + if lines.len() >= max { break; } + match line { + Ok(l) if !l.is_empty() => lines.push(l), + _ => {} } - }); - (rx, handle) - } - - let target_block_bytes = 300 * 1024 * 1024; // ~300MB per block + } + lines + }; // Phase 1: Images (creates alive slots) let images_csv = csv_dir.join("images.csv"); if images_csv.exists() { let img_start = Instant::now(); - let (rx, reader_handle) = spawn_block_reader(images_csv, target_block_bytes); + let lines = read_lines(&images_csv, record_limit); + eprintln!(" images: read {} lines in {:.1}s", lines.len(), img_start.elapsed().as_secs_f64()); let f_names = &filter_names; let s_configs = &sort_configs; let meta_ref = &meta; - let mut phase_total = 0usize; - let mut phase_errors = 0u64; - - while let Ok(block) = rx.recv() { - if phase_total >= record_limit { break; } - - let block_str = match std::str::from_utf8(&block) { - Ok(s) => s, - Err(_) => continue, - }; - let mut lines: Vec<&[u8]> = block_str - .split('\n') - .filter(|l| !l.is_empty()) - .map(|l| l.as_bytes()) - .collect(); - let remaining = record_limit.saturating_sub(phase_total); - if lines.len() > remaining { - lines.truncate(remaining); - } - let accum = lines - .into_par_iter() - .fold( - || BitmapAccum::new(f_names, s_configs), - |mut acc, line| { - let row = match parse_image_row(line) { - Some(r) => r, - None => { acc.errors += 1; return acc; } - }; - let slot = row.id as u32; - acc.alive.insert(slot); - - let ops = crate::pg_sync::csv_ops::image_row_to_ops_pub(&row); - for op in &ops { - if let Op::Set { field, value } = op { - let qval = json_to_qvalue(value); - if let Some((_, _)) = meta_ref.filter_fields.get(field.as_str()) { - if let Some(key) = value_to_bitmap_key(&qval) { - acc.filter_maps - .get_mut(field.as_str()) - .map(|m| m.entry(key).or_insert_with(roaring::RoaringBitmap::new).insert(slot)); + let accum = lines + .into_par_iter() + .fold( + || BitmapAccum::new(f_names, s_configs), + |mut acc, line| { + let row = match parse_image_row(&line) { + Some(r) => r, + None => { acc.errors += 1; return acc; } + }; + let slot = row.id as u32; + acc.alive.insert(slot); + + let ops = crate::pg_sync::csv_ops::image_row_to_ops_pub(&row); + for op in &ops { + if let Op::Set { field, value } = op { + let qval = json_to_qvalue(value); + if let Some((_, _)) = meta_ref.filter_fields.get(field.as_str()) { + if let Some(key) = value_to_bitmap_key(&qval) { + if let Some(m) = acc.filter_maps.get_mut(field.as_str()) { + m.entry(key).or_insert_with(roaring::RoaringBitmap::new).insert(slot); } } - if let Some((_, num_bits)) = meta_ref.sort_fields.get(field.as_str()) { - if let Some(sort_val) = value_to_sort_u32(&qval) { + } + if let Some((_, num_bits)) = meta_ref.sort_fields.get(field.as_str()) { + if let Some(sort_val) = value_to_sort_u32(&qval) { + if let Some(m) = acc.sort_maps.get_mut(field.as_str()) { for bit in 0..*num_bits { if (sort_val >> bit) & 1 == 1 { - acc.sort_maps - .get_mut(field.as_str()) - .map(|m| m.entry(bit).or_insert_with(roaring::RoaringBitmap::new).insert(slot)); + m.entry(bit).or_insert_with(roaring::RoaringBitmap::new).insert(slot); } } } } } } - acc.count += 1; - acc - }, - ) - .reduce( - || BitmapAccum::new(f_names, s_configs), - |a, b| a.merge(b), - ); - - phase_total += accum.count; - phase_errors += accum.errors; - engine.apply_accum(&accum); - } - let _ = reader_handle.join(); - total_applied += phase_total as u64; - total_errors += phase_errors; + } + acc.count += 1; + acc + }, + ) + .reduce( + || BitmapAccum::new(f_names, s_configs), + |a, b| a.merge(b), + ); + + total_applied += accum.count as u64; + total_errors += accum.errors; + engine.apply_accum(&accum); - eprintln!(" images: {} rows, {:.1}s ({:.0}/s)", - phase_total, + eprintln!(" images: {} rows, {:.1}s total ({:.0}/s)", + accum.count, img_start.elapsed().as_secs_f64(), - phase_total as f64 / img_start.elapsed().as_secs_f64().max(0.001)); + accum.count as f64 / img_start.elapsed().as_secs_f64().max(0.001)); } - // Phase 2: Tags (chunked reader + rayon) + // Phase 2: Tags (rayon parallel) let tags_csv = csv_dir.join("tags.csv"); if tags_csv.exists() { let tag_start = Instant::now(); - let (rx, reader_handle) = spawn_block_reader(tags_csv, target_block_bytes); + let lines = read_lines(&tags_csv, record_limit); + eprintln!(" tags: read {} lines in {:.1}s", lines.len(), tag_start.elapsed().as_secs_f64()); let f_names = &filter_names; let s_configs = &sort_configs; - let mut phase_total = 0usize; - let mut phase_errors = 0u64; - - while let Ok(block) = rx.recv() { - if phase_total >= record_limit { break; } - - let mut lines: Vec<&[u8]> = block - .split(|&b| b == b'\n') - .filter(|l| !l.is_empty()) - .collect(); - let remaining = record_limit.saturating_sub(phase_total); - if lines.len() > remaining { - lines.truncate(remaining); - } - let accum = lines - .into_par_iter() - .fold( - || BitmapAccum::new(f_names, s_configs), - |mut acc, line| { - let (tag_id, image_id) = match parse_tag_row(line) { - Some(pair) => pair, - None => { acc.errors += 1; return acc; } - }; - let slot = image_id as u32; - if let Some(m) = acc.filter_maps.get_mut("tagIds") { - m.entry(tag_id as u64) - .or_insert_with(roaring::RoaringBitmap::new) - .insert(slot); - } - acc.count += 1; - acc - }, - ) - .reduce( - || BitmapAccum::new(f_names, s_configs), - |a, b| a.merge(b), - ); - - phase_total += accum.count; - phase_errors += accum.errors; - engine.apply_accum(&accum); - } - let _ = reader_handle.join(); - total_applied += phase_total as u64; - total_errors += phase_errors; + let accum = lines + .into_par_iter() + .fold( + || BitmapAccum::new(f_names, s_configs), + |mut acc, line| { + let (tag_id, image_id) = match parse_tag_row(&line) { + Some(pair) => pair, + None => { acc.errors += 1; return acc; } + }; + let slot = image_id as u32; + if let Some(m) = acc.filter_maps.get_mut("tagIds") { + m.entry(tag_id as u64) + .or_insert_with(roaring::RoaringBitmap::new) + .insert(slot); + } + acc.count += 1; + acc + }, + ) + .reduce( + || BitmapAccum::new(f_names, s_configs), + |a, b| a.merge(b), + ); - eprintln!(" tags: {} rows, {:.1}s ({:.0}/s)", - phase_total, + total_applied += accum.count as u64; + total_errors += accum.errors; + engine.apply_accum(&accum); + + eprintln!(" tags: {} rows, {:.1}s total ({:.0}/s)", + accum.count, tag_start.elapsed().as_secs_f64(), - phase_total as f64 / tag_start.elapsed().as_secs_f64().max(0.001)); + accum.count as f64 / tag_start.elapsed().as_secs_f64().max(0.001)); } - // Phase 3: Tools (chunked reader + rayon) + // Phase 3: Tools (rayon parallel) let tools_csv = csv_dir.join("tools.csv"); if tools_csv.exists() { let tool_start = Instant::now(); - let (rx, reader_handle) = spawn_block_reader(tools_csv, target_block_bytes); + let lines = read_lines(&tools_csv, record_limit); + eprintln!(" tools: read {} lines in {:.1}s", lines.len(), tool_start.elapsed().as_secs_f64()); let f_names = &filter_names; let s_configs = &sort_configs; - let mut phase_total = 0usize; - let mut phase_errors = 0u64; - - while let Ok(block) = rx.recv() { - if phase_total >= record_limit { break; } - - let mut lines: Vec<&[u8]> = block - .split(|&b| b == b'\n') - .filter(|l| !l.is_empty()) - .collect(); - let remaining = record_limit.saturating_sub(phase_total); - if lines.len() > remaining { - lines.truncate(remaining); - } - let accum = lines - .into_par_iter() - .fold( - || BitmapAccum::new(f_names, s_configs), - |mut acc, line| { - let (tool_id, image_id) = match parse_tool_row(line) { - Some(pair) => pair, - None => { acc.errors += 1; return acc; } - }; - let slot = image_id as u32; - if let Some(m) = acc.filter_maps.get_mut("toolIds") { - m.entry(tool_id as u64) - .or_insert_with(roaring::RoaringBitmap::new) - .insert(slot); - } - acc.count += 1; - acc - }, - ) - .reduce( - || BitmapAccum::new(f_names, s_configs), - |a, b| a.merge(b), - ); - - phase_total += accum.count; - phase_errors += accum.errors; - engine.apply_accum(&accum); - } - let _ = reader_handle.join(); - total_applied += phase_total as u64; - total_errors += phase_errors; + let accum = lines + .into_par_iter() + .fold( + || BitmapAccum::new(f_names, s_configs), + |mut acc, line| { + let (tool_id, image_id) = match parse_tool_row(&line) { + Some(pair) => pair, + None => { acc.errors += 1; return acc; } + }; + let slot = image_id as u32; + if let Some(m) = acc.filter_maps.get_mut("toolIds") { + m.entry(tool_id as u64) + .or_insert_with(roaring::RoaringBitmap::new) + .insert(slot); + } + acc.count += 1; + acc + }, + ) + .reduce( + || BitmapAccum::new(f_names, s_configs), + |a, b| a.merge(b), + ); + + total_applied += accum.count as u64; + total_errors += accum.errors; + engine.apply_accum(&accum); - eprintln!(" tools: {} rows, {:.1}s ({:.0}/s)", - phase_total, + eprintln!(" tools: {} rows, {:.1}s total ({:.0}/s)", + accum.count, tool_start.elapsed().as_secs_f64(), - phase_total as f64 / tool_start.elapsed().as_secs_f64().max(0.001)); + accum.count as f64 / tool_start.elapsed().as_secs_f64().max(0.001)); } + // Exit loading mode. On headless, this will timeout (no flush thread) but + // that's OK — the warning is harmless and the flag gets cleared. + engine.exit_loading_mode(); + eprintln!(" Total: {total_applied} ops in {:.1}s ({:.0}/s)", start.elapsed().as_secs_f64(), total_applied as f64 / start.elapsed().as_secs_f64().max(0.001)); From ce9178b283e428948d00088e70f070cb0e5b51de Mon Sep 17 00:00:00 2001 From: Justin Maier Date: Wed, 25 Mar 2026 21:14:20 -0600 Subject: [PATCH 19/19] fix: chunked CSV processing to prevent OOM on 67GB tags.csv Reads 10M lines per chunk instead of entire file. Each chunk is rayon processed and applied to staging, then freed. Caps memory at ~2GB per chunk instead of 67GB for the full tags file. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/ops_processor.rs | 308 +++++++++++++++++++++++++------------------ 1 file changed, 182 insertions(+), 126 deletions(-) diff --git a/src/ops_processor.rs b/src/ops_processor.rs index 5284fa89..b7550eb8 100644 --- a/src/ops_processor.rs +++ b/src/ops_processor.rs @@ -551,175 +551,231 @@ pub fn process_csv_dump_direct( // Enter loading mode ONCE for the entire dump — avoids Arc clone cascade. engine.enter_loading_mode(); - // Read CSV lines into memory (respecting limit). At 107M scale, images.csv - // is 14GB — fits in memory on 32GB+ machines. For the 67GB tags.csv, - // we process in chunks via the limit parameter. - let read_lines = |path: &std::path::Path, max: usize| -> Vec> { - let file = std::fs::File::open(path).expect("open CSV"); - let reader = std::io::BufReader::with_capacity(8 * 1024 * 1024, file); - let mut lines = Vec::new(); - for line in reader.split(b'\n') { - if lines.len() >= max { break; } - match line { - Ok(l) if !l.is_empty() => lines.push(l), - _ => {} + // Chunk size for reading CSV lines. 10M lines per chunk keeps memory bounded + // (~1-2GB per chunk) while giving rayon enough work for parallelism. + const CHUNK_SIZE: usize = 10_000_000; + + /// Helper: read up to `chunk` lines from a BufReader, returns lines read. + fn read_chunk( + reader: &mut impl BufRead, + chunk: usize, + buf: &mut Vec>, + ) -> usize { + buf.clear(); + let mut count = 0; + let mut line_buf = Vec::new(); + while count < chunk { + line_buf.clear(); + match reader.read_until(b'\n', &mut line_buf) { + Ok(0) => break, // EOF + Ok(_) => { + // Trim trailing newline + if line_buf.last() == Some(&b'\n') { line_buf.pop(); } + if line_buf.last() == Some(&b'\r') { line_buf.pop(); } + if !line_buf.is_empty() { + buf.push(std::mem::take(&mut line_buf)); + line_buf = Vec::new(); + count += 1; + } + } + Err(_) => break, } } - lines - }; + count + } - // Phase 1: Images (creates alive slots) + // Phase 1: Images (creates alive slots) — chunked let images_csv = csv_dir.join("images.csv"); if images_csv.exists() { let img_start = Instant::now(); - let lines = read_lines(&images_csv, record_limit); - eprintln!(" images: read {} lines in {:.1}s", lines.len(), img_start.elapsed().as_secs_f64()); + let file = std::fs::File::open(&images_csv).expect("open images.csv"); + let mut reader = std::io::BufReader::with_capacity(8 * 1024 * 1024, file); + let mut phase_total = 0usize; + let mut phase_errors = 0u64; + let mut chunk_buf = Vec::with_capacity(CHUNK_SIZE); let f_names = &filter_names; let s_configs = &sort_configs; let meta_ref = &meta; - let accum = lines - .into_par_iter() - .fold( - || BitmapAccum::new(f_names, s_configs), - |mut acc, line| { - let row = match parse_image_row(&line) { - Some(r) => r, - None => { acc.errors += 1; return acc; } - }; - let slot = row.id as u32; - acc.alive.insert(slot); - - let ops = crate::pg_sync::csv_ops::image_row_to_ops_pub(&row); - for op in &ops { - if let Op::Set { field, value } = op { - let qval = json_to_qvalue(value); - if let Some((_, _)) = meta_ref.filter_fields.get(field.as_str()) { - if let Some(key) = value_to_bitmap_key(&qval) { - if let Some(m) = acc.filter_maps.get_mut(field.as_str()) { - m.entry(key).or_insert_with(roaring::RoaringBitmap::new).insert(slot); + loop { + let remaining = record_limit.saturating_sub(phase_total); + if remaining == 0 { break; } + let n = read_chunk(&mut reader, remaining.min(CHUNK_SIZE), &mut chunk_buf); + if n == 0 { break; } + + let accum = chunk_buf + .par_iter() + .fold( + || BitmapAccum::new(f_names, s_configs), + |mut acc, line| { + let row = match parse_image_row(line) { + Some(r) => r, + None => { acc.errors += 1; return acc; } + }; + let slot = row.id as u32; + acc.alive.insert(slot); + + let ops = crate::pg_sync::csv_ops::image_row_to_ops_pub(&row); + for op in &ops { + if let Op::Set { field, value } = op { + let qval = json_to_qvalue(value); + if let Some((_, _)) = meta_ref.filter_fields.get(field.as_str()) { + if let Some(key) = value_to_bitmap_key(&qval) { + if let Some(m) = acc.filter_maps.get_mut(field.as_str()) { + m.entry(key).or_insert_with(roaring::RoaringBitmap::new).insert(slot); + } } } - } - if let Some((_, num_bits)) = meta_ref.sort_fields.get(field.as_str()) { - if let Some(sort_val) = value_to_sort_u32(&qval) { - if let Some(m) = acc.sort_maps.get_mut(field.as_str()) { - for bit in 0..*num_bits { - if (sort_val >> bit) & 1 == 1 { - m.entry(bit).or_insert_with(roaring::RoaringBitmap::new).insert(slot); + if let Some((_, num_bits)) = meta_ref.sort_fields.get(field.as_str()) { + if let Some(sort_val) = value_to_sort_u32(&qval) { + if let Some(m) = acc.sort_maps.get_mut(field.as_str()) { + for bit in 0..*num_bits { + if (sort_val >> bit) & 1 == 1 { + m.entry(bit).or_insert_with(roaring::RoaringBitmap::new).insert(slot); + } } } } } } } - } - acc.count += 1; - acc - }, - ) - .reduce( - || BitmapAccum::new(f_names, s_configs), - |a, b| a.merge(b), - ); - - total_applied += accum.count as u64; - total_errors += accum.errors; - engine.apply_accum(&accum); + acc.count += 1; + acc + }, + ) + .reduce( + || BitmapAccum::new(f_names, s_configs), + |a, b| a.merge(b), + ); + + phase_total += accum.count; + phase_errors += accum.errors; + engine.apply_accum(&accum); + + eprintln!(" images: chunk {}..{} ({:.0}/s)", + phase_total - accum.count, phase_total, + accum.count as f64 / img_start.elapsed().as_secs_f64().max(0.001)); + } + total_applied += phase_total as u64; + total_errors += phase_errors; - eprintln!(" images: {} rows, {:.1}s total ({:.0}/s)", - accum.count, + eprintln!(" images: {} rows total, {:.1}s ({:.0}/s)", + phase_total, img_start.elapsed().as_secs_f64(), - accum.count as f64 / img_start.elapsed().as_secs_f64().max(0.001)); + phase_total as f64 / img_start.elapsed().as_secs_f64().max(0.001)); } - // Phase 2: Tags (rayon parallel) + // Phase 2: Tags (chunked rayon) let tags_csv = csv_dir.join("tags.csv"); if tags_csv.exists() { let tag_start = Instant::now(); - let lines = read_lines(&tags_csv, record_limit); - eprintln!(" tags: read {} lines in {:.1}s", lines.len(), tag_start.elapsed().as_secs_f64()); + let file = std::fs::File::open(&tags_csv).expect("open tags.csv"); + let mut reader = std::io::BufReader::with_capacity(8 * 1024 * 1024, file); + let mut phase_total = 0usize; + let mut phase_errors = 0u64; + let mut chunk_buf = Vec::with_capacity(CHUNK_SIZE); let f_names = &filter_names; let s_configs = &sort_configs; - let accum = lines - .into_par_iter() - .fold( - || BitmapAccum::new(f_names, s_configs), - |mut acc, line| { - let (tag_id, image_id) = match parse_tag_row(&line) { - Some(pair) => pair, - None => { acc.errors += 1; return acc; } - }; - let slot = image_id as u32; - if let Some(m) = acc.filter_maps.get_mut("tagIds") { - m.entry(tag_id as u64) - .or_insert_with(roaring::RoaringBitmap::new) - .insert(slot); - } - acc.count += 1; - acc - }, - ) - .reduce( - || BitmapAccum::new(f_names, s_configs), - |a, b| a.merge(b), - ); - - total_applied += accum.count as u64; - total_errors += accum.errors; - engine.apply_accum(&accum); + loop { + let remaining = record_limit.saturating_sub(phase_total); + if remaining == 0 { break; } + let n = read_chunk(&mut reader, remaining.min(CHUNK_SIZE), &mut chunk_buf); + if n == 0 { break; } + + let accum = chunk_buf + .par_iter() + .fold( + || BitmapAccum::new(f_names, s_configs), + |mut acc, line| { + let (tag_id, image_id) = match parse_tag_row(line) { + Some(pair) => pair, + None => { acc.errors += 1; return acc; } + }; + let slot = image_id as u32; + if let Some(m) = acc.filter_maps.get_mut("tagIds") { + m.entry(tag_id as u64) + .or_insert_with(roaring::RoaringBitmap::new) + .insert(slot); + } + acc.count += 1; + acc + }, + ) + .reduce( + || BitmapAccum::new(f_names, s_configs), + |a, b| a.merge(b), + ); + + phase_total += accum.count; + phase_errors += accum.errors; + engine.apply_accum(&accum); + } + total_applied += phase_total as u64; + total_errors += phase_errors; - eprintln!(" tags: {} rows, {:.1}s total ({:.0}/s)", - accum.count, + eprintln!(" tags: {} rows, {:.1}s ({:.0}/s)", + phase_total, tag_start.elapsed().as_secs_f64(), - accum.count as f64 / tag_start.elapsed().as_secs_f64().max(0.001)); + phase_total as f64 / tag_start.elapsed().as_secs_f64().max(0.001)); } - // Phase 3: Tools (rayon parallel) + // Phase 3: Tools (chunked rayon) let tools_csv = csv_dir.join("tools.csv"); if tools_csv.exists() { let tool_start = Instant::now(); - let lines = read_lines(&tools_csv, record_limit); - eprintln!(" tools: read {} lines in {:.1}s", lines.len(), tool_start.elapsed().as_secs_f64()); + let file = std::fs::File::open(&tools_csv).expect("open tools.csv"); + let mut reader = std::io::BufReader::with_capacity(8 * 1024 * 1024, file); + let mut phase_total = 0usize; + let mut phase_errors = 0u64; + let mut chunk_buf = Vec::with_capacity(CHUNK_SIZE); let f_names = &filter_names; let s_configs = &sort_configs; - let accum = lines - .into_par_iter() - .fold( - || BitmapAccum::new(f_names, s_configs), - |mut acc, line| { - let (tool_id, image_id) = match parse_tool_row(&line) { - Some(pair) => pair, - None => { acc.errors += 1; return acc; } - }; - let slot = image_id as u32; - if let Some(m) = acc.filter_maps.get_mut("toolIds") { - m.entry(tool_id as u64) - .or_insert_with(roaring::RoaringBitmap::new) - .insert(slot); - } - acc.count += 1; - acc - }, - ) - .reduce( - || BitmapAccum::new(f_names, s_configs), - |a, b| a.merge(b), - ); - - total_applied += accum.count as u64; - total_errors += accum.errors; - engine.apply_accum(&accum); + loop { + let remaining = record_limit.saturating_sub(phase_total); + if remaining == 0 { break; } + let n = read_chunk(&mut reader, remaining.min(CHUNK_SIZE), &mut chunk_buf); + if n == 0 { break; } + + let accum = chunk_buf + .par_iter() + .fold( + || BitmapAccum::new(f_names, s_configs), + |mut acc, line| { + let (tool_id, image_id) = match parse_tool_row(line) { + Some(pair) => pair, + None => { acc.errors += 1; return acc; } + }; + let slot = image_id as u32; + if let Some(m) = acc.filter_maps.get_mut("toolIds") { + m.entry(tool_id as u64) + .or_insert_with(roaring::RoaringBitmap::new) + .insert(slot); + } + acc.count += 1; + acc + }, + ) + .reduce( + || BitmapAccum::new(f_names, s_configs), + |a, b| a.merge(b), + ); + + phase_total += accum.count; + phase_errors += accum.errors; + engine.apply_accum(&accum); + } + total_applied += phase_total as u64; + total_errors += phase_errors; - eprintln!(" tools: {} rows, {:.1}s total ({:.0}/s)", - accum.count, + eprintln!(" tools: {} rows, {:.1}s ({:.0}/s)", + phase_total, tool_start.elapsed().as_secs_f64(), - accum.count as f64 / tool_start.elapsed().as_secs_f64().max(0.001)); + phase_total as f64 / tool_start.elapsed().as_secs_f64().max(0.001)); } // Exit loading mode. On headless, this will timeout (no flush thread) but