diff --git a/.claude/settings.local.json b/.claude/settings.local.json index afaa7ae..612cf9d 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -1,4 +1,4 @@ { - "outputStyle": "Justin", + "outputStyle": "default", "prefersReducedMotion": true } diff --git a/Cargo.toml b/Cargo.toml index 14c77a2..8261c15 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -154,10 +154,6 @@ harness = false name = "bound_store_bench" harness = false -[[bench]] -name = "parse_alloc_bench" -harness = false - [[bin]] name = "bitdex-benchmark" path = "src/bin/benchmark.rs" @@ -182,9 +178,6 @@ name = "bitdex-replay" path = "src/bin/replay.rs" required-features = ["replay"] -[[bin]] -name = "rebuild_bench" -path = "src/bin/rebuild_bench.rs" [[example]] name = "load_from_csv" diff --git a/src/bin/pg_sync.rs b/src/bin/pg_sync.rs index dc010ac..10000dc 100644 --- a/src/bin/pg_sync.rs +++ b/src/bin/pg_sync.rs @@ -366,15 +366,8 @@ async fn run_boot_sequence( if let Some(config) = full_sync_config { run_streaming_pipeline(pool, sync_config, bitdex_client, config, stage_dir).await; } else { - // V1 fallback: download all then process manually - bulk_loader::download_all_tables(pool, stage_dir) - .await - .unwrap_or_else(|e| { - eprintln!("CSV download failed: {e}"); - std::process::exit(1); - }); - eprintln!("No sync config YAML — skipping dump pipeline."); - eprintln!("CSVs staged at: {}. Use /dumps endpoint manually.", stage_dir.display()); + eprintln!("No sync config YAML — cannot run dump pipeline. Provide --sync-config."); + std::process::exit(1); } // Step 10: Seed cursor at pre_dump_cursor diff --git a/src/bin/rebuild_bench.rs b/src/bin/rebuild_bench.rs deleted file mode 100644 index 64550dd..0000000 --- a/src/bin/rebuild_bench.rs +++ /dev/null @@ -1,906 +0,0 @@ -//! Microbenchmarks for docstore → bitmap rebuild pipeline. -//! -//! Measures individual stages of the rebuild pipeline to identify bottlenecks: -//! 1. Raw shard I/O: read + zstd decompress -//! 2. Decode: msgpack → StoredDoc -//! 3. Bitmap extraction: StoredDoc → filter/sort bitmaps -//! 4. Full pipeline: read → decode → extract → merge -//! -//! Usage: -//! cargo run --release --bin rebuild_bench -- --data-dir ./data --index civitai [--shards 1000] - -use std::collections::HashMap; -use std::path::{Path, PathBuf}; -use std::sync::atomic::{AtomicU64, Ordering}; -use std::time::Instant; - -use rayon::prelude::*; -use roaring::RoaringBitmap; - -use bitdex_v2::silos::doc_format::{PackedValue, StoredDoc}; -use bitdex_v2::silos::doc_silo_adapter::DocSiloAdapter; -use bitdex_v2::mutation::{value_to_bitmap_key, value_to_sort_u32}; -use bitdex_v2::query::Value; - -#[global_allocator] -static ALLOC: rpmalloc::RpMalloc = rpmalloc::RpMalloc; - -struct BenchConfig { - data_dir: PathBuf, - index_name: String, - max_shards: Option, - full_build: bool, - add_field: Option, -} - -fn parse_args() -> BenchConfig { - let args: Vec = std::env::args().collect(); - let mut data_dir = PathBuf::from("./data"); - let mut index_name = "civitai".to_string(); - let mut max_shards: Option = None; - let mut full_build = false; - let mut add_field: Option = None; - - let mut i = 1; - while i < args.len() { - match args[i].as_str() { - "--data-dir" => { data_dir = PathBuf::from(&args[i + 1]); i += 2; } - "--index" => { index_name = args[i + 1].clone(); i += 2; } - "--shards" => { max_shards = Some(args[i + 1].parse().unwrap()); i += 2; } - "--full" => { full_build = true; i += 1; } - "--add-field" => { add_field = Some(args[i + 1].clone()); i += 2; } - _ => { i += 1; } - } - } - - BenchConfig { data_dir, index_name, max_shards, full_build, add_field } -} - -/// Count total shards by scanning the shard directory. -fn count_shards(docs_path: &Path) -> u32 { - let shards_dir = docs_path.join("shards"); - let mut count = 0u32; - if let Ok(entries) = std::fs::read_dir(&shards_dir) { - for entry in entries.flatten() { - if entry.file_type().map(|t| t.is_dir()).unwrap_or(false) { - if let Ok(sub_entries) = std::fs::read_dir(entry.path()) { - count += sub_entries - .filter(|e| e.as_ref().map(|e| { - e.path().extension().map(|ext| ext == "bin").unwrap_or(false) - }).unwrap_or(false)) - .count() as u32; - } - } - } - } - count -} - -/// Find the maximum shard ID by scanning shard files. -fn find_max_shard(docs_path: &Path) -> u32 { - let shards_dir = docs_path.join("shards"); - let mut max_id = 0u32; - if let Ok(entries) = std::fs::read_dir(&shards_dir) { - for entry in entries.flatten() { - if entry.file_type().map(|t| t.is_dir()).unwrap_or(false) { - if let Ok(sub_entries) = std::fs::read_dir(entry.path()) { - for sub in sub_entries.flatten() { - if let Some(stem) = sub.path().file_stem() { - if let Ok(id) = stem.to_string_lossy().parse::() { - max_id = max_id.max(id); - } - } - } - } - } - } - } - max_id -} - -/// Stage 1: Raw shard I/O — read files + zstd decompress, no decode. -fn bench_raw_io(docs_path: &Path, num_shards: u32) -> (f64, u64, u64) { - eprintln!("\n=== Stage 1: Raw shard I/O (read + zstd decompress) ==="); - let bytes_read = AtomicU64::new(0); - let bytes_decompressed = AtomicU64::new(0); - let shards_read = AtomicU64::new(0); - - let t0 = Instant::now(); - - (0..num_shards).into_par_iter().for_each(|shard_id| { - let dir_byte = ((shard_id >> 8) & 0xFF) as u8; - let path = docs_path - .join("shards") - .join(format!("{:02x}", dir_byte)) - .join(format!("{:06}.bin", shard_id)); - - match std::fs::read(&path) { - Ok(data) => { - bytes_read.fetch_add(data.len() as u64, Ordering::Relaxed); - // Decompress to measure decompression throughput - // BitmapSilo format — count bytes as decompressed (no separate compression layer) - bytes_decompressed.fetch_add(data.len() as u64, Ordering::Relaxed); - shards_read.fetch_add(1, Ordering::Relaxed); - } - Err(_) => {} - } - }); - - let elapsed = t0.elapsed().as_secs_f64(); - let total_read = bytes_read.load(Ordering::Relaxed); - let total_decompressed = bytes_decompressed.load(Ordering::Relaxed); - let total_shards = shards_read.load(Ordering::Relaxed); - - eprintln!(" Shards read: {}", total_shards); - eprintln!(" Compressed bytes: {:.2} GB", total_read as f64 / 1e9); - eprintln!(" Decompressed: {:.2} GB", total_decompressed as f64 / 1e9); - eprintln!(" Time: {:.2}s", elapsed); - eprintln!(" Read throughput: {:.0} MB/s (compressed)", total_read as f64 / elapsed / 1e6); - eprintln!(" Decomp throughput: {:.0} MB/s (decompressed)", total_decompressed as f64 / elapsed / 1e6); - - (elapsed, total_read, total_decompressed) -} - -/// Stage 2: Read + decode to StoredDoc. -fn bench_decode(docs_path: &Path, num_shards: u32) -> (f64, u64) { - eprintln!("\n=== Stage 2: Read + Decode (→ StoredDoc) ==="); - let docs_decoded = AtomicU64::new(0); - - let reader = DocSiloAdapter::open(docs_path).expect("open docstore"); - - let t0 = Instant::now(); - - (0..num_shards).into_par_iter().for_each(|shard_id| { - match reader.get_shard(shard_id) { - Ok(docs) => { - docs_decoded.fetch_add(docs.len() as u64, Ordering::Relaxed); - } - Err(_) => {} - } - }); - - let elapsed = t0.elapsed().as_secs_f64(); - let total_docs = docs_decoded.load(Ordering::Relaxed); - - eprintln!(" Docs decoded: {}", total_docs); - eprintln!(" Time: {:.2}s", elapsed); - eprintln!(" Throughput: {:.0} docs/s", total_docs as f64 / elapsed); - eprintln!(" Per-doc avg: {:.2} µs", elapsed * 1e6 / total_docs as f64); - - (elapsed, total_docs) -} - -/// Stage 3: Full rebuild pipeline — read + decode + extract filter/sort bitmaps + merge. -fn bench_full_rebuild( - docs_path: &Path, - num_shards: u32, - filter_names: &[&str], - sort_names: &[&str], - sort_bits: &[usize], -) -> (f64, u64) { - eprintln!("\n=== Stage 3: Full Rebuild Pipeline ==="); - eprintln!(" Filter fields: {:?}", filter_names); - eprintln!(" Sort fields: {:?}", sort_names); - - let reader = DocSiloAdapter::open(docs_path).expect("open docstore"); - - type FilterMap = HashMap<(usize, u64), RoaringBitmap>; - struct Accum { - sort_layers: Vec>, - filter_map: FilterMap, - alive: RoaringBitmap, - count: u64, - } - - let make_accum = || Accum { - sort_layers: sort_bits.iter().map(|&b| { - (0..b).map(|_| RoaringBitmap::new()).collect() - }).collect(), - filter_map: FilterMap::new(), - alive: RoaringBitmap::new(), - count: 0, - }; - - let chunk_size = 500u32; - let num_chunks = (num_shards + chunk_size - 1) / chunk_size; - - let t0 = Instant::now(); - - let merged = (0..num_chunks) - .into_par_iter() - .fold(make_accum, |mut acc, chunk_idx| { - let shard_start = chunk_idx * chunk_size; - let shard_end = std::cmp::min(shard_start + chunk_size, num_shards); - - for shard_id in shard_start..shard_end { - let docs = match reader.get_shard(shard_id) { - Ok(d) => d, - Err(_) => continue, - }; - - for (slot_id, doc) in &docs { - acc.alive.insert(*slot_id); - - // Filter bitmap extraction - for (fi, &fname) in filter_names.iter().enumerate() { - if let Some(fv) = doc.fields.get(fname) { - match fv { - bitdex_v2::mutation::FieldValue::Single(v) => { - if let Some(key) = value_to_bitmap_key(v) { - acc.filter_map - .entry((fi, key)) - .or_insert_with(RoaringBitmap::new) - .insert(*slot_id); - } - } - bitdex_v2::mutation::FieldValue::Multi(vals) => { - for v in vals { - if let Some(key) = value_to_bitmap_key(v) { - acc.filter_map - .entry((fi, key)) - .or_insert_with(RoaringBitmap::new) - .insert(*slot_id); - } - } - } - } - } - } - - // Sort bitmap extraction - for (si, &sname) in sort_names.iter().enumerate() { - if let Some(fv) = doc.fields.get(sname) { - if let bitdex_v2::mutation::FieldValue::Single(ref v) = fv { - if let Some(value) = value_to_sort_u32(v) { - let num_bits = sort_bits[si]; - for bit in 0..num_bits { - if (value >> bit) & 1 == 1 { - acc.sort_layers[si][bit].insert(*slot_id); - } - } - } - } - } - } - - acc.count += 1; - } - } - acc - }) - .reduce(make_accum, |mut a, b| { - for (si, b_layers) in b.sort_layers.into_iter().enumerate() { - for (bit, bm) in b_layers.into_iter().enumerate() { - a.sort_layers[si][bit] |= bm; - } - } - for (key, bm) in b.filter_map { - a.filter_map.entry(key) - .and_modify(|existing| *existing |= &bm) - .or_insert(bm); - } - a.alive |= b.alive; - a.count += b.count; - a - }); - - let elapsed = t0.elapsed().as_secs_f64(); - - let total_filter_bitmaps: usize = merged.filter_map.len(); - let total_sort_layers: usize = merged.sort_layers.iter() - .map(|layers| layers.iter().filter(|bm| !bm.is_empty()).count()) - .sum(); - - eprintln!(" Docs processed: {}", merged.count); - eprintln!(" Alive bitmap: {} bits", merged.alive.len()); - eprintln!(" Filter bitmaps: {} distinct (field,value) pairs", total_filter_bitmaps); - eprintln!(" Sort layers: {} non-empty layers", total_sort_layers); - eprintln!(" Time: {:.2}s", elapsed); - eprintln!(" Throughput: {:.0} docs/s", merged.count as f64 / elapsed); - - (elapsed, merged.count) -} - -/// Stage 4: Rebuild a single field — measures per-field rebuild cost. -fn bench_single_field_rebuild( - docs_path: &Path, - num_shards: u32, - field_name: &str, - is_sort: bool, - bits: usize, -) -> (f64, u64) { - eprintln!("\n=== Stage 4: Single Field Rebuild — {} ({}) ===", - field_name, if is_sort { "sort" } else { "filter" }); - - let reader = DocSiloAdapter::open(docs_path).expect("open docstore"); - let docs_processed = AtomicU64::new(0); - - let chunk_size = 500u32; - let num_chunks = (num_shards + chunk_size - 1) / chunk_size; - - let t0 = Instant::now(); - - if is_sort { - // Sort field rebuild - struct SortAccum { - layers: Vec, - count: u64, - } - - let make_accum = || SortAccum { - layers: (0..bits).map(|_| RoaringBitmap::new()).collect(), - count: 0, - }; - - let merged = (0..num_chunks) - .into_par_iter() - .fold(make_accum, |mut acc, chunk_idx| { - let shard_start = chunk_idx * chunk_size; - let shard_end = std::cmp::min(shard_start + chunk_size, num_shards); - - for shard_id in shard_start..shard_end { - let docs = match reader.get_shard(shard_id) { - Ok(d) => d, - Err(_) => continue, - }; - for (slot_id, doc) in &docs { - if let Some(fv) = doc.fields.get(field_name) { - if let bitdex_v2::mutation::FieldValue::Single(ref v) = fv { - if let Some(value) = value_to_sort_u32(v) { - for bit in 0..bits { - if (value >> bit) & 1 == 1 { - acc.layers[bit].insert(*slot_id); - } - } - } - } - } - acc.count += 1; - } - } - acc - }) - .reduce(make_accum, |mut a, b| { - for (bit, bm) in b.layers.into_iter().enumerate() { - a.layers[bit] |= bm; - } - a.count += b.count; - a - }); - - let elapsed = t0.elapsed().as_secs_f64(); - let non_empty = merged.layers.iter().filter(|l| !l.is_empty()).count(); - eprintln!(" Docs: {}", merged.count); - eprintln!(" Layers: {}/{} non-empty", non_empty, bits); - eprintln!(" Time: {:.2}s", elapsed); - eprintln!(" Rate: {:.0} docs/s", merged.count as f64 / elapsed); - (elapsed, merged.count) - } else { - // Filter field rebuild - type FMap = HashMap; - struct FilterAccum { - map: FMap, - count: u64, - } - - let make_accum = || FilterAccum { - map: FMap::new(), - count: 0, - }; - - let merged = (0..num_chunks) - .into_par_iter() - .fold(make_accum, |mut acc, chunk_idx| { - let shard_start = chunk_idx * chunk_size; - let shard_end = std::cmp::min(shard_start + chunk_size, num_shards); - - for shard_id in shard_start..shard_end { - let docs = match reader.get_shard(shard_id) { - Ok(d) => d, - Err(_) => continue, - }; - for (slot_id, doc) in &docs { - if let Some(fv) = doc.fields.get(field_name) { - match fv { - bitdex_v2::mutation::FieldValue::Single(v) => { - if let Some(key) = value_to_bitmap_key(v) { - acc.map.entry(key) - .or_insert_with(RoaringBitmap::new) - .insert(*slot_id); - } - } - bitdex_v2::mutation::FieldValue::Multi(vals) => { - for v in vals { - if let Some(key) = value_to_bitmap_key(v) { - acc.map.entry(key) - .or_insert_with(RoaringBitmap::new) - .insert(*slot_id); - } - } - } - } - } - acc.count += 1; - } - } - acc - }) - .reduce(make_accum, |mut a, b| { - for (key, bm) in b.map { - a.map.entry(key) - .and_modify(|existing| *existing |= &bm) - .or_insert(bm); - } - a.count += b.count; - a - }); - - let elapsed = t0.elapsed().as_secs_f64(); - eprintln!(" Docs: {}", merged.count); - eprintln!(" Distinct: {} values", merged.map.len()); - eprintln!(" Time: {:.2}s", elapsed); - eprintln!(" Rate: {:.0} docs/s", merged.count as f64 / elapsed); - (elapsed, merged.count) - } -} - -/// Stage 5: Split-phase — pre-read all shards into memory, then benchmark -/// bitmap construction with zero I/O. This isolates CPU cost of bitmap ops. -fn bench_bitmap_only( - docs_path: &Path, - num_shards: u32, - filter_names: &[&str], - sort_names: &[&str], - sort_bits: &[usize], -) -> (f64, f64, u64) { - eprintln!("\n=== Stage 5: Split-Phase (pre-read → bitmap-only) ==="); - - let reader = DocSiloAdapter::open(docs_path).expect("open docstore"); - - // Phase A: Read all shards into memory (decoded StoredDocs) - let t_read = Instant::now(); - let all_docs: Vec> = (0..num_shards) - .into_par_iter() - .filter_map(|shard_id| { - reader.get_shard(shard_id).ok().filter(|d| !d.is_empty()) - }) - .collect(); - let read_time = t_read.elapsed().as_secs_f64(); - - let total_docs: u64 = all_docs.iter().map(|s| s.len() as u64).sum(); - eprintln!(" Read phase: {:.2}s ({} docs, {:.0} docs/s)", - read_time, total_docs, total_docs as f64 / read_time); - - // Phase B: Build bitmaps from in-memory docs (no I/O) - type FilterMap = HashMap<(usize, u64), RoaringBitmap>; - struct Accum { - sort_layers: Vec>, - filter_map: FilterMap, - alive: RoaringBitmap, - count: u64, - } - - let make_accum = || Accum { - sort_layers: sort_bits.iter().map(|&b| { - (0..b).map(|_| RoaringBitmap::new()).collect() - }).collect(), - filter_map: FilterMap::new(), - alive: RoaringBitmap::new(), - count: 0, - }; - - let t_bitmap = Instant::now(); - - let merged = all_docs - .par_iter() - .fold(make_accum, |mut acc, shard_docs| { - for (slot_id, doc) in shard_docs { - acc.alive.insert(*slot_id); - - for (fi, &fname) in filter_names.iter().enumerate() { - if let Some(fv) = doc.fields.get(fname) { - match fv { - bitdex_v2::mutation::FieldValue::Single(v) => { - if let Some(key) = value_to_bitmap_key(v) { - acc.filter_map - .entry((fi, key)) - .or_insert_with(RoaringBitmap::new) - .insert(*slot_id); - } - } - bitdex_v2::mutation::FieldValue::Multi(vals) => { - for v in vals { - if let Some(key) = value_to_bitmap_key(v) { - acc.filter_map - .entry((fi, key)) - .or_insert_with(RoaringBitmap::new) - .insert(*slot_id); - } - } - } - } - } - } - - for (si, &sname) in sort_names.iter().enumerate() { - if let Some(fv) = doc.fields.get(sname) { - if let bitdex_v2::mutation::FieldValue::Single(ref v) = fv { - if let Some(value) = value_to_sort_u32(v) { - let num_bits = sort_bits[si]; - for bit in 0..num_bits { - if (value >> bit) & 1 == 1 { - acc.sort_layers[si][bit].insert(*slot_id); - } - } - } - } - } - } - - acc.count += 1; - } - acc - }) - .reduce(make_accum, |mut a, b| { - for (si, b_layers) in b.sort_layers.into_iter().enumerate() { - for (bit, bm) in b_layers.into_iter().enumerate() { - a.sort_layers[si][bit] |= bm; - } - } - for (key, bm) in b.filter_map { - a.filter_map.entry(key) - .and_modify(|existing| *existing |= &bm) - .or_insert(bm); - } - a.alive |= b.alive; - a.count += b.count; - a - }); - - let bitmap_time = t_bitmap.elapsed().as_secs_f64(); - - eprintln!(" Bitmap phase: {:.2}s ({:.0} docs/s)", - bitmap_time, merged.count as f64 / bitmap_time); - eprintln!(" Filter bitmaps: {} distinct pairs", merged.filter_map.len()); - eprintln!(" Total: {:.2}s", read_time + bitmap_time); - - (read_time, bitmap_time, merged.count) -} - -/// Stage 6: Raw bytes → bitmap extraction WITHOUT full StoredDoc decode. -/// Decodes msgpack pairs directly, only extracting fields we need. -fn bench_selective_decode( - docs_path: &Path, - num_shards: u32, - target_fields: &[&str], -) -> (f64, u64) { - eprintln!("\n=== Stage 6: Selective Decode (skip full StoredDoc) ==="); - eprintln!(" Target fields: {:?}", target_fields); - - let reader = DocSiloAdapter::open(docs_path).expect("open docstore"); - let field_to_idx = &reader; - - // We'll read raw shard bytes and decode only needed fields - let docs_processed = AtomicU64::new(0); - - let t0 = Instant::now(); - - // For now, just measure the difference by reading shards and only looking up target fields - // This shows the cost of HashMap::get vs iterating all fields - let chunk_size = 500u32; - let num_chunks = (num_shards + chunk_size - 1) / chunk_size; - - let total: u64 = (0..num_chunks) - .into_par_iter() - .map(|chunk_idx| { - let shard_start = chunk_idx * chunk_size; - let shard_end = std::cmp::min(shard_start + chunk_size, num_shards); - let mut count = 0u64; - - for shard_id in shard_start..shard_end { - let docs = match reader.get_shard(shard_id) { - Ok(d) => d, - Err(_) => continue, - }; - for (slot_id, doc) in &docs { - // Only access target fields — simulates selective decode - for &fname in target_fields { - let _ = doc.fields.get(fname); - } - count += 1; - } - } - count - }) - .sum(); - - let elapsed = t0.elapsed().as_secs_f64(); - eprintln!(" Docs: {}", total); - eprintln!(" Time: {:.2}s", elapsed); - eprintln!(" Rate: {:.0} docs/s", total as f64 / elapsed); - - (elapsed, total) -} - -/// Stage 7: Zero-alloc packed rebuild — decode to Vec<(u16, PackedValue)> directly, -/// use field dictionary u16 indices instead of String HashMap lookups. -/// This is the "what if we skip StoredDoc entirely" benchmark. -fn bench_packed_rebuild( - docs_path: &Path, - num_shards: u32, - filter_names: &[&str], - sort_names: &[&str], - sort_bits: &[usize], -) -> (f64, u64) { - eprintln!("\n=== Stage 7: Packed Rebuild (skip StoredDoc) ==="); - - let reader = DocSiloAdapter::open(docs_path).expect("open docstore"); - - // Build u16 index → (role, position) lookup table from field dictionary - // role: 0 = filter, 1 = sort, 2 = both - let field_dict = reader.field_to_idx(); - let mut filter_idx_map: HashMap = HashMap::new(); // dict_idx → filter position - let mut sort_idx_map: HashMap = HashMap::new(); // dict_idx → (sort position, bits) - - for (fi, &fname) in filter_names.iter().enumerate() { - if let Some(&idx) = field_dict.get(fname) { - filter_idx_map.insert(idx, fi); - } - } - for (si, &sname) in sort_names.iter().enumerate() { - if let Some(&idx) = field_dict.get(sname) { - sort_idx_map.insert(idx, (si, sort_bits[si])); - } - } - - eprintln!(" Filter fields mapped: {}/{}", filter_idx_map.len(), filter_names.len()); - eprintln!(" Sort fields mapped: {}/{}", sort_idx_map.len(), sort_names.len()); - - type FilterMap = HashMap<(usize, u64), RoaringBitmap>; - struct Accum { - sort_layers: Vec>, - filter_map: FilterMap, - alive: RoaringBitmap, - count: u64, - } - - let make_accum = || Accum { - sort_layers: sort_bits.iter().map(|&b| { - (0..b).map(|_| RoaringBitmap::new()).collect() - }).collect(), - filter_map: FilterMap::new(), - alive: RoaringBitmap::new(), - count: 0, - }; - - let chunk_size = 500u32; - let num_chunks = (num_shards + chunk_size - 1) / chunk_size; - - let t0 = Instant::now(); - - let merged = (0..num_chunks) - .into_par_iter() - .fold(make_accum, |mut acc, chunk_idx| { - let shard_start = chunk_idx * chunk_size; - let shard_end = std::cmp::min(shard_start + chunk_size, num_shards); - - for shard_id in shard_start..shard_end { - let packed_docs = match reader.get_shard_packed(shard_id) { - Ok(d) => d, - Err(_) => continue, - }; - - for (slot_id, pairs) in &packed_docs { - acc.alive.insert(*slot_id); - - for (field_idx, pv) in pairs { - // Filter extraction — direct u16 lookup, no String - if let Some(&fi) = filter_idx_map.get(field_idx) { - match pv { - PackedValue::I(v) => { - acc.filter_map - .entry((fi, *v as u64)) - .or_insert_with(RoaringBitmap::new) - .insert(*slot_id); - } - PackedValue::B(b) => { - acc.filter_map - .entry((fi, if *b { 1 } else { 0 })) - .or_insert_with(RoaringBitmap::new) - .insert(*slot_id); - } - PackedValue::Mi(vals) => { - for v in vals { - acc.filter_map - .entry((fi, *v as u64)) - .or_insert_with(RoaringBitmap::new) - .insert(*slot_id); - } - } - _ => {} - } - } - - // Sort extraction — direct u16 lookup - if let Some(&(si, bits)) = sort_idx_map.get(field_idx) { - if let PackedValue::I(v) = pv { - let value = (*v).max(0) as u32; - for bit in 0..bits { - if (value >> bit) & 1 == 1 { - acc.sort_layers[si][bit].insert(*slot_id); - } - } - } - } - } - - acc.count += 1; - } - } - acc - }) - .reduce(make_accum, |mut a, b| { - for (si, b_layers) in b.sort_layers.into_iter().enumerate() { - for (bit, bm) in b_layers.into_iter().enumerate() { - a.sort_layers[si][bit] |= bm; - } - } - for (key, bm) in b.filter_map { - a.filter_map.entry(key) - .and_modify(|existing| *existing |= &bm) - .or_insert(bm); - } - a.alive |= b.alive; - a.count += b.count; - a - }); - - let elapsed = t0.elapsed().as_secs_f64(); - - eprintln!(" Docs processed: {}", merged.count); - eprintln!(" Filter bitmaps: {} distinct pairs", merged.filter_map.len()); - eprintln!(" Time: {:.2}s", elapsed); - eprintln!(" Throughput: {:.0} docs/s", merged.count as f64 / elapsed); - - (elapsed, merged.count) -} - -/// Full-scale build: not yet implemented — DataSilo bulk scan API pending. -fn run_full_build(_data_dir: &Path, _index_name: &str) { - eprintln!("ERROR: build_all_from_docstore is not yet implemented (DataSilo bulk scan API pending)."); - std::process::exit(1); -} - -/// --add-field mode: not yet implemented — DataSilo bulk scan API pending. -fn run_add_field(_data_dir: &Path, _index_name: &str, _field_name: &str) { - eprintln!("ERROR: add_fields_from_docstore is not yet implemented (DataSilo bulk scan API pending)."); - std::process::exit(1); -} - -fn main() { - let config = parse_args(); - let index_dir = config.data_dir.join("indexes").join(&config.index_name); - let docs_path = index_dir.join("docs"); - - // --add-field mode: benchmark hot-adding a single field - if let Some(ref field_name) = config.add_field { - run_add_field(&config.data_dir, &config.index_name, field_name); - return; - } - - // --full mode: run the engine-level build_all_from_docstore - if config.full_build { - run_full_build(&config.data_dir, &config.index_name); - return; - } - - eprintln!("Rebuild Benchmark — index: {}", config.index_name); - eprintln!("Docs path: {}", docs_path.display()); - - // Count shards - let t0 = Instant::now(); - let max_shard = find_max_shard(&docs_path); - let total_shards = max_shard + 1; - eprintln!("Found max shard ID {} ({} total) in {:.1}s", - max_shard, total_shards, t0.elapsed().as_secs_f64()); - - let num_shards = config.max_shards.unwrap_or(total_shards).min(total_shards); - eprintln!("Benchmarking {} shards (~{} docs)", - num_shards, num_shards as u64 * 512); - - let num_threads = rayon::current_num_threads(); - eprintln!("Rayon threads: {}", num_threads); - - // Stage 1: Raw I/O - let (io_time, compressed_bytes, decompressed_bytes) = bench_raw_io(&docs_path, num_shards); - - // Stage 2: Read + Decode - let (decode_time, total_docs) = bench_decode(&docs_path, num_shards); - - // Stage 3: Full rebuild (all filter + sort fields) - let filter_names: Vec<&str> = vec![ - "nsfwLevel", "userId", "postId", "postedToId", "type", "baseModel", - "availability", "blockedFor", "remixOfId", "hasMeta", "onSite", "poi", "minor", - "tagIds", "modelVersionIds", "modelVersionIdsManual", "toolIds", "techniqueIds", - ]; - let sort_names: Vec<&str> = vec!["reactionCount", "sortAt", "commentCount", "collectedCount", "id"]; - let sort_bits: Vec = vec![32, 32, 32, 32, 32]; - - let (full_time, full_docs) = bench_full_rebuild( - &docs_path, num_shards, - &filter_names, &sort_names, &sort_bits, - ); - - // Stage 4: Single field rebuilds (interesting ones) - eprintln!("\n--- Per-field rebuild times ---"); - - let (nsfw_time, _) = bench_single_field_rebuild(&docs_path, num_shards, "nsfwLevel", false, 0); - let (tags_time, _) = bench_single_field_rebuild(&docs_path, num_shards, "tagIds", false, 0); - let (sort_time, _) = bench_single_field_rebuild(&docs_path, num_shards, "sortAt", true, 32); - let (reaction_time, _) = bench_single_field_rebuild(&docs_path, num_shards, "reactionCount", true, 32); - - // Stage 5: Split-phase (isolate I/O from CPU) - let (split_read, split_bitmap, split_docs) = bench_bitmap_only( - &docs_path, num_shards, - &filter_names, &sort_names, &sort_bits, - ); - - // Stage 6: Selective decode (only target fields) - let (selective_1_time, _) = bench_selective_decode( - &docs_path, num_shards, &["nsfwLevel"], - ); - let (selective_all_time, _) = bench_selective_decode( - &docs_path, num_shards, - &["nsfwLevel", "userId", "tagIds", "reactionCount", "sortAt"], - ); - - // Stage 7: Packed rebuild (skip StoredDoc entirely) - let (packed_time, packed_docs) = bench_packed_rebuild( - &docs_path, num_shards, - &filter_names, &sort_names, &sort_bits, - ); - - // Summary - eprintln!("\n========================================"); - eprintln!(" SUMMARY ({} docs, {} shards, {} threads)", total_docs, num_shards, num_threads); - eprintln!("========================================"); - eprintln!(" Raw I/O: {:.2}s ({:.0} MB/s compressed, {:.0} MB/s decompressed)", - io_time, - compressed_bytes as f64 / io_time / 1e6, - decompressed_bytes as f64 / io_time / 1e6); - eprintln!(" Read + Decode: {:.2}s ({:.0} docs/s)", - decode_time, total_docs as f64 / decode_time); - eprintln!(" Full Rebuild: {:.2}s ({:.0} docs/s) [current: StoredDoc path]", - full_time, full_docs as f64 / full_time); - eprintln!(" Packed Rebuild: {:.2}s ({:.0} docs/s) [new: skip StoredDoc]", - packed_time, packed_docs as f64 / packed_time); - if packed_time < full_time { - eprintln!(" >>> Packed is {:.1}x FASTER than current <<<", - full_time / packed_time); - } - eprintln!(" ---"); - eprintln!(" Split-phase read: {:.2}s ({:.0} docs/s)", - split_read, split_docs as f64 / split_read); - eprintln!(" Split-phase bmap: {:.2}s ({:.0} docs/s)", - split_bitmap, split_docs as f64 / split_bitmap); - eprintln!(" ---"); - eprintln!(" nsfwLevel only: {:.2}s", nsfw_time); - eprintln!(" tagIds only: {:.2}s", tags_time); - eprintln!(" sortAt only: {:.2}s", sort_time); - eprintln!(" reactionCount: {:.2}s", reaction_time); - eprintln!(" ---"); - eprintln!(" Selective (1): {:.2}s (decode + 1 field lookup)", selective_1_time); - eprintln!(" Selective (5): {:.2}s (decode + 5 field lookups)", selective_all_time); - eprintln!(" ---"); - eprintln!(" Decode overhead: {:.1}x vs raw I/O", decode_time / io_time); - eprintln!(" Bitmap overhead: {:.1}x vs decode-only", full_time / decode_time); - eprintln!(" I/O vs CPU split: {:.0}% I/O, {:.0}% bitmap", - split_read / (split_read + split_bitmap) * 100.0, - split_bitmap / (split_read + split_bitmap) * 100.0); - eprintln!(" ---"); - eprintln!(" 105M extrapolation:"); - eprintln!(" Current: {:.0}s ({:.1} min)", total_docs as f64 / (full_docs as f64 / full_time) * (105e6 / total_docs as f64), - total_docs as f64 / (full_docs as f64 / full_time) * (105e6 / total_docs as f64) / 60.0); - eprintln!(" Packed: {:.0}s ({:.1} min)", total_docs as f64 / (packed_docs as f64 / packed_time) * (105e6 / total_docs as f64), - total_docs as f64 / (packed_docs as f64 / packed_time) * (105e6 / total_docs as f64) / 60.0); -} diff --git a/src/engine/concurrent_engine.rs b/src/engine/concurrent_engine.rs index c6027da..c88b6bf 100644 --- a/src/engine/concurrent_engine.rs +++ b/src/engine/concurrent_engine.rs @@ -554,17 +554,6 @@ impl ConcurrentEngine { } Ok(()) } - /// Persist dirty dictionaries to disk. Call after upserts that may have - /// created new LowCardinalityString values. Only writes dictionaries that - /// have new entries since the last persist, and clears their dirty flags. - /// - /// This ensures dictionary mappings survive crashes even before the next - /// full `save_snapshot()`. Dictionaries are small (typically < 1 KB), so - /// the I/O cost is negligible. - pub fn persist_dirty_dictionaries(&self) -> Result<()> { - // No-op: BitmapSilo saves dictionaries at save_snapshot time. - Ok(()) - } /// Load dictionaries from disk for all LowCardinalityString fields in the schema. pub fn load_dictionaries( schema: &crate::config::DataSchema, @@ -739,20 +728,11 @@ impl ConcurrentEngine { // Read directly from DataSilo (no separate doc cache — DataSilo uses mmap). Ok(self.docstore.lock().get(slot_id)?) } - /// Compact the docstore, reclaiming space from old write transactions. - pub fn compact_docstore(&self) -> Result { - Ok(self.docstore.lock().compact()?) - } /// Configure docstore field defaults from a DataSchema. /// Must be called before `prepare_bulk_writer()` so the BulkWriter inherits the defaults. pub fn set_docstore_defaults(&self, schema: &crate::config::DataSchema) { self.docstore.lock().set_field_defaults(schema); } - /// Get the current schema version from the docstore. - pub fn docstore_schema_version(&self) -> u8 { - self.docstore.lock().schema_version() - } - /// Get a clone of the Arc> for external writers. pub fn docstore_arc(&self) -> Arc> { Arc::clone(&self.docstore) @@ -771,19 +751,6 @@ impl ConcurrentEngine { self.docstore.lock().prepare_field_names(field_names) .map_err(|e| crate::error::BitdexError::Storage(format!("prepare_field_names: {e}"))) } - /// Return the set of indexed field names (filter + sort + "id"). - /// Used by the loader to strip doc-only fields from the bitmap accumulator. - pub fn indexed_field_names(&self) -> std::collections::HashSet { - let mut s = std::collections::HashSet::new(); - for f in &self.config.filter_fields { - s.insert(f.name.clone()); - } - for f in &self.config.sort_fields { - s.insert(f.name.clone()); - } - s.insert("id".to_string()); - s - } /// Get the current pending buffer depth. Always 0 (tier 2 removed). pub fn pending_depth(&self) -> usize { 0 diff --git a/src/engine/executor.rs b/src/engine/executor.rs index 3136d64..95cc831 100644 --- a/src/engine/executor.rs +++ b/src/engine/executor.rs @@ -90,30 +90,7 @@ impl<'a> QueryExecutor<'a> { bitmap_silo, } } - /// Attach string maps for MappedString field reverse lookup. - /// Enables querying with `Value::String("SD 1.5")` on MappedString fields. - pub fn with_string_maps(mut self, maps: &'a StringMaps) -> Self { - self.string_maps = Some(maps); - self - } - /// Attach case-sensitive field set for string matching control. - pub fn with_case_sensitive_fields(mut self, fields: &'a CaseSensitiveFields) -> Self { - self.case_sensitive_fields = Some(fields); - self - } - /// Attach live dictionaries for LowCardinalityString field query resolution. - /// Used as fallback when the string_maps snapshot doesn't have a recently-added value. - pub fn with_dictionaries(mut self, dicts: &'a HashMap) -> Self { - self.dictionaries = Some(dicts); - self - } - /// Attach a BitmapSilo for frozen bitmap reads. - /// When filter/sort bitmaps are unloaded, the executor reads frozen data - /// directly from the silo's mmap (zero-copy, near-zero heap). - pub fn with_bitmap_silo(mut self, silo: &'a BitmapSilo) -> Self { - self.bitmap_silo = Some(silo); - self - } + /// Attach a time bucket manager for in-executor bucket snapping (C3). /// Range filters on the bucketed field will be snapped to pre-computed bitmaps. pub fn with_time_buckets(mut self, tb: &'a crate::time_buckets::TimeBucketManager, now: u64) -> Self { @@ -265,17 +242,6 @@ impl<'a> QueryExecutor<'a> { total_matched, }) } - /// Check if a single slot matches all the given filter clauses. - /// Used by post-validation to revalidate slots that overlap with in-flight writes. - pub fn slot_matches_filters(&self, slot: u32, clauses: &[FilterClause]) -> Result { - for clause in clauses { - let bitmap = self.evaluate_clause(clause)?; - if !bitmap.contains(slot) { - return Ok(false); - } - } - Ok(true) - } /// Execute from a pre-computed filter bitmap: alive AND + sort + paginate. /// Used when the caller handles cache interaction separately. pub fn execute_from_bitmap( diff --git a/src/engine/filter.rs b/src/engine/filter.rs index b3b8b4c..7a17535 100644 --- a/src/engine/filter.rs +++ b/src/engine/filter.rs @@ -173,10 +173,6 @@ impl FilterField { pub fn cardinality(&self, value: u64) -> u64 { self.bitmaps.get(&value).map_or(0, |vb| vb.base_len()) } - /// Get the number of distinct values tracked. - pub fn distinct_count(&self) -> usize { - self.bitmaps.len() - } /// Compute the union of bitmaps for multiple values (OR). pub fn union(&self, values: &[u64]) -> RoaringBitmap { let mut result = RoaringBitmap::new(); diff --git a/src/engine/sort.rs b/src/engine/sort.rs index 34ab993..02f0eec 100644 --- a/src/engine/sort.rs +++ b/src/engine/sort.rs @@ -376,6 +376,7 @@ impl SortField { /// Iterates every slot in `universe` and reconstructs its value from the /// bit layers. O(universe_size * num_bits) — acceptable when the matching /// fraction is small (e.g. a 300-second window out of 86400 seconds). + #[cfg(test)] pub fn slots_in_range( &self, universe: &RoaringBitmap, @@ -429,25 +430,6 @@ impl SortField { } } - /// Get base bitmap references for all layers (for persistence). - /// Only valid when layers are clean (merged). - pub fn layer_bases(&self) -> Vec<&RoaringBitmap> { - self.bit_layers - .iter() - .map(|vb| { - debug_assert!(!vb.is_dirty(), "persisting dirty sort layer"); - vb.base() - }) - .collect() - } - - /// Get fused bitmap references for all layers (for zero-copy persistence). - /// Returns `Cow::Borrowed` when the layer is clean (zero copy), - /// `Cow::Owned` when the layer has pending diffs. - pub fn layer_bases_fused(&self) -> Vec> { - self.bit_layers.iter().map(|vb| vb.fused_cow()).collect() - } - /// Return the serialized byte size of all bit layer bitmaps. pub fn bitmap_bytes(&self) -> usize { self.bit_layers.iter().map(|bm| bm.bitmap_bytes()).sum() diff --git a/src/mutation.rs b/src/mutation.rs index 198625e..b666cb7 100644 --- a/src/mutation.rs +++ b/src/mutation.rs @@ -95,12 +95,14 @@ pub enum FieldValue { } /// A partial update payload for PATCH operations. /// Contains only the changed fields with old and new values. +#[cfg(test)] #[derive(Debug, Clone)] pub struct PatchPayload { pub fields: HashMap, } /// A single field change in a PATCH operation. /// Both old and new values come from the WAL event -- we never look up stored state. +#[cfg(test)] #[derive(Debug, Clone)] pub struct PatchField { pub old: FieldValue, @@ -333,6 +335,7 @@ pub fn diff_document( /// but ONLY processes fields present in new_doc. Missing fields are skipped /// entirely — they are NOT treated as deletions. This is the key difference /// from diff_document which treats missing fields as "change to None." +#[cfg(test)] pub fn diff_document_partial( slot: u32, old_doc: Option<&StoredDoc>, @@ -453,6 +456,7 @@ pub fn diff_document_partial( ops } /// Pure diff for PATCH: given old/new field values, returns MutationOps. +#[cfg(test)] pub fn diff_patch( slot: u32, patch: &PatchPayload, @@ -709,6 +713,7 @@ fn emit_sort_diff_ops( } } /// The core mutation engine. Applies PUT/PATCH/DELETE/DELETE WHERE to bitmaps. +#[cfg(test)] pub struct MutationEngine<'a> { slots: &'a mut SlotAllocator, filters: &'a mut FilterIndex, @@ -716,6 +721,7 @@ pub struct MutationEngine<'a> { config: &'a Config, docstore: &'a mut DocSiloAdapter, } +#[cfg(test)] impl<'a> MutationEngine<'a> { pub fn new( slots: &'a mut SlotAllocator, diff --git a/src/ops_processor.rs b/src/ops_processor.rs index 0a08179..8db7885 100644 --- a/src/ops_processor.rs +++ b/src/ops_processor.rs @@ -12,9 +12,8 @@ //! into BitmapSink calls using the engine Config for field awareness and //! `value_to_bitmap_key()` / `value_to_sort_u32()` for value conversion. use std::collections::HashMap; -use std::path::{Path, PathBuf}; +use std::path::Path; use std::sync::Arc; -use std::time::Duration; use serde_json::Value as JsonValue; use crate::engine::ConcurrentEngine; use crate::config::Config; @@ -180,140 +179,6 @@ fn json_to_field_value(v: &JsonValue) -> Option { _ => None, } } -// --------------------------------------------------------------------------- -// Document → Ops decomposition (for PUT/PATCH → WAL refactor, task 2.7) -// --------------------------------------------------------------------------- -/// Convert a FieldValue to a serde_json::Value for Op serialization. -pub fn field_value_to_json(fv: &crate::mutation::FieldValue) -> JsonValue { - match fv { - crate::mutation::FieldValue::Single(v) => qvalue_to_json(v), - crate::mutation::FieldValue::Multi(vals) => { - JsonValue::Array(vals.iter().map(qvalue_to_json).collect()) - } - } -} -/// Convert a query::Value to a serde_json::Value. -fn qvalue_to_json(v: &QValue) -> JsonValue { - match v { - QValue::Integer(i) => JsonValue::Number(serde_json::Number::from(*i)), - QValue::Float(f) => { - serde_json::Number::from_f64(*f) - .map(JsonValue::Number) - .unwrap_or(JsonValue::Null) - } - QValue::Bool(b) => JsonValue::Bool(*b), - QValue::String(s) => JsonValue::String(s.clone()), - } -} -/// Decompose a Document into `Vec` for WAL writing. -/// -/// For fresh inserts (old_doc is None): emits Op::Set for each field. -/// For upserts (old_doc is Some): emits Op::Remove for old values + Op::Set for -/// new values on changed fields. Unchanged fields are skipped. -/// -/// Multi-value fields are decomposed into individual Op::Add/Op::Remove per value. -/// -/// `is_patch`: when true (PATCH semantics), fields absent from new_doc are left -/// untouched — no Op::Remove emitted. When false (PUT semantics), absent fields -/// are treated as deletions and their old bitmap bits are cleared. -pub fn document_to_ops( - new_doc: &crate::mutation::Document, - old_doc: Option<&crate::silos::doc_format::StoredDoc>, - config: &crate::config::Config, - is_patch: bool, -) -> Vec { - let mut ops = Vec::new(); - let empty_fields = std::collections::HashMap::new(); - let old_fields = old_doc.map_or(&empty_fields, |d| &d.fields); - // Process all fields in the new document - for (field_name, new_val) in &new_doc.fields { - let old_val = old_fields.get(field_name); - // Check if this is a multi-value field (tagIds, toolIds, etc.) - let is_multi_value = config.filter_fields.iter() - .any(|f| f.name == *field_name && f.field_type == crate::engine::filter::FilterFieldType::MultiValue); - if is_multi_value { - // Multi-value: compute add/remove sets - let old_ints = extract_multi_ints(old_val); - let new_ints = extract_multi_ints(Some(new_val)); - // Remove values that were in old but not in new - for v in &old_ints { - if !new_ints.contains(v) { - ops.push(Op::Remove { - field: field_name.clone(), - value: JsonValue::Number(serde_json::Number::from(*v)), - }); - } - } - // Add values that are in new but not in old - for v in &new_ints { - if !old_ints.contains(v) { - ops.push(Op::Add { - field: field_name.clone(), - value: JsonValue::Number(serde_json::Number::from(*v)), - }); - } - } - } else { - // Single-value field: remove old + set new if changed - if let Some(old) = old_val { - if old != new_val { - ops.push(Op::Remove { - field: field_name.clone(), - value: field_value_to_json(old), - }); - ops.push(Op::Set { - field: field_name.clone(), - value: field_value_to_json(new_val), - }); - } - // else: unchanged, skip - } else { - // New field (not in old doc) - ops.push(Op::Set { - field: field_name.clone(), - value: field_value_to_json(new_val), - }); - } - } - } - // For PUT upsert: handle fields that were in old doc but removed in new doc. - // PATCH skips this — absent fields are left untouched (partial update semantics). - if old_doc.is_some() && !is_patch { - for (field_name, old_val) in old_fields { - if !new_doc.fields.contains_key(field_name) { - // Field was removed - let is_multi_value = config.filter_fields.iter() - .any(|f| f.name == *field_name && f.field_type == crate::engine::filter::FilterFieldType::MultiValue); - if is_multi_value { - for v in extract_multi_ints(Some(old_val)) { - ops.push(Op::Remove { - field: field_name.clone(), - value: JsonValue::Number(serde_json::Number::from(v)), - }); - } - } else { - ops.push(Op::Remove { - field: field_name.clone(), - value: field_value_to_json(old_val), - }); - } - } - } - } - ops -} -/// Extract integer values from a multi-value FieldValue. -fn extract_multi_ints(fv: Option<&crate::mutation::FieldValue>) -> Vec { - match fv { - Some(crate::mutation::FieldValue::Multi(vals)) => { - vals.iter().filter_map(|v| { - if let QValue::Integer(i) = v { Some(*i) } else { None } - }).collect() - } - Some(crate::mutation::FieldValue::Single(QValue::Integer(i))) => vec![*i], - _ => Vec::new(), - } -} /// Convert a JSON value to a PackedValue for docstore storage. fn json_to_packed(v: &JsonValue) -> Option { match v { @@ -362,24 +227,6 @@ fn json_to_qvalue(v: &JsonValue) -> QValue { _ => QValue::String(v.to_string()), } } -/// Configuration for the ops processor. -pub struct OpsProcessorConfig { - /// Max records to read per WAL batch - pub batch_size: usize, - /// How long to sleep when no new records are available - pub poll_interval: Duration, - /// Path to persist the cursor position - pub cursor_path: PathBuf, -} -impl Default for OpsProcessorConfig { - fn default() -> Self { - Self { - batch_size: 10_000, - poll_interval: Duration::from_millis(50), - cursor_path: PathBuf::from("wal_cursor"), - } - } -} /// Info about a computed sort field: which source fields feed it and the operation. #[derive(Clone)] struct ComputedSortInfo { @@ -1882,87 +1729,6 @@ mod tests { assert_eq!(json_to_packed(&json!(null)), None); assert_eq!(json_to_packed(&json!([1, 2, 3])), Some(PackedValue::Mi(vec![1, 2, 3]))); } - // ----------------------------------------------------------------------- - // document_to_ops tests (2.7) - // ----------------------------------------------------------------------- - #[test] - fn test_document_to_ops_fresh_insert() { - use crate::mutation::{Document, FieldValue}; - use crate::query::Value as QValue; - let config = test_config(); - let mut fields = std::collections::HashMap::new(); - fields.insert("nsfwLevel".into(), FieldValue::Single(QValue::Integer(16))); - let doc = Document { fields }; - let ops = document_to_ops(&doc, None, &config, false); - // Should have a Set op for nsfwLevel - assert_eq!(ops.len(), 1); - match &ops[0] { - Op::Set { field, value } => { - assert_eq!(field, "nsfwLevel"); - assert_eq!(value, &json!(16)); - } - other => panic!("expected Set, got {:?}", other), - } - } - #[test] - fn test_document_to_ops_upsert_changed_field() { - use crate::mutation::{Document, FieldValue}; - use crate::query::Value as QValue; - let config = test_config(); - // Old doc: nsfwLevel=8 - let mut old_fields = std::collections::HashMap::new(); - old_fields.insert("nsfwLevel".into(), FieldValue::Single(QValue::Integer(8))); - let old_doc = crate::silos::doc_format::StoredDoc { fields: old_fields, schema_version: 0 }; - - // New doc: nsfwLevel=16 - let mut new_fields = std::collections::HashMap::new(); - new_fields.insert("nsfwLevel".into(), FieldValue::Single(QValue::Integer(16))); - let new_doc = Document { fields: new_fields }; - let ops = document_to_ops(&new_doc, Some(&old_doc), &config, false); - // Should have Remove(old=8) + Set(new=16) - assert_eq!(ops.len(), 2); - assert!(ops.iter().any(|op| matches!(op, Op::Remove { field, value } if field == "nsfwLevel" && value == &json!(8)))); - assert!(ops.iter().any(|op| matches!(op, Op::Set { field, value } if field == "nsfwLevel" && value == &json!(16)))); - } - #[test] - fn test_document_to_ops_unchanged_field_skipped() { - use crate::mutation::{Document, FieldValue}; - use crate::query::Value as QValue; - let config = test_config(); - let mut fields = std::collections::HashMap::new(); - fields.insert("nsfwLevel".into(), FieldValue::Single(QValue::Integer(8))); - - let old_doc = crate::silos::doc_format::StoredDoc { fields: fields.clone(), schema_version: 0 }; - let new_doc = Document { fields }; - let ops = document_to_ops(&new_doc, Some(&old_doc), &config, false); - assert!(ops.is_empty(), "unchanged fields should produce no ops"); - } - #[test] - fn test_document_to_ops_patch_preserves_absent_fields() { - use crate::mutation::{Document, FieldValue}; - use crate::query::Value as QValue; - let config = test_config(); - // Old doc has nsfwLevel=8 AND reactionCount sort field - let mut old_fields = std::collections::HashMap::new(); - old_fields.insert("nsfwLevel".into(), FieldValue::Single(QValue::Integer(8))); - let old_doc = crate::silos::doc_format::StoredDoc { fields: old_fields, schema_version: 0 }; - - // PATCH only sends userId=42 (nsfwLevel absent from patch) - let mut new_fields = std::collections::HashMap::new(); - new_fields.insert("userId".into(), FieldValue::Single(QValue::Integer(42))); - let new_doc = Document { fields: new_fields }; - // is_patch=true: absent fields should NOT generate Remove ops - let ops = document_to_ops(&new_doc, Some(&old_doc), &config, true); - let has_remove_nsfw = ops.iter().any(|op| matches!(op, Op::Remove { field, .. } if field == "nsfwLevel")); - assert!(!has_remove_nsfw, "PATCH should NOT remove absent fields (nsfwLevel)"); - // Should have Set for userId (new field) - let has_set_user = ops.iter().any(|op| matches!(op, Op::Set { field, .. } if field == "userId")); - assert!(has_set_user, "PATCH should set provided fields (userId)"); - // is_patch=false (PUT): absent fields SHOULD generate Remove ops - let ops_put = document_to_ops(&new_doc, Some(&old_doc), &config, false); - let has_remove_nsfw_put = ops_put.iter().any(|op| matches!(op, Op::Remove { field, .. } if field == "nsfwLevel")); - assert!(has_remove_nsfw_put, "PUT should remove absent fields (nsfwLevel)"); - } fn test_config_with_nullable() -> Config { let mut config = test_config(); config.filter_fields.push(FilterFieldConfig { diff --git a/src/server.rs b/src/server.rs index 59ab90e..2b6fdf0 100644 --- a/src/server.rs +++ b/src/server.rs @@ -24,7 +24,6 @@ use crate::engine::ConcurrentEngine; use crate::config::{Config, DataSchema, FieldValueType, FilterFieldConfig, SortFieldConfig}; use crate::silos::doc_format::StoredDoc; use crate::engine::executor::{CaseSensitiveFields, StringMaps}; -use crate::sync::loader; use crate::metrics::Metrics; use crate::mutation::FieldValue; use crate::query::{BitdexQuery, Value}; @@ -546,43 +545,6 @@ struct CreateIndexRequest { data_schema: DataSchema, } -#[derive(Deserialize)] -struct LoadRequest { - path: String, - #[serde(default)] - limit: Option, - #[serde(default = "default_threads")] - threads: usize, - #[serde(default = "default_chunk_size")] - chunk_size: usize, - #[serde(default = "default_docstore_batch_size")] - docstore_batch_size: usize, - #[serde(default = "default_max_writer_threads")] - max_writer_threads: usize, - #[serde(default)] - save_snapshot: bool, -} - -fn default_threads() -> usize { - // Unused by fused parse+bitmap loader (rayon manages parallelism), - // kept for API compat. - let logical = std::thread::available_parallelism() - .map(|n| n.get()) - .unwrap_or(8); - (logical / 2).clamp(4, 8) -} - -fn default_chunk_size() -> usize { - 500_000 -} - -fn default_docstore_batch_size() -> usize { - 100_000 -} - -fn default_max_writer_threads() -> usize { - 4 -} #[derive(Deserialize)] struct DocumentRequest { @@ -878,24 +840,6 @@ struct AddFieldsRequest { skip_validation: bool, } -/// Sync filter values for a filter_only multi-value field. -/// Replaces all bitmap memberships for the given slots on the named field. -#[derive(Deserialize)] -struct FilterSyncRequest { - /// The filter field name (must be a multi_value field). - field: String, - /// List of (slot, values) pairs to sync. - documents: Vec, -} - -#[derive(Deserialize)] -struct FilterSyncEntry { - /// The document/slot ID. - id: u32, - /// The complete set of values this slot should have for the field. - values: Vec, -} - #[derive(Deserialize)] struct RemoveFieldsRequest { #[serde(default)] @@ -1343,12 +1287,9 @@ impl BitdexServer { .route("/api/indexes", post(handle_create_index)) .route("/api/indexes/{name}", delete(handle_delete_index)) .route("/api/indexes/{name}/config", patch(handle_patch_config)) - .route("/api/indexes/{name}/load", post(handle_load)) .route("/api/indexes/{name}/documents", post(handle_documents_batch).delete(handle_delete_docs)) .route("/api/indexes/{name}/documents/{slot_id}", get(handle_get_document)) .route("/api/indexes/{name}/documents/upsert", post(handle_upsert)) - .route("/api/indexes/{name}/documents/patch", patch(handle_patch_documents)) - .route("/api/indexes/{name}/documents/filter-sync", post(handle_filter_sync)) .route("/api/indexes/{name}/cache", delete(handle_clear_cache)) .route("/api/indexes/{name}/cache/persistent", delete(handle_purge_cache)) .route("/api/indexes/{name}/warm", post(handle_warm_cache)) @@ -2278,104 +2219,6 @@ async fn handle_delete_index( Json(serde_json::json!({"status": "deleted"})).into_response() } -// --------------------------------------------------------------------------- -// Handlers: Data loading -// --------------------------------------------------------------------------- - -async fn handle_load( - State(state): State, - AxumPath(name): AxumPath, - Json(req): Json, -) -> impl IntoResponse { - let (engine, schema, tasks) = { - let guard = state.index.lock(); - match guard.as_ref() { - Some(idx) if idx.definition.name == name => ( - Arc::clone(&idx.engine), - idx.definition.data_schema.clone(), - Arc::clone(&idx.tasks), - ), - _ => { - return ( - StatusCode::NOT_FOUND, - Json(serde_json::json!({"error": format!("Index '{}' not found", name)})), - ).into_response(); - } - } - }; - - let path = PathBuf::from(&req.path); - if !path.exists() { - return ( - StatusCode::BAD_REQUEST, - Json(serde_json::json!({"error": format!("File not found: {}", req.path)})), - ).into_response(); - } - - let (task_id, progress) = match tasks.try_start(TaskType::Load) { - Ok(v) => v, - Err(active_info) => { - return ( - StatusCode::CONFLICT, - Json(serde_json::json!({ - "error": "A task is already running", - "active_task": serde_json::to_value(&active_info).unwrap(), - })), - ).into_response(); - } - }; - - let limit = req.limit; - let threads = req.threads; - let chunk_size = req.chunk_size; - let docstore_batch_size = req.docstore_batch_size; - let max_writer_threads = req.max_writer_threads; - let save_snapshot = req.save_snapshot; - - // Spawn blocking loading task with TaskGuard for panic safety - let tasks_clone = Arc::clone(&tasks); - tokio::task::spawn_blocking(move || { - let mut guard = TaskGuard { tasks: tasks_clone, task_id: Some(task_id) }; - - match loader::load_ndjson(&engine, &schema, &path, limit, threads, chunk_size, docstore_batch_size, max_writer_threads, progress.clone()) { - Ok(stats) => { - let alive; - - if save_snapshot { - guard.tasks.set_saving(task_id); - - let snap_start = Instant::now(); - if let Err(e) = engine.save_and_unload() { - eprintln!("Warning: failed to save_and_unload: {e}"); - } else { - eprintln!("save_and_unload complete in {:.1}s", snap_start.elapsed().as_secs_f64()); - } - // Alive bitmap is always preserved during unload - alive = engine.alive_count(); - } else { - alive = engine.alive_count(); - } - - eprintln!("Load complete: {} records alive", alive); - - guard.tasks.set_complete(task_id, Some(serde_json::json!({ - "records_loaded": stats.records_loaded, - "elapsed_secs": stats.elapsed.as_secs_f64(), - }))); - guard.defuse(); - } - Err(e) => { - guard.tasks.set_error(task_id, e.to_string()); - guard.defuse(); - } - } - }); - - ( - StatusCode::ACCEPTED, - Json(serde_json::json!({"task_id": task_id})), - ).into_response() -} // --------------------------------------------------------------------------- // Handlers: Query & documents @@ -2785,38 +2628,6 @@ async fn handle_upsert( ).into_response() } -/// PATCH /api/indexes/{name}/documents/patch -/// -/// Not implemented — use upsert (PUT) for all document writes. -async fn handle_patch_documents( - State(_state): State, - AxumPath(name): AxumPath, - Json(_req): Json, -) -> impl IntoResponse { - ( - StatusCode::NOT_IMPLEMENTED, - Json(serde_json::json!({ - "error": format!("PATCH is not implemented for index '{}'; use PUT upsert instead", name) - })), - ) -} - -/// Sync filter values — not implemented. -/// -/// This endpoint is no longer supported. Use upsert (PUT) for all document writes. -async fn handle_filter_sync( - State(_state): State, - AxumPath(name): AxumPath, - Json(_req): Json, -) -> impl IntoResponse { - ( - StatusCode::NOT_IMPLEMENTED, - Json(serde_json::json!({ - "error": format!("filter_sync is not implemented for index '{}'; use PUT upsert instead", name) - })), - ) -} - async fn handle_delete_docs( State(state): State, AxumPath(name): AxumPath, diff --git a/src/sync/bulk_loader.rs b/src/sync/bulk_loader.rs index e9eac91..5224fc0 100644 --- a/src/sync/bulk_loader.rs +++ b/src/sync/bulk_loader.rs @@ -1,218 +1,25 @@ //! Bulk loader utilities: PG CSV download + ClickHouse metrics download. //! -//! The V1 in-process bulk load pipeline (run_bulk_load / run_bulk_load_copy) has been -//! removed. Use the single-pass V2 loader via the pg-sync binary instead. +//! The V1 in-process bulk load pipeline has been removed. +//! Use the config-driven dump processor via the pg-sync binary instead. //! //! Remaining functionality: -//! - `download_all_tables` / `download_single_table`: Stream PG tables to local CSVs +//! - `download_phase_csvs`: Stream phase CSVs from PG to local files +//! - `download_from_sync_config`: Download all phases from sync config //! - `download_metrics_from_clickhouse`: Fetch aggregate metrics from ClickHouse -//! - `finalize_from_bitmaps` / `scalars_to_json`: Docstore finalization helpers (used by tests) +//! - `clear_done_markers`: Clear stale .done markers at boot -use std::collections::HashMap; use std::time::Instant; -use roaring::RoaringBitmap; use sqlx::PgPool; -use super::loader::BitmapAccum; - -use super::copy_queries; - // --------------------------------------------------------------------------- -// Compact per-image scalar storage (replaces 512-byte arena slots) +// PG CSV download // --------------------------------------------------------------------------- -/// Compact per-image scalar data stored during CSV processing. -/// -/// Only stores fields needed for docstore finalization that cannot be -/// reconstructed from filter/sort bitmaps. Multi-value fields (tagIds, -/// toolIds, etc.) are reconstructed from their filter bitmaps. -/// -/// At ~80 bytes avg per image (including heap strings), 107M images ≈ 8.5 GB. -/// This replaces the 60GB memory-mapped SlotArena. -#[derive(Debug)] -struct ImageScalars { - url: Option>, // Box instead of String saves 8 bytes/entry (no capacity field) - hash: Option>, - nsfw_level: u8, - user_id: u64, - image_type: u8, // encoded via encode_image_type - sort_at: u64, // epoch seconds - poi: bool, // image-level poi (OR'd with resource_poi at finalization) - minor: bool, - has_meta: bool, - on_site: bool, - post_id: u64, - posted_to_id: u64, - availability: u8, // encoded via encode_availability - blocked_for: u8, // encoded via encode_blocked_for - published_at_ms: u64, // milliseconds -} - -/// Per-slot resource enrichment data, written by the resources stream. -/// Stored separately because it arrives from a different CSV file. -#[derive(Debug, Default)] -struct ResourceEnrichment { - base_model: u8, // encoded via encode_base_model - resource_poi: bool, -} - -/// Statistics from a completed bulk load. -#[derive(Debug)] -pub struct BulkLoadStats { - pub records_loaded: u64, - pub errors: u64, - pub elapsed: std::time::Duration, -} - -// --------------------------------------------------------------------------- -// Phase 1: Download tables to local CSV files -// --------------------------------------------------------------------------- - -/// Table descriptor for the download phase. -struct TableDownload { - name: &'static str, - file: &'static str, -} - -const TABLES: &[TableDownload] = &[ - TableDownload { name: "images", file: "images.csv" }, - TableDownload { name: "posts", file: "posts.csv" }, - TableDownload { name: "tags", file: "tags.csv" }, - TableDownload { name: "tools", file: "tools.csv" }, - TableDownload { name: "techniques", file: "techniques.csv" }, - TableDownload { name: "resources", file: "resources.csv" }, - TableDownload { name: "model_versions", file: "model_versions.csv" }, - TableDownload { name: "models", file: "models.csv" }, - TableDownload { name: "collection_items", file: "collection_items.csv" }, -]; - -/// Download a single named table from PG to a CSV file. -/// Public wrapper for use by backfill module. -pub async fn download_single_table( - pool: &PgPool, - stage_dir: &std::path::Path, - name: &'static str, - file: &'static str, -) -> Result { - let table = TableDownload { name, file }; - download_table(pool, stage_dir, &table).await -} - -/// Download a single table from PG to a CSV file on the PVC. -/// Returns the number of bytes written. -/// Skips if the .done marker already exists. -async fn download_table( - pool: &PgPool, - stage_dir: &std::path::Path, - table: &TableDownload, -) -> Result { - use futures_util::TryStreamExt; - use tokio::io::AsyncWriteExt; - - let csv_path = stage_dir.join(table.file); - let done_path = stage_dir.join(format!("{}.done", table.file)); - - // Skip if already downloaded - if done_path.exists() { - let size = std::fs::metadata(&csv_path).map(|m| m.len()).unwrap_or(0); - eprintln!(" {}: already downloaded ({:.1} MB), skipping", table.name, size as f64 / 1048576.0); - return Ok(size); - } - - // Get the COPY stream for this table - let mut stream = match table.name { - "images" => copy_queries::copy_images(pool).await, - "posts" => copy_queries::copy_posts(pool).await, - "tags" => copy_queries::copy_tags(pool).await, - "tools" => copy_queries::copy_tools(pool).await, - "techniques" => copy_queries::copy_techniques(pool).await, - "resources" => copy_queries::copy_resources(pool).await, - "model_versions" => copy_queries::copy_model_versions(pool).await, - "models" => copy_queries::copy_models(pool).await, - "collection_items" => copy_queries::copy_collection_items(pool).await, - _ => return Err(format!("unknown table: {}", table.name)), - }.map_err(|e| format!("{}: COPY start failed: {e}", table.name))?; - - // Stream to file - let file = tokio::fs::File::create(&csv_path) - .await - .map_err(|e| format!("{}: create file: {e}", table.name))?; - let mut writer = tokio::io::BufWriter::with_capacity(1024 * 1024, file); - let mut bytes_written = 0u64; - let start = Instant::now(); - - while let Some(chunk) = stream - .try_next() - .await - .map_err(|e| format!("{}: COPY stream: {e}", table.name))? - { - writer - .write_all(&chunk) - .await - .map_err(|e| format!("{}: write: {e}", table.name))?; - bytes_written += chunk.len() as u64; - } - writer.flush().await.map_err(|e| format!("{}: flush: {e}", table.name))?; - - // Write .done marker - std::fs::write(&done_path, b"ok") - .map_err(|e| format!("{}: write done marker: {e}", table.name))?; - - let elapsed = start.elapsed(); - eprintln!( - " {}: {:.1} MB in {:.1}s ({:.0} MB/s)", - table.name, - bytes_written as f64 / 1048576.0, - elapsed.as_secs_f64(), - bytes_written as f64 / 1048576.0 / elapsed.as_secs_f64().max(0.001), - ); - - Ok(bytes_written) -} - -/// Download all tables from PG to CSV files on the PVC. -/// Each table runs concurrently. Completed tables are skipped on retry. -pub async fn download_all_tables( - pool: &PgPool, - stage_dir: &std::path::Path, -) -> Result<(), String> { - std::fs::create_dir_all(stage_dir) - .map_err(|e| format!("create stage dir: {e}"))?; - - eprintln!("\n=== Phase 1: Downloading tables to {} ===", stage_dir.display()); - let start = Instant::now(); - - // Download all tables concurrently - let results = tokio::join!( - download_table(pool, stage_dir, &TABLES[0]), // images - download_table(pool, stage_dir, &TABLES[1]), // posts - download_table(pool, stage_dir, &TABLES[2]), // tags - download_table(pool, stage_dir, &TABLES[3]), // tools - download_table(pool, stage_dir, &TABLES[4]), // techniques - download_table(pool, stage_dir, &TABLES[5]), // resources - download_table(pool, stage_dir, &TABLES[6]), // model_versions - download_table(pool, stage_dir, &TABLES[7]), // models - ); - - // Check all results - let mut total_bytes = 0u64; - for (i, result) in [results.0, results.1, results.2, results.3, results.4, results.5, results.6, results.7].into_iter().enumerate() { - total_bytes += result.map_err(|e| format!("download {} failed: {e}", TABLES[i].name))?; - } - - eprintln!( - "Phase 1 complete: {:.1} GB in {:.1}s", - total_bytes as f64 / (1024.0 * 1024.0 * 1024.0), - start.elapsed().as_secs_f64(), - ); - - Ok(()) -} - /// Download CSVs using copy_query from sync config dump phases. /// -/// Config-driven replacement for download_all_tables — uses the exact COPY SQL +/// Config-driven replacement for the old download_all_tables — uses the exact COPY SQL /// from each DumpPhase (and its enrichment lookups) instead of hardcoded queries. /// This ensures the CSVs match what the dump processor expects. pub async fn download_from_sync_config( @@ -404,331 +211,6 @@ async fn download_copy_query( Ok(bytes_written) } -// --------------------------------------------------------------------------- -// Arena-free docstore finalization (used by V1 bulk loader, kept for tests) -// --------------------------------------------------------------------------- - -/// Block size for chunked bitmap reconstruction. -/// Aligned with roaring bitmap container boundaries (65,536 = 2^16). -const FINALIZE_CHUNK_SIZE: u32 = 65_536; - -/// Finalize alive slots to the docstore by reconstructing multi-value fields -/// from filter bitmaps and combining with stored scalars. -/// -/// Processes alive slots in 65K-block chunks aligned to roaring container -/// boundaries for efficient `bitmap.range()` iteration. -fn finalize_from_bitmaps( - _schema: &crate::config::DataSchema, - _alive: &RoaringBitmap, - _image_scalars: &HashMap, - _resource_enrichments: &HashMap, - _tag_bitmaps: &HashMap, - _tool_bitmaps: &HashMap, - _technique_bitmaps: &HashMap, - _mv_bitmaps: &HashMap, -) -> Result<(u64, u64), String> { - // TODO: Rewrite for DataSilo when V1 bulk loader is needed - Err("finalize_from_bitmaps: not yet ported to DataSilo".to_string()) -} - -// V2 dump pipeline (dump_processor.rs) handles doc finalization via DataSilo - -/// Convert compact ImageScalars + reconstructed multi-value fields to a -/// JSON document matching the Bitdex data schema. -/// -/// Produces the same output as `slot_data_to_json` in slot_arena.rs. -fn scalars_to_json( - slot: u32, - s: &ImageScalars, - enrichment: Option<&ResourceEnrichment>, - tag_ids: &[u32], - tool_ids: &[u32], - technique_ids: &[u32], - model_version_ids: &[u32], -) -> serde_json::Value { - use super::slot_arena::{decode_image_type, decode_availability, decode_base_model}; - - let base_model_enum = enrichment.map(|e| e.base_model).unwrap_or(0); - let resource_poi = enrichment.map(|e| e.resource_poi).unwrap_or(false); - let poi = s.poi || resource_poi; - - let mut doc = serde_json::json!({ - "id": slot as i64, - "nsfwLevel": s.nsfw_level as i64, - "userId": s.user_id as i64, - "postId": s.post_id as i64, - "postedToId": s.posted_to_id as i64, - "type": decode_image_type(s.image_type), - "baseModel": decode_base_model(base_model_enum), - "availability": decode_availability(s.availability), - "tagIds": tag_ids.iter().map(|&t| t as i64).collect::>(), - "modelVersionIds": model_version_ids.iter().map(|&t| t as i64).collect::>(), - "modelVersionIdsManual": serde_json::json!([]), - "toolIds": tool_ids.iter().map(|&t| t as i64).collect::>(), - "techniqueIds": technique_ids.iter().map(|&t| t as i64).collect::>(), - "reactionCount": 0i64, - "commentCount": 0i64, - "collectedCount": 0i64, - "sortAt": s.sort_at as i64, - "publishedAt": (s.published_at_ms / 1000) as i64, - }); - - if let Some(obj) = doc.as_object_mut() { - // Exists-boolean: isPublished = publishedAt is non-zero (matches outbox row_assembler) - if s.published_at_ms > 0 { - obj.insert("isPublished".into(), serde_json::json!(true)); - } - if s.has_meta { - obj.insert("hasMeta".into(), serde_json::json!(true)); - } - if s.on_site { - obj.insert("onSite".into(), serde_json::json!(true)); - } - if poi { - obj.insert("poi".into(), serde_json::json!(true)); - } - if s.minor { - obj.insert("minor".into(), serde_json::json!(true)); - } - if let Some(ref url) = s.url { - obj.insert("url".into(), serde_json::json!(url.as_ref())); - } - if let Some(ref hash) = s.hash { - obj.insert("hash".into(), serde_json::json!(hash.as_ref())); - } - if s.blocked_for > 0 { - obj.insert("blockedFor".into(), serde_json::json!("blocked")); - } - } - - doc -} - -/// AND all filter and sort bitmaps in an accumulator against the alive bitmap. -/// -/// Returns the number of bitmaps that were modified (had orphan bits stripped). -/// This enforces the clean bitmap invariant: filter bitmaps must be subsets of alive. -fn cleanup_orphan_bitmaps(accum: &mut BitmapAccum, alive: &RoaringBitmap) -> usize { - let mut cleaned = 0; - for value_map in accum.filter_maps.values_mut() { - for bitmap in value_map.values_mut() { - let before = bitmap.len(); - *bitmap &= alive; - if bitmap.len() < before { - cleaned += 1; - } - } - } - for bit_map in accum.sort_maps.values_mut() { - for bitmap in bit_map.values_mut() { - let before = bitmap.len(); - *bitmap &= alive; - if bitmap.len() < before { - cleaned += 1; - } - } - } - cleaned -} - -// --------------------------------------------------------------------------- -// Tests -// --------------------------------------------------------------------------- - -#[cfg(test)] -mod tests { - use super::*; - - fn make_scalars(slot: u32) -> ImageScalars { - ImageScalars { - url: Some(format!("https://example.com/{slot}.jpg").into_boxed_str()), - hash: Some(format!("hash{slot}").into_boxed_str()), - nsfw_level: 1, - user_id: slot as u64 * 7, - image_type: 0, // "image" - sort_at: 1700000000 + slot as u64, - poi: false, - minor: false, - has_meta: true, - on_site: false, - post_id: 100 + slot as u64, - posted_to_id: 200 + slot as u64, - availability: 0, // "Public" - blocked_for: 0, - published_at_ms: 1700000000000 + slot as u64 * 1000, - } - } - - #[test] - fn test_scalars_to_json_basic() { - let scalars = make_scalars(42); - let json = scalars_to_json(42, &scalars, None, &[], &[], &[], &[]); - - let obj = json.as_object().unwrap(); - assert_eq!(obj["id"], 42); - assert_eq!(obj["nsfwLevel"], 1); - assert_eq!(obj["userId"], 42 * 7); - assert_eq!(obj["type"], "image"); - assert_eq!(obj["url"], "https://example.com/42.jpg"); - assert_eq!(obj["hash"], "hash42"); - assert_eq!(obj["hasMeta"], true); - assert_eq!(obj["tagIds"].as_array().unwrap().len(), 0); - assert_eq!(obj["modelVersionIds"].as_array().unwrap().len(), 0); - } - - #[test] - fn test_scalars_to_json_with_multi_value() { - let scalars = make_scalars(10); - let tags = vec![100u32, 200, 300]; - let tools = vec![50u32]; - let techniques = vec![5u32, 6]; - let mvs = vec![999u32, 888]; - - let json = scalars_to_json(10, &scalars, None, &tags, &tools, &techniques, &mvs); - let obj = json.as_object().unwrap(); - - let tag_ids: Vec = obj["tagIds"].as_array().unwrap() - .iter().map(|v| v.as_i64().unwrap()).collect(); - assert_eq!(tag_ids, vec![100, 200, 300]); - - let tool_ids: Vec = obj["toolIds"].as_array().unwrap() - .iter().map(|v| v.as_i64().unwrap()).collect(); - assert_eq!(tool_ids, vec![50]); - - let mv_ids: Vec = obj["modelVersionIds"].as_array().unwrap() - .iter().map(|v| v.as_i64().unwrap()).collect(); - assert_eq!(mv_ids, vec![999, 888]); - } - - #[test] - fn test_scalars_to_json_with_enrichment() { - let scalars = make_scalars(5); - let enrichment = ResourceEnrichment { - base_model: 3, // SDXL 1.0 - resource_poi: true, - }; - - let json = scalars_to_json(5, &scalars, Some(&enrichment), &[], &[], &[], &[]); - let obj = json.as_object().unwrap(); - - assert_eq!(obj["baseModel"], "SDXL 1.0"); - assert_eq!(obj["poi"], true); // resource_poi OR'd with image poi - } - - #[test] - fn test_scalars_to_json_poi_or() { - // Image poi=true, resource_poi=false → poi=true - let mut scalars = make_scalars(1); - scalars.poi = true; - let json = scalars_to_json(1, &scalars, None, &[], &[], &[], &[]); - assert_eq!(json["poi"], true); - - // Image poi=false, resource_poi=true → poi=true - let scalars2 = make_scalars(2); - let enrichment = ResourceEnrichment { base_model: 0, resource_poi: true }; - let json2 = scalars_to_json(2, &scalars2, Some(&enrichment), &[], &[], &[], &[]); - assert_eq!(json2["poi"], true); - - // Image poi=false, resource_poi=false → no poi field - let scalars3 = make_scalars(3); - let json3 = scalars_to_json(3, &scalars3, None, &[], &[], &[], &[]); - assert!(json3.get("poi").is_none()); - } - - #[test] - fn test_scalars_to_json_blocked_for() { - let mut scalars = make_scalars(1); - scalars.blocked_for = 1; // some blocked_for value - let json = scalars_to_json(1, &scalars, None, &[], &[], &[], &[]); - assert_eq!(json["blockedFor"], "blocked"); - } - - #[test] - fn test_bitmap_reconstruction_single_chunk() { - // Simulate the bitmap reconstruction logic for a single chunk - let mut tag_bitmaps: HashMap = HashMap::new(); - - // Tag 100 is on slots 5 and 10 - let mut bm100 = RoaringBitmap::new(); - bm100.insert(5); - bm100.insert(10); - tag_bitmaps.insert(100, bm100); - - // Tag 200 is on slot 5 only - let mut bm200 = RoaringBitmap::new(); - bm200.insert(5); - tag_bitmaps.insert(200, bm200); - - // Tag 300 is on slot 10 only - let mut bm300 = RoaringBitmap::new(); - bm300.insert(10); - tag_bitmaps.insert(300, bm300); - - // Reconstruct for chunk 0..65536 - let chunk_start: u32 = 0; - let chunk_end: u32 = FINALIZE_CHUNK_SIZE; - let mut chunk_tags: Vec> = vec![Vec::new(); FINALIZE_CHUNK_SIZE as usize]; - - for (&tag_id, bm) in &tag_bitmaps { - for slot in bm.range(chunk_start..chunk_end) { - chunk_tags[(slot - chunk_start) as usize].push(tag_id as u32); - } - } - - // Slot 5 should have tags [100, 200] (order may vary) - let mut tags_5 = chunk_tags[5].clone(); - tags_5.sort(); - assert_eq!(tags_5, vec![100, 200]); - - // Slot 10 should have tags [100, 300] (order may vary) - let mut tags_10 = chunk_tags[10].clone(); - tags_10.sort(); - assert_eq!(tags_10, vec![100, 300]); - - // Slot 0 should have no tags - assert!(chunk_tags[0].is_empty()); - } - - #[test] - fn test_bitmap_reconstruction_cross_chunk() { - // Test that slots in different chunks are correctly handled - let mut tag_bitmaps: HashMap = HashMap::new(); - - // Tag 100 spans two chunks - let mut bm = RoaringBitmap::new(); - bm.insert(100); // chunk 0 - bm.insert(FINALIZE_CHUNK_SIZE + 50); // chunk 1 - tag_bitmaps.insert(100, bm); - - // Check chunk 0 - let mut chunk0: Vec> = vec![Vec::new(); FINALIZE_CHUNK_SIZE as usize]; - for (&tag_id, bm) in &tag_bitmaps { - for slot in bm.range(0..FINALIZE_CHUNK_SIZE) { - chunk0[(slot) as usize].push(tag_id as u32); - } - } - assert_eq!(chunk0[100], vec![100u32]); - - // Check chunk 1 - let chunk1_start = FINALIZE_CHUNK_SIZE; - let chunk1_end = FINALIZE_CHUNK_SIZE * 2; - let mut chunk1: Vec> = vec![Vec::new(); FINALIZE_CHUNK_SIZE as usize]; - for (&tag_id, bm) in &tag_bitmaps { - for slot in bm.range(chunk1_start..chunk1_end) { - chunk1[(slot - chunk1_start) as usize].push(tag_id as u32); - } - } - assert_eq!(chunk1[50], vec![100u32]); - } - - #[test] - fn test_resource_enrichment_default() { - let enrichment = ResourceEnrichment::default(); - assert_eq!(enrichment.base_model, 0); - assert!(!enrichment.resource_poi); - } -} - // --------------------------------------------------------------------------- // ClickHouse metrics download // --------------------------------------------------------------------------- diff --git a/src/sync/copy_queries.rs b/src/sync/copy_queries.rs deleted file mode 100644 index 6c6d285..0000000 --- a/src/sync/copy_queries.rs +++ /dev/null @@ -1,364 +0,0 @@ -//! PostgreSQL COPY TO STDOUT queries and CSV chunk parser for bulk loading. -//! -//! Each table is streamed independently with no JOINs. -//! -//! This is significantly faster than JOIN-based loading because: -//! - No per-row deserialization through sqlx's type system -//! - No intermediate `Vec` allocation per batch -//! - Streaming backpressure: we process as fast as we can consume -//! - No JOINs: each table streams at sequential scan speed - -use bytes::Bytes; -use futures_core::stream::BoxStream; -use sqlx::postgres::PgPoolCopyExt; -use sqlx::PgPool; - -// --------------------------------------------------------------------------- -// COPY query functions — one per table, no JOINs -// --------------------------------------------------------------------------- - -/// Stream Image table via COPY CSV (no JOINs). -/// -/// Columns (13): id, url, nsfwLevel, hash, flags, type, userId, blockedFor, -/// scannedAtSecs, createdAtSecs, postId, width, height -pub async fn copy_images( - pool: &PgPool, -) -> Result>, sqlx::Error> { - pool.copy_out_raw( - r#"COPY (SELECT id, url, "nsfwLevel", hash, flags, type::text, - "userId", "blockedFor", - extract(epoch from "scannedAt")::bigint, - extract(epoch from "createdAt")::bigint, - "postId", - width, height - FROM "Image" - ) TO STDOUT WITH (FORMAT csv)"#, - ) - .await -} - -/// Stream Post table via COPY CSV for enrichment. -/// -/// Columns (4): id, publishedAtSecs, availability, modelVersionId -pub async fn copy_posts( - pool: &PgPool, -) -> Result>, sqlx::Error> { - pool.copy_out_raw( - r#"COPY (SELECT id, - extract(epoch from "publishedAt")::bigint, - availability::text, - "modelVersionId" - FROM "Post" - ) TO STDOUT WITH (FORMAT csv)"#, - ) - .await -} - -/// Stream tags via COPY CSV (unordered). -/// -/// Columns (2): tagId, imageId -pub async fn copy_tags( - pool: &PgPool, -) -> Result>, sqlx::Error> { - pool.copy_out_raw( - r#"COPY (SELECT "tagId", "imageId" FROM "TagsOnImageDetails" WHERE disabled = false) TO STDOUT WITH (FORMAT csv)"#, - ) - .await -} - -/// Stream tools via COPY CSV (unordered). -/// -/// Columns (2): toolId, imageId -pub async fn copy_tools( - pool: &PgPool, -) -> Result>, sqlx::Error> { - pool.copy_out_raw( - r#"COPY (SELECT "toolId", "imageId" FROM "ImageTool") TO STDOUT WITH (FORMAT csv)"#, - ) - .await -} - -/// Stream techniques via COPY CSV (unordered). -/// -/// Columns (2): techniqueId, imageId -pub async fn copy_techniques( - pool: &PgPool, -) -> Result>, sqlx::Error> { - pool.copy_out_raw( - r#"COPY (SELECT "techniqueId", "imageId" FROM "ImageTechnique") TO STDOUT WITH (FORMAT csv)"#, - ) - .await -} - -/// Stream ImageResourceNew via COPY CSV (no JOINs). -/// -/// Columns (3): imageId, modelVersionId, detected -pub async fn copy_resources( - pool: &PgPool, -) -> Result>, sqlx::Error> { - pool.copy_out_raw( - r#"COPY (SELECT "imageId", "modelVersionId", detected FROM "ImageResourceNew") TO STDOUT WITH (FORMAT csv)"#, - ) - .await -} - -/// Stream ModelVersion table via COPY CSV for enrichment. -/// -/// Columns (3): id, baseModel, modelId -pub async fn copy_model_versions( - pool: &PgPool, -) -> Result>, sqlx::Error> { - pool.copy_out_raw( - r#"COPY (SELECT id, "baseModel", "modelId" FROM "ModelVersion") TO STDOUT WITH (FORMAT csv)"#, - ) - .await -} - -/// Stream CollectionItem via COPY CSV (accepted image collections only). -/// -/// Columns (2): collectionId, imageId -pub async fn copy_collection_items( - pool: &PgPool, -) -> Result>, sqlx::Error> { - pool.copy_out_raw( - r#"COPY (SELECT "collectionId", "imageId" FROM "CollectionItem" WHERE "imageId" IS NOT NULL AND status = 'ACCEPTED') TO STDOUT WITH (FORMAT csv)"#, - ) - .await -} - -/// Stream Model table via COPY CSV for enrichment. -/// -/// Columns (3): id, poi, type -pub async fn copy_models( - pool: &PgPool, -) -> Result>, sqlx::Error> { - pool.copy_out_raw( - r#"COPY (SELECT id, poi, type::text FROM "Model") TO STDOUT WITH (FORMAT csv)"#, - ) - .await -} - -// --------------------------------------------------------------------------- -// CSV chunk parser -// --------------------------------------------------------------------------- - -/// Incremental CSV parser that buffers across `Bytes` chunk boundaries. -/// -/// PostgreSQL's `COPY ... TO STDOUT WITH (FORMAT csv)` sends data in arbitrary -/// chunk sizes that may split CSV rows mid-line. This parser accumulates bytes -/// and yields only complete lines. -pub struct CopyParser { - buffer: Vec, -} - -impl CopyParser { - pub fn new() -> Self { - Self { - buffer: Vec::with_capacity(64 * 1024), - } - } - - /// Feed a chunk of bytes. Returns complete lines that can be parsed. - /// Retains any incomplete trailing line in the internal buffer. - pub fn feed(&mut self, chunk: &[u8]) -> Vec> { - self.buffer.extend_from_slice(chunk); - - let mut lines = Vec::new(); - let mut start = 0; - let mut in_quote = false; - - let buf = &self.buffer; - let len = buf.len(); - let mut i = 0; - - while i < len { - let b = buf[i]; - if b == b'"' { - in_quote = !in_quote; - } else if b == b'\n' && !in_quote { - // Complete line found (excluding the newline). - lines.push(buf[start..i].to_vec()); - start = i + 1; - } - i += 1; - } - - // Keep the incomplete trailing data for the next feed. - if start == len { - self.buffer.clear(); - } else if start > 0 { - // Shift remaining bytes to the front. - let remaining = self.buffer[start..].to_vec(); - self.buffer = remaining; - } - // If start == 0, the entire buffer is an incomplete line — keep as-is. - - lines - } -} - -// --------------------------------------------------------------------------- -// CSV field splitting -// --------------------------------------------------------------------------- - -/// Split a CSV line into fields, handling quoted fields. -/// -/// Rules (PostgreSQL CSV format): -/// - Fields separated by `,` -/// - Quoted fields start and end with `"` -/// - A literal `"` inside a quoted field is represented as `""` -/// - NULL is an empty unquoted field -fn split_csv_fields(line: &[u8]) -> Vec> { - let mut fields = Vec::new(); - let mut i = 0; - let len = line.len(); - - while i <= len { - if i == len { - fields.push(Vec::new()); - break; - } - - if line[i] == b'"' { - // Quoted field. - let mut field = Vec::new(); - i += 1; // skip opening quote - while i < len { - if line[i] == b'"' { - if i + 1 < len && line[i + 1] == b'"' { - field.push(b'"'); - i += 2; - } else { - i += 1; - break; - } - } else { - field.push(line[i]); - i += 1; - } - } - fields.push(field); - if i < len && line[i] == b',' { - i += 1; - } - } else { - // Unquoted field — scan until comma or end. - let start = i; - while i < len && line[i] != b',' { - i += 1; - } - fields.push(line[start..i].to_vec()); - if i < len { - i += 1; // skip comma - } else { - break; - } - } - } - - fields -} - -// --------------------------------------------------------------------------- -// Tests -// --------------------------------------------------------------------------- - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_parser_basic_lines() { - let mut parser = CopyParser::new(); - let lines = parser.feed(b"100,hello,42\n200,world,99\n"); - assert_eq!(lines.len(), 2); - assert_eq!(lines[0], b"100,hello,42"); - assert_eq!(lines[1], b"200,world,99"); - } - - #[test] - fn test_parser_chunk_boundary() { - let mut parser = CopyParser::new(); - let lines1 = parser.feed(b"100,hello\n200,wor"); - assert_eq!(lines1.len(), 1); - assert_eq!(lines1[0], b"100,hello"); - let lines2 = parser.feed(b"ld\n"); - assert_eq!(lines2.len(), 1); - assert_eq!(lines2[0], b"200,world"); - } - - #[test] - fn test_parser_no_trailing_newline() { - let mut parser = CopyParser::new(); - let lines = parser.feed(b"100,hello\n200,world"); - assert_eq!(lines.len(), 1); - assert_eq!(lines[0], b"100,hello"); - let lines2 = parser.feed(b"\n"); - assert_eq!(lines2.len(), 1); - assert_eq!(lines2[0], b"200,world"); - } - - #[test] - fn test_parser_empty_fields_null() { - let mut parser = CopyParser::new(); - let lines = parser.feed(b"100,,42,,\n"); - assert_eq!(lines.len(), 1); - let fields = split_csv_fields(&lines[0]); - assert_eq!(fields.len(), 5); - assert_eq!(fields[0], b"100"); - assert!(fields[1].is_empty()); - assert_eq!(fields[2], b"42"); - assert!(fields[3].is_empty()); - assert!(fields[4].is_empty()); - } - - #[test] - fn test_parser_quoted_field_with_comma() { - let mut parser = CopyParser::new(); - let lines = parser.feed(b"100,\"hello,world\",42\n"); - assert_eq!(lines.len(), 1); - let fields = split_csv_fields(&lines[0]); - assert_eq!(fields.len(), 3); - assert_eq!(fields[1], b"hello,world"); - } - - #[test] - fn test_parser_quoted_field_with_escaped_quote() { - let mut parser = CopyParser::new(); - let lines = parser.feed(b"100,\"say \"\"hi\"\"\",42\n"); - assert_eq!(lines.len(), 1); - let fields = split_csv_fields(&lines[0]); - assert_eq!(fields[1], b"say \"hi\""); - } - - #[test] - fn test_parser_quoted_field_with_newline() { - let mut parser = CopyParser::new(); - let lines = parser.feed(b"100,\"line1\nline2\",42\n"); - assert_eq!(lines.len(), 1); - let fields = split_csv_fields(&lines[0]); - assert_eq!(fields[1], b"line1\nline2"); - } - - #[test] - fn test_split_csv_simple() { - let fields = split_csv_fields(b"a,b,c"); - assert_eq!(fields.len(), 3); - } - - #[test] - fn test_split_csv_trailing_comma() { - let fields = split_csv_fields(b"a,b,"); - assert_eq!(fields.len(), 3); - assert_eq!(fields[2], b""); - } - - #[test] - fn test_multiple_chunks_interleaved() { - let mut parser = CopyParser::new(); - let lines1 = parser.feed(b"1,a\n2,"); - assert_eq!(lines1.len(), 1); - let lines2 = parser.feed(b"b\n3,c\n"); - assert_eq!(lines2.len(), 2); - } -} diff --git a/src/sync/dump_enrichment.rs b/src/sync/dump_enrichment.rs index 5203759..fe8684d 100644 --- a/src/sync/dump_enrichment.rs +++ b/src/sync/dump_enrichment.rs @@ -25,10 +25,9 @@ use ahash::AHashMap as HashMap; use std::io::{self, BufRead, BufReader}; -use std::path::{Path, PathBuf}; +use std::path::PathBuf; use std::sync::Arc; -use crate::dictionary::FieldDictionary; use super::dump_expression::{ ColumnIndex, ComputedFieldDef, CsvRow, ExprValue, FilterExpression, }; @@ -68,28 +67,8 @@ pub struct LookupRow { } impl LookupRow { - /// Get a column value by name. - pub fn get(&self, column: &str) -> Option<&str> { - let idx = self.col_index.get(column)?; - self.values.get(*idx)?.as_deref() - } - - /// Convert to CsvRow for expression evaluation. - pub fn to_csv_row(&self) -> CsvRow<'_> { - let mut row = CsvRow::new(); - for (name, &idx) in self.col_index.as_ref() { - let val = self.values.get(idx).and_then(|v| v.as_deref()); - row.insert(name.as_str(), val); - } - row - } - - /// Iterate over (column_name, value) pairs (non-null only). - pub fn iter_columns(&self) -> impl Iterator { - self.col_index.iter().filter_map(move |(name, &idx)| { - self.values.get(idx)?.as_deref().map(|v| (name.as_str(), v)) - }) - } + // No public methods needed — LookupRow is internal to EnrichmentTable. + // Accessed via indexed path (enrich_indexed_into_with_buf) only. } /// Mmap-backed dense offset index for enrichment lookups. @@ -405,61 +384,6 @@ impl EnrichmentTable { }) } - /// Look up a row by key value (HashMap path only). - /// Look up a row by key (HashMap path only — panics for Mmap-backed tables). - /// For Mmap tables, use enrich_indexed_into or enrich_key_into instead. - pub fn get(&self, key: i64) -> Option<&LookupRow> { - match &self.storage { - EnrichmentStorage::HashMap(data) => data.get(&key), - EnrichmentStorage::Mmap(_) => panic!("get() not supported for Mmap-backed tables — use enrich_indexed_into() or enrich_key_into()"), - } - } - - /// Get the nested child table (if any). - pub fn child(&self) -> Option<&EnrichmentTable> { - self.child.as_deref() - } - - /// Enrich a parent row using this lookup table and its config. - /// - /// This is the full enrichment resolution that handles the filter-on-nested pattern: - /// Resources → MV (by modelVersionId) → Model (by modelId) → if type='Checkpoint', set baseModel - pub fn enrich<'a>( - &self, - parent_row: &CsvRow<'a>, - config: &EnrichmentConfig, - ) -> EnrichedFields { - let mut result = EnrichedFields::default(); - - // Get join key from parent row - let join_value = match parent_row.get(config.join_on.as_str()) { - Some(Some(v)) if !v.is_empty() => *v, - _ => return result, - }; - - let join_key: i64 = match join_value.parse() { - Ok(k) => k, - Err(_) => return result, - }; - - self.enrich_key_into(join_key, config, &mut result); - result - } - - /// Enrich using indexed parent row (zero-allocation hot path for 107M+ rows). - /// - /// The parent row is `&[Option<&str>]` + `ColumnIndex` — no HashMap per row. - pub fn enrich_indexed( - &self, - parent_fields: &[Option<&str>], - parent_col_idx: &ColumnIndex, - config: &EnrichmentConfig, - ) -> EnrichedFields { - let mut result = EnrichedFields::default(); - self.enrich_indexed_into(parent_fields, parent_col_idx, config, &mut result); - result - } - /// Enrich into a pre-allocated buffer (avoids Vec reallocation across rows). pub fn enrich_indexed_into( &self, @@ -688,26 +612,6 @@ impl EnrichmentManager { Ok(()) } - /// Enrich a row using all loaded tables. - /// Returns combined enriched fields from all enrichment sources. - pub fn enrich_row<'a>(&self, row: &CsvRow<'a>) -> EnrichedFields { - let mut combined = EnrichedFields::default(); - for (table, config) in self.tables.values() { - let enriched = table.enrich(row, config); - combined.fields.extend(enriched.fields); - combined.computed.extend(enriched.computed); - } - combined - } - - /// Enrich a row using indexed fields (zero-allocation hot path). - pub fn enrich_row_indexed(&self, fields: &[Option<&str>], col_idx: &super::dump_expression::ColumnIndex) -> EnrichedFields { - let mut combined = EnrichedFields::default(); - let mut lookup_buf = Vec::new(); - self.enrich_row_indexed_into(fields, col_idx, &mut combined, &mut lookup_buf); - combined - } - /// Enrich a row into a pre-allocated buffer (reuse across rows). /// Avoids Vec reallocation — clear + refill. String allocs still per-row. /// `lookup_buf` is a reusable buffer for mmap-backed table lookups (avoids Vec alloc per row). @@ -719,119 +623,12 @@ impl EnrichmentManager { } } - /// Drop all tables to free memory. Call after the phase completes. - pub fn clear(&mut self) { - self.tables.clear(); - } - - /// Drop a specific table by join_on key. - pub fn drop_table(&mut self, join_on: &str) { - self.tables.remove(join_on); - } - - /// Total estimated memory across all loaded tables. - pub fn total_memory(&self) -> usize { - self.tables.values().map(|(t, _)| t.estimated_memory()).sum() - } - /// Number of loaded tables. pub fn table_count(&self) -> usize { self.tables.len() } } -// ---- Dictionary helpers ---- - -/// Resolve a string value through a FieldDictionary, returning the integer key. -/// -/// This is the clean API for 1.10/1.15#7: pass individual `&FieldDictionary` refs, -/// not a full `HashMap`. -pub fn resolve_dictionary_value(dict: &FieldDictionary, value: &str) -> i64 { - dict.get_or_insert(value) -} - -/// Resolve an ExprValue through a dictionary if it's a string. -/// Returns the bitmap key (i64) for the value. -pub fn resolve_expr_to_bitmap_key( - value: &ExprValue, - dict: Option<&FieldDictionary>, -) -> Option { - match value { - ExprValue::Int(n) => Some(*n as u64), - ExprValue::Bool(b) => Some(if *b { 1 } else { 0 }), - ExprValue::Str(s) => { - if let Some(d) = dict { - Some(d.get_or_insert(s) as u64) - } else { - // Try parsing as integer - s.parse::().ok() - } - } - ExprValue::Null => None, - } -} - -/// Collection of field dictionaries for LCS fields, keyed by field name. -/// -/// Thread-safe: FieldDictionary uses DashMap internally. -/// Share via `Arc` across threads. -pub struct DictionarySet { - dicts: HashMap>, -} - -impl DictionarySet { - /// Create a new set with dictionaries for the given field names. - pub fn new(field_names: &[&str]) -> Self { - let mut dicts = HashMap::new(); - for name in field_names { - dicts.insert(name.to_string(), Arc::new(FieldDictionary::new())); - } - Self { dicts } - } - - /// Create from existing dictionaries (e.g., loaded from disk). - pub fn from_existing(dicts: HashMap>) -> Self { - Self { dicts } - } - - /// Get a dictionary by field name. - pub fn get(&self, field: &str) -> Option<&Arc> { - self.dicts.get(field) - } - - /// Resolve a string value for a field, returning the bitmap key. - /// Returns None if the field has no dictionary (not an LCS field). - pub fn resolve(&self, field: &str, value: &str) -> Option { - self.dicts.get(field).map(|d| d.get_or_insert(value)) - } - - /// Persist all dirty dictionaries to disk. - pub fn persist_all(&self, dict_dir: &Path) -> io::Result<()> { - std::fs::create_dir_all(dict_dir)?; - for (name, dict) in &self.dicts { - let snapshot = dict.snapshot(); - let path = dict_dir.join(format!("{}.dict", name)); - let json = serde_json::to_string_pretty(&snapshot) - .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; - // Atomic write - let tmp = path.with_extension("dict.tmp"); - std::fs::write(&tmp, &json)?; - std::fs::rename(&tmp, &path)?; - } - Ok(()) - } - - /// Get all dictionary names. - pub fn names(&self) -> Vec<&str> { - self.dicts.keys().map(|s| s.as_str()).collect() - } - - /// Iterate over all dictionaries. - pub fn iter(&self) -> impl Iterator)> { - self.dicts.iter().map(|(k, v)| (k.as_str(), v)) - } -} - // ---- CSV parsing helpers ---- /// Fast extract of a specific column as i64 from a comma-delimited byte line. @@ -937,15 +734,6 @@ pub fn parse_tsv_fields(line: &str) -> Vec<&str> { #[cfg(test)] mod tests { use super::*; - use std::io::Write; - use tempfile::TempDir; - - fn write_csv(dir: &Path, name: &str, content: &str) -> PathBuf { - let path = dir.join(name); - let mut f = std::fs::File::create(&path).unwrap(); - f.write_all(content.as_bytes()).unwrap(); - path - } // ---- CSV parser tests ---- @@ -973,331 +761,4 @@ mod tests { assert_eq!(fields, vec!["1", "hello", "42"]); } - // ---- EnrichmentTable tests ---- - - #[test] - fn test_load_simple_table() { - let dir = TempDir::new().unwrap(); - let csv = write_csv( - dir.path(), - "posts.csv", - "id,publishedAtSecs,availability\n100,1700000000,Public\n200,,Private\n300,1700001000,Public\n", - ); - - let config = EnrichmentConfig { - csv_path: csv, - key: "id".into(), - join_on: "postId".into(), - fields: vec![ - ("publishedAtSecs".into(), "publishedAt".into()), - ("availability".into(), "availability".into()), - ], - computed_fields: vec![], - filter: None, - child: None, - columns: vec![], - }; - - let table = EnrichmentTable::load(&config).unwrap(); - assert_eq!(table.row_count, 3); - - // Check row 100 - let row = table.get(100).unwrap(); - assert_eq!(row.get("publishedAtSecs").unwrap(), "1700000000"); - assert_eq!(row.get("availability").unwrap(), "Public"); - - // Check row 200 (null publishedAtSecs) - let row200 = table.get(200).unwrap(); - assert!(row200.get("publishedAtSecs").is_none()); // empty → absent - assert_eq!(row200.get("availability").unwrap(), "Private"); - } - - #[test] - fn test_single_level_enrichment() { - let dir = TempDir::new().unwrap(); - let csv = write_csv( - dir.path(), - "posts.csv", - "id,publishedAtSecs,availability\n100,1700000000,Public\n", - ); - - let config = EnrichmentConfig { - csv_path: csv, - key: "id".into(), - join_on: "postId".into(), - fields: vec![ - ("publishedAtSecs".into(), "publishedAt".into()), - ("availability".into(), "availability".into()), - ], - computed_fields: vec![ - ComputedFieldDef::parse("isPublished", "publishedAtSecs != null", None).unwrap(), - ComputedFieldDef::parse("postedToId", "lookup_key", None).unwrap(), - ], - filter: None, - child: None, - columns: vec![], - }; - - let table = EnrichmentTable::load(&config).unwrap(); - - // Simulate a parent row with postId=100 - let parent: CsvRow = vec![("postId", Some("100"))].into_iter().collect(); - - let enriched = table.enrich(&parent, &config); - - // Direct fields - assert_eq!(enriched.fields.len(), 2); - assert!(enriched.fields.contains(&("publishedAt".into(), "1700000000".into()))); - assert!(enriched.fields.contains(&("availability".into(), "Public".into()))); - - // Computed fields - assert_eq!(enriched.computed.len(), 2); - assert!(enriched - .computed - .contains(&("isPublished".into(), ExprValue::Bool(true)))); - assert!(enriched - .computed - .contains(&("postedToId".into(), ExprValue::Int(100)))); - } - - #[test] - fn test_enrichment_no_match() { - let dir = TempDir::new().unwrap(); - let csv = write_csv(dir.path(), "posts.csv", "id,name\n100,hello\n"); - - let config = EnrichmentConfig { - csv_path: csv, - key: "id".into(), - join_on: "postId".into(), - fields: vec![("name".into(), "name".into())], - computed_fields: vec![], - filter: None, - child: None, - columns: vec![], - }; - - let table = EnrichmentTable::load(&config).unwrap(); - let parent: CsvRow = vec![("postId", Some("999"))].into_iter().collect(); - let enriched = table.enrich(&parent, &config); - assert!(enriched.is_empty()); - } - - #[test] - fn test_enrichment_null_join_key() { - let dir = TempDir::new().unwrap(); - let csv = write_csv(dir.path(), "posts.csv", "id,name\n100,hello\n"); - - let config = EnrichmentConfig { - csv_path: csv, - key: "id".into(), - join_on: "postId".into(), - fields: vec![("name".into(), "name".into())], - computed_fields: vec![], - filter: None, - child: None, - columns: vec![], - }; - - let table = EnrichmentTable::load(&config).unwrap(); - - // Missing join key - let parent: CsvRow = HashMap::new(); - let enriched = table.enrich(&parent, &config); - assert!(enriched.is_empty()); - - // Null join key - let parent2: CsvRow = vec![("postId", None)].into_iter().collect(); - let enriched2 = table.enrich(&parent2, &config); - assert!(enriched2.is_empty()); - } - - #[test] - fn test_nested_enrichment_with_filter() { - let dir = TempDir::new().unwrap(); - - // Model versions CSV - let mv_csv = write_csv( - dir.path(), - "model_versions.csv", - "id,baseModel,modelId\n10,SDXL,1000\n20,SD 1.5,2000\n", - ); - - // Models CSV - let models_csv = write_csv( - dir.path(), - "models.csv", - "id,poi,type\n1000,false,Checkpoint\n2000,true,LORA\n", - ); - - // Config: Resources → MV (by modelVersionId) → Model (by modelId, filter: Checkpoint) - let config = EnrichmentConfig { - csv_path: mv_csv, - key: "id".into(), - join_on: "modelVersionId".into(), - fields: vec![("baseModel".into(), "baseModel".into())], - computed_fields: vec![], - filter: None, - columns: vec![], - child: Some(Box::new(EnrichmentConfig { - csv_path: models_csv, - key: "id".into(), - join_on: "modelId".into(), - fields: vec![("poi".into(), "poi".into())], - computed_fields: vec![], - filter: Some(FilterExpression::parse("type = 'Checkpoint'").unwrap()), - child: None, - columns: vec![], - })), - }; - - let table = EnrichmentTable::load(&config).unwrap(); - assert_eq!(table.row_count, 2); - assert!(table.child().is_some()); - assert_eq!(table.child().unwrap().row_count, 2); - - // Resource row with MV id=10 (Checkpoint model → filter passes) - let row1: CsvRow = vec![("modelVersionId", Some("10"))].into_iter().collect(); - let enriched1 = table.enrich(&row1, &config); - // baseModel from MV level - assert!(enriched1.fields.contains(&("baseModel".into(), "SDXL".into()))); - // poi from Model level (Checkpoint, filter passed) - assert!(enriched1.fields.contains(&("poi".into(), "false".into()))); - - // Resource row with MV id=20 (LORA model → filter fails) - let row2: CsvRow = vec![("modelVersionId", Some("20"))].into_iter().collect(); - let enriched2 = table.enrich(&row2, &config); - // baseModel from MV level (no filter on MV) - assert!(enriched2.fields.contains(&("baseModel".into(), "SD 1.5".into()))); - // poi NOT present — Model filter (type=Checkpoint) failed for LORA - assert!(!enriched2.fields.iter().any(|(k, _)| k == "poi")); - } - - // ---- EnrichmentManager tests ---- - - #[test] - fn test_manager_load_and_clear() { - let dir = TempDir::new().unwrap(); - let csv = write_csv(dir.path(), "posts.csv", "id,name\n100,hello\n"); - - let mut mgr = EnrichmentManager::new(); - assert_eq!(mgr.table_count(), 0); - - mgr.load(EnrichmentConfig { - csv_path: csv, - key: "id".into(), - join_on: "postId".into(), - fields: vec![("name".into(), "name".into())], - computed_fields: vec![], - filter: None, - child: None, - columns: vec![], - }) - .unwrap(); - - assert_eq!(mgr.table_count(), 1); - assert!(mgr.total_memory() > 0); - - mgr.clear(); - assert_eq!(mgr.table_count(), 0); - } - - #[test] - fn test_manager_enrich_row() { - let dir = TempDir::new().unwrap(); - let csv = write_csv( - dir.path(), - "posts.csv", - "id,availability\n100,Public\n200,Private\n", - ); - - let mut mgr = EnrichmentManager::new(); - mgr.load(EnrichmentConfig { - csv_path: csv, - key: "id".into(), - join_on: "postId".into(), - fields: vec![("availability".into(), "availability".into())], - computed_fields: vec![], - filter: None, - child: None, - columns: vec![], - }) - .unwrap(); - - let row: CsvRow = vec![("postId", Some("100"))].into_iter().collect(); - let enriched = mgr.enrich_row(&row); - assert_eq!(enriched.fields.len(), 1); - assert!(enriched.fields.contains(&("availability".into(), "Public".into()))); - } - - // ---- Dictionary tests ---- - - #[test] - fn test_resolve_dictionary_value() { - let dict = FieldDictionary::new(); - let key1 = resolve_dictionary_value(&dict, "Checkpoint"); - let key2 = resolve_dictionary_value(&dict, "LORA"); - let key3 = resolve_dictionary_value(&dict, "Checkpoint"); // same as key1 - assert_ne!(key1, key2); - assert_eq!(key1, key3); - } - - #[test] - fn test_resolve_expr_to_bitmap_key() { - let dict = FieldDictionary::new(); - - // Integer → direct - assert_eq!( - resolve_expr_to_bitmap_key(&ExprValue::Int(42), None), - Some(42) - ); - - // Bool → 0/1 - assert_eq!( - resolve_expr_to_bitmap_key(&ExprValue::Bool(true), None), - Some(1) - ); - - // String with dict → dictionary key - let key = resolve_expr_to_bitmap_key(&ExprValue::Str("Public".into()), Some(&dict)); - assert!(key.is_some()); - - // String without dict → try parse - assert_eq!( - resolve_expr_to_bitmap_key(&ExprValue::Str("42".into()), None), - Some(42) - ); - - // Null → None - assert_eq!(resolve_expr_to_bitmap_key(&ExprValue::Null, None), None); - } - - #[test] - fn test_dictionary_set() { - let set = DictionarySet::new(&["type", "availability", "baseModel"]); - assert_eq!(set.names().len(), 3); - - let key1 = set.resolve("type", "Checkpoint").unwrap(); - let key2 = set.resolve("type", "LORA").unwrap(); - assert_ne!(key1, key2); - - // Unknown field → None - assert!(set.resolve("unknown", "value").is_none()); - } - - #[test] - fn test_dictionary_set_persist() { - let dir = TempDir::new().unwrap(); - let dict_dir = dir.path().join("dictionaries"); - - let set = DictionarySet::new(&["type", "availability"]); - set.resolve("type", "Checkpoint"); - set.resolve("type", "LORA"); - set.resolve("availability", "Public"); - - set.persist_all(&dict_dir).unwrap(); - - // Check files exist - assert!(dict_dir.join("type.dict").exists()); - assert!(dict_dir.join("availability.dict").exists()); - } } diff --git a/src/sync/dump_expression.rs b/src/sync/dump_expression.rs index 2e5efc8..3d57589 100644 --- a/src/sync/dump_expression.rs +++ b/src/sync/dump_expression.rs @@ -49,14 +49,6 @@ impl ExprValue { } } - /// Coerce to string. - pub fn as_str_value(&self) -> Option<&str> { - match self { - ExprValue::Str(s) => Some(s.as_str()), - _ => None, - } - } - pub fn is_null(&self) -> bool { matches!(self, ExprValue::Null) } @@ -99,23 +91,10 @@ pub enum Expr { Max(Vec), } -/// Context for expression evaluation. -pub struct EvalContext<'a> { - /// The current CSV row being processed. - pub row: &'a CsvRow<'a>, - /// The enrichment join key value (for `lookup_key` expressions). - pub lookup_key: Option, -} - /// Column name → index mapping for zero-allocation row access. /// Build once from CSV headers, reuse for every row in the phase. pub type ColumnIndex = HashMap; -/// Build a ColumnIndex from CSV header names. -pub fn build_column_index(headers: &[&str]) -> ColumnIndex { - headers.iter().enumerate().map(|(i, &name)| (name.to_string(), i)).collect() -} - /// Zero-allocation evaluation context using column indices. /// The row is a slice of parsed fields — no HashMap per row. pub struct IndexedEvalContext<'a> { @@ -138,122 +117,6 @@ impl<'a> IndexedEvalContext<'a> { } impl Expr { - /// Evaluate the expression against a row context. - pub fn eval(&self, ctx: &EvalContext) -> ExprValue { - match self { - Expr::Column(name) => { - match ctx.row.get(name.as_str()) { - Some(Some(val)) if !val.is_empty() => { - // Try to parse as integer first, then keep as string - if let Ok(n) = val.parse::() { - ExprValue::Int(n) - } else if *val == "true" || *val == "t" { - ExprValue::Bool(true) - } else if *val == "false" || *val == "f" { - ExprValue::Bool(false) - } else { - ExprValue::Str(val.to_string()) - } - } - _ => ExprValue::Null, - } - } - Expr::IntLit(n) => ExprValue::Int(*n), - Expr::StrLit(s) => ExprValue::Str(s.clone()), - Expr::BoolLit(b) => ExprValue::Bool(*b), - Expr::NullLit => ExprValue::Null, - Expr::LookupKey => match ctx.lookup_key { - Some(k) => ExprValue::Int(k), - None => ExprValue::Null, - }, - - Expr::BitfieldExtract { expr, shift, mask } => { - let val = expr.eval(ctx); - match val.as_i64() { - Some(n) => ExprValue::Int((n >> shift) & (*mask as i64)), - None => ExprValue::Null, - } - } - - Expr::Eq(left, right) => { - let l = left.eval(ctx); - let r = right.eval(ctx); - // null != null (SQL semantics for filter context) - if l.is_null() && r.is_null() { - // Special case: `col != null` is handled by NotEq - // For `col = null`, we check if left is null - return ExprValue::Bool(true); - } - if l.is_null() || r.is_null() { - return ExprValue::Bool(false); - } - let result = match (&l, &r) { - (ExprValue::Int(a), ExprValue::Int(b)) => a == b, - (ExprValue::Str(a), ExprValue::Str(b)) => a == b, - (ExprValue::Bool(a), ExprValue::Bool(b)) => a == b, - // Cross-type: try i64 comparison - _ => l.as_i64() == r.as_i64(), - }; - ExprValue::Bool(result) - } - - Expr::NotEq(left, right) => { - let l = left.eval(ctx); - let r = right.eval(ctx); - // `col != null` means "col is not null" - if r.is_null() { - return ExprValue::Bool(!l.is_null()); - } - if l.is_null() { - return ExprValue::Bool(true); - } - let result = match (&l, &r) { - (ExprValue::Int(a), ExprValue::Int(b)) => a != b, - (ExprValue::Str(a), ExprValue::Str(b)) => a != b, - (ExprValue::Bool(a), ExprValue::Bool(b)) => a != b, - _ => l.as_i64() != r.as_i64(), - }; - ExprValue::Bool(result) - } - - Expr::And(left, right) => { - let l = left.eval(ctx); - if !l.as_bool() { - return ExprValue::Bool(false); - } - let r = right.eval(ctx); - ExprValue::Bool(r.as_bool()) - } - - Expr::Or(left, right) => { - let l = left.eval(ctx); - if l.as_bool() { - return ExprValue::Bool(true); - } - let r = right.eval(ctx); - ExprValue::Bool(r.as_bool()) - } - - Expr::Max(columns) => { - let mut max_val: Option = None; - for col in columns { - if let Some(Some(val)) = ctx.row.get(col.as_str()) { - if let Ok(n) = val.parse::() { - max_val = Some(match max_val { - Some(cur) => cur.max(n), - None => n, - }); - } - } - } - match max_val { - Some(n) => ExprValue::Int(n), - None => ExprValue::Null, - } - } - } - } - /// Evaluate against an indexed row context (zero-allocation per row). /// This is the hot-path method for 107M+ row processing. pub fn eval_indexed(&self, ctx: &IndexedEvalContext) -> ExprValue { @@ -643,12 +506,6 @@ impl FilterExpression { Ok(Self { expr, source: source.to_string() }) } - /// Evaluate the filter against a row. Returns true if the row passes. - pub fn eval(&self, row: &CsvRow, lookup_key: Option) -> bool { - let ctx = EvalContext { row, lookup_key }; - self.expr.eval(&ctx).as_bool() - } - /// Evaluate against an indexed row (zero-allocation hot path). #[inline] pub fn eval_indexed(&self, fields: &[Option<&str>], col_idx: &ColumnIndex, lookup_key: Option) -> bool { @@ -689,36 +546,6 @@ impl ComputedFieldDef { /// Returns `Some(value)` if the field should be set, `None` if it should be skipped. /// For conditional fields (value_column set), returns the value from that column /// only when the expression evaluates to true. - pub fn eval(&self, row: &CsvRow, lookup_key: Option) -> Option { - let ctx = EvalContext { row, lookup_key }; - - if let Some(ref value_col) = self.value_column { - // Conditional: expression is a filter, value comes from column - if self.expr.eval(&ctx).as_bool() { - match row.get(value_col.as_str()) { - Some(Some(val)) if !val.is_empty() => { - if let Ok(n) = val.parse::() { - Some(ExprValue::Int(n)) - } else { - Some(ExprValue::Str(val.to_string())) - } - } - _ => None, - } - } else { - None - } - } else { - // Standard: expression IS the value - let val = self.expr.eval(&ctx); - if val.is_null() { - None - } else { - Some(val) - } - } - } - /// Evaluate against an indexed row (zero-allocation hot path). pub fn eval_indexed(&self, fields: &[Option<&str>], col_idx: &ColumnIndex, lookup_key: Option) -> Option { let ctx = IndexedEvalContext { fields, col_idx, lookup_key }; @@ -841,203 +668,6 @@ mod tests { } } - // --- Evaluator tests --- - - #[test] - fn test_eval_identity() { - let expr = parse_expression("id").unwrap(); - let row = make_row(&[("id", "12345")]); - let ctx = EvalContext { row: &row, lookup_key: None }; - assert_eq!(expr.eval(&ctx), ExprValue::Int(12345)); - } - - #[test] - fn test_eval_lookup_key() { - let expr = parse_expression("lookup_key").unwrap(); - let row = CsvRow::new(); - let ctx = EvalContext { row: &row, lookup_key: Some(42) }; - assert_eq!(expr.eval(&ctx), ExprValue::Int(42)); - } - - #[test] - fn test_eval_null_check_present() { - let expr = parse_expression("publishedAtSecs != null").unwrap(); - let row = make_row(&[("publishedAtSecs", "1700000000")]); - let ctx = EvalContext { row: &row, lookup_key: None }; - assert_eq!(expr.eval(&ctx), ExprValue::Bool(true)); - } - - #[test] - fn test_eval_null_check_absent() { - let expr = parse_expression("publishedAtSecs != null").unwrap(); - let row = make_row_with_nulls(&[("publishedAtSecs", None)]); - let ctx = EvalContext { row: &row, lookup_key: None }; - assert_eq!(expr.eval(&ctx), ExprValue::Bool(false)); - } - - #[test] - fn test_eval_equality_string() { - let expr = parse_expression("type = 'Checkpoint'").unwrap(); - let row = make_row(&[("type", "Checkpoint")]); - let ctx = EvalContext { row: &row, lookup_key: None }; - assert_eq!(expr.eval(&ctx), ExprValue::Bool(true)); - } - - #[test] - fn test_eval_equality_string_mismatch() { - let expr = parse_expression("type = 'Checkpoint'").unwrap(); - let row = make_row(&[("type", "LORA")]); - let ctx = EvalContext { row: &row, lookup_key: None }; - assert_eq!(expr.eval(&ctx), ExprValue::Bool(false)); - } - - #[test] - fn test_eval_boolean_false() { - let expr = parse_expression("detected == false").unwrap(); - let row = make_row(&[("detected", "false")]); - let ctx = EvalContext { row: &row, lookup_key: None }; - assert_eq!(expr.eval(&ctx), ExprValue::Bool(true)); - } - - #[test] - fn test_eval_boolean_true() { - let expr = parse_expression("detected == false").unwrap(); - let row = make_row(&[("detected", "true")]); - let ctx = EvalContext { row: &row, lookup_key: None }; - assert_eq!(expr.eval(&ctx), ExprValue::Bool(false)); - } - - #[test] - fn test_eval_bitfield_set() { - // (flags >> 13) & 1 == 1 - let expr = parse_expression("(flags >> 13) & 1 == 1").unwrap(); - let flags = (1i64 << 13).to_string(); - let row = make_row(&[("flags", &flags)]); - let ctx = EvalContext { row: &row, lookup_key: None }; - assert_eq!(expr.eval(&ctx), ExprValue::Bool(true)); - } - - #[test] - fn test_eval_bitfield_unset() { - let expr = parse_expression("(flags >> 13) & 1 == 1").unwrap(); - let row = make_row(&[("flags", "0")]); - let ctx = EvalContext { row: &row, lookup_key: None }; - assert_eq!(expr.eval(&ctx), ExprValue::Bool(false)); - } - - #[test] - fn test_eval_compound_bitfield() { - // hasMeta: (flags >> 13) & 1 == 1 && (flags >> 2) & 1 == 0 - let expr = parse_expression("(flags >> 13) & 1 == 1 && (flags >> 2) & 1 == 0").unwrap(); - // bit 13 set, bit 2 NOT set → true - let flags = (1i64 << 13).to_string(); - let row = make_row(&[("flags", &flags)]); - let ctx = EvalContext { row: &row, lookup_key: None }; - assert_eq!(expr.eval(&ctx), ExprValue::Bool(true)); - - // bit 13 set, bit 2 ALSO set → false - let flags2 = ((1i64 << 13) | (1i64 << 2)).to_string(); - let row2 = make_row(&[("flags", &flags2)]); - let ctx2 = EvalContext { row: &row2, lookup_key: None }; - assert_eq!(expr.eval(&ctx2), ExprValue::Bool(false)); - } - - #[test] - fn test_eval_max() { - let expr = parse_expression("max(scannedAtSecs, createdAtSecs)").unwrap(); - let row = make_row(&[("scannedAtSecs", "1000"), ("createdAtSecs", "2000")]); - let ctx = EvalContext { row: &row, lookup_key: None }; - assert_eq!(expr.eval(&ctx), ExprValue::Int(2000)); - } - - #[test] - fn test_eval_max_with_null() { - let expr = parse_expression("max(scannedAtSecs, createdAtSecs)").unwrap(); - let row = make_row_with_nulls(&[ - ("scannedAtSecs", None), - ("createdAtSecs", Some("2000")), - ]); - let ctx = EvalContext { row: &row, lookup_key: None }; - assert_eq!(expr.eval(&ctx), ExprValue::Int(2000)); - } - - // --- Filter expression tests --- - - #[test] - fn test_filter_disabled_tags() { - // (attributes >> 10) & 1 = 0 — skip disabled tags (filter returns true to include) - let filter = FilterExpression::parse("(attributes >> 10) & 1 = 0").unwrap(); - - // Not disabled (bit 10 not set) → include - let row = make_row(&[("attributes", "0")]); - assert!(filter.eval(&row, None)); - - // Disabled (bit 10 set) → exclude - let disabled = (1i64 << 10).to_string(); - let row2 = make_row(&[("attributes", &disabled)]); - assert!(!filter.eval(&row2, None)); - } - - // --- Computed field tests --- - - #[test] - fn test_computed_has_meta() { - let cf = ComputedFieldDef::parse("hasMeta", "(flags >> 13) & 1 == 1 && (flags >> 2) & 1 == 0", None).unwrap(); - let flags = (1i64 << 13).to_string(); - let row = make_row(&[("flags", &flags)]); - assert_eq!(cf.eval(&row, None), Some(ExprValue::Bool(true))); - } - - #[test] - fn test_computed_is_published() { - let cf = ComputedFieldDef::parse("isPublished", "publishedAtSecs != null", None).unwrap(); - let row = make_row(&[("publishedAtSecs", "1700000000")]); - assert_eq!(cf.eval(&row, None), Some(ExprValue::Bool(true))); - - let row2 = make_row_with_nulls(&[("publishedAtSecs", None)]); - // false is not null, so it should return Some(Bool(false)) - assert_eq!(cf.eval(&row2, None), Some(ExprValue::Bool(false))); - } - - #[test] - fn test_computed_posted_to_id() { - let cf = ComputedFieldDef::parse("postedToId", "lookup_key", None).unwrap(); - let row = CsvRow::new(); - assert_eq!(cf.eval(&row, Some(999)), Some(ExprValue::Int(999))); - } - - #[test] - fn test_computed_conditional_multi_value() { - // modelVersionIdsManual: detected == false, value = modelVersionId - let cf = ComputedFieldDef::parse( - "modelVersionIdsManual", - "detected == false", - Some("modelVersionId"), - ).unwrap(); - - // detected=false → include with modelVersionId value - let row = make_row(&[("detected", "false"), ("modelVersionId", "42")]); - assert_eq!(cf.eval(&row, None), Some(ExprValue::Int(42))); - - // detected=true → skip - let row2 = make_row(&[("detected", "true"), ("modelVersionId", "42")]); - assert_eq!(cf.eval(&row2, None), None); - } - - #[test] - fn test_computed_max_sort() { - let cf = ComputedFieldDef::parse("existedAt", "max(scannedAtSecs, createdAtSecs)", None).unwrap(); - let row = make_row(&[("scannedAtSecs", "100"), ("createdAtSecs", "200")]); - assert_eq!(cf.eval(&row, None), Some(ExprValue::Int(200))); - } - - #[test] - fn test_computed_identity() { - let cf = ComputedFieldDef::parse("id", "id", None).unwrap(); - let row = make_row(&[("id", "12345")]); - assert_eq!(cf.eval(&row, None), Some(ExprValue::Int(12345))); - } - // --- Error handling tests --- #[test] @@ -1054,12 +684,4 @@ mod tests { fn test_parse_unmatched_paren() { assert!(parse_expression("(flags >> 13").is_err()); } - - #[test] - fn test_eval_missing_column() { - let expr = parse_expression("missing_col").unwrap(); - let row = CsvRow::new(); - let ctx = EvalContext { row: &row, lookup_key: None }; - assert_eq!(expr.eval(&ctx), ExprValue::Null); - } } diff --git a/src/sync/dump_processor.rs b/src/sync/dump_processor.rs index 5132448..4fb0daa 100644 --- a/src/sync/dump_processor.rs +++ b/src/sync/dump_processor.rs @@ -545,29 +545,8 @@ impl<'a> ParsedRow<'a> { self.get_i64(slot_field).map(|v| v as u32) } - /// Convert to Nate's CsvRow format for expression/enrichment evaluation. - pub fn to_csv_row<'b>(&'b self) -> CsvRow<'b> { - let mut row = CsvRow::new(); - for (name, &idx) in self.col_index { - if let Some(bytes) = self.fields.get(idx) { - if bytes.is_empty() { - row.insert(name.as_str(), None); - } else { - let s = if bytes.len() >= 2 && bytes[0] == b'"' && bytes[bytes.len() - 1] == b'"' { - std::str::from_utf8(&bytes[1..bytes.len() - 1]).ok() - } else { - std::str::from_utf8(bytes).ok() - }; - row.insert(name.as_str(), s); - } - } - } - row - } - /// Build indexed fields for zero-allocation expression evaluation. /// Returns a Vec> aligned to the column index positions. - /// Much cheaper than to_csv_row() — no HashMap allocation. pub fn to_indexed_fields<'b>(&'b self) -> Vec> { self.fields .iter() diff --git a/src/sync/ingester.rs b/src/sync/ingester.rs index 8812797..be33083 100644 --- a/src/sync/ingester.rs +++ b/src/sync/ingester.rs @@ -1,15 +1,11 @@ //! Bitmap sink traits and implementations for document ingestion. //! -//! Two bitmap sinks: -//! - `CoalescerSink`: sends MutationOps to the write coalescer channel (online upserts) -//! - `AccumSink`: inserts directly into a BitmapAccum (bulk loading) +//! Provides `CoalescerSink`: sends MutationOps to the write coalescer channel (online upserts). +//! The AccumSink (bulk loading) has been removed along with the V1 bulk loader. use std::sync::Arc; -use roaring::RoaringBitmap; - use crate::error::Result; -use super::loader::BitmapAccum; use crate::mutation::{MutationOp, MutationSender}; /// Trait for sinking bitmap mutations during document ingestion. @@ -127,100 +123,3 @@ impl BitmapSink for CoalescerSink { } } -/// BitmapSink that inserts directly into a BitmapAccum. -/// Used by the bulk loading path where bitmaps are accumulated in-memory -/// and applied to staging in one shot. -pub struct AccumSink<'a> { - accum: &'a mut BitmapAccum, -} - -impl<'a> AccumSink<'a> { - #[allow(dead_code)] - pub(crate) fn new(accum: &'a mut BitmapAccum) -> Self { - Self { accum } - } -} - -impl<'a> BitmapSink for AccumSink<'a> { - fn filter_insert(&mut self, field: Arc, value: u64, slot: u32) { - let field_name: &str = &field; - if let Some(value_map) = self.accum.filter_maps.get_mut(field_name) { - value_map - .entry(value) - .or_insert_with(RoaringBitmap::new) - .insert(slot); - } - } - - fn filter_remove(&mut self, _field: Arc, _value: u64, _slot: u32) { - // Bulk loading never removes — this is a fresh insert path. - } - - fn sort_set(&mut self, field: Arc, bit_layer: usize, slot: u32) { - let field_name: &str = &field; - if let Some(layer_map) = self.accum.sort_maps.get_mut(field_name) { - layer_map - .entry(bit_layer) - .or_insert_with(RoaringBitmap::new) - .insert(slot); - } - } - - fn sort_clear(&mut self, _field: Arc, _bit_layer: usize, _slot: u32) { - // Bulk loading never clears sort layers. - } - - fn alive_insert(&mut self, slot: u32) { - self.accum.alive.insert(slot); - } - - fn alive_remove(&mut self, _slot: u32) { - // Bulk loading never removes alive bits. - } - - fn deferred_alive(&mut self, _slot: u32, _activate_at: u64) { - // In dump mode, deferred alive is a no-op for AccumSink. - // The slot is NOT added to the alive bitmap (skipped in the caller). - // The deferred alive map is built separately by the dump pipeline - // and applied to the engine after the dump completes. - } - - fn flush(&mut self) -> Result<()> { - Ok(()) // Accum is in-memory, nothing to flush. - } -} - - - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_accum_sink() { - let mut accum = BitmapAccum::new( - &["nsfwLevel".to_string()], - &[("reactionCount".to_string(), 32)], - ); - - { - let mut sink = AccumSink::new(&mut accum); - sink.filter_insert(Arc::from("nsfwLevel"), 1, 10); - sink.filter_insert(Arc::from("nsfwLevel"), 1, 20); - sink.filter_insert(Arc::from("nsfwLevel"), 2, 30); - sink.sort_set(Arc::from("reactionCount"), 0, 10); - sink.sort_set(Arc::from("reactionCount"), 1, 10); - sink.alive_insert(10); - sink.alive_insert(20); - sink.alive_insert(30); - } - - assert_eq!(accum.alive.len(), 3); - let nsfw_map = &accum.filter_maps["nsfwLevel"]; - assert_eq!(nsfw_map[&1].len(), 2); - assert_eq!(nsfw_map[&2].len(), 1); - let sort_map = &accum.sort_maps["reactionCount"]; - assert_eq!(sort_map[&0].len(), 1); - assert_eq!(sort_map[&1].len(), 1); - } -} diff --git a/src/sync/loader.rs b/src/sync/loader.rs deleted file mode 100644 index e057bf1..0000000 --- a/src/sync/loader.rs +++ /dev/null @@ -1,1855 +0,0 @@ -//! Generic NDJSON loader — converts arbitrary NDJSON files to engine Documents -//! using a DataSchema definition. -//! -//! Three-stage pipeline: -//! Stage 1 (reader thread): reads raw bytes from disk into blocks -//! Stage 2 (parse thread): rayon fold+reduce → bitmap maps + full docs (fused) -//! Stage 3 (main thread): apply bitmaps to staging + async docstore writes -//! -//! Key optimization: bitmaps are built directly from JSON during parse — no -//! intermediate Document allocation for the bitmap path. The old decompose/merge -//! pipeline in put_bulk_into is bypassed entirely. - -use std::collections::{HashMap, HashSet}; -use std::fs::File; -use std::io::Read as _; -use std::path::Path; -use std::sync::atomic::{AtomicU64, Ordering}; -use std::sync::Arc; -use std::thread; -use std::time::{Duration, Instant}; - -use rayon::prelude::*; -use roaring::RoaringBitmap; - -use crate::engine::ConcurrentEngine; -use crate::config::{DataSchema, FieldMapping, FieldValueType}; -use crate::dictionary::FieldDictionary; -use crate::mutation::{Document, FieldValue}; -use crate::query::Value; -#[cfg(test)] -use crate::silos::doc_format::StoredDoc; - -/// Statistics from a completed load operation. -#[derive(Debug, Clone)] -pub struct LoadStats { - pub records_loaded: u64, - pub elapsed: Duration, - pub errors_skipped: u64, -} - -/// Bitmap accumulator for rayon fold+reduce. -/// Each rayon task builds its own instance; reduce merges them with bitmap OR. -pub(crate) struct BitmapAccum { - pub(crate) filter_maps: HashMap>, - pub(crate) sort_maps: HashMap>, - pub(crate) alive: RoaringBitmap, - /// Pre-encoded msgpack bytes — encoding happens in the rayon fold so - /// BulkWriter does pure I/O with no rayon contention. - pub(crate) encoded_docs: Vec<(u32, Vec)>, - /// Deferred alive slots: (slot, activate_at_secs). These slots have - /// filter/sort bitmaps set but alive is NOT set — deferred until timestamp. - pub(crate) deferred_alive: Vec<(u32, u64)>, - pub(crate) count: usize, - pub(crate) errors: u64, -} - -impl BitmapAccum { - pub(crate) fn new(filter_names: &[String], sort_configs: &[(String, u8)]) -> Self { - let mut filter_maps = HashMap::with_capacity(filter_names.len()); - for name in filter_names { - filter_maps.insert(name.clone(), HashMap::new()); - } - let mut sort_maps = HashMap::with_capacity(sort_configs.len()); - for (name, bits) in sort_configs { - sort_maps.insert(name.clone(), HashMap::with_capacity(*bits as usize)); - } - BitmapAccum { - filter_maps, - sort_maps, - alive: RoaringBitmap::new(), - encoded_docs: Vec::new(), - deferred_alive: Vec::new(), - count: 0, - errors: 0, - } - } - - /// Save this accumulator to a checkpoint file for crash recovery. - /// - /// Format: [alive_len:u64][alive_bytes][filter_count:u64] - /// for each filter: [name_len:u64][name_bytes][value_count:u64] - /// for each value: [value:u64][bitmap_len:u64][bitmap_bytes] - /// [sort_count:u64] - /// for each sort: [name_len:u64][name_bytes][bit_count:u64] - /// for each bit: [bit:u64][bitmap_len:u64][bitmap_bytes] - #[allow(dead_code)] - pub(crate) fn save_checkpoint(&self, path: &std::path::Path) -> std::io::Result<()> { - let mut buf = Vec::with_capacity(64 * 1024 * 1024); - - // Alive bitmap - let alive_bytes = self.alive.serialized_size(); - buf.extend_from_slice(&(alive_bytes as u64).to_le_bytes()); - self.alive.serialize_into(&mut buf)?; - - // Filter maps - buf.extend_from_slice(&(self.filter_maps.len() as u64).to_le_bytes()); - for (name, value_map) in &self.filter_maps { - let name_bytes = name.as_bytes(); - buf.extend_from_slice(&(name_bytes.len() as u64).to_le_bytes()); - buf.extend_from_slice(name_bytes); - buf.extend_from_slice(&(value_map.len() as u64).to_le_bytes()); - for (&value, bitmap) in value_map { - buf.extend_from_slice(&value.to_le_bytes()); - let bm_size = bitmap.serialized_size(); - buf.extend_from_slice(&(bm_size as u64).to_le_bytes()); - bitmap.serialize_into(&mut buf)?; - } - } - - // Sort maps - buf.extend_from_slice(&(self.sort_maps.len() as u64).to_le_bytes()); - for (name, bit_map) in &self.sort_maps { - let name_bytes = name.as_bytes(); - buf.extend_from_slice(&(name_bytes.len() as u64).to_le_bytes()); - buf.extend_from_slice(name_bytes); - buf.extend_from_slice(&(bit_map.len() as u64).to_le_bytes()); - for (&bit, bitmap) in bit_map { - buf.extend_from_slice(&(bit as u64).to_le_bytes()); - let bm_size = bitmap.serialized_size(); - buf.extend_from_slice(&(bm_size as u64).to_le_bytes()); - bitmap.serialize_into(&mut buf)?; - } - } - - // Atomic write: write to temp file, then rename - let tmp = path.with_extension("tmp"); - std::fs::write(&tmp, &buf)?; - std::fs::rename(&tmp, path)?; - eprintln!( - "Checkpoint saved: {} ({:.1} MB)", - path.display(), - buf.len() as f64 / (1024.0 * 1024.0) - ); - Ok(()) - } - - /// Load an accumulator from a checkpoint file. - #[allow(dead_code)] - pub(crate) fn load_checkpoint(path: &std::path::Path) -> std::io::Result { - let data = std::fs::read(path)?; - let mut pos = 0; - - let read_u64 = |pos: &mut usize| -> u64 { - let val = u64::from_le_bytes(data[*pos..*pos + 8].try_into().unwrap()); - *pos += 8; - val - }; - - // Alive bitmap - let alive_len = read_u64(&mut pos) as usize; - let alive = RoaringBitmap::deserialize_from(&data[pos..pos + alive_len]) - .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?; - pos += alive_len; - - // Filter maps - let filter_count = read_u64(&mut pos) as usize; - let mut filter_maps = HashMap::with_capacity(filter_count); - for _ in 0..filter_count { - let name_len = read_u64(&mut pos) as usize; - let name = String::from_utf8_lossy(&data[pos..pos + name_len]).into_owned(); - pos += name_len; - let value_count = read_u64(&mut pos) as usize; - let mut value_map = HashMap::with_capacity(value_count); - for _ in 0..value_count { - let value = read_u64(&mut pos); - let bm_size = read_u64(&mut pos) as usize; - let bitmap = RoaringBitmap::deserialize_from(&data[pos..pos + bm_size]) - .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?; - pos += bm_size; - value_map.insert(value, bitmap); - } - filter_maps.insert(name, value_map); - } - - // Sort maps - let sort_count = read_u64(&mut pos) as usize; - let mut sort_maps = HashMap::with_capacity(sort_count); - for _ in 0..sort_count { - let name_len = read_u64(&mut pos) as usize; - let name = String::from_utf8_lossy(&data[pos..pos + name_len]).into_owned(); - pos += name_len; - let bit_count = read_u64(&mut pos) as usize; - let mut bit_map = HashMap::with_capacity(bit_count); - for _ in 0..bit_count { - let bit = read_u64(&mut pos) as usize; - let bm_size = read_u64(&mut pos) as usize; - let bitmap = RoaringBitmap::deserialize_from(&data[pos..pos + bm_size]) - .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?; - pos += bm_size; - bit_map.insert(bit, bitmap); - } - sort_maps.insert(name, bit_map); - } - - eprintln!( - "Checkpoint loaded: {} ({:.1} MB, {} alive)", - path.display(), - data.len() as f64 / (1024.0 * 1024.0), - alive.len() - ); - - Ok(BitmapAccum { - filter_maps, - sort_maps, - alive, - encoded_docs: Vec::new(), - deferred_alive: Vec::new(), - count: 0, - errors: 0, - }) - } - - #[cfg(test)] - pub(crate) fn alive_len(&self) -> u64 { - self.alive.len() - } - - pub(crate) fn merge(mut self, other: Self) -> Self { - self.alive |= &other.alive; - for (field, value_map) in other.filter_maps { - let target = self.filter_maps.entry(field).or_default(); - for (value, bm) in value_map { - target - .entry(value) - .and_modify(|e| *e |= &bm) - .or_insert(bm); - } - } - for (field, bit_map) in other.sort_maps { - let target = self.sort_maps.entry(field).or_default(); - for (bit, bm) in bit_map { - target - .entry(bit) - .and_modify(|e| *e |= &bm) - .or_insert(bm); - } - } - self.encoded_docs.extend(other.encoded_docs); - self.deferred_alive.extend(other.deferred_alive); - self.count += other.count; - self.errors += other.errors; - self - } -} - -/// Load an NDJSON file into an engine using the given data schema. -/// -/// - `engine`: target ConcurrentEngine (must already be constructed with the right config) -/// - `schema`: field mapping rules for converting raw JSON → Documents -/// - `path`: path to the NDJSON file -/// - `limit`: optional max records to load -/// - `threads`: number of threads (unused — rayon manages parallelism) -/// - `chunk_size`: number of full docs to accumulate before flushing docstore -/// - `docstore_batch_size`: unused -/// - `max_writer_threads`: max concurrent docstore writer threads (0 = unbounded) -/// - `progress`: atomic counter updated as records are loaded (for progress polling) -pub fn load_ndjson( - engine: &ConcurrentEngine, - schema: &DataSchema, - path: &Path, - limit: Option, - _threads: usize, - chunk_size: usize, - _docstore_batch_size: usize, - max_writer_threads: usize, - progress: Arc, -) -> Result { - let record_limit = limit.unwrap_or(usize::MAX); - let _chunk_size = chunk_size; // kept for API compat; docstore flushes per block now - let read_batch_size: usize = 500_000; - let target_batch_bytes = read_batch_size * 600; - - // Pre-build field lookup tables for direct bitmap extraction - let config = engine.config(); - let filter_names: Vec = config.filter_fields.iter().map(|f| f.name.clone()).collect(); - let sort_configs: Vec<(String, u8)> = config - .sort_fields - .iter() - .map(|f| (f.name.clone(), f.bits)) - .collect(); - let filter_set: HashSet = filter_names.iter().cloned().collect(); - let sort_bits: HashMap = sort_configs.iter().cloned().collect(); - - // ---- Stage 1: Reader thread ---- - // Reads raw bytes from disk in large blocks, split on newline boundaries. - let data_path_owned = path.to_owned(); - let (block_tx, block_rx) = std::sync::mpsc::sync_channel::>(2); - - let reader_handle = thread::spawn(move || { - let file = File::open(&data_path_owned).expect("Failed to open data file"); - let mut reader = std::io::BufReader::with_capacity(16 * 1024 * 1024, file); - let mut buf = vec![0u8; 4 * 1024 * 1024]; - let mut accum = Vec::::with_capacity(target_batch_bytes + 4 * 1024 * 1024); - - loop { - let bytes_read = reader.read(&mut buf).unwrap_or(0); - if bytes_read == 0 { - if !accum.is_empty() { - let _ = block_tx.send(accum); - } - break; - } - accum.extend_from_slice(&buf[..bytes_read]); - - if accum.len() >= target_batch_bytes { - if let Some(last_nl) = memrchr_newline(&accum) { - let remainder = accum[last_nl + 1..].to_vec(); - accum.truncate(last_nl + 1); - let batch = std::mem::replace( - &mut accum, - Vec::with_capacity(target_batch_bytes + 4 * 1024 * 1024), - ); - accum = remainder; - if block_tx.send(batch).is_err() { - break; - } - } - } - } - }); - - // Register field names with the docstore field dictionary. - // TODO: BitmapSilo (Phase 3) — replace with DataSilo BulkWriter when wired. - let all_field_names: Vec = schema - .fields - .iter() - .map(|f| f.target.clone()) - .chain(std::iter::once("id".to_string())) - .collect(); - // Set up field defaults for write-side elision before creating the BulkWriter - engine.set_docstore_defaults(schema); - engine.prepare_field_names(&all_field_names).expect("prepare_field_names"); - let bulk_writer = Arc::new(()); // TODO: BitmapSilo Phase 3 — stub, replace with DataSilo BulkWriter - - // ---- Stage 2: Fused parse + bitmap build + doc encode thread ---- - // Rayon fold+reduce: JSON → bitmap maps + pre-encoded msgpack bytes in one pass. - // No intermediate Document for the bitmap path; encoding in-fold avoids rayon contention. - let schema_ref = schema.clone(); - let filter_names_clone = filter_names.clone(); - let sort_configs_clone = sort_configs.clone(); - let filter_set_clone = filter_set; - let sort_bits_clone = sort_bits; - let parse_writer = Arc::clone(&bulk_writer); - let (chunk_tx, chunk_rx) = std::sync::mpsc::sync_channel::(2); - - // Check if there are LowCardinalityString fields; if so, get dictionaries from engine - let has_lcs = schema.fields.iter().any(|f| f.value_type == FieldValueType::LowCardinalityString); - let dicts_arc: Option>> = if has_lcs { - Some(engine.dictionaries_arc()) - } else { - None - }; - - let id_field = schema_ref.id_field.clone(); - let dicts_clone = dicts_arc; - let parse_handle = thread::spawn(move || { - let mut total_parsed: usize = 0; - - while let Ok(raw_block) = block_rx.recv() { - if total_parsed >= record_limit { - break; - } - - let block_str = match std::str::from_utf8(&raw_block) { - Ok(s) => s, - Err(_) => continue, - }; - - let mut lines: Vec<&str> = block_str - .split('\n') - .map(|l| l.trim_end_matches('\r')) - .filter(|l| !l.is_empty()) - .collect(); - - // Respect limit - let remaining = record_limit.saturating_sub(total_parsed); - if lines.len() > remaining { - lines.truncate(remaining); - } - - let schema = &schema_ref; - let f_names = &filter_names_clone; - let s_configs = &sort_configs_clone; - let f_set = &filter_set_clone; - let s_bits = &sort_bits_clone; - let writer = &parse_writer; - let id_field_ref = &id_field; - let dicts = dicts_clone.as_deref(); - - // Rayon fold+reduce: each worker builds thread-local bitmap maps - // AND encodes docs to msgpack bytes — all CPU work in one pass. - // Slot = document ID (Postgres ID), not a sequential counter. - let accum = lines - .into_par_iter() - .fold( - || BitmapAccum::new(f_names, s_configs), - |mut acc, line| { - match serde_json::from_str::(line) { - Ok(json) => { - // Extract the document ID to use as the slot - let slot = match json.get(id_field_ref).and_then(|v| v.as_u64().or_else(|| v.as_i64().map(|n| n as u64))) { - Some(id) => id as u32, - None => { - acc.errors += 1; - return acc; - } - }; - - // TODO: BitmapSilo (Phase 3) — encode doc via DataSilo BulkWriter. - // For now, skip doc encoding (bitmaps still built correctly). - let _ = writer; // suppress unused warning - - // Build bitmaps directly from JSON - acc.alive.insert(slot); - extract_bitmaps_with_dicts( - &json, - schema, - f_set, - s_bits, - slot, - &mut acc.filter_maps, - &mut acc.sort_maps, - dicts, - ); - acc.count += 1; - } - Err(_) => acc.errors += 1, - } - acc - }, - ) - .reduce( - || BitmapAccum::new(f_names, s_configs), - |a, b| a.merge(b), - ); - - total_parsed += accum.count; - - if chunk_tx.send(accum).is_err() { - break; - } - } - }); - - // ---- Stage 3: Apply bitmaps + docstore (main thread) ---- - let mut staging = engine.clone_staging(); - let mut total_inserted: usize = 0; - let mut total_errors: u64 = 0; - let mut chunks_processed: usize = 0; - let wall_start = Instant::now(); - - let mut ds_handles: Vec> = Vec::new(); - let writer_cap = if max_writer_threads == 0 { usize::MAX } else { max_writer_threads }; - - while let Ok(chunk) = chunk_rx.recv() { - total_errors += chunk.errors; - let chunk_count = chunk.count; - - // Apply pre-built bitmaps directly to staging — no decompose/merge needed - let t0 = Instant::now(); - ConcurrentEngine::apply_bitmap_maps( - &mut staging, - chunk.filter_maps, - chunk.sort_maps, - chunk.alive, - ); - let apply_ms = t0.elapsed().as_secs_f64() * 1000.0; - - total_inserted += chunk_count; - progress.store(total_inserted as u64, Ordering::Release); - chunks_processed += 1; - - let elapsed = wall_start.elapsed(); - let rate = total_inserted as f64 / elapsed.as_secs_f64(); - eprintln!( - " chunk {}: {} total ({:.0}/s) apply={:.1}ms", - chunks_processed, total_inserted, rate, apply_ms - ); - - // Backpressure: wait for a writer to finish before spawning another - if ds_handles.len() >= writer_cap { - if let Some(h) = ds_handles.drain(..1).next() { - h.join().unwrap(); - } - } - - // TODO: BitmapSilo (Phase 3) — write encoded docs via DataSilo BulkWriter. - // For now, skip docstore writes (bitmaps applied correctly above). - let _ = &bulk_writer; // suppress unused warning - } - - // Wait for remaining threads - parse_handle.join().unwrap(); - reader_handle.join().unwrap(); - for h in ds_handles { - h.join().unwrap(); - } - - // Publish staging snapshot - engine.publish_staging(staging); - - let elapsed = wall_start.elapsed(); - let rate = total_inserted as f64 / elapsed.as_secs_f64(); - eprintln!( - "Loaded {} records in {:.1}s ({:.0}/s), errors skipped: {}", - total_inserted, - elapsed.as_secs_f64(), - rate, - total_errors - ); - - Ok(LoadStats { - records_loaded: total_inserted as u64, - elapsed, - errors_skipped: total_errors, - }) -} - -/// Extract bitmap entries directly from JSON into accumulator maps. -/// Skips intermediate Document creation for indexed fields. -#[allow(dead_code)] // Used by sync pipeline (feature-gated) -pub(crate) fn extract_bitmaps( - json: &serde_json::Value, - schema: &DataSchema, - filter_set: &HashSet, - sort_bits: &HashMap, - slot: u32, - filter_maps: &mut HashMap>, - sort_maps: &mut HashMap>, -) { - extract_bitmaps_with_dicts(json, schema, filter_set, sort_bits, slot, filter_maps, sort_maps, None); -} - -/// Extract bitmap entries directly from JSON into accumulator maps, with optional dictionaries. -pub(crate) fn extract_bitmaps_with_dicts( - json: &serde_json::Value, - schema: &DataSchema, - filter_set: &HashSet, - sort_bits: &HashMap, - slot: u32, - filter_maps: &mut HashMap>, - sort_maps: &mut HashMap>, - dictionaries: Option<&HashMap>, -) { - for mapping in &schema.fields { - if mapping.doc_only { - continue; - } - - let is_filter = filter_set.contains(&mapping.target); - let s_bits = sort_bits.get(&mapping.target).copied(); - - if !is_filter && s_bits.is_none() { - continue; - } - - let (raw, apply_ms) = match mapping.resolve_raw(json) { - Some(pair) => pair, - None => { - // ExistsBoolean: field absent → false - if is_filter && matches!(mapping.value_type, FieldValueType::ExistsBoolean) { - if let Some(fm) = filter_maps.get_mut(&mapping.target) { - fm.entry(0) - .or_insert_with(RoaringBitmap::new) - .insert(slot); - } - } - continue; - } - }; - - if is_filter { - if let Some(fm) = filter_maps.get_mut(&mapping.target) { - let dict = dictionaries.and_then(|d| d.get(&mapping.target)); - extract_filter_value_with_dict(raw, mapping, slot, fm, apply_ms, dict); - } - } - - if let Some(bits) = s_bits { - if let Some(sm) = sort_maps.get_mut(&mapping.target) { - extract_sort_value(raw, mapping, slot, bits, sm, apply_ms); - } - } - } -} - -/// Extract a single filter value, with optional dictionary for LowCardinalityString. -pub(crate) fn extract_filter_value_with_dict( - raw: &serde_json::Value, - mapping: &FieldMapping, - slot: u32, - field_map: &mut HashMap, - ms_to_seconds: bool, - dictionary: Option<&FieldDictionary>, -) { - match mapping.value_type { - FieldValueType::Integer => { - if let Some(n) = extract_integer(raw, ms_to_seconds) { - field_map - .entry(n as u64) - .or_insert_with(RoaringBitmap::new) - .insert(slot); - } - } - FieldValueType::Boolean => { - if let Some(b) = raw.as_bool() { - field_map - .entry(if b { 1 } else { 0 }) - .or_insert_with(RoaringBitmap::new) - .insert(slot); - } - } - FieldValueType::MappedString => { - if let Some(s) = raw.as_str() { - let lookup = if mapping.case_sensitive { - std::borrow::Cow::Borrowed(s) - } else { - std::borrow::Cow::Owned(s.to_lowercase()) - }; - let n = mapping - .string_map - .as_ref() - .and_then(|m| m.get(lookup.as_ref()).copied()) - .unwrap_or(0); - field_map - .entry(n as u64) - .or_insert_with(RoaringBitmap::new) - .insert(slot); - } - } - FieldValueType::LowCardinalityString => { - if let Some(s) = raw.as_str() { - if let Some(dict) = dictionary { - let n = dict.get_or_insert(s); - field_map - .entry(n as u64) - .or_insert_with(RoaringBitmap::new) - .insert(slot); - } - // If no dictionary provided, skip silently (shouldn't happen in practice) - } - } - FieldValueType::IntegerArray => { - if let Some(arr) = raw.as_array() { - for v in arr { - if let Some(n) = v.as_i64().or_else(|| v.as_u64().map(|n| n as i64)) { - field_map - .entry(n as u64) - .or_insert_with(RoaringBitmap::new) - .insert(slot); - } - } - } - } - FieldValueType::ExistsBoolean => { - field_map - .entry(1) - .or_insert_with(RoaringBitmap::new) - .insert(slot); - } - FieldValueType::String => {} // String filter fields not supported in bitmap index - } -} - -/// Extract sort value from JSON and insert into bit-layer bitmap maps. -pub(crate) fn extract_sort_value( - raw: &serde_json::Value, - mapping: &FieldMapping, - slot: u32, - bits: u8, - bit_map: &mut HashMap, - ms_to_seconds: bool, -) { - let value = match mapping.value_type { - // Sort fields are stored as u32 — clamp negative values to 0 so they don't - // wrap around to u32::MAX and sort incorrectly. - FieldValueType::Integer => { - extract_integer(raw, ms_to_seconds).map(|n| n.max(0) as u32) - } - _ => None, - }; - if let Some(v) = value { - for bit in 0..(bits as usize) { - if (v >> bit) & 1 == 1 { - bit_map - .entry(bit) - .or_insert_with(RoaringBitmap::new) - .insert(slot); - } - } - } -} - -/// Extract an integer from a JSON value, optionally converting ms→seconds. -pub(crate) fn extract_integer(raw: &serde_json::Value, ms_to_seconds: bool) -> Option { - let n = raw - .as_i64() - .or_else(|| raw.as_u64().map(|n| n as i64)) - .or_else(|| raw.as_f64().map(|n| n as i64))?; - Some(if ms_to_seconds { - ((n / 1000) as u32) as i64 - } else { - n - }) -} - -/// Convert a raw JSON value to a StoredDoc using the DataSchema field mappings. -/// Used by tests to verify field mapping correctness. -#[cfg(test)] -fn json_to_stored_doc(json: &serde_json::Value, schema: &DataSchema) -> StoredDoc { - let mut fields = HashMap::new(); - - if let Some(id_val) = json.get(&schema.id_field) { - if let Some(n) = id_val.as_i64() { - fields.insert("id".to_string(), FieldValue::Single(Value::Integer(n))); - } else if let Some(n) = id_val.as_u64() { - fields.insert( - "id".to_string(), - FieldValue::Single(Value::Integer(n as i64)), - ); - } - } - - for mapping in &schema.fields { - if mapping.filter_only { - continue; - } - - let (raw, apply_ms) = match mapping.resolve_raw(json) { - Some(pair) => pair, - None => { - match mapping.value_type { - FieldValueType::ExistsBoolean => { - fields.insert( - mapping.target.clone(), - FieldValue::Single(Value::Bool(false)), - ); - } - _ => {} - } - continue; - } - }; - - if let Some(fv) = convert_field(raw, mapping, apply_ms) { - fields.insert(mapping.target.clone(), fv); - } - } - - StoredDoc { fields, schema_version: 0 } -} - -/// Convert a raw JSON object to a `Document` using the given `DataSchema`. -/// -/// Extracts the ID from `schema.id_field` and builds the Document's field map -/// using the schema's field mappings. Returns `(slot_id, Document)` or an error -/// if the ID field is missing or not an integer. -pub fn json_to_document( - json: &serde_json::Value, - schema: &DataSchema, -) -> Result<(u32, Document), String> { - json_to_document_with_dicts(json, schema, None) -} - -/// Convert a raw JSON object to a `Document`, with optional dictionaries for LowCardinalityString fields. -pub fn json_to_document_with_dicts( - json: &serde_json::Value, - schema: &DataSchema, - dictionaries: Option<&HashMap>, -) -> Result<(u32, Document), String> { - // Extract ID - let id_val = json - .get(&schema.id_field) - .ok_or_else(|| format!("Missing id field '{}'", schema.id_field))?; - let id = id_val - .as_u64() - .or_else(|| id_val.as_i64().map(|n| n as u64)) - .ok_or_else(|| format!("id field '{}' is not an integer", schema.id_field))?; - let slot = id as u32; - - let mut fields = HashMap::new(); - - // Store the ID in the document fields - fields.insert( - "id".to_string(), - FieldValue::Single(Value::Integer(id as i64)), - ); - - for mapping in &schema.fields { - // filter_only fields are bitmap-indexed only — skip docstore storage - if mapping.filter_only { - continue; - } - - let (raw, apply_ms) = match mapping.resolve_raw(json) { - Some(pair) => pair, - None => { - if matches!(mapping.value_type, FieldValueType::ExistsBoolean) { - fields.insert( - mapping.target.clone(), - FieldValue::Single(Value::Bool(false)), - ); - } - continue; - } - }; - - // Null source values: write explicit defaults so the V2 docstore - // LIFO scan doesn't find stale old values. For fields without a - // default, null is a schema violation → return error. - if raw.is_null() { - match mapping.value_type { - FieldValueType::ExistsBoolean => { - fields.insert(mapping.target.clone(), FieldValue::Single(Value::Bool(false))); - } - _ => { - if let Some(ref dv) = mapping.default_value { - let dict = dictionaries.and_then(|d| d.get(&mapping.target)); - if let Some(fv) = convert_field_with_dict(dv, mapping, false, dict) { - fields.insert(mapping.target.clone(), fv); - } - } else if !mapping.doc_only { - return Err(format!( - "field '{}' (source '{}') is null but has no default", - mapping.target, mapping.source - )); - } - } - } - continue; - } - - let dict = dictionaries.and_then(|d| d.get(&mapping.target)); - if let Some(fv) = convert_field_with_dict(raw, mapping, apply_ms, dict) { - fields.insert(mapping.target.clone(), fv); - } - } - - Ok((slot, Document { fields })) -} - -/// Apply computed sort field values to a document. -/// Call this after `json_to_document` when the engine config is available. -/// For each computed sort field, reads source field values from the document, -/// applies the computation (e.g., GREATEST), and inserts the result. -pub fn apply_computed_sort_fields(doc: &mut Document, sort_fields: &[crate::config::SortFieldConfig]) { - use crate::mutation::apply_computed_op; - - for sort_field in sort_fields { - if let Some(ref computed) = sort_field.computed { - let values: Vec = computed.source_fields.iter() - .filter_map(|f| { - doc.fields.get(f).and_then(|fv| match fv { - FieldValue::Single(Value::Integer(v)) => Some((*v).max(0) as u32), - _ => None, - }) - }) - .collect(); - if !values.is_empty() { - let result = apply_computed_op(&computed.op, &values); - doc.fields.insert( - sort_field.name.clone(), - FieldValue::Single(Value::Integer(result as i64)), - ); - } - } - } -} - -/// Convert a raw serde_json Value field to a FieldValue. -#[allow(dead_code)] // Used by test helpers -fn convert_field(raw: &serde_json::Value, mapping: &FieldMapping, ms_to_seconds: bool) -> Option { - convert_field_with_dict(raw, mapping, ms_to_seconds, None) -} - -/// Convert a raw serde_json Value field to a FieldValue, with optional dictionary. -pub fn convert_field_with_dict( - raw: &serde_json::Value, - mapping: &FieldMapping, - ms_to_seconds: bool, - dictionary: Option<&FieldDictionary>, -) -> Option { - match mapping.value_type { - FieldValueType::Integer => { - let n = if let Some(n) = raw.as_i64() { - n - } else if let Some(n) = raw.as_u64() { - n as i64 - } else if let Some(n) = raw.as_f64() { - n as i64 - } else { - return None; - }; - let n = if ms_to_seconds { - ((n / 1000) as u32) as i64 - } else { - n - }; - Some(FieldValue::Single(Value::Integer(n))) - } - FieldValueType::Boolean => { - let b = raw.as_bool()?; - Some(FieldValue::Single(Value::Bool(b))) - } - FieldValueType::String => { - let s = raw.as_str()?; - Some(FieldValue::Single(Value::String(s.to_string()))) - } - FieldValueType::MappedString => { - let s = raw.as_str()?; - let map = mapping.string_map.as_ref()?; - let lookup = if mapping.case_sensitive { - std::borrow::Cow::Borrowed(s) - } else { - std::borrow::Cow::Owned(s.to_lowercase()) - }; - let n = map.get(lookup.as_ref()).copied().unwrap_or(0); - Some(FieldValue::Single(Value::Integer(n))) - } - FieldValueType::LowCardinalityString => { - let s = raw.as_str()?; - if let Some(dict) = dictionary { - let n = dict.get_or_insert(s); - Some(FieldValue::Single(Value::Integer(n))) - } else { - // Without a dictionary, store as 0 (unknown) - Some(FieldValue::Single(Value::Integer(0))) - } - } - FieldValueType::IntegerArray => { - let arr = raw.as_array()?; - if arr.is_empty() { - return None; - } - let values: Vec = arr - .iter() - .filter_map(|v| { - v.as_i64() - .or_else(|| v.as_u64().map(|n| n as i64)) - .map(Value::Integer) - }) - .collect(); - if values.is_empty() { - None - } else { - Some(FieldValue::Multi(values)) - } - } - FieldValueType::ExistsBoolean => Some(FieldValue::Single(Value::Bool(true))), - } -} - -fn memrchr_newline(data: &[u8]) -> Option { - data.iter().rposition(|&b| b == b'\n') -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_json_to_stored_doc_integer() { - let schema = DataSchema { - id_field: "id".into(), - schema_version: 1, - fields: vec![FieldMapping { - source: "count".into(), - target: "count".into(), - value_type: FieldValueType::Integer, - fallback: None, - string_map: None, - doc_only: false, - filter_only: false, - ms_to_seconds: false, - truncate_u32: false, - case_sensitive: false, - default_value: None, - nullable: false, - }], - }; - let json: serde_json::Value = serde_json::json!({"id": 42, "count": 100}); - let doc = json_to_stored_doc(&json, &schema); - assert_eq!( - doc.fields.get("id"), - Some(&FieldValue::Single(Value::Integer(42))) - ); - assert_eq!( - doc.fields.get("count"), - Some(&FieldValue::Single(Value::Integer(100))) - ); - } - - #[test] - fn test_json_to_stored_doc_fallback() { - let schema = DataSchema { - id_field: "id".into(), - schema_version: 1, - fields: vec![FieldMapping { - source: "primary".into(), - target: "val".into(), - value_type: FieldValueType::Integer, - fallback: Some("secondary".into()), - string_map: None, - doc_only: false, - filter_only: false, - ms_to_seconds: false, - truncate_u32: false, - case_sensitive: false, - default_value: None, - nullable: false, - }], - }; - let json: serde_json::Value = serde_json::json!({"id": 1, "secondary": 99}); - let doc = json_to_stored_doc(&json, &schema); - assert_eq!( - doc.fields.get("val"), - Some(&FieldValue::Single(Value::Integer(99))) - ); - } - - #[test] - fn test_json_to_stored_doc_mapped_string() { - let mut map = HashMap::new(); - map.insert("image".into(), 1); - map.insert("video".into(), 2); - - let schema = DataSchema { - id_field: "id".into(), - schema_version: 1, - fields: vec![FieldMapping { - source: "type".into(), - target: "type".into(), - value_type: FieldValueType::MappedString, - fallback: None, - string_map: Some(map), - doc_only: false, - filter_only: false, - ms_to_seconds: false, - truncate_u32: false, - case_sensitive: false, - default_value: None, - nullable: false, - }], - }; - let json: serde_json::Value = serde_json::json!({"id": 1, "type": "image"}); - let doc = json_to_stored_doc(&json, &schema); - assert_eq!( - doc.fields.get("type"), - Some(&FieldValue::Single(Value::Integer(1))) - ); - } - - #[test] - fn test_json_to_stored_doc_mapped_string_case_insensitive() { - let mut map = HashMap::new(); - map.insert("Image".into(), 1); - map.insert("Video".into(), 2); - - let mut schema = DataSchema { - id_field: "id".into(), - schema_version: 1, - fields: vec![FieldMapping { - source: "type".into(), - target: "type".into(), - value_type: FieldValueType::MappedString, - fallback: None, - string_map: Some(map), - doc_only: false, - filter_only: false, - ms_to_seconds: false, - truncate_u32: false, - case_sensitive: false, // default - default_value: None, - nullable: false, - }], - }; - schema.normalize_string_maps(); - - // Uppercase input matches lowercase-normalized map key - let json: serde_json::Value = serde_json::json!({"id": 1, "type": "IMAGE"}); - let doc = json_to_stored_doc(&json, &schema); - assert_eq!( - doc.fields.get("type"), - Some(&FieldValue::Single(Value::Integer(1))) - ); - - // Mixed case input also matches - let json2: serde_json::Value = serde_json::json!({"id": 2, "type": "Video"}); - let doc2 = json_to_stored_doc(&json2, &schema); - assert_eq!( - doc2.fields.get("type"), - Some(&FieldValue::Single(Value::Integer(2))) - ); - } - - #[test] - fn test_json_to_stored_doc_mapped_string_case_sensitive() { - let mut map = HashMap::new(); - map.insert("Image".into(), 1); - map.insert("Video".into(), 2); - - let mut schema = DataSchema { - id_field: "id".into(), - schema_version: 1, - fields: vec![FieldMapping { - source: "type".into(), - target: "type".into(), - value_type: FieldValueType::MappedString, - fallback: None, - string_map: Some(map), - doc_only: false, - filter_only: false, - ms_to_seconds: false, - truncate_u32: false, - case_sensitive: true, - default_value: None, - nullable: false, - }], - }; - schema.normalize_string_maps(); - - // Exact case matches - let json: serde_json::Value = serde_json::json!({"id": 1, "type": "Image"}); - let doc = json_to_stored_doc(&json, &schema); - assert_eq!( - doc.fields.get("type"), - Some(&FieldValue::Single(Value::Integer(1))) - ); - - // Wrong case falls back to 0 - let json2: serde_json::Value = serde_json::json!({"id": 2, "type": "image"}); - let doc2 = json_to_stored_doc(&json2, &schema); - assert_eq!( - doc2.fields.get("type"), - Some(&FieldValue::Single(Value::Integer(0))) - ); - } - - #[test] - fn test_json_to_stored_doc_boolean() { - let schema = DataSchema { - id_field: "id".into(), - schema_version: 1, - fields: vec![FieldMapping { - source: "hasMeta".into(), - target: "hasMeta".into(), - value_type: FieldValueType::Boolean, - fallback: None, - string_map: None, - doc_only: false, - filter_only: false, - ms_to_seconds: false, - truncate_u32: false, - case_sensitive: false, - default_value: None, - nullable: false, - }], - }; - let json: serde_json::Value = serde_json::json!({"id": 1, "hasMeta": true}); - let doc = json_to_stored_doc(&json, &schema); - assert_eq!( - doc.fields.get("hasMeta"), - Some(&FieldValue::Single(Value::Bool(true))) - ); - } - - #[test] - fn test_json_to_stored_doc_integer_array() { - let schema = DataSchema { - id_field: "id".into(), - schema_version: 1, - fields: vec![FieldMapping { - source: "tagIds".into(), - target: "tagIds".into(), - value_type: FieldValueType::IntegerArray, - fallback: None, - string_map: None, - doc_only: false, - filter_only: false, - ms_to_seconds: false, - truncate_u32: false, - case_sensitive: false, - default_value: None, - nullable: false, - }], - }; - let json: serde_json::Value = serde_json::json!({"id": 1, "tagIds": [10, 20, 30]}); - let doc = json_to_stored_doc(&json, &schema); - assert_eq!( - doc.fields.get("tagIds"), - Some(&FieldValue::Multi(vec![ - Value::Integer(10), - Value::Integer(20), - Value::Integer(30), - ])) - ); - } - - #[test] - fn test_json_to_stored_doc_truncate_u32() { - let schema = DataSchema { - id_field: "id".into(), - schema_version: 1, - fields: vec![FieldMapping { - source: "ts".into(), - target: "ts".into(), - value_type: FieldValueType::Integer, - fallback: None, - string_map: None, - doc_only: false, - filter_only: false, - ms_to_seconds: true, - truncate_u32: false, - case_sensitive: false, - default_value: None, - nullable: false, - }], - }; - // Millisecond timestamp → divide by 1000, then cast to u32 - let ms_val: i64 = 1_710_000_000_000; // March 2024 in ms - let json: serde_json::Value = serde_json::json!({"id": 1, "ts": ms_val}); - let doc = json_to_stored_doc(&json, &schema); - let expected = (ms_val / 1000) as i64; // 1_710_000_000 — valid seconds - assert_eq!( - doc.fields.get("ts"), - Some(&FieldValue::Single(Value::Integer(expected))) - ); - - } - - #[test] - fn test_ms_to_seconds_with_fallback() { - // Mirrors the real civitai config: source=sortAtUnix (ms), fallback=sortAt (seconds) - let schema = DataSchema { - id_field: "id".into(), - schema_version: 1, - fields: vec![FieldMapping { - source: "sortAtUnix".into(), - target: "sortAt".into(), - value_type: FieldValueType::Integer, - fallback: Some("sortAt".into()), - string_map: None, - doc_only: false, - filter_only: false, - ms_to_seconds: true, - truncate_u32: false, - case_sensitive: false, - default_value: None, - nullable: false, - }], - }; - - // Case 1: sortAtUnix present (milliseconds) → divide by 1000 - let json1: serde_json::Value = - serde_json::json!({"id": 1, "sortAtUnix": 1_684_867_905_000_i64}); - let doc1 = json_to_stored_doc(&json1, &schema); - assert_eq!( - doc1.fields.get("sortAt"), - Some(&FieldValue::Single(Value::Integer(1_684_867_905))), - "ms timestamp should be divided by 1000" - ); - - // Case 2: sortAtUnix missing, falls back to sortAt (seconds) → NO division - let json2: serde_json::Value = - serde_json::json!({"id": 2, "sortAt": 1_684_867_905_i64}); - let doc2 = json_to_stored_doc(&json2, &schema); - assert_eq!( - doc2.fields.get("sortAt"), - Some(&FieldValue::Single(Value::Integer(1_684_867_905))), - "fallback (seconds) should NOT be divided by 1000" - ); - - // Case 3: sortAtUnix present but null, falls back to sortAt (seconds) - let json3: serde_json::Value = - serde_json::json!({"id": 3, "sortAtUnix": null, "sortAt": 1_684_867_905_i64}); - let doc3 = json_to_stored_doc(&json3, &schema); - assert_eq!( - doc3.fields.get("sortAt"), - Some(&FieldValue::Single(Value::Integer(1_684_867_905))), - "null primary should fall back to seconds without division" - ); - - // Case 4: Both missing → field absent - let json4: serde_json::Value = serde_json::json!({"id": 4}); - let doc4 = json_to_stored_doc(&json4, &schema); - assert_eq!( - doc4.fields.get("sortAt"), - None, - "both missing → field should be absent" - ); - } - - #[test] - fn test_ms_to_seconds_json_to_document() { - // Same test through json_to_document (the production path for upserts) - let schema = DataSchema { - id_field: "id".into(), - schema_version: 1, - fields: vec![FieldMapping { - source: "sortAtUnix".into(), - target: "sortAt".into(), - value_type: FieldValueType::Integer, - fallback: Some("sortAt".into()), - string_map: None, - doc_only: false, - filter_only: false, - ms_to_seconds: true, - truncate_u32: false, - case_sensitive: false, - default_value: None, - nullable: false, - }], - }; - - // Primary (ms) → divided - let json1 = serde_json::json!({"id": 100, "sortAtUnix": 1_684_867_905_000_i64}); - let (slot, doc1) = json_to_document(&json1, &schema).unwrap(); - assert_eq!(slot, 100); - assert_eq!( - doc1.fields.get("sortAt"), - Some(&FieldValue::Single(Value::Integer(1_684_867_905))) - ); - - // Fallback (seconds) → not divided - let json2 = serde_json::json!({"id": 200, "sortAt": 1_684_867_905_i64}); - let (slot2, doc2) = json_to_document(&json2, &schema).unwrap(); - assert_eq!(slot2, 200); - assert_eq!( - doc2.fields.get("sortAt"), - Some(&FieldValue::Single(Value::Integer(1_684_867_905))) - ); - } - - #[test] - fn test_ms_to_seconds_extract_integer() { - // Direct test of the extraction function - let ms = serde_json::json!(1_684_867_905_000_i64); - assert_eq!(extract_integer(&ms, true), Some(1_684_867_905)); - assert_eq!(extract_integer(&ms, false), Some(1_684_867_905_000)); - - let sec = serde_json::json!(1_684_867_905_i64); - assert_eq!(extract_integer(&sec, true), Some(1_684_867)); - assert_eq!(extract_integer(&sec, false), Some(1_684_867_905)); - } - - #[test] - fn test_json_to_stored_doc_string() { - let schema = DataSchema { - id_field: "id".into(), - schema_version: 1, - fields: vec![FieldMapping { - source: "url".into(), - target: "url".into(), - value_type: FieldValueType::String, - fallback: None, - string_map: None, - doc_only: true, - filter_only: false, - ms_to_seconds: false, - truncate_u32: false, - case_sensitive: false, - default_value: None, - nullable: false, - }], - }; - let json: serde_json::Value = serde_json::json!({"id": 1, "url": "http://example.com"}); - let doc = json_to_stored_doc(&json, &schema); - assert_eq!( - doc.fields.get("url"), - Some(&FieldValue::Single(Value::String( - "http://example.com".into() - ))) - ); - } - - #[test] - fn test_json_to_stored_doc_missing_field_skipped() { - let schema = DataSchema { - id_field: "id".into(), - schema_version: 1, - fields: vec![FieldMapping { - source: "missing".into(), - target: "val".into(), - value_type: FieldValueType::Integer, - fallback: None, - string_map: None, - doc_only: false, - filter_only: false, - ms_to_seconds: false, - truncate_u32: false, - case_sensitive: false, - default_value: None, - nullable: false, - }], - }; - let json: serde_json::Value = serde_json::json!({"id": 1}); - let doc = json_to_stored_doc(&json, &schema); - assert!(doc.fields.get("val").is_none()); - } - - #[test] - fn test_json_to_stored_doc_null_field_skipped() { - let schema = DataSchema { - id_field: "id".into(), - schema_version: 1, - fields: vec![FieldMapping { - source: "val".into(), - target: "val".into(), - value_type: FieldValueType::Integer, - fallback: None, - string_map: None, - doc_only: false, - filter_only: false, - ms_to_seconds: false, - truncate_u32: false, - case_sensitive: false, - default_value: None, - nullable: false, - }], - }; - let json: serde_json::Value = serde_json::json!({"id": 1, "val": null}); - let doc = json_to_stored_doc(&json, &schema); - assert!(doc.fields.get("val").is_none()); - } - - #[test] - fn test_json_to_stored_doc_empty_array_skipped() { - let schema = DataSchema { - id_field: "id".into(), - schema_version: 1, - fields: vec![FieldMapping { - source: "tags".into(), - target: "tags".into(), - value_type: FieldValueType::IntegerArray, - fallback: None, - string_map: None, - doc_only: false, - filter_only: false, - ms_to_seconds: false, - truncate_u32: false, - case_sensitive: false, - default_value: None, - nullable: false, - }], - }; - let json: serde_json::Value = serde_json::json!({"id": 1, "tags": []}); - let doc = json_to_stored_doc(&json, &schema); - assert!(doc.fields.get("tags").is_none()); - } - - // ----------------------------------------------------------------------- - // LowCardinalityString tests - // ----------------------------------------------------------------------- - - #[test] - fn test_low_cardinality_string_auto_assignment() { - use crate::dictionary::FieldDictionary; - - let dict = FieldDictionary::new(); - let mut dicts = HashMap::new(); - dicts.insert("baseModel".to_string(), dict); - - let schema = DataSchema { - id_field: "id".into(), - schema_version: 1, - fields: vec![FieldMapping { - source: "baseModel".into(), - target: "baseModel".into(), - value_type: FieldValueType::LowCardinalityString, - fallback: None, - string_map: None, - doc_only: false, - filter_only: false, - ms_to_seconds: false, - truncate_u32: false, - case_sensitive: false, - default_value: None, - nullable: false, - }], - }; - - // First document — "SD 1.5" gets assigned a key - let json1 = serde_json::json!({"id": 1, "baseModel": "SD 1.5"}); - let (slot1, doc1) = json_to_document_with_dicts(&json1, &schema, Some(&dicts)).unwrap(); - assert_eq!(slot1, 1); - let k1 = match doc1.fields.get("baseModel") { - Some(FieldValue::Single(Value::Integer(n))) => *n, - _ => panic!("expected integer"), - }; - assert!(k1 >= 1, "auto-assigned key should be >= 1"); - - // Second document — same string gets same key - let json2 = serde_json::json!({"id": 2, "baseModel": "SD 1.5"}); - let (_, doc2) = json_to_document_with_dicts(&json2, &schema, Some(&dicts)).unwrap(); - let k2 = match doc2.fields.get("baseModel") { - Some(FieldValue::Single(Value::Integer(n))) => *n, - _ => panic!("expected integer"), - }; - assert_eq!(k1, k2, "same string should get same key"); - - // Third document — different string gets different key - let json3 = serde_json::json!({"id": 3, "baseModel": "SDXL 1.0"}); - let (_, doc3) = json_to_document_with_dicts(&json3, &schema, Some(&dicts)).unwrap(); - let k3 = match doc3.fields.get("baseModel") { - Some(FieldValue::Single(Value::Integer(n))) => *n, - _ => panic!("expected integer"), - }; - assert_ne!(k1, k3, "different string should get different key"); - } - - #[test] - fn test_low_cardinality_string_case_insensitive() { - use crate::dictionary::FieldDictionary; - - let dict = FieldDictionary::new(); - let mut dicts = HashMap::new(); - dicts.insert("type".to_string(), dict); - - let schema = DataSchema { - id_field: "id".into(), - schema_version: 1, - fields: vec![FieldMapping { - source: "type".into(), - target: "type".into(), - value_type: FieldValueType::LowCardinalityString, - fallback: None, - string_map: None, - doc_only: false, - filter_only: false, - ms_to_seconds: false, - truncate_u32: false, - case_sensitive: false, - default_value: None, - nullable: false, - }], - }; - - let json1 = serde_json::json!({"id": 1, "type": "Image"}); - let (_, doc1) = json_to_document_with_dicts(&json1, &schema, Some(&dicts)).unwrap(); - let k1 = match doc1.fields.get("type") { - Some(FieldValue::Single(Value::Integer(n))) => *n, - _ => panic!("expected integer"), - }; - - // Different casing should get same key - let json2 = serde_json::json!({"id": 2, "type": "IMAGE"}); - let (_, doc2) = json_to_document_with_dicts(&json2, &schema, Some(&dicts)).unwrap(); - let k2 = match doc2.fields.get("type") { - Some(FieldValue::Single(Value::Integer(n))) => *n, - _ => panic!("expected integer"), - }; - assert_eq!(k1, k2, "case-insensitive: same key for different casing"); - - // Original casing preserved in dictionary - let dict = dicts.get("type").unwrap(); - let snap = dict.snapshot(); - assert_eq!(snap.originals.get("image"), Some(&"Image".to_string())); - } - - #[test] - fn test_low_cardinality_string_extract_filter_value() { - use crate::dictionary::FieldDictionary; - - let dict = FieldDictionary::new(); - let mapping = FieldMapping { - source: "color".into(), - target: "color".into(), - value_type: FieldValueType::LowCardinalityString, - fallback: None, - string_map: None, - doc_only: false, - filter_only: false, - ms_to_seconds: false, - truncate_u32: false, - case_sensitive: false, - default_value: None, - nullable: false, - }; - - let mut field_map: HashMap = HashMap::new(); - - let raw1 = serde_json::json!("Red"); - extract_filter_value_with_dict(&raw1, &mapping, 100, &mut field_map, false, Some(&dict)); - - let raw2 = serde_json::json!("Blue"); - extract_filter_value_with_dict(&raw2, &mapping, 200, &mut field_map, false, Some(&dict)); - - let raw3 = serde_json::json!("red"); // same as "Red" (case insensitive) - extract_filter_value_with_dict(&raw3, &mapping, 300, &mut field_map, false, Some(&dict)); - - // "Red" and "red" should have the same key - let red_key = dict.get("Red").unwrap() as u64; - let blue_key = dict.get("Blue").unwrap() as u64; - assert_ne!(red_key, blue_key); - - let red_bm = field_map.get(&red_key).unwrap(); - assert!(red_bm.contains(100)); - assert!(red_bm.contains(300)); // "red" maps to same key as "Red" - assert!(!red_bm.contains(200)); - - let blue_bm = field_map.get(&blue_key).unwrap(); - assert!(blue_bm.contains(200)); - assert!(!blue_bm.contains(100)); - } - - #[test] - fn test_low_cardinality_string_dictionary_persistence() { - use crate::dictionary::{FieldDictionary, save_dictionary, load_dictionary}; - - let dict = FieldDictionary::new(); - dict.get_or_insert("Alpha"); - dict.get_or_insert("Beta"); - dict.get_or_insert("Gamma"); - - let dir = tempfile::tempdir().unwrap(); - let path = dir.path().join("test_field.dict"); - - let snap = dict.snapshot(); - save_dictionary(&snap, &path).unwrap(); - - let loaded_snap = load_dictionary(&path).unwrap().unwrap(); - let dict2 = FieldDictionary::from_snapshot(&loaded_snap); - - // Same mappings after reload - assert_eq!(dict2.get("alpha"), dict.get("alpha")); - assert_eq!(dict2.get("beta"), dict.get("beta")); - assert_eq!(dict2.get("gamma"), dict.get("gamma")); - - // Original casing preserved - assert_eq!(loaded_snap.originals.get("alpha"), Some(&"Alpha".to_string())); - } -} - -#[cfg(test)] -mod checkpoint_tests { - use super::*; - - #[test] - fn test_checkpoint_roundtrip() { - let filter_names: Vec = vec!["nsfwLevel", "userId", "tagIds"] - .into_iter().map(String::from).collect(); - let sort_configs: Vec<(String, u8)> = vec![("sortAt".to_string(), 32), ("id".to_string(), 32)]; - - let mut accum = BitmapAccum::new(&filter_names, &sort_configs); - - // Add alive bits - for i in [100u32, 200, 300, 50000] { - accum.alive.insert(i); - } - - // Add filter values - if let Some(fm) = accum.filter_maps.get_mut("nsfwLevel") { - fm.entry(1).or_insert_with(RoaringBitmap::new).insert(100); - fm.entry(1).or_insert_with(RoaringBitmap::new).insert(200); - fm.entry(8).or_insert_with(RoaringBitmap::new).insert(300); - } - if let Some(fm) = accum.filter_maps.get_mut("userId") { - fm.entry(42).or_insert_with(RoaringBitmap::new).insert(100); - fm.entry(42).or_insert_with(RoaringBitmap::new).insert(300); - fm.entry(99).or_insert_with(RoaringBitmap::new).insert(200); - } - if let Some(fm) = accum.filter_maps.get_mut("tagIds") { - fm.entry(1000).or_insert_with(RoaringBitmap::new).insert(100); - fm.entry(1000).or_insert_with(RoaringBitmap::new).insert(200); - fm.entry(2000).or_insert_with(RoaringBitmap::new).insert(300); - } - - // Add sort bits (sortAt = 1700000000 for slot 100) - let val: u32 = 1700000000; - if let Some(sm) = accum.sort_maps.get_mut("sortAt") { - for bit in 0..32usize { - if (val >> bit) & 1 == 1 { - sm.entry(bit).or_insert_with(RoaringBitmap::new).insert(100); - } - } - } - - // Save checkpoint - let dir = tempfile::tempdir().unwrap(); - let path = dir.path().join("test.ckpt"); - accum.save_checkpoint(&path).unwrap(); - - // Load checkpoint - let loaded = BitmapAccum::load_checkpoint(&path).unwrap(); - - // Verify alive - assert_eq!(loaded.alive.len(), 4); - assert!(loaded.alive.contains(100)); - assert!(loaded.alive.contains(200)); - assert!(loaded.alive.contains(300)); - assert!(loaded.alive.contains(50000)); - - // Verify filters - let nsfw = loaded.filter_maps.get("nsfwLevel").unwrap(); - assert_eq!(nsfw.get(&1).unwrap().len(), 2); - assert_eq!(nsfw.get(&8).unwrap().len(), 1); - - let users = loaded.filter_maps.get("userId").unwrap(); - assert_eq!(users.get(&42).unwrap().len(), 2); - assert_eq!(users.get(&99).unwrap().len(), 1); - - let tags = loaded.filter_maps.get("tagIds").unwrap(); - assert_eq!(tags.get(&1000).unwrap().len(), 2); - assert_eq!(tags.get(&2000).unwrap().len(), 1); - - // Verify sort bits - let sort_at = loaded.sort_maps.get("sortAt").unwrap(); - // Reconstruct the value from bits - let mut reconstructed: u32 = 0; - for bit in 0..32usize { - if let Some(bm) = sort_at.get(&bit) { - if bm.contains(100) { - reconstructed |= 1 << bit; - } - } - } - assert_eq!(reconstructed, 1700000000); - } - - #[test] - fn test_checkpoint_empty_accum() { - let filter_names: Vec = vec!["field1".to_string()]; - let sort_configs: Vec<(String, u8)> = vec![("sort1".to_string(), 16)]; - - let accum = BitmapAccum::new(&filter_names, &sort_configs); - - let dir = tempfile::tempdir().unwrap(); - let path = dir.path().join("empty.ckpt"); - accum.save_checkpoint(&path).unwrap(); - - let loaded = BitmapAccum::load_checkpoint(&path).unwrap(); - assert_eq!(loaded.alive.len(), 0); - assert!(loaded.filter_maps.get("field1").unwrap().is_empty()); - assert!(loaded.sort_maps.get("sort1").unwrap().is_empty()); - } - - #[test] - fn test_filter_only_excluded_from_document() { - // filter_only fields should be bitmap-indexed but NOT stored in the Document - let schema = DataSchema { - id_field: "id".into(), - schema_version: 1, - fields: vec![ - FieldMapping { - source: "tagIds".into(), - target: "tagIds".into(), - value_type: FieldValueType::IntegerArray, - fallback: None, - string_map: None, - doc_only: false, - filter_only: false, - ms_to_seconds: false, - truncate_u32: false, - case_sensitive: false, - default_value: None, - nullable: false, - }, - FieldMapping { - source: "collectionIds".into(), - target: "collectionIds".into(), - value_type: FieldValueType::IntegerArray, - fallback: None, - string_map: None, - doc_only: false, - filter_only: true, - ms_to_seconds: false, - truncate_u32: false, - case_sensitive: false, - default_value: None, - nullable: false, - }, - ], - }; - - let json = serde_json::json!({ - "id": 42, - "tagIds": [10, 20], - "collectionIds": [100, 200] - }); - - // Document should have tagIds but NOT collectionIds - let (slot, doc) = json_to_document(&json, &schema).unwrap(); - assert_eq!(slot, 42); - assert!(doc.fields.contains_key("tagIds"), "tagIds should be in Document"); - assert!(!doc.fields.contains_key("collectionIds"), "filter_only field should be excluded from Document"); - - // StoredDoc should also exclude filter_only fields - let stored = json_to_stored_doc(&json, &schema); - assert!(stored.fields.contains_key("tagIds")); - assert!(!stored.fields.contains_key("collectionIds")); - } - - #[test] - fn test_filter_only_still_indexed_in_bitmaps() { - // filter_only fields should still be bitmap-indexed - let schema = DataSchema { - id_field: "id".into(), - schema_version: 1, - fields: vec![FieldMapping { - source: "collectionIds".into(), - target: "collectionIds".into(), - value_type: FieldValueType::IntegerArray, - fallback: None, - string_map: None, - doc_only: false, - filter_only: true, - ms_to_seconds: false, - truncate_u32: false, - case_sensitive: false, - default_value: None, - nullable: false, - }], - }; - - let json = serde_json::json!({ - "id": 42, - "collectionIds": [100, 200] - }); - - let filter_set: HashSet = ["collectionIds".to_string()].into(); - let sort_bits: HashMap = HashMap::new(); - let mut filter_maps: HashMap> = HashMap::new(); - filter_maps.insert("collectionIds".to_string(), HashMap::new()); - let mut sort_maps: HashMap> = HashMap::new(); - - extract_bitmaps(&json, &schema, &filter_set, &sort_bits, 42, &mut filter_maps, &mut sort_maps); - - let coll_map = filter_maps.get("collectionIds").unwrap(); - assert!(coll_map.get(&100).unwrap().contains(42), "slot 42 should be in bitmap for collectionId 100"); - assert!(coll_map.get(&200).unwrap().contains(42), "slot 42 should be in bitmap for collectionId 200"); - } - - #[test] - fn test_filter_only_and_doc_only_mutually_exclusive() { - let schema = DataSchema { - id_field: "id".into(), - schema_version: 1, - fields: vec![FieldMapping { - source: "x".into(), - target: "x".into(), - value_type: FieldValueType::Integer, - fallback: None, - string_map: None, - doc_only: true, - filter_only: true, - ms_to_seconds: false, - truncate_u32: false, - case_sensitive: false, - default_value: None, - nullable: false, - }], - }; - assert!(schema.validate().is_err(), "doc_only + filter_only should fail validation"); - } -} diff --git a/src/sync/mod.rs b/src/sync/mod.rs index db1d0d3..a0ab68a 100644 --- a/src/sync/mod.rs +++ b/src/sync/mod.rs @@ -5,19 +5,15 @@ pub mod bitdex_client; pub mod bulk_loader; pub mod config; -pub mod copy_queries; pub mod dump; pub mod dump_enrichment; pub mod dump_expression; pub mod dump_processor; pub mod ingester; -pub mod loader; pub mod metrics_poller; pub mod op_dedup; pub mod ops; pub mod ops_poller; pub mod trigger_gen; -pub mod progress; pub mod queries; -pub mod slot_arena; pub mod sync_config; diff --git a/src/sync/progress.rs b/src/sync/progress.rs deleted file mode 100644 index 5ddd8c0..0000000 --- a/src/sync/progress.rs +++ /dev/null @@ -1,113 +0,0 @@ -//! Shared load progress state and HTTP endpoint for monitoring bulk loads. - -use std::sync::atomic::{AtomicU8, AtomicU64, Ordering}; -use std::sync::Arc; -use std::time::Instant; - -use axum::extract::State; -use axum::routing::get; -use axum::{Json, Router}; -use serde_json::json; - -/// Shared progress state for bulk load monitoring. -/// All fields are atomic for concurrent access from multiple stream tasks. -pub struct LoadProgress { - /// Load phase: 0=setup, 1=streaming, 2=cleanup, 3=applying, 4=finalizing, 5=saving, 6=done - pub phase: AtomicU8, - /// Wall clock start time - start_time: Instant, - /// Per-stream row counters - pub image_rows: AtomicU64, - pub tag_rows: AtomicU64, - pub tool_rows: AtomicU64, - pub technique_rows: AtomicU64, - pub resource_rows: AtomicU64, - /// Number of streams that have completed (out of 5) - pub streams_done: AtomicU8, -} - -impl LoadProgress { - pub fn new() -> Self { - Self { - phase: AtomicU8::new(0), - start_time: Instant::now(), - image_rows: AtomicU64::new(0), - tag_rows: AtomicU64::new(0), - tool_rows: AtomicU64::new(0), - technique_rows: AtomicU64::new(0), - resource_rows: AtomicU64::new(0), - streams_done: AtomicU8::new(0), - } - } - - pub fn elapsed_secs(&self) -> f64 { - self.start_time.elapsed().as_secs_f64() - } - - pub fn set_phase(&self, phase: u8) { - self.phase.store(phase, Ordering::Release); - } -} - -async fn status_handler(State(progress): State>) -> Json { - let elapsed = progress.elapsed_secs(); - let phase = progress.phase.load(Ordering::Acquire); - let images = progress.image_rows.load(Ordering::Relaxed); - let tags = progress.tag_rows.load(Ordering::Relaxed); - let tools = progress.tool_rows.load(Ordering::Relaxed); - let techniques = progress.technique_rows.load(Ordering::Relaxed); - let resources = progress.resource_rows.load(Ordering::Relaxed); - let done = progress.streams_done.load(Ordering::Relaxed); - - let phase_name = match phase { - 0 => "setup", - 1 => "streaming", - 2 => "cleanup", - 3 => "applying", - 4 => "finalizing", - 5 => "saving", - 6 => "done", - _ => "unknown", - }; - - Json(json!({ - "phase": phase_name, - "elapsed_secs": (elapsed * 10.0).round() / 10.0, - "streams_done": done, - "streams": { - "images": { "rows": images, "rate": if elapsed > 0.0 { (images as f64 / elapsed).round() } else { 0.0 } }, - "tags": { "rows": tags, "rate": if elapsed > 0.0 { (tags as f64 / elapsed).round() } else { 0.0 } }, - "tools": { "rows": tools, "rate": if elapsed > 0.0 { (tools as f64 / elapsed).round() } else { 0.0 } }, - "techniques": { "rows": techniques, "rate": if elapsed > 0.0 { (techniques as f64 / elapsed).round() } else { 0.0 } }, - "resources": { "rows": resources, "rate": if elapsed > 0.0 { (resources as f64 / elapsed).round() } else { 0.0 } }, - } - })) -} - -/// Spawn the progress HTTP server in a background tokio task. -/// Returns the shutdown sender — send `()` to gracefully stop the server. -pub fn spawn_progress_server( - port: u16, - progress: Arc, -) -> tokio::sync::oneshot::Sender<()> { - let (tx, rx) = tokio::sync::oneshot::channel::<()>(); - - tokio::spawn(async move { - let app = Router::new() - .route("/status", get(status_handler)) - .with_state(progress); - - let addr = std::net::SocketAddr::from(([0, 0, 0, 0], port)); - let listener = tokio::net::TcpListener::bind(addr).await.unwrap(); - eprintln!("Progress server listening on {addr}"); - - axum::serve(listener, app) - .with_graceful_shutdown(async { - rx.await.ok(); - }) - .await - .ok(); - }); - - tx -} diff --git a/src/sync/queries.rs b/src/sync/queries.rs index 7e2b169..cebe73d 100644 --- a/src/sync/queries.rs +++ b/src/sync/queries.rs @@ -1,5 +1,4 @@ -use chrono::{DateTime, Utc}; -use sqlx::{FromRow, PgPool}; +use sqlx::PgPool; // --------------------------------------------------------------------------- // Setup SQL — creates BitdexOutbox table + all triggers @@ -307,101 +306,6 @@ pub async fn get_max_ops_id(pool: &PgPool) -> Result { Ok(row.0.unwrap_or(0)) } -// --------------------------------------------------------------------------- -// Row types -// --------------------------------------------------------------------------- - -#[derive(Debug, FromRow)] -pub struct ImageRow { - pub id: i64, - #[sqlx(rename = "postId")] - pub post_id: i64, - pub url: Option, - #[sqlx(rename = "nsfwLevel")] - pub nsfw_level: Option, - pub hash: Option, - #[sqlx(rename = "hideMeta")] - pub hide_meta: Option, - #[sqlx(rename = "type")] - pub image_type: Option, - #[sqlx(rename = "userId")] - pub user_id: Option, - pub minor: Option, - pub poi: Option, - #[sqlx(rename = "blockedFor")] - pub blocked_for: Option, - #[sqlx(rename = "scannedAt")] - pub scanned_at: Option>, - #[sqlx(rename = "createdAt")] - pub created_at: Option>, - pub meta: Option, - #[sqlx(rename = "publishedAt")] - pub published_at: Option>, - pub availability: Option, - #[sqlx(rename = "postedToId")] - pub posted_to_id: Option, - #[sqlx(rename = "sortAt")] - pub sort_at: Option>, - pub width: Option, - pub height: Option, -} - -#[derive(Debug, FromRow)] -pub struct TagRow { - #[sqlx(rename = "imageId")] - pub image_id: i32, - #[sqlx(rename = "tagId")] - pub tag_id: i32, -} - -#[derive(Debug, FromRow)] -pub struct ToolRow { - #[sqlx(rename = "imageId")] - pub image_id: i32, - #[sqlx(rename = "toolId")] - pub tool_id: i32, -} - -#[derive(Debug, FromRow)] -pub struct TechniqueRow { - #[sqlx(rename = "imageId")] - pub image_id: i32, - #[sqlx(rename = "techniqueId")] - pub technique_id: i32, -} - -#[derive(Debug, FromRow)] -pub struct ResourceRow { - #[sqlx(rename = "imageId")] - pub image_id: i32, - #[sqlx(rename = "baseModel")] - pub base_model: Option, - #[sqlx(rename = "modelVersionIds")] - pub model_version_ids: Vec, - #[sqlx(rename = "modelVersionIdsManual")] - pub model_version_ids_manual: Vec, - #[sqlx(rename = "resourcePoi")] - pub resource_poi: Option, -} - -#[derive(Debug, FromRow)] -pub struct OutboxRow { - pub id: i64, - pub entity_id: i64, - pub event: String, -} - -#[derive(Debug, FromRow)] -pub struct MetricRow { - pub id: i64, - #[sqlx(rename = "reactionCount")] - pub reaction_count: i64, - #[sqlx(rename = "commentCount")] - pub comment_count: i64, - #[sqlx(rename = "collectedCount")] - pub collected_count: i64, -} - // --------------------------------------------------------------------------- // Query functions // --------------------------------------------------------------------------- @@ -431,158 +335,6 @@ async fn check_triggers_exist(pool: &PgPool) -> Result { Ok(row.0 >= 9) } -/// Get the max image ID for range-based bulk loading. -pub async fn get_max_image_id(pool: &PgPool) -> Result { - let row: (i64,) = sqlx::query_as("SELECT COALESCE(MAX(id)::int8, 0) FROM \"Image\"") - .fetch_one(pool) - .await?; - Ok(row.0) -} - -/// Fetch images by ID range (for bulk loading). -pub async fn fetch_images_by_range( - pool: &PgPool, - start: i64, - end: i64, -) -> Result, sqlx::Error> { - sqlx::query_as::<_, ImageRow>( - r#"SELECT i.id::int8, i."postId"::int8, i.url, i."nsfwLevel", i.hash, - i."hideMeta", i.type::text, i."userId"::int8, - i.minor, i.poi, i."blockedFor", i."scannedAt"::timestamptz, i."createdAt"::timestamptz, - i.meta, - p."publishedAt"::timestamptz, p.availability::text, p."modelVersionId"::int8 as "postedToId", - GREATEST(p."publishedAt", i."scannedAt", i."createdAt")::timestamptz as "sortAt" - FROM "Image" i - JOIN "Post" p ON p.id = i."postId" - WHERE i.id >= $1 AND i.id < $2"#, - ) - .bind(start) - .bind(end) - .fetch_all(pool) - .await -} - -/// Fetch images by ID list (for sync/streaming). -pub async fn fetch_images_by_ids( - pool: &PgPool, - ids: &[i64], -) -> Result, sqlx::Error> { - sqlx::query_as::<_, ImageRow>( - r#"SELECT i.id::int8, i."postId"::int8, i.url, i."nsfwLevel", i.hash, - i."hideMeta", i.type::text, i."userId"::int8, - i.minor, i.poi, i."blockedFor", i."scannedAt"::timestamptz, i."createdAt"::timestamptz, - i.meta, - p."publishedAt"::timestamptz, p.availability::text, p."modelVersionId"::int8 as "postedToId", - GREATEST(p."publishedAt", i."scannedAt", i."createdAt")::timestamptz as "sortAt", - i.width, i.height - FROM "Image" i - JOIN "Post" p ON p.id = i."postId" - WHERE i.id = ANY($1)"#, - ) - .bind(ids) - .fetch_all(pool) - .await -} - -/// Fetch tags for a batch of image IDs. -pub async fn fetch_tags(pool: &PgPool, image_ids: &[i64]) -> Result, sqlx::Error> { - sqlx::query_as::<_, TagRow>( - r#"SELECT "imageId", "tagId" FROM "TagsOnImageDetails" - WHERE "imageId" = ANY($1) AND disabled = false"#, - ) - .bind(image_ids) - .fetch_all(pool) - .await -} - -/// Fetch tools for a batch of image IDs. -pub async fn fetch_tools(pool: &PgPool, image_ids: &[i64]) -> Result, sqlx::Error> { - sqlx::query_as::<_, ToolRow>( - r#"SELECT "imageId", "toolId" FROM "ImageTool" WHERE "imageId" = ANY($1)"#, - ) - .bind(image_ids) - .fetch_all(pool) - .await -} - -/// Fetch techniques for a batch of image IDs. -pub async fn fetch_techniques( - pool: &PgPool, - image_ids: &[i64], -) -> Result, sqlx::Error> { - sqlx::query_as::<_, TechniqueRow>( - r#"SELECT "imageId", "techniqueId" FROM "ImageTechnique" WHERE "imageId" = ANY($1)"#, - ) - .bind(image_ids) - .fetch_all(pool) - .await -} - -/// Fetch resources + model versions for a batch of image IDs. -pub async fn fetch_resources( - pool: &PgPool, - image_ids: &[i64], -) -> Result, sqlx::Error> { - sqlx::query_as::<_, ResourceRow>( - r#"SELECT ir."imageId", - string_agg(CASE WHEN m.type = 'Checkpoint' THEN mv."baseModel" ELSE NULL END, '') as "baseModel", - coalesce(array_agg(mv.id::int8) FILTER (WHERE ir.detected), '{}') as "modelVersionIds", - coalesce(array_agg(mv.id::int8) FILTER (WHERE NOT ir.detected), '{}') as "modelVersionIdsManual", - bool_or(m.poi) as "resourcePoi" - FROM "ImageResourceNew" ir - JOIN "ModelVersion" mv ON ir."modelVersionId" = mv.id - JOIN "Model" m ON mv."modelId" = m.id - WHERE ir."imageId" = ANY($1) - GROUP BY ir."imageId""#, - ) - .bind(image_ids) - .fetch_all(pool) - .await -} - -/// Row type for CollectionItem enrichment. -#[derive(Debug, FromRow)] -pub struct CollectionItemRow { - #[sqlx(rename = "imageId")] - pub image_id: i64, - #[sqlx(rename = "collectionId")] - pub collection_id: i64, -} - -/// Fetch accepted collection memberships for a batch of image IDs. -pub async fn fetch_collections( - pool: &PgPool, - image_ids: &[i64], -) -> Result, sqlx::Error> { - sqlx::query_as::<_, CollectionItemRow>( - r#"SELECT "imageId"::int8, "collectionId"::int8 FROM "CollectionItem" - WHERE "imageId" = ANY($1) AND status = 'ACCEPTED'"#, - ) - .bind(image_ids) - .fetch_all(pool) - .await -} - -// V1 poll_outbox and delete_outbox removed — V2 uses ops_poller with BitdexOps table. - -/// Poll outbox rows after a cursor position (FIFO — oldest first). -pub async fn poll_outbox_from_cursor( - pool: &PgPool, - cursor: i64, - limit: i64, -) -> Result, sqlx::Error> { - sqlx::query_as::<_, OutboxRow>( - r#"SELECT id, entity_id, event FROM "BitdexOutbox" - WHERE id > $1 - ORDER BY id ASC - LIMIT $2"#, - ) - .bind(cursor) - .bind(limit) - .fetch_all(pool) - .await -} - /// Report a replica's cursor to PG for outbox cleanup tracking. pub async fn upsert_cursor( pool: &PgPool, @@ -611,177 +363,3 @@ pub async fn get_max_outbox_id(pool: &PgPool) -> Result { .await?; Ok(row.0.unwrap_or(0)) } - -// --------------------------------------------------------------------------- -// Streaming bulk queries — table-at-a-time loading -// --------------------------------------------------------------------------- - -/// Row type for streaming tags ordered by tagId (for bitmap-efficient insertion). -#[derive(Debug, FromRow)] -pub struct StreamTagRow { - #[sqlx(rename = "tagId")] - pub tag_id: i64, - #[sqlx(rename = "imageId")] - pub image_id: i64, -} - -/// Row type for streaming resources (one row per imageId, pre-aggregated). -#[derive(Debug, FromRow)] -pub struct StreamResourceRow { - #[sqlx(rename = "imageId")] - pub image_id: i64, - #[sqlx(rename = "baseModel")] - pub base_model: Option, - #[sqlx(rename = "modelVersionIds")] - pub model_version_ids: Vec, - #[sqlx(rename = "modelVersionIdsManual")] - pub model_version_ids_manual: Vec, - #[sqlx(rename = "resourcePoi")] - pub resource_poi: Option, -} - -/// Get max tag ID for range iteration. -pub async fn get_max_tag_id(pool: &PgPool) -> Result { - let row: (i64,) = sqlx::query_as( - r#"SELECT COALESCE(MAX("tagId")::int8, 0) FROM "TagsOnImageDetails""#, - ) - .fetch_one(pool) - .await?; - Ok(row.0) -} - -/// Fetch tags by tagId range, ordered by tagId then imageId. -/// This produces bitmap-optimal ordering: all images for one tagId together. -pub async fn fetch_tags_by_tag_range( - pool: &PgPool, - start: i64, - end: i64, -) -> Result, sqlx::Error> { - sqlx::query_as::<_, StreamTagRow>( - r#"SELECT "tagId", "imageId" FROM "TagsOnImageDetails" - WHERE "tagId" >= $1 AND "tagId" < $2 - AND disabled = false - ORDER BY "tagId", "imageId""#, - ) - .bind(start) - .bind(end) - .fetch_all(pool) - .await -} - -/// Get max tool ID for range iteration. -pub async fn get_max_tool_id(pool: &PgPool) -> Result { - let row: (i64,) = sqlx::query_as( - r#"SELECT COALESCE(MAX("toolId")::int8, 0) FROM "ImageTool""#, - ) - .fetch_one(pool) - .await?; - Ok(row.0) -} - -/// Fetch tools by toolId range, ordered by toolId then imageId. -pub async fn fetch_tools_by_tool_range( - pool: &PgPool, - start: i64, - end: i64, -) -> Result, sqlx::Error> { - sqlx::query_as::<_, ToolRow>( - r#"SELECT "imageId", "toolId" FROM "ImageTool" - WHERE "toolId" >= $1 AND "toolId" < $2 - ORDER BY "toolId", "imageId""#, - ) - .bind(start) - .bind(end) - .fetch_all(pool) - .await -} - -/// Get max technique ID for range iteration. -pub async fn get_max_technique_id(pool: &PgPool) -> Result { - let row: (i64,) = sqlx::query_as( - r#"SELECT COALESCE(MAX("techniqueId")::int8, 0) FROM "ImageTechnique""#, - ) - .fetch_one(pool) - .await?; - Ok(row.0) -} - -/// Fetch techniques by techniqueId range, ordered by techniqueId then imageId. -pub async fn fetch_techniques_by_technique_range( - pool: &PgPool, - start: i64, - end: i64, -) -> Result, sqlx::Error> { - sqlx::query_as::<_, TechniqueRow>( - r#"SELECT "imageId", "techniqueId" FROM "ImageTechnique" - WHERE "techniqueId" >= $1 AND "techniqueId" < $2 - ORDER BY "techniqueId", "imageId""#, - ) - .bind(start) - .bind(end) - .fetch_all(pool) - .await -} - -/// Fetch resources by imageId range (pre-aggregated per imageId). -pub async fn fetch_resources_by_range( - pool: &PgPool, - start: i64, - end: i64, -) -> Result, sqlx::Error> { - sqlx::query_as::<_, StreamResourceRow>( - r#"SELECT ir."imageId", - string_agg(CASE WHEN m.type = 'Checkpoint' THEN mv."baseModel" ELSE NULL END, '') as "baseModel", - coalesce(array_agg(mv.id) FILTER (WHERE ir.detected), '{}') as "modelVersionIds", - coalesce(array_agg(mv.id) FILTER (WHERE NOT ir.detected), '{}') as "modelVersionIdsManual", - bool_or(m.poi) as "resourcePoi" - FROM "ImageResourceNew" ir - JOIN "ModelVersion" mv ON ir."modelVersionId" = mv.id - JOIN "Model" m ON mv."modelId" = m.id - WHERE ir."imageId" >= $1 AND ir."imageId" < $2 - GROUP BY ir."imageId""#, - ) - .bind(start) - .bind(end) - .fetch_all(pool) - .await -} - -/// Row type for streaming collection items ordered by collectionId. -#[derive(Debug, FromRow)] -pub struct StreamCollectionRow { - #[sqlx(rename = "collectionId")] - pub collection_id: i64, - #[sqlx(rename = "imageId")] - pub image_id: i64, -} - -/// Get max collection ID for range iteration. -pub async fn get_max_collection_id(pool: &PgPool) -> Result { - let row: (i32,) = sqlx::query_as( - r#"SELECT COALESCE(MAX("collectionId"), 0) FROM "CollectionItem" WHERE "imageId" IS NOT NULL"#, - ) - .fetch_one(pool) - .await?; - Ok(row.0) -} - -/// Fetch collection items by collectionId range, ordered by collectionId then imageId. -/// Filters on imageId IS NOT NULL (image collections only) and status = 'ACCEPTED'. -pub async fn fetch_collections_by_range( - pool: &PgPool, - start: i32, - end: i32, -) -> Result, sqlx::Error> { - sqlx::query_as::<_, StreamCollectionRow>( - r#"SELECT "collectionId", "imageId" FROM "CollectionItem" - WHERE "collectionId" >= $1 AND "collectionId" < $2 - AND "imageId" IS NOT NULL - AND status = 'ACCEPTED' - ORDER BY "collectionId", "imageId""#, - ) - .bind(start) - .bind(end) - .fetch_all(pool) - .await -} diff --git a/src/sync/slot_arena.rs b/src/sync/slot_arena.rs deleted file mode 100644 index 2ee48ec..0000000 --- a/src/sync/slot_arena.rs +++ /dev/null @@ -1,1016 +0,0 @@ -//! Pre-allocated memory-mapped slot arena for bulk loading. -//! -//! Each document gets a fixed 512-byte slot in a memory-mapped file. -//! Multiple table streams write to different field offsets concurrently. -//! After all streams complete, a finalization pass reads populated slots, -//! serializes to msgpack, and writes to the docstore. -//! -//! Slot layout (512 bytes): -//! ```text -//! Offset Size Field -//! ------ ---- ----- -//! 0 8 present_mask (AtomicU64 LE) -//! 8 8 image_id (u64 LE) -//! 16 1 nsfw_level (u8) -//! 17 8 user_id (u64 LE) -//! 25 1 image_type_enum (u8: 0=image, 1=video, 2=audio) -//! 26 8 sort_at (u64 LE, unix seconds) -//! 34 1 poi (u8 bool) -//! 35 1 minor (u8 bool) -//! 36 80 url ([u8; 80], first byte = length, rest = UTF-8) -//! 116 40 hash ([u8; 40], first byte = length) -//! 156 1 tag_count (u8, max 48 inline) -//! 157 192 tag_ids ([u32; 48] LE) -//! 349 1 mv_count (u8, max 8 inline) -//! 350 32 model_version_ids ([u32; 8] LE) -//! 382 1 tool_count (u8, max 8) -//! 383 32 tool_ids ([u32; 8] LE) -//! 415 1 technique_count (u8, max 4) -//! 416 16 technique_ids ([u32; 4] LE) -//! 432 1 has_meta (u8 bool) -//! 433 1 on_site (u8 bool) -//! 434 8 post_id (u64 LE) -//! 442 8 posted_to_id (u64 LE) -//! 450 1 availability_enum (u8: 0=Public, 1=Private, 2=Unsearchable) -//! 451 1 blocked_for_enum (u8: 0=none, 1=CSAM, 2=TOS, ...) -//! 452 4 reaction_count (u32 LE) -//! 456 4 comment_count (u32 LE) -//! 460 4 collected_count (u32 LE) -//! 464 1 mv_manual_count (u8, max 8) -//! 465 32 model_version_ids_manual ([u32; 8] LE) -//! 497 1 base_model_enum (u8) -//! 498 1 resource_poi (u8 bool, OR'd with image poi) -//! 499 8 published_at_unix (u64 LE, milliseconds) -//! 507 5 _padding -//! --- --- -//! 512 total -//! ``` - -use std::path::{Path, PathBuf}; -use std::sync::atomic::{AtomicU64, Ordering}; -use std::sync::Mutex; - -use memmap2::MmapMut; -use roaring::RoaringBitmap; - -use crate::config::DataSchema; -use crate::error::Result; - -// --------------------------------------------------------------------------- -// Constants — slot layout -// --------------------------------------------------------------------------- - -pub const SLOT_SIZE: usize = 512; - -// Field offsets -pub const OFF_PRESENT: usize = 0; -pub const OFF_IMAGE_ID: usize = 8; -pub const OFF_NSFW: usize = 16; -pub const OFF_USER_ID: usize = 17; -pub const OFF_IMAGE_TYPE: usize = 25; -pub const OFF_SORT_AT: usize = 26; -pub const OFF_POI: usize = 34; -pub const OFF_MINOR: usize = 35; -pub const OFF_URL: usize = 36; -pub const OFF_HASH: usize = 116; -pub const OFF_TAG_COUNT: usize = 156; -pub const OFF_TAG_IDS: usize = 157; -pub const OFF_MV_COUNT: usize = 349; -pub const OFF_MV_IDS: usize = 350; -pub const OFF_TOOL_COUNT: usize = 382; -pub const OFF_TOOL_IDS: usize = 383; -pub const OFF_TECH_COUNT: usize = 415; -pub const OFF_TECH_IDS: usize = 416; -pub const OFF_HAS_META: usize = 432; -pub const OFF_ON_SITE: usize = 433; -pub const OFF_POST_ID: usize = 434; -pub const OFF_POSTED_TO_ID: usize = 442; -pub const OFF_AVAILABILITY: usize = 450; -pub const OFF_BLOCKED_FOR: usize = 451; -pub const OFF_REACTION_COUNT: usize = 452; -pub const OFF_COMMENT_COUNT: usize = 456; -pub const OFF_COLLECTED_COUNT: usize = 460; -pub const OFF_MV_MANUAL_COUNT: usize = 464; -pub const OFF_MV_MANUAL_IDS: usize = 465; -pub const OFF_BASE_MODEL: usize = 497; -pub const OFF_RESOURCE_POI: usize = 498; -pub const OFF_PUBLISHED_AT: usize = 499; - -// Inline capacity limits -pub const MAX_INLINE_TAGS: usize = 48; -pub const MAX_INLINE_MVS: usize = 8; -pub const MAX_INLINE_TOOLS: usize = 8; -pub const MAX_INLINE_TECHNIQUES: usize = 4; -pub const MAX_URL_LEN: usize = 79; // first byte = length -pub const MAX_HASH_LEN: usize = 39; - -// Present mask bits -pub const MASK_IMAGE_ID: u64 = 1 << 0; -pub const MASK_NSFW: u64 = 1 << 1; -pub const MASK_USER_ID: u64 = 1 << 2; -pub const MASK_IMAGE_TYPE: u64 = 1 << 3; -pub const MASK_SORT_AT: u64 = 1 << 4; -pub const MASK_POI: u64 = 1 << 5; -pub const MASK_MINOR: u64 = 1 << 6; -pub const MASK_URL: u64 = 1 << 7; -pub const MASK_HASH: u64 = 1 << 8; -pub const MASK_TAGS: u64 = 1 << 9; -pub const MASK_MV: u64 = 1 << 10; -pub const MASK_TOOLS: u64 = 1 << 11; -pub const MASK_TECHNIQUES: u64 = 1 << 12; -pub const MASK_HAS_META: u64 = 1 << 13; -pub const MASK_ON_SITE: u64 = 1 << 14; -pub const MASK_POST_ID: u64 = 1 << 15; -pub const MASK_POSTED_TO_ID: u64 = 1 << 16; -pub const MASK_AVAILABILITY: u64 = 1 << 17; -pub const MASK_BLOCKED_FOR: u64 = 1 << 18; -pub const MASK_METRICS: u64 = 1 << 19; -pub const MASK_MV_MANUAL: u64 = 1 << 20; -pub const MASK_BASE_MODEL: u64 = 1 << 21; -pub const MASK_RESOURCE_POI: u64 = 1 << 22; -pub const MASK_PUBLISHED_AT: u64 = 1 << 23; - -// --------------------------------------------------------------------------- -// Overflow for fields that exceed inline capacity -// --------------------------------------------------------------------------- - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum OverflowField { - Tags, - ModelVersionIds, - ModelVersionIdsManual, - Tools, - Techniques, -} - -#[derive(Debug)] -pub(crate) struct OverflowEntry { - pub(crate) slot: u32, - pub(crate) field: OverflowField, - pub(crate) data: Vec, -} - -// --------------------------------------------------------------------------- -// SlotData — assembled from slot + overflow for finalization -// --------------------------------------------------------------------------- - -/// Complete data read from a slot, including overflow. -#[derive(Debug)] -pub struct SlotData { - pub image_id: u64, - pub nsfw_level: u8, - pub user_id: u64, - pub image_type: u8, - pub sort_at: u64, - pub poi: bool, - pub minor: bool, - pub url: Option, - pub hash: Option, - pub tag_ids: Vec, - pub model_version_ids: Vec, - pub model_version_ids_manual: Vec, - pub tool_ids: Vec, - pub technique_ids: Vec, - pub has_meta: bool, - pub on_site: bool, - pub post_id: u64, - pub posted_to_id: u64, - pub availability: u8, - pub blocked_for: u8, - pub reaction_count: u32, - pub comment_count: u32, - pub collected_count: u32, - pub base_model: u8, - pub resource_poi: bool, - pub published_at_unix: u64, -} - -// --------------------------------------------------------------------------- -// SlotArena -// --------------------------------------------------------------------------- - -/// Memory-mapped slot arena for bulk loading. -/// -/// Pre-allocates a file with `max_slot * 512` bytes. Each table stream writes -/// to different field offsets concurrently using atomic present_mask updates. -/// After all streams finish, `finalize_to_docstore` reads populated slots and -/// writes compressed docstore shards. -pub struct SlotArena { - mmap: MmapMut, - overflow: Mutex>, - max_slot: u32, - arena_path: PathBuf, - _file: std::fs::File, // keep file handle alive for the mmap -} - -impl SlotArena { - /// Create a new slot arena backed by a memory-mapped file. - /// - /// The file is pre-allocated to `(max_slot + 1) * SLOT_SIZE` bytes and - /// zero-initialized. - pub fn new(max_slot: u32, path: &Path) -> Result { - let file_size = (max_slot as u64 + 1) * SLOT_SIZE as u64; - let file = std::fs::OpenOptions::new() - .read(true) - .write(true) - .create(true) - .truncate(true) - .open(path) - .map_err(|e| crate::error::BitdexError::Storage( - format!("SlotArena: create file {}: {e}", path.display()), - ))?; - file.set_len(file_size).map_err(|e| { - crate::error::BitdexError::Storage(format!("SlotArena: set_len {file_size}: {e}")) - })?; - - let mmap = unsafe { - MmapMut::map_mut(&file).map_err(|e| { - crate::error::BitdexError::Storage(format!("SlotArena: mmap: {e}")) - })? - }; - // Random hint: write phase has each rayon thread writing to arbitrary slot - // offsets determined by document ID — access pattern is uniformly scattered. - #[cfg(unix)] let _ = mmap.advise(memmap2::Advice::Random); - - eprintln!( - "SlotArena: allocated {} MB for {} slots at {}", - file_size / (1024 * 1024), - max_slot + 1, - path.display() - ); - - Ok(SlotArena { - mmap, - overflow: Mutex::new(Vec::new()), - max_slot, - arena_path: path.to_path_buf(), - _file: file, - }) - } - - /// Report memory usage: (arena_bytes, overflow_bytes). - pub fn memory_usage(&self) -> (usize, usize) { - let arena = (self.max_slot as usize + 1) * SLOT_SIZE; - let overflow_entries = self.overflow.lock().unwrap(); - let overflow: usize = overflow_entries - .iter() - .map(|e| e.data.len() * 4 + std::mem::size_of::()) - .sum(); - (arena, overflow) - } - - // ---- Atomic present_mask helpers ---- - - /// Atomically OR bits into the present_mask for a slot. - /// Safe for concurrent access from multiple table streams. - fn or_present_mask(&self, slot: u32, bits: u64) { - let base = slot as usize * SLOT_SIZE + OFF_PRESENT; - // SAFETY: We're treating the 8 bytes at the present_mask offset as an AtomicU64. - // The mmap is aligned to page boundaries (4KB), and each slot is 512 bytes, - // so the 8-byte present_mask is always naturally aligned. - unsafe { - let ptr = self.mmap.as_ptr().add(base) as *const AtomicU64; - (*ptr).fetch_or(bits, Ordering::Release); - } - } - - // ---- Low-level write helpers ---- - - #[inline] - fn slot_base(&self, slot: u32) -> usize { - slot as usize * SLOT_SIZE - } - - #[inline] - fn write_u64(&self, offset: usize, val: u64) { - // SAFETY: We need &self (not &mut self) for concurrent access. - // Different streams write to non-overlapping field offsets within each slot. - unsafe { - let ptr = self.mmap.as_ptr().add(offset) as *mut u8; - std::ptr::copy_nonoverlapping(val.to_le_bytes().as_ptr(), ptr, 8); - } - } - - #[inline] - fn write_u32(&self, offset: usize, val: u32) { - unsafe { - let ptr = self.mmap.as_ptr().add(offset) as *mut u8; - std::ptr::copy_nonoverlapping(val.to_le_bytes().as_ptr(), ptr, 4); - } - } - - #[inline] - fn write_u8(&self, offset: usize, val: u8) { - unsafe { - let ptr = self.mmap.as_ptr().add(offset) as *mut u8; - *ptr = val; - } - } - - #[inline] - fn write_inline_str(&self, offset: usize, max_len: usize, s: &[u8]) { - let len = s.len().min(max_len); - self.write_u8(offset, len as u8); - if len > 0 { - unsafe { - let ptr = self.mmap.as_ptr().add(offset + 1) as *mut u8; - std::ptr::copy_nonoverlapping(s.as_ptr(), ptr, len); - } - } - } - - // ---- Read helpers (for finalization) ---- - - #[inline] - fn read_u64(&self, offset: usize) -> u64 { - let bytes: [u8; 8] = self.mmap[offset..offset + 8].try_into().unwrap(); - u64::from_le_bytes(bytes) - } - - #[inline] - fn read_u32(&self, offset: usize) -> u32 { - let bytes: [u8; 4] = self.mmap[offset..offset + 4].try_into().unwrap(); - u32::from_le_bytes(bytes) - } - - #[inline] - fn read_u8(&self, offset: usize) -> u8 { - self.mmap[offset] - } - - fn read_inline_str(&self, offset: usize) -> Option { - let len = self.read_u8(offset) as usize; - if len == 0 { - return None; - } - let bytes = &self.mmap[offset + 1..offset + 1 + len]; - Some(String::from_utf8_lossy(bytes).into_owned()) - } - - // ---- Public write methods (called by table streams) ---- - - /// Write Image+Post scalar fields to a slot. - /// - /// Called by the image stream. Sets all scalar fields and their present bits. - pub fn write_scalars( - &self, - slot: u32, - image_id: u64, - nsfw_level: u8, - user_id: u64, - image_type: u8, - sort_at: u64, - poi: bool, - minor: bool, - url: Option<&[u8]>, - hash: Option<&[u8]>, - has_meta: bool, - on_site: bool, - post_id: u64, - posted_to_id: u64, - availability: u8, - blocked_for: u8, - published_at_unix: u64, - ) { - let base = self.slot_base(slot); - - self.write_u64(base + OFF_IMAGE_ID, image_id); - self.write_u8(base + OFF_NSFW, nsfw_level); - self.write_u64(base + OFF_USER_ID, user_id); - self.write_u8(base + OFF_IMAGE_TYPE, image_type); - self.write_u64(base + OFF_SORT_AT, sort_at); - self.write_u8(base + OFF_POI, poi as u8); - self.write_u8(base + OFF_MINOR, minor as u8); - - let mut mask = MASK_IMAGE_ID | MASK_NSFW | MASK_USER_ID | MASK_IMAGE_TYPE - | MASK_SORT_AT | MASK_POI | MASK_MINOR | MASK_POST_ID | MASK_POSTED_TO_ID - | MASK_AVAILABILITY | MASK_PUBLISHED_AT; - - if let Some(url_bytes) = url { - self.write_inline_str(base + OFF_URL, MAX_URL_LEN, url_bytes); - mask |= MASK_URL; - } - if let Some(hash_bytes) = hash { - self.write_inline_str(base + OFF_HASH, MAX_HASH_LEN, hash_bytes); - mask |= MASK_HASH; - } - - self.write_u8(base + OFF_HAS_META, has_meta as u8); - self.write_u8(base + OFF_ON_SITE, on_site as u8); - self.write_u64(base + OFF_POST_ID, post_id); - self.write_u64(base + OFF_POSTED_TO_ID, posted_to_id); - self.write_u8(base + OFF_AVAILABILITY, availability); - self.write_u8(base + OFF_BLOCKED_FOR, blocked_for); - self.write_u64(base + OFF_PUBLISHED_AT, published_at_unix); - - if has_meta { mask |= MASK_HAS_META; } - if on_site { mask |= MASK_ON_SITE; } - if blocked_for > 0 { mask |= MASK_BLOCKED_FOR; } - - // Metrics default to 0 — set mask so finalization knows they're present - self.write_u32(base + OFF_REACTION_COUNT, 0); - self.write_u32(base + OFF_COMMENT_COUNT, 0); - self.write_u32(base + OFF_COLLECTED_COUNT, 0); - mask |= MASK_METRICS; - - self.or_present_mask(slot, mask); - } - - /// Append tag IDs to a slot. Inline up to 48, overflow the rest. - /// - /// Called by the tag stream. Tags arrive ordered by tagId, so the same - /// slot may be written to multiple times as different tagIds are processed. - /// Each call appends to the existing tag list in the slot. - pub fn write_tags(&self, slot: u32, tag_ids: &[u32]) { - if tag_ids.is_empty() { - return; - } - let base = self.slot_base(slot); - - // Read current count atomically-enough (single byte, no torn reads) - let current_count = self.read_u8(base + OFF_TAG_COUNT) as usize; - let remaining_inline = MAX_INLINE_TAGS.saturating_sub(current_count); - - // Write as many as fit inline - let inline_count = tag_ids.len().min(remaining_inline); - for (i, &tag_id) in tag_ids[..inline_count].iter().enumerate() { - self.write_u32(base + OFF_TAG_IDS + (current_count + i) * 4, tag_id); - } - self.write_u8(base + OFF_TAG_COUNT, (current_count + inline_count) as u8); - - // Overflow the rest - if inline_count < tag_ids.len() { - let overflow_data: Vec = tag_ids[inline_count..].to_vec(); - self.overflow.lock().unwrap().push(OverflowEntry { - slot, - field: OverflowField::Tags, - data: overflow_data, - }); - } - - self.or_present_mask(slot, MASK_TAGS); - } - - /// Write model version IDs to a slot. Inline up to 8, overflow the rest. - pub fn write_model_version_ids(&self, slot: u32, mv_ids: &[u32]) { - if mv_ids.is_empty() { - return; - } - let base = self.slot_base(slot); - let current = self.read_u8(base + OFF_MV_COUNT) as usize; - let remaining = MAX_INLINE_MVS.saturating_sub(current); - let inline_n = mv_ids.len().min(remaining); - - for (i, &id) in mv_ids[..inline_n].iter().enumerate() { - self.write_u32(base + OFF_MV_IDS + (current + i) * 4, id); - } - self.write_u8(base + OFF_MV_COUNT, (current + inline_n) as u8); - - if inline_n < mv_ids.len() { - self.overflow.lock().unwrap().push(OverflowEntry { - slot, - field: OverflowField::ModelVersionIds, - data: mv_ids[inline_n..].to_vec(), - }); - } - - self.or_present_mask(slot, MASK_MV); - } - - /// Write manual model version IDs to a slot. - pub fn write_model_version_ids_manual(&self, slot: u32, mv_ids: &[u32]) { - if mv_ids.is_empty() { - return; - } - let base = self.slot_base(slot); - let current = self.read_u8(base + OFF_MV_MANUAL_COUNT) as usize; - let remaining = MAX_INLINE_MVS.saturating_sub(current); - let inline_n = mv_ids.len().min(remaining); - - for (i, &id) in mv_ids[..inline_n].iter().enumerate() { - self.write_u32(base + OFF_MV_MANUAL_IDS + (current + i) * 4, id); - } - self.write_u8(base + OFF_MV_MANUAL_COUNT, (current + inline_n) as u8); - - if inline_n < mv_ids.len() { - self.overflow.lock().unwrap().push(OverflowEntry { - slot, - field: OverflowField::ModelVersionIdsManual, - data: mv_ids[inline_n..].to_vec(), - }); - } - - self.or_present_mask(slot, MASK_MV_MANUAL); - } - - /// Write tool IDs to a slot. - pub fn write_tools(&self, slot: u32, tool_ids: &[u32]) { - if tool_ids.is_empty() { - return; - } - let base = self.slot_base(slot); - let current = self.read_u8(base + OFF_TOOL_COUNT) as usize; - let remaining = MAX_INLINE_TOOLS.saturating_sub(current); - let inline_n = tool_ids.len().min(remaining); - - for (i, &id) in tool_ids[..inline_n].iter().enumerate() { - self.write_u32(base + OFF_TOOL_IDS + (current + i) * 4, id); - } - self.write_u8(base + OFF_TOOL_COUNT, (current + inline_n) as u8); - - if inline_n < tool_ids.len() { - self.overflow.lock().unwrap().push(OverflowEntry { - slot, - field: OverflowField::Tools, - data: tool_ids[inline_n..].to_vec(), - }); - } - - self.or_present_mask(slot, MASK_TOOLS); - } - - /// Write technique IDs to a slot. - pub fn write_techniques(&self, slot: u32, technique_ids: &[u32]) { - if technique_ids.is_empty() { - return; - } - let base = self.slot_base(slot); - let current = self.read_u8(base + OFF_TECH_COUNT) as usize; - let remaining = MAX_INLINE_TECHNIQUES.saturating_sub(current); - let inline_n = technique_ids.len().min(remaining); - - for (i, &id) in technique_ids[..inline_n].iter().enumerate() { - self.write_u32(base + OFF_TECH_IDS + (current + i) * 4, id); - } - self.write_u8(base + OFF_TECH_COUNT, (current + inline_n) as u8); - - if inline_n < technique_ids.len() { - self.overflow.lock().unwrap().push(OverflowEntry { - slot, - field: OverflowField::Techniques, - data: technique_ids[inline_n..].to_vec(), - }); - } - - self.or_present_mask(slot, MASK_TECHNIQUES); - } - - /// Write base model enum to a slot. - pub fn write_base_model(&self, slot: u32, base_model: u8) { - let base = self.slot_base(slot); - self.write_u8(base + OFF_BASE_MODEL, base_model); - self.or_present_mask(slot, MASK_BASE_MODEL); - } - - /// OR resource_poi into the slot's poi field. - /// - /// Image stream sets poi from Image.poi. - /// Resource stream OR's in resource_poi if true (idempotent). - pub fn set_resource_poi(&self, slot: u32) { - let base = self.slot_base(slot); - self.write_u8(base + OFF_RESOURCE_POI, 1); - self.or_present_mask(slot, MASK_RESOURCE_POI); - } - - // ---- Read methods (for finalization) ---- - - /// Read a complete slot, merging any overflow data. - /// - /// Returns `None` if the slot has no present bits set (never written). - pub(crate) fn read_slot(&self, slot: u32, overflow_map: &std::collections::HashMap>) -> Option { - let base = self.slot_base(slot); - let mask = self.read_u64(base + OFF_PRESENT); - - if mask == 0 { - return None; - } - - // Read inline tag IDs - let tag_count = self.read_u8(base + OFF_TAG_COUNT) as usize; - let mut tag_ids: Vec = (0..tag_count) - .map(|i| self.read_u32(base + OFF_TAG_IDS + i * 4)) - .collect(); - - // Read inline MV IDs - let mv_count = self.read_u8(base + OFF_MV_COUNT) as usize; - let mut model_version_ids: Vec = (0..mv_count) - .map(|i| self.read_u32(base + OFF_MV_IDS + i * 4)) - .collect(); - - // Read inline manual MV IDs - let mv_manual_count = self.read_u8(base + OFF_MV_MANUAL_COUNT) as usize; - let mut model_version_ids_manual: Vec = (0..mv_manual_count) - .map(|i| self.read_u32(base + OFF_MV_MANUAL_IDS + i * 4)) - .collect(); - - // Read inline tool IDs - let tool_count = self.read_u8(base + OFF_TOOL_COUNT) as usize; - let mut tool_ids: Vec = (0..tool_count) - .map(|i| self.read_u32(base + OFF_TOOL_IDS + i * 4)) - .collect(); - - // Read inline technique IDs - let tech_count = self.read_u8(base + OFF_TECH_COUNT) as usize; - let mut technique_ids: Vec = (0..tech_count) - .map(|i| self.read_u32(base + OFF_TECH_IDS + i * 4)) - .collect(); - - // Merge overflow - if let Some(entries) = overflow_map.get(&slot) { - for entry in entries { - match entry.field { - OverflowField::Tags => tag_ids.extend_from_slice(&entry.data), - OverflowField::ModelVersionIds => model_version_ids.extend_from_slice(&entry.data), - OverflowField::ModelVersionIdsManual => model_version_ids_manual.extend_from_slice(&entry.data), - OverflowField::Tools => tool_ids.extend_from_slice(&entry.data), - OverflowField::Techniques => technique_ids.extend_from_slice(&entry.data), - } - } - } - - // Resolve poi: image poi OR resource_poi - let image_poi = self.read_u8(base + OFF_POI) != 0; - let resource_poi = self.read_u8(base + OFF_RESOURCE_POI) != 0; - - Some(SlotData { - image_id: self.read_u64(base + OFF_IMAGE_ID), - nsfw_level: self.read_u8(base + OFF_NSFW), - user_id: self.read_u64(base + OFF_USER_ID), - image_type: self.read_u8(base + OFF_IMAGE_TYPE), - sort_at: self.read_u64(base + OFF_SORT_AT), - poi: image_poi || resource_poi, - minor: self.read_u8(base + OFF_MINOR) != 0, - url: self.read_inline_str(base + OFF_URL), - hash: self.read_inline_str(base + OFF_HASH), - tag_ids, - model_version_ids, - model_version_ids_manual, - tool_ids, - technique_ids, - has_meta: self.read_u8(base + OFF_HAS_META) != 0, - on_site: self.read_u8(base + OFF_ON_SITE) != 0, - post_id: self.read_u64(base + OFF_POST_ID), - posted_to_id: self.read_u64(base + OFF_POSTED_TO_ID), - availability: self.read_u8(base + OFF_AVAILABILITY), - blocked_for: self.read_u8(base + OFF_BLOCKED_FOR), - reaction_count: self.read_u32(base + OFF_REACTION_COUNT), - comment_count: self.read_u32(base + OFF_COMMENT_COUNT), - collected_count: self.read_u32(base + OFF_COLLECTED_COUNT), - base_model: self.read_u8(base + OFF_BASE_MODEL), - resource_poi, - published_at_unix: self.read_u64(base + OFF_PUBLISHED_AT), - }) - } - - /// Finalize all populated slots to the docstore. - /// TODO: Rewrite for DataSilo ParallelWriter - pub fn finalize_to_docstore( - &self, - _schema: &DataSchema, - _alive: &RoaringBitmap, - ) -> Result<(u64, u64)> { - // TODO(madvise): when implemented, switch hint to Sequential before the - // 0..max_slot scan: `let _ = self.mmap.advise(memmap2::Advice::Sequential);` - Err(crate::error::BitdexError::Storage( - "finalize_to_docstore: not yet ported to DataSilo".to_string() - )) - } - - /// Clean up the arena file. - pub fn cleanup(self) -> std::io::Result<()> { - // DONTNEED before drop: immediately reclaims RSS on Linux (up to ~54 GB at - // 107M slots) before the OS-level munmap completes. - #[cfg(target_os = "linux")] - let _ = unsafe { self.mmap.unchecked_advise(memmap2::UncheckedAdvice::DontNeed) }; - // (On non-Linux Unix, the drop/munmap itself frees pages promptly enough.) - drop(self.mmap); - drop(self._file); - std::fs::remove_file(&self.arena_path) - } -} - -// --------------------------------------------------------------------------- -// Enum encoding helpers -// --------------------------------------------------------------------------- - -/// Encode image type string to enum byte. -pub fn encode_image_type(s: Option<&str>) -> u8 { - match s { - Some("video") => 1, - Some("audio") => 2, - _ => 0, // "image" or unknown - } -} - -/// Decode image type enum byte to string. -pub fn decode_image_type(v: u8) -> &'static str { - match v { - 1 => "video", - 2 => "audio", - _ => "image", - } -} - -/// Encode availability string to enum byte. -pub fn encode_availability(s: Option<&str>) -> u8 { - match s { - Some("Private") => 1, - Some("Unsearchable") => 2, - _ => 0, // "Public" or unknown - } -} - -/// Decode availability enum byte to string. -pub fn decode_availability(v: u8) -> &'static str { - match v { - 1 => "Private", - 2 => "Unsearchable", - _ => "Public", - } -} - -/// Encode blocked_for string to enum byte. -pub fn encode_blocked_for(s: Option<&str>) -> u8 { - match s { - None => 0, - Some("") => 0, - Some(_) => 1, // any non-empty value - } -} - -// Well-known base model strings → enum byte (for filter bitmaps). -// Not exhaustive — new base models get 0 (unknown). -pub fn encode_base_model(s: Option<&str>) -> u8 { - match s { - None | Some("") => 0, - Some("SD 1.5") => 1, - Some("SD 2.1") => 2, - Some("SDXL 1.0") => 3, - Some("Pony") => 4, - Some("Flux.1 D") => 5, - Some("Flux.1 S") => 6, - Some("SD 3.5 Large") => 7, - Some("Illustrious") => 8, - Some("Hunyuan 1") => 9, - Some("SD 3.5 Medium") => 10, - Some(_) => 255, // known but unmapped - } -} - -pub fn decode_base_model(v: u8) -> &'static str { - match v { - 0 => "", - 1 => "SD 1.5", - 2 => "SD 2.1", - 3 => "SDXL 1.0", - 4 => "Pony", - 5 => "Flux.1 D", - 6 => "Flux.1 S", - 7 => "SD 3.5 Large", - 8 => "Illustrious", - 9 => "Hunyuan 1", - 10 => "SD 3.5 Medium", - _ => "Other", - } -} - -// --------------------------------------------------------------------------- -// SlotData → JSON for docstore encoding -// --------------------------------------------------------------------------- - -/// Convert SlotData to a serde_json::Value matching the Bitdex data schema. -/// Used during finalization to produce docstore-compatible documents. -fn slot_data_to_json(slot: &SlotData) -> serde_json::Value { - let mut doc = serde_json::json!({ - "id": slot.image_id as i64, - "nsfwLevel": slot.nsfw_level as i64, - "userId": slot.user_id as i64, - "postId": slot.post_id as i64, - "postedToId": slot.posted_to_id as i64, - "type": decode_image_type(slot.image_type), - "baseModel": decode_base_model(slot.base_model), - "availability": decode_availability(slot.availability), - "tagIds": slot.tag_ids.iter().map(|&t| t as i64).collect::>(), - "modelVersionIds": slot.model_version_ids.iter().map(|&t| t as i64).collect::>(), - "modelVersionIdsManual": slot.model_version_ids_manual.iter().map(|&t| t as i64).collect::>(), - "toolIds": slot.tool_ids.iter().map(|&t| t as i64).collect::>(), - "techniqueIds": slot.technique_ids.iter().map(|&t| t as i64).collect::>(), - "reactionCount": slot.reaction_count as i64, - "commentCount": slot.comment_count as i64, - "collectedCount": slot.collected_count as i64, - "sortAt": slot.sort_at as i64, - "publishedAt": (slot.published_at_unix / 1000) as i64, - }); - - if let Some(obj) = doc.as_object_mut() { - // Exists-boolean: isPublished = publishedAt is non-zero (matches outbox row_assembler) - if slot.published_at_unix > 0 { - obj.insert("isPublished".into(), serde_json::json!(true)); - } - if slot.has_meta { - obj.insert("hasMeta".into(), serde_json::json!(true)); - } - if slot.on_site { - obj.insert("onSite".into(), serde_json::json!(true)); - } - if slot.poi { - obj.insert("poi".into(), serde_json::json!(true)); - } - if slot.minor { - obj.insert("minor".into(), serde_json::json!(true)); - } - if let Some(ref url) = slot.url { - obj.insert("url".into(), serde_json::json!(url)); - } - if let Some(ref hash) = slot.hash { - obj.insert("hash".into(), serde_json::json!(hash)); - } - if slot.blocked_for > 0 { - obj.insert("blockedFor".into(), serde_json::json!("blocked")); - } - } - - doc -} - -// --------------------------------------------------------------------------- -// Tests -// --------------------------------------------------------------------------- - -#[cfg(test)] -mod tests { - use super::*; - use tempfile::tempdir; - - #[test] - fn test_write_read_scalars() { - let dir = tempdir().unwrap(); - let arena = SlotArena::new(100, &dir.path().join("slots.bin")).unwrap(); - - arena.write_scalars( - 42, 12345, 16, 999, 0, 1700000000, true, false, - Some(b"https://example.com/img.jpg"), - Some(b"abc123hash"), - true, false, - 100, 200, 0, 0, 1700000000000, - ); - - let overflow = std::collections::HashMap::new(); - let slot = arena.read_slot(42, &overflow).unwrap(); - - assert_eq!(slot.image_id, 12345); - assert_eq!(slot.nsfw_level, 16); - assert_eq!(slot.user_id, 999); - assert_eq!(slot.image_type, 0); - assert_eq!(slot.sort_at, 1700000000); - assert!(slot.poi); - assert!(!slot.minor); - assert_eq!(slot.url.as_deref(), Some("https://example.com/img.jpg")); - assert_eq!(slot.hash.as_deref(), Some("abc123hash")); - assert!(slot.has_meta); - assert!(!slot.on_site); - assert_eq!(slot.post_id, 100); - assert_eq!(slot.posted_to_id, 200); - assert_eq!(slot.availability, 0); - assert_eq!(slot.published_at_unix, 1700000000000); - } - - #[test] - fn test_write_tags_inline() { - let dir = tempdir().unwrap(); - let arena = SlotArena::new(10, &dir.path().join("slots.bin")).unwrap(); - - let tags: Vec = (100..110).collect(); - arena.write_tags(5, &tags); - - let overflow = std::collections::HashMap::new(); - let slot = arena.read_slot(5, &overflow).unwrap(); - assert_eq!(slot.tag_ids, tags); - } - - #[test] - fn test_write_tags_overflow() { - let dir = tempdir().unwrap(); - let arena = SlotArena::new(10, &dir.path().join("slots.bin")).unwrap(); - - // Write 60 tags — 48 inline + 12 overflow - let tags: Vec = (100..160).collect(); - arena.write_tags(3, &tags); - - // Build overflow map - let overflow_entries = arena.overflow.lock().unwrap(); - let mut overflow_map: std::collections::HashMap> = - std::collections::HashMap::new(); - for entry in overflow_entries.iter() { - overflow_map.entry(entry.slot).or_default().push(entry); - } - - let slot = arena.read_slot(3, &overflow_map).unwrap(); - assert_eq!(slot.tag_ids.len(), 60); - assert_eq!(slot.tag_ids, tags); - } - - #[test] - fn test_write_tags_incremental() { - let dir = tempdir().unwrap(); - let arena = SlotArena::new(10, &dir.path().join("slots.bin")).unwrap(); - - // Write tags incrementally (simulating tag stream) - arena.write_tags(2, &[100, 101, 102]); - arena.write_tags(2, &[200, 201]); - - let overflow = std::collections::HashMap::new(); - let slot = arena.read_slot(2, &overflow).unwrap(); - assert_eq!(slot.tag_ids, vec![100, 101, 102, 200, 201]); - } - - #[test] - fn test_concurrent_field_writes() { - let dir = tempdir().unwrap(); - let arena = SlotArena::new(1000, &dir.path().join("slots.bin")).unwrap(); - - std::thread::scope(|s| { - // Thread 1: write scalars - s.spawn(|| { - for slot in 0..1000u32 { - arena.write_scalars( - slot, slot as u64, 16, slot as u64 * 7, 0, - 1700000000, false, false, - Some(b"url"), Some(b"hash"), - false, false, 100, 0, 0, 0, 0, - ); - } - }); - - // Thread 2: write tags - s.spawn(|| { - for slot in 0..1000u32 { - arena.write_tags(slot, &[slot + 1000, slot + 2000]); - } - }); - - // Thread 3: write tools - s.spawn(|| { - for slot in 0..1000u32 { - arena.write_tools(slot, &[slot + 5000]); - } - }); - }); - - // Verify all fields present - let overflow = std::collections::HashMap::new(); - for slot in 0..1000u32 { - let data = arena.read_slot(slot, &overflow).unwrap(); - assert_eq!(data.image_id, slot as u64); - assert_eq!(data.tag_ids.len(), 2); - assert_eq!(data.tool_ids.len(), 1); - } - } - - #[test] - fn test_present_mask_atomic() { - let dir = tempdir().unwrap(); - let arena = SlotArena::new(10, &dir.path().join("slots.bin")).unwrap(); - - // Multiple threads OR-ing different bits into the same slot - std::thread::scope(|s| { - for bit in 0..20u64 { - let arena_ref = &arena; - s.spawn(move || { - arena_ref.or_present_mask(5, 1 << bit); - }); - } - }); - - let base = 5 * SLOT_SIZE + OFF_PRESENT; - let mask = arena.read_u64(base); - // All 20 bits should be set - for bit in 0..20u64 { - assert!(mask & (1 << bit) != 0, "bit {bit} not set"); - } - } - - #[test] - fn test_empty_slot_returns_none() { - let dir = tempdir().unwrap(); - let arena = SlotArena::new(10, &dir.path().join("slots.bin")).unwrap(); - - let overflow = std::collections::HashMap::new(); - assert!(arena.read_slot(7, &overflow).is_none()); - } - - #[test] - fn test_memory_usage_reporting() { - let dir = tempdir().unwrap(); - let arena = SlotArena::new(100, &dir.path().join("slots.bin")).unwrap(); - - let (arena_bytes, overflow_bytes) = arena.memory_usage(); - assert_eq!(arena_bytes, 101 * SLOT_SIZE); - assert_eq!(overflow_bytes, 0); - - // Add some overflow - arena.write_tags(1, &vec![0u32; 60]); // 48 inline + 12 overflow - let (_, overflow_bytes) = arena.memory_usage(); - assert!(overflow_bytes > 0); - } -}