From d3dfc5c4466f7d90db1c237c63e83c6b220fcdf9 Mon Sep 17 00:00:00 2001 From: johnny Date: Thu, 19 Feb 2026 23:30:26 -0500 Subject: [PATCH 1/4] add order book implementation --- Cargo.lock | 13 + Cargo.toml | 4 + crates/aedb-orderbook/Cargo.toml | 18 + .../aedb-orderbook/src/bin/orderbook_perf.rs | 243 ++ .../src/bin/orderbook_perf_guard.rs | 114 + crates/aedb-orderbook/src/lib.rs | 630 +++++ .../tests/correctness_matrix.rs | 69 + .../aedb-orderbook/tests/simulation_smoke.rs | 206 ++ src/catalog/mod.rs | 7 +- src/commit/apply.rs | 145 ++ src/commit/executor/internals.rs | 271 ++- src/commit/executor/mod.rs | 76 +- src/commit/validation.rs | 355 +++ src/lib.rs | 508 ++++ src/order_book.rs | 2164 +++++++++++++++++ src/preflight/mod.rs | 168 ++ src/storage/keyspace.rs | 127 +- tests/order_book_native.rs | 697 ++++++ tests/order_book_simulation.rs | 322 +++ 19 files changed, 6098 insertions(+), 39 deletions(-) create mode 100644 crates/aedb-orderbook/Cargo.toml create mode 100644 crates/aedb-orderbook/src/bin/orderbook_perf.rs create mode 100644 crates/aedb-orderbook/src/bin/orderbook_perf_guard.rs create mode 100644 crates/aedb-orderbook/src/lib.rs create mode 100644 crates/aedb-orderbook/tests/correctness_matrix.rs create mode 100644 crates/aedb-orderbook/tests/simulation_smoke.rs create mode 100644 src/order_book.rs create mode 100644 tests/order_book_native.rs create mode 100644 tests/order_book_simulation.rs diff --git a/Cargo.lock b/Cargo.lock index 460ad15..756838b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -48,6 +48,19 @@ dependencies = [ "zstd", ] +[[package]] +name = "aedb-orderbook" +version = "0.1.0" +dependencies = [ + "aedb", + "primitive-types", + "rand 0.8.5", + "rmp-serde", + "serde", + "tempfile", + "tokio", +] + [[package]] name = "aes" version = "0.8.4" diff --git a/Cargo.toml b/Cargo.toml index 66842cd..2cebd57 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -17,6 +17,10 @@ exclude = [ ".github/*", ] +[workspace] +members = [".", "crates/aedb-orderbook"] +resolver = "2" + [dependencies] serde = { version = "1", features = ["derive"] } serde_json = "1" diff --git a/crates/aedb-orderbook/Cargo.toml b/crates/aedb-orderbook/Cargo.toml new file mode 100644 index 0000000..a926c96 --- /dev/null +++ b/crates/aedb-orderbook/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "aedb-orderbook" +version = "0.1.0" +edition = "2024" +description = "Order book simulation and stress utilities for AEDB" +license = "MIT OR Apache-2.0" + +[dependencies] +aedb = { path = "../.." } +rand = "0.8" +primitive-types = "0.12" +tokio = { version = "1", features = ["full"] } +tempfile = "3" +serde = { version = "1", features = ["derive"] } +rmp-serde = "1" + +[dev-dependencies] +tokio = { version = "1", features = ["macros", "rt-multi-thread"] } diff --git a/crates/aedb-orderbook/src/bin/orderbook_perf.rs b/crates/aedb-orderbook/src/bin/orderbook_perf.rs new file mode 100644 index 0000000..b4c96b5 --- /dev/null +++ b/crates/aedb-orderbook/src/bin/orderbook_perf.rs @@ -0,0 +1,243 @@ +use aedb::config::DurabilityMode; +use aedb_orderbook::{ + MatchWorkload, OrderFlowProfile, SimulationConfig, TableProfile, + run_hft_simulation_with_config, tuned_simulation_config_with_durability, +}; + +#[derive(Debug, Clone)] +struct Scenario { + name: &'static str, + assets: Vec, + traders: usize, + ops_per_trader: usize, + flow: OrderFlowProfile, + table: TableProfile, + durability: DurabilityMode, + collect_latency: bool, + lifecycle_every_ops: usize, + orders_per_commit: usize, + match_workload: MatchWorkload, +} + +fn scenario_matrix(scale: &str) -> Vec { + if scale == "max" { + let assets = vec![ + "BTC-USD".into(), + "ETH-USD".into(), + "SOL-USD".into(), + "DOGE-USD".into(), + "XRP-USD".into(), + "ADA-USD".into(), + "LTC-USD".into(), + "BNB-USD".into(), + ]; + let mut out = Vec::new(); + for traders in [8usize, 12, 16] { + for orders_per_commit in [16usize, 32, 64] { + out.push(Scenario { + name: Box::leak( + format!("max_tps_batch_limit_multi_asset_t{traders}_b{orders_per_commit}") + .into_boxed_str(), + ), + assets: assets.clone(), + traders, + ops_per_trader: 1_000, + flow: OrderFlowProfile::LimitOnlyIoc, + table: TableProfile::MultiAssetTable { + table_id: "markets".to_string(), + }, + durability: DurabilityMode::Batch, + collect_latency: false, + lifecycle_every_ops: 0, + orders_per_commit, + match_workload: MatchWorkload::NoCrossIoc, + }); + } + } + return out; + } + let (traders, ops) = if scale == "stress" { + (20, 1_000) + } else { + (10, 400) + }; + vec![ + Scenario { + name: "per_asset_limit_full", + assets: vec!["BTC-USD".into(), "ETH-USD".into(), "SOL-USD".into()], + traders, + ops_per_trader: ops, + flow: OrderFlowProfile::LimitOnlyIoc, + table: TableProfile::PerAssetTable, + durability: DurabilityMode::Full, + collect_latency: true, + lifecycle_every_ops: 100, + orders_per_commit: 1, + match_workload: MatchWorkload::CrossingNearTouch, + }, + Scenario { + name: "per_asset_limit_batch", + assets: vec!["BTC-USD".into(), "ETH-USD".into(), "SOL-USD".into()], + traders, + ops_per_trader: ops, + flow: OrderFlowProfile::LimitOnlyIoc, + table: TableProfile::PerAssetTable, + durability: DurabilityMode::Batch, + collect_latency: true, + lifecycle_every_ops: 100, + orders_per_commit: 1, + match_workload: MatchWorkload::CrossingNearTouch, + }, + Scenario { + name: "per_asset_mixed_full", + assets: vec!["BTC-USD".into(), "ETH-USD".into(), "SOL-USD".into()], + traders, + ops_per_trader: ops, + flow: OrderFlowProfile::MixedMarketAndLimit, + table: TableProfile::PerAssetTable, + durability: DurabilityMode::Full, + collect_latency: true, + lifecycle_every_ops: 100, + orders_per_commit: 1, + match_workload: MatchWorkload::CrossingNearTouch, + }, + Scenario { + name: "per_asset_mixed_batch", + assets: vec!["BTC-USD".into(), "ETH-USD".into(), "SOL-USD".into()], + traders, + ops_per_trader: ops, + flow: OrderFlowProfile::MixedMarketAndLimit, + table: TableProfile::PerAssetTable, + durability: DurabilityMode::Batch, + collect_latency: true, + lifecycle_every_ops: 100, + orders_per_commit: 1, + match_workload: MatchWorkload::CrossingNearTouch, + }, + Scenario { + name: "multi_asset_mixed_full", + assets: vec![ + "BTC-USD".into(), + "ETH-USD".into(), + "SOL-USD".into(), + "DOGE-USD".into(), + ], + traders, + ops_per_trader: ops, + flow: OrderFlowProfile::MixedMarketAndLimit, + table: TableProfile::MultiAssetTable { + table_id: "markets".to_string(), + }, + durability: DurabilityMode::Full, + collect_latency: true, + lifecycle_every_ops: 100, + orders_per_commit: 1, + match_workload: MatchWorkload::CrossingNearTouch, + }, + Scenario { + name: "multi_asset_mixed_batch", + assets: vec![ + "BTC-USD".into(), + "ETH-USD".into(), + "SOL-USD".into(), + "DOGE-USD".into(), + ], + traders, + ops_per_trader: ops, + flow: OrderFlowProfile::MixedMarketAndLimit, + table: TableProfile::MultiAssetTable { + table_id: "markets".to_string(), + }, + durability: DurabilityMode::Batch, + collect_latency: true, + lifecycle_every_ops: 100, + orders_per_commit: 1, + match_workload: MatchWorkload::CrossingNearTouch, + }, + Scenario { + name: "single_asset_contention_mixed_full", + assets: vec!["BTC-USD".into()], + traders: traders.saturating_mul(2), + ops_per_trader: ops, + flow: OrderFlowProfile::MixedMarketAndLimit, + table: TableProfile::PerAssetTable, + durability: DurabilityMode::Full, + collect_latency: true, + lifecycle_every_ops: 100, + orders_per_commit: 1, + match_workload: MatchWorkload::CrossingNearTouch, + }, + Scenario { + name: "single_asset_contention_mixed_batch", + assets: vec!["BTC-USD".into()], + traders: traders.saturating_mul(2), + ops_per_trader: ops, + flow: OrderFlowProfile::MixedMarketAndLimit, + table: TableProfile::PerAssetTable, + durability: DurabilityMode::Batch, + collect_latency: true, + lifecycle_every_ops: 100, + orders_per_commit: 1, + match_workload: MatchWorkload::CrossingNearTouch, + }, + ] +} + +fn cfg_for_scenario(s: &Scenario) -> (SimulationConfig, aedb::config::AedbConfig) { + let db_cfg = tuned_simulation_config_with_durability(s.durability); + let sim_cfg = SimulationConfig { + assets: s.assets.clone(), + traders: s.traders, + ops_per_trader: s.ops_per_trader, + seed: 42, + flow_profile: s.flow.clone(), + table_profile: s.table.clone(), + collect_latency: s.collect_latency, + lifecycle_every_ops: s.lifecycle_every_ops, + orders_per_commit: s.orders_per_commit, + match_workload: s.match_workload.clone(), + }; + (sim_cfg, db_cfg) +} + +#[tokio::main(flavor = "multi_thread")] +async fn main() { + let scale = std::env::args() + .nth(1) + .unwrap_or_else(|| "quick".to_string()); + if scale != "quick" && scale != "stress" && scale != "max" { + eprintln!("usage: cargo run -p aedb-orderbook --bin orderbook_perf [quick|stress|max]"); + std::process::exit(2); + } + + println!( + "scenario,attempted,accepted,rejected,elapsed_ms,attempted_ops_s,accepted_ops_s,rejected_ops_s,lat_avg_us,lat_p50_us,lat_p95_us,lat_p99_us,lat_max_us,max_finality_gap,visible_head,durable_head,zero_dropped,durability" + ); + for scenario in scenario_matrix(&scale) { + let (sim_cfg, db_cfg) = cfg_for_scenario(&scenario); + let report = run_hft_simulation_with_config(sim_cfg, db_cfg) + .await + .unwrap_or_else(|e| panic!("scenario {} failed: {e}", scenario.name)); + println!( + "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}", + scenario.name, + report.simulation.attempted_orders, + report.simulation.accepted_orders, + report.simulation.rejected_orders, + report.elapsed_ms, + report.attempted_ops_per_sec, + report.accepted_ops_per_sec, + report.rejected_ops_per_sec, + report.latency.avg_us, + report.latency.p50_us, + report.latency.p95_us, + report.latency.p99_us, + report.latency.max_us, + report.max_commit_finality_gap, + report.simulation.visible_head_seq, + report.simulation.durable_head_seq, + report.zero_dropped_orders, + report.durability_mode, + ); + } +} diff --git a/crates/aedb-orderbook/src/bin/orderbook_perf_guard.rs b/crates/aedb-orderbook/src/bin/orderbook_perf_guard.rs new file mode 100644 index 0000000..5230837 --- /dev/null +++ b/crates/aedb-orderbook/src/bin/orderbook_perf_guard.rs @@ -0,0 +1,114 @@ +use aedb::config::DurabilityMode; +use aedb_orderbook::{ + MatchWorkload, OrderFlowProfile, SimulationConfig, TableProfile, + run_hft_simulation_with_config, tuned_simulation_config_with_durability, +}; + +#[derive(Clone)] +struct Scenario { + name: &'static str, + cfg: SimulationConfig, + min_attempted_tps: u64, +} + +fn scenarios() -> Vec { + vec![ + Scenario { + name: "per_asset_mixed_batch", + cfg: SimulationConfig { + assets: vec!["BTC-USD".into(), "ETH-USD".into(), "SOL-USD".into()], + traders: 10, + ops_per_trader: 400, + seed: 42, + flow_profile: OrderFlowProfile::MixedMarketAndLimit, + table_profile: TableProfile::PerAssetTable, + collect_latency: false, + lifecycle_every_ops: 100, + orders_per_commit: 1, + match_workload: MatchWorkload::CrossingNearTouch, + }, + min_attempted_tps: 3_500, + }, + Scenario { + name: "multi_asset_mixed_batch", + cfg: SimulationConfig { + assets: vec![ + "BTC-USD".into(), + "ETH-USD".into(), + "SOL-USD".into(), + "DOGE-USD".into(), + ], + traders: 10, + ops_per_trader: 400, + seed: 42, + flow_profile: OrderFlowProfile::MixedMarketAndLimit, + table_profile: TableProfile::MultiAssetTable { + table_id: "markets".to_string(), + }, + collect_latency: false, + lifecycle_every_ops: 100, + orders_per_commit: 1, + match_workload: MatchWorkload::CrossingNearTouch, + }, + min_attempted_tps: 3_000, + }, + Scenario { + name: "single_asset_contention_mixed_batch", + cfg: SimulationConfig { + assets: vec!["BTC-USD".into()], + traders: 20, + ops_per_trader: 400, + seed: 42, + flow_profile: OrderFlowProfile::MixedMarketAndLimit, + table_profile: TableProfile::PerAssetTable, + collect_latency: false, + lifecycle_every_ops: 100, + orders_per_commit: 1, + match_workload: MatchWorkload::CrossingNearTouch, + }, + min_attempted_tps: 1_800, + }, + ] +} + +#[tokio::main(flavor = "multi_thread")] +async fn main() { + let db_cfg = tuned_simulation_config_with_durability(DurabilityMode::Batch); + let mut failures = Vec::new(); + + println!( + "scenario,attempted_tps,accepted_tps,rejected_tps,min_attempted_tps,zero_dropped,max_finality_gap" + ); + for s in scenarios() { + let report = run_hft_simulation_with_config(s.cfg.clone(), db_cfg.clone()) + .await + .unwrap_or_else(|e| panic!("scenario {} failed to run: {e}", s.name)); + println!( + "{},{},{},{},{},{},{}", + s.name, + report.attempted_ops_per_sec, + report.accepted_ops_per_sec, + report.rejected_ops_per_sec, + s.min_attempted_tps, + report.zero_dropped_orders, + report.max_commit_finality_gap + ); + if report.attempted_ops_per_sec < s.min_attempted_tps || !report.zero_dropped_orders { + failures.push(format!( + "{}: attempted_tps={} (min={}), zero_dropped={}", + s.name, + report.attempted_ops_per_sec, + s.min_attempted_tps, + report.zero_dropped_orders + )); + } + } + + if !failures.is_empty() { + eprintln!("performance/correctness guard failed:"); + for f in failures { + eprintln!(" - {f}"); + } + std::process::exit(1); + } +} diff --git a/crates/aedb-orderbook/src/lib.rs b/crates/aedb-orderbook/src/lib.rs new file mode 100644 index 0000000..4ea1701 --- /dev/null +++ b/crates/aedb-orderbook/src/lib.rs @@ -0,0 +1,630 @@ +use aedb::AedbInstance; +use aedb::commit::validation::Mutation; +use aedb::config::{AedbConfig, DurabilityMode}; +use aedb::error::AedbError; +use aedb::order_book::{ + ExecInstruction, InstrumentConfig, OrderBookTableMode, OrderRequest, OrderSide, OrderStatus, + OrderType, TimeInForce, parse_plqty_price, scoped_instrument, +}; +use aedb::query::plan::ConsistencyMode; +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; +use serde::{Deserialize, Serialize}; +use std::collections::BTreeMap; +use std::sync::Arc; +use std::time::Instant; +use tempfile::tempdir; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SimulationConfig { + pub assets: Vec, + pub traders: usize, + pub ops_per_trader: usize, + pub seed: u64, + pub flow_profile: OrderFlowProfile, + pub table_profile: TableProfile, + pub collect_latency: bool, + pub lifecycle_every_ops: usize, + pub orders_per_commit: usize, + pub match_workload: MatchWorkload, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub enum OrderFlowProfile { + LimitOnlyIoc, + MixedMarketAndLimit, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub enum TableProfile { + NativeInstrument, + PerAssetTable, + MultiAssetTable { table_id: String }, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub enum MatchWorkload { + CrossingNearTouch, + NoCrossIoc, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct SimulationReport { + pub assets: Vec, + pub instruments: Vec, + pub traders: usize, + pub ops_per_trader: usize, + pub attempted_orders: usize, + pub accepted_orders: usize, + pub rejected_orders: usize, + pub max_commit_seq: u64, + pub visible_head_seq: u64, + pub durable_head_seq: u64, + pub flow_profile: OrderFlowProfile, + pub table_profile: TableProfile, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct LatencyStats { + pub samples: usize, + pub avg_us: u64, + pub p50_us: u64, + pub p95_us: u64, + pub p99_us: u64, + pub max_us: u64, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct ProfiledSimulationReport { + pub simulation: SimulationReport, + pub elapsed_ms: u64, + pub attempted_ops_per_sec: u64, + pub accepted_ops_per_sec: u64, + pub rejected_ops_per_sec: u64, + pub max_commit_finality_gap: u64, + pub zero_dropped_orders: bool, + pub latency: LatencyStats, + pub durability_mode: String, +} + +pub fn high_throughput_simulation_config() -> AedbConfig { + AedbConfig { + durability_mode: DurabilityMode::Batch, + batch_interval_ms: 20, + batch_max_bytes: 2 * 1024 * 1024, + max_inflight_commits: 4096, + max_commit_queue_bytes: 256 * 1024 * 1024, + epoch_max_wait_us: 250, + epoch_min_commits: 16, + epoch_max_commits: 1024, + adaptive_epoch_min_commits_floor: 8, + adaptive_epoch_min_commits_ceiling: 1024, + adaptive_epoch_wait_us_floor: 25, + adaptive_epoch_wait_us_ceiling: 5_000, + adaptive_epoch_target_latency_us: 5_000, + prestage_shards: 16, + ..AedbConfig::default() + } +} + +pub fn tuned_simulation_config_with_durability(durability_mode: DurabilityMode) -> AedbConfig { + AedbConfig { + durability_mode, + ..high_throughput_simulation_config() + } +} + +fn u256_be(v: u64) -> [u8; 32] { + let mut out = [0u8; 32]; + out[24..].copy_from_slice(&v.to_be_bytes()); + out +} + +fn decode_u256_u64(bytes: [u8; 32]) -> u64 { + let mut out = [0u8; 8]; + out.copy_from_slice(&bytes[24..]); + u64::from_be_bytes(out) +} + +fn decode_u256_bytes_to_u64(bytes: &[u8]) -> u64 { + assert_eq!(bytes.len(), 32); + let mut out = [0u8; 8]; + out.copy_from_slice(&bytes[24..]); + u64::from_be_bytes(out) +} + +fn request( + instrument: &str, + owner: &str, + client_order_id: String, + side: OrderSide, + order_type: OrderType, + tif: TimeInForce, + post_only: bool, + price_ticks: i64, + qty: u64, + nonce: u64, +) -> OrderRequest { + OrderRequest { + instrument: instrument.to_string(), + client_order_id, + side, + order_type, + time_in_force: tif, + exec_instructions: ExecInstruction(if post_only { + ExecInstruction::POST_ONLY + } else { + 0 + }), + self_trade_prevention: aedb::order_book::SelfTradePrevention::None, + price_ticks, + qty_be: u256_be(qty), + owner: owner.to_string(), + account: None, + nonce, + price_limit_ticks: None, + } +} + +async fn setup_books( + db: &AedbInstance, + assets: &[String], + table_profile: &TableProfile, +) -> Result, AedbError> { + db.create_project("p").await.expect("project"); + let mut instruments = Vec::with_capacity(assets.len()); + let multi_table_id = match table_profile { + TableProfile::MultiAssetTable { table_id } => Some(table_id.clone()), + _ => None, + }; + if let Some(table_id) = &multi_table_id { + db.order_book_define_table("p", "app", table_id, OrderBookTableMode::MultiAsset) + .await?; + } + for asset in assets { + let instrument = match table_profile { + TableProfile::NativeInstrument => asset.clone(), + TableProfile::PerAssetTable => { + db.order_book_define_table("p", "app", asset, OrderBookTableMode::PerAsset) + .await?; + scoped_instrument(asset, asset) + } + TableProfile::MultiAssetTable { table_id } => scoped_instrument(table_id, asset), + }; + instruments.push(instrument.clone()); + db.order_book_set_instrument_config( + "p", + "app", + &instrument, + InstrumentConfig { + instrument: instrument.clone(), + tick_size: 1, + lot_size_be: u256_be(1), + min_price_ticks: 1, + max_price_ticks: 1_000_000, + market_order_price_band: Some(50), + halted: false, + balance_config: None, + }, + ) + .await?; + + for i in 0..20_u64 { + let ask_owner = format!("seed_ask_{}_{}", instrument, i); + db.order_book_new( + "p", + "app", + request( + &instrument, + &ask_owner, + format!("seed-a-{i}"), + OrderSide::Ask, + OrderType::Limit, + TimeInForce::Gtc, + false, + 1_000 + i as i64, + 10, + 1, + ), + ) + .await?; + + let bid_owner = format!("seed_bid_{}_{}", instrument, i); + db.order_book_new( + "p", + "app", + request( + &instrument, + &bid_owner, + format!("seed-b-{i}"), + OrderSide::Bid, + OrderType::Limit, + TimeInForce::Gtc, + false, + 999 - i as i64, + 10, + 1, + ), + ) + .await?; + } + } + Ok(instruments) +} + +async fn assert_book_invariants(db: &AedbInstance, instruments: &[String]) { + for instrument in instruments { + let mut from_orders: BTreeMap<(u8, i64), u64> = BTreeMap::new(); + + let rows = db + .kv_scan_prefix_no_auth( + "p", + "app", + format!("ob:{instrument}:ord:").as_bytes(), + 1_000_000, + ConsistencyMode::AtLatest, + ) + .await + .expect("scan orders"); + + for (_, entry) in rows { + let order: aedb::order_book::OrderRecord = + rmp_serde::from_slice(&entry.value).expect("decode order"); + let original = decode_u256_u64(order.original_qty_be); + let remaining = decode_u256_u64(order.remaining_qty_be); + let filled = decode_u256_u64(order.filled_qty_be); + assert!( + remaining + filled <= original, + "quantity accounting invariant" + ); + if remaining > 0 + && matches!( + order.status, + OrderStatus::Open | OrderStatus::PartiallyFilled + ) + { + *from_orders + .entry((order.side as u8, order.price_ticks)) + .or_insert(0) += remaining; + } + } + + let mut from_levels: BTreeMap<(u8, i64), u64> = BTreeMap::new(); + for side in [OrderSide::Bid, OrderSide::Ask] { + let levels = db + .kv_scan_prefix_no_auth( + "p", + "app", + format!("ob:{instrument}:plqty:{}:", side as u8).as_bytes(), + 1_000_000, + ConsistencyMode::AtLatest, + ) + .await + .expect("scan levels"); + for (k, v) in levels { + let qty = decode_u256_bytes_to_u64(&v.value); + if qty == 0 { + continue; + } + let price = parse_plqty_price(side, &k).expect("parse level price"); + from_levels.insert((side as u8, price), qty); + } + } + + assert_eq!( + from_orders, from_levels, + "price-level aggregates must match open orders for {instrument}" + ); + } +} + +pub async fn run_hft_simulation(cfg: SimulationConfig) -> Result { + run_hft_simulation_with_config(cfg, high_throughput_simulation_config()) + .await + .map(|r| r.simulation) +} + +pub async fn run_hft_simulation_with_config( + cfg: SimulationConfig, + db_cfg: AedbConfig, +) -> Result { + let durability_mode_name = match db_cfg.durability_mode { + DurabilityMode::Full => "full", + DurabilityMode::Batch => "batch", + DurabilityMode::OsBuffered => "os_buffered", + } + .to_string(); + let dir = tempdir().expect("temp"); + let db = Arc::new(AedbInstance::open(db_cfg, dir.path()).expect("open")); + let instruments = setup_books(&db, &cfg.assets, &cfg.table_profile).await?; + let run_started = Instant::now(); + + let mut tasks = Vec::with_capacity(cfg.traders); + for t in 0..cfg.traders { + let db_clone = Arc::clone(&db); + let instruments_clone = instruments.clone(); + let flow_profile = cfg.flow_profile.clone(); + let match_workload = cfg.match_workload.clone(); + let seed = cfg.seed; + let ops_per_trader = cfg.ops_per_trader; + let orders_per_commit = cfg.orders_per_commit.max(1); + tasks.push(tokio::spawn(async move { + let owner = format!("trader_{t}"); + let mut nonces: BTreeMap = BTreeMap::new(); + let mut rng = StdRng::seed_from_u64(seed + t as u64); + let mut accepted = 0usize; + let mut rejected = 0usize; + let mut max_commit_seq = 0u64; + let mut max_finality_gap = 0u64; + let mut latencies_us = if cfg.collect_latency { + Vec::with_capacity(ops_per_trader) + } else { + Vec::new() + }; + let mut pending_mutations = Vec::with_capacity(orders_per_commit); + let mut pending_orders = 0usize; + let mut pending_started: Option = None; + + for op in 0..ops_per_trader { + let instrument = &instruments_clone[rng.gen_range(0..instruments_clone.len())]; + let nonce = nonces.entry(instrument.clone()).or_insert(0); + *nonce += 1; + + let side = if rng.gen_bool(0.5) { + OrderSide::Bid + } else { + OrderSide::Ask + }; + let mut price = match match_workload { + MatchWorkload::CrossingNearTouch => 995 + rng.gen_range(0..12) as i64, + MatchWorkload::NoCrossIoc => { + if side == OrderSide::Bid { + 900 + } else { + 1_100 + } + } + }; + let qty = 1 + rng.gen_range(0..5) as u64; + let (order_type, tif) = match flow_profile { + OrderFlowProfile::LimitOnlyIoc => (OrderType::Limit, TimeInForce::Ioc), + OrderFlowProfile::MixedMarketAndLimit => { + let tif = if rng.gen_bool(0.7) { + TimeInForce::Ioc + } else { + TimeInForce::Fok + }; + if rng.gen_bool(0.15) { + (OrderType::Market, tif) + } else { + (OrderType::Limit, tif) + } + } + }; + let post_only = matches!(match_workload, MatchWorkload::CrossingNearTouch) + && order_type == OrderType::Limit + && rng.gen_bool(0.05); + if order_type == OrderType::Market { + price = 0; + } + + if pending_mutations.is_empty() { + pending_started = Some(Instant::now()); + } + pending_mutations.push(Mutation::OrderBookNew { + project_id: "p".to_string(), + scope_id: "app".to_string(), + request: request( + instrument, + &owner, + format!("{owner}-{op}"), + side, + order_type, + tif, + post_only, + price, + qty, + *nonce, + ), + }); + pending_orders += 1; + if pending_orders >= orders_per_commit { + if let Err(other) = flush_pending_orders( + db_clone.as_ref(), + &mut pending_mutations, + &mut pending_orders, + &mut pending_started, + cfg.collect_latency, + &mut latencies_us, + &mut accepted, + &mut rejected, + &mut max_commit_seq, + &mut max_finality_gap, + ) + .await + { + panic!("unexpected simulation error: {other:?}"); + } + } + + if cfg.lifecycle_every_ops > 0 && op % cfg.lifecycle_every_ops == 0 { + *nonce += 1; + let cid = format!("gtc-{owner}-{op}"); + let _ = db_clone + .order_book_new( + "p", + "app", + request( + instrument, + &owner, + cid.clone(), + side, + OrderType::Limit, + TimeInForce::Gtc, + false, + price, + qty, + *nonce, + ), + ) + .await; + let _ = db_clone + .order_book_cancel_by_client_id("p", "app", instrument, &cid, &owner) + .await; + } + } + if let Err(other) = flush_pending_orders( + db_clone.as_ref(), + &mut pending_mutations, + &mut pending_orders, + &mut pending_started, + cfg.collect_latency, + &mut latencies_us, + &mut accepted, + &mut rejected, + &mut max_commit_seq, + &mut max_finality_gap, + ) + .await + { + panic!("unexpected simulation error: {other:?}"); + } + ( + accepted, + rejected, + max_commit_seq, + max_finality_gap, + latencies_us, + ) + })); + } + + let mut accepted_orders = 0usize; + let mut rejected_orders = 0usize; + let mut max_commit_seq = 0u64; + let mut max_commit_finality_gap = 0u64; + let mut all_latencies_us = Vec::new(); + for task in tasks { + let (a, r, max_seq, max_gap, mut latencies) = task.await.expect("task join"); + accepted_orders += a; + rejected_orders += r; + max_commit_seq = max_commit_seq.max(max_seq); + max_commit_finality_gap = max_commit_finality_gap.max(max_gap); + all_latencies_us.append(&mut latencies); + } + + assert_book_invariants(&db, &instruments).await; + let elapsed_ms = run_started.elapsed().as_millis().max(1) as u64; + db.force_fsync().await?; + let heads = db.head_state().await; + assert!( + heads.visible_head_seq >= max_commit_seq, + "visible head must include all accepted commits" + ); + assert!( + heads.durable_head_seq >= max_commit_seq, + "durable head must include all accepted commits after final fsync" + ); + + all_latencies_us.sort_unstable(); + let latency = summarize_latency(&all_latencies_us); + let attempted_orders = cfg.traders * cfg.ops_per_trader; + let attempted_ops_per_sec = (attempted_orders as u64).saturating_mul(1000) / elapsed_ms; + let accepted_ops_per_sec = (accepted_orders as u64).saturating_mul(1000) / elapsed_ms; + let rejected_ops_per_sec = (rejected_orders as u64).saturating_mul(1000) / elapsed_ms; + + let simulation = SimulationReport { + assets: cfg.assets, + instruments, + traders: cfg.traders, + ops_per_trader: cfg.ops_per_trader, + attempted_orders, + accepted_orders, + rejected_orders, + max_commit_seq, + visible_head_seq: heads.visible_head_seq, + durable_head_seq: heads.durable_head_seq, + flow_profile: cfg.flow_profile, + table_profile: cfg.table_profile, + }; + Ok(ProfiledSimulationReport { + zero_dropped_orders: simulation.accepted_orders + simulation.rejected_orders + == simulation.attempted_orders, + simulation, + elapsed_ms, + attempted_ops_per_sec, + accepted_ops_per_sec, + rejected_ops_per_sec, + max_commit_finality_gap, + latency, + durability_mode: durability_mode_name, + }) +} + +fn summarize_latency(sorted_latencies_us: &[u64]) -> LatencyStats { + if sorted_latencies_us.is_empty() { + return LatencyStats { + samples: 0, + avg_us: 0, + p50_us: 0, + p95_us: 0, + p99_us: 0, + max_us: 0, + }; + } + let samples = sorted_latencies_us.len(); + let sum: u128 = sorted_latencies_us.iter().map(|v| *v as u128).sum(); + let percentile = |p: f64| -> u64 { + let idx = ((samples as f64 - 1.0) * p).round() as usize; + sorted_latencies_us[idx] + }; + LatencyStats { + samples, + avg_us: (sum / samples as u128) as u64, + p50_us: percentile(0.50), + p95_us: percentile(0.95), + p99_us: percentile(0.99), + max_us: *sorted_latencies_us.last().expect("non-empty"), + } +} + +async fn flush_pending_orders( + db: &AedbInstance, + pending_mutations: &mut Vec, + pending_orders: &mut usize, + pending_started: &mut Option, + collect_latency: bool, + latencies_us: &mut Vec, + accepted: &mut usize, + rejected: &mut usize, + max_commit_seq: &mut u64, + max_finality_gap: &mut u64, +) -> Result<(), AedbError> { + if pending_mutations.is_empty() { + return Ok(()); + } + let started = pending_started.take().unwrap_or_else(Instant::now); + let batch_len = *pending_orders; + let res = db + .commit_many_atomic(std::mem::take(pending_mutations)) + .await; + let elapsed = started.elapsed().as_micros() as u64; + if collect_latency && batch_len > 0 { + let per_order = (elapsed / batch_len as u64).max(1); + latencies_us.extend(std::iter::repeat_n(per_order, batch_len)); + } + *pending_orders = 0; + match res { + Ok(commit) => { + let gap = commit.commit_seq.saturating_sub(commit.durable_head_seq); + *max_finality_gap = (*max_finality_gap).max(gap); + *max_commit_seq = (*max_commit_seq).max(commit.commit_seq); + *accepted += batch_len; + } + Err(err) => match err { + AedbError::Validation(_) => *rejected += batch_len, + other => return Err(other), + }, + } + Ok(()) +} diff --git a/crates/aedb-orderbook/tests/correctness_matrix.rs b/crates/aedb-orderbook/tests/correctness_matrix.rs new file mode 100644 index 0000000..e939236 --- /dev/null +++ b/crates/aedb-orderbook/tests/correctness_matrix.rs @@ -0,0 +1,69 @@ +use aedb_orderbook::{ + MatchWorkload, OrderFlowProfile, SimulationConfig, TableProfile, + high_throughput_simulation_config, run_hft_simulation_with_config, +}; + +fn cfg( + assets: Vec<&str>, + traders: usize, + ops_per_trader: usize, + seed: u64, + flow_profile: OrderFlowProfile, + table_profile: TableProfile, +) -> SimulationConfig { + SimulationConfig { + assets: assets.into_iter().map(|s| s.to_string()).collect(), + traders, + ops_per_trader, + seed, + flow_profile, + table_profile, + collect_latency: false, + lifecycle_every_ops: 50, + orders_per_commit: 1, + match_workload: MatchWorkload::CrossingNearTouch, + } +} + +#[tokio::test] +async fn correctness_matrix_multiple_seeds_and_layouts() { + let scenarios = vec![ + ( + vec!["BTC-USD", "ETH-USD", "SOL-USD"], + OrderFlowProfile::LimitOnlyIoc, + TableProfile::PerAssetTable, + ), + ( + vec!["BTC-USD", "ETH-USD", "SOL-USD", "DOGE-USD"], + OrderFlowProfile::MixedMarketAndLimit, + TableProfile::MultiAssetTable { + table_id: "markets".to_string(), + }, + ), + ]; + + for seed in [3_u64, 7, 11, 19, 23] { + for (assets, flow, table) in &scenarios { + let report = run_hft_simulation_with_config( + cfg(assets.clone(), 8, 300, seed, flow.clone(), table.clone()), + high_throughput_simulation_config(), + ) + .await + .expect("simulation matrix run should succeed"); + + assert!(report.zero_dropped_orders, "no dropped orders allowed"); + assert_eq!( + report.simulation.accepted_orders + report.simulation.rejected_orders, + report.simulation.attempted_orders + ); + assert!( + report.simulation.visible_head_seq >= report.simulation.max_commit_seq, + "visible head must include all accepted commits" + ); + assert!( + report.simulation.durable_head_seq >= report.simulation.max_commit_seq, + "durable head must include all accepted commits after fsync" + ); + } + } +} diff --git a/crates/aedb-orderbook/tests/simulation_smoke.rs b/crates/aedb-orderbook/tests/simulation_smoke.rs new file mode 100644 index 0000000..e2071f2 --- /dev/null +++ b/crates/aedb-orderbook/tests/simulation_smoke.rs @@ -0,0 +1,206 @@ +use aedb_orderbook::{ + MatchWorkload, OrderFlowProfile, SimulationConfig, TableProfile, run_hft_simulation, +}; + +fn cfg( + assets: Vec<&str>, + traders: usize, + ops_per_trader: usize, + seed: u64, + flow_profile: OrderFlowProfile, + table_profile: TableProfile, +) -> SimulationConfig { + SimulationConfig { + assets: assets.into_iter().map(|s| s.to_string()).collect(), + traders, + ops_per_trader, + seed, + flow_profile, + table_profile, + collect_latency: true, + lifecycle_every_ops: 100, + orders_per_commit: 1, + match_workload: MatchWorkload::CrossingNearTouch, + } +} + +#[tokio::test] +async fn simulation_smoke_runs() { + let report = run_hft_simulation(cfg( + vec!["BTC-USD", "ETH-USD"], + 4, + 150, + 7, + OrderFlowProfile::LimitOnlyIoc, + TableProfile::NativeInstrument, + )) + .await + .expect("simulation should run"); + + assert_eq!(report.attempted_orders, 600); + assert_eq!( + report.accepted_orders + report.rejected_orders, + report.attempted_orders + ); + assert!(report.durable_head_seq >= report.max_commit_seq); + assert!(report.visible_head_seq >= report.max_commit_seq); +} + +#[tokio::test] +async fn simulation_high_throughput_per_asset_limit_only() { + let report = run_hft_simulation(cfg( + vec!["BTC-USD", "ETH-USD", "SOL-USD"], + 12, + 700, + 11, + OrderFlowProfile::LimitOnlyIoc, + TableProfile::PerAssetTable, + )) + .await + .expect("high-throughput per-asset limit-only simulation should run"); + + assert_eq!(report.attempted_orders, 8_400); + assert_eq!( + report.accepted_orders + report.rejected_orders, + report.attempted_orders + ); + assert!(report.accepted_orders > 0); + assert!(report.durable_head_seq >= report.max_commit_seq); +} + +#[tokio::test] +async fn simulation_high_throughput_per_asset_mixed_market_limit() { + let report = run_hft_simulation(cfg( + vec!["BTC-USD", "ETH-USD", "SOL-USD"], + 12, + 700, + 13, + OrderFlowProfile::MixedMarketAndLimit, + TableProfile::PerAssetTable, + )) + .await + .expect("high-throughput per-asset mixed simulation should run"); + + assert_eq!(report.attempted_orders, 8_400); + assert_eq!( + report.accepted_orders + report.rejected_orders, + report.attempted_orders + ); + assert!(report.rejected_orders > 0); + assert!(report.durable_head_seq >= report.max_commit_seq); +} + +#[tokio::test] +async fn simulation_high_throughput_multi_asset_table_limit_only() { + let report = run_hft_simulation(cfg( + vec!["BTC-USD", "ETH-USD", "SOL-USD", "DOGE-USD"], + 14, + 650, + 17, + OrderFlowProfile::LimitOnlyIoc, + TableProfile::MultiAssetTable { + table_id: "markets".to_string(), + }, + )) + .await + .expect("high-throughput multi-asset table limit-only simulation should run"); + + assert_eq!(report.attempted_orders, 9_100); + assert_eq!( + report.accepted_orders + report.rejected_orders, + report.attempted_orders + ); + assert!(report.durable_head_seq >= report.max_commit_seq); +} + +#[tokio::test] +async fn simulation_high_throughput_multi_asset_table_mixed_market_limit() { + let report = run_hft_simulation(cfg( + vec!["BTC-USD", "ETH-USD", "SOL-USD", "DOGE-USD"], + 14, + 650, + 19, + OrderFlowProfile::MixedMarketAndLimit, + TableProfile::MultiAssetTable { + table_id: "markets".to_string(), + }, + )) + .await + .expect("high-throughput multi-asset table mixed simulation should run"); + + assert_eq!(report.attempted_orders, 9_100); + assert_eq!( + report.accepted_orders + report.rejected_orders, + report.attempted_orders + ); + assert!(report.rejected_orders > 0); + assert!(report.durable_head_seq >= report.max_commit_seq); +} + +#[tokio::test] +async fn simulation_high_contention_single_asset_mixed_market_limit() { + let report = run_hft_simulation(cfg( + vec!["BTC-USD"], + 20, + 1_200, + 23, + OrderFlowProfile::MixedMarketAndLimit, + TableProfile::PerAssetTable, + )) + .await + .expect("high-contention single-asset mixed simulation should run"); + + assert_eq!(report.attempted_orders, 24_000); + assert_eq!( + report.accepted_orders + report.rejected_orders, + report.attempted_orders + ); + assert!(report.rejected_orders > 0); + assert!(report.durable_head_seq >= report.max_commit_seq); +} + +#[tokio::test] +#[ignore = "extended soak test"] +async fn simulation_soak_multi_asset_mixed() { + let report = run_hft_simulation(cfg( + vec!["BTC-USD", "ETH-USD", "SOL-USD", "DOGE-USD"], + 24, + 2_000, + 29, + OrderFlowProfile::MixedMarketAndLimit, + TableProfile::MultiAssetTable { + table_id: "markets".to_string(), + }, + )) + .await + .expect("soak simulation should run"); + + assert_eq!(report.attempted_orders, 48_000); + assert_eq!( + report.accepted_orders + report.rejected_orders, + report.attempted_orders + ); + assert!(report.durable_head_seq >= report.max_commit_seq); +} + +#[tokio::test] +#[ignore = "extended contention soak test"] +async fn simulation_soak_single_asset_contention_limit() { + let report = run_hft_simulation(cfg( + vec!["BTC-USD"], + 32, + 2_000, + 31, + OrderFlowProfile::LimitOnlyIoc, + TableProfile::PerAssetTable, + )) + .await + .expect("single-asset soak simulation should run"); + + assert_eq!(report.attempted_orders, 64_000); + assert_eq!( + report.accepted_orders + report.rejected_orders, + report.attempted_orders + ); + assert!(report.durable_head_seq >= report.max_commit_seq); +} diff --git a/src/catalog/mod.rs b/src/catalog/mod.rs index 8a83de1..1f65552 100644 --- a/src/catalog/mod.rs +++ b/src/catalog/mod.rs @@ -1902,7 +1902,12 @@ fn now_micros() -> u64 { } pub fn namespace_key(project_id: &str, scope_id: &str) -> String { - format!("{project_id}{NAMESPACE_KEY_SEPARATOR}{scope_id}") + let mut key = + String::with_capacity(project_id.len() + NAMESPACE_KEY_SEPARATOR.len() + scope_id.len()); + key.push_str(project_id); + key.push_str(NAMESPACE_KEY_SEPARATOR); + key.push_str(scope_id); + key } pub const DEFAULT_SCOPE_ID: &str = "app"; diff --git a/src/commit/apply.rs b/src/commit/apply.rs index e9bb9b8..e337581 100644 --- a/src/commit/apply.rs +++ b/src/commit/apply.rs @@ -7,6 +7,12 @@ use crate::commit::validation::{ }; use crate::error::AedbError; use crate::error::ResourceType as ErrorResourceType; +use crate::order_book::{ + apply_order_book_cancel, apply_order_book_cancel_replace, apply_order_book_define_table, + apply_order_book_drop_table, apply_order_book_mass_cancel, apply_order_book_match, + apply_order_book_new, apply_order_book_reduce, apply_set_instrument_config, + apply_set_instrument_halted, u256_from_be, +}; use crate::query::operators::{compile_expr, eval_compiled_expr_public}; use crate::storage::encoded_key::EncodedKey; use crate::storage::index::extract_index_key_encoded; @@ -542,6 +548,145 @@ pub fn apply_mutation( false, commit_seq, )?, + Mutation::OrderBookNew { + project_id, + scope_id, + request, + } => apply_order_book_new(keyspace, &project_id, &scope_id, &request, commit_seq)?, + Mutation::OrderBookCancel { + project_id, + scope_id, + instrument, + order_id, + client_order_id, + owner, + .. + } => apply_order_book_cancel( + keyspace, + &project_id, + &scope_id, + &instrument, + order_id, + client_order_id.as_deref(), + &owner, + commit_seq, + )?, + Mutation::OrderBookCancelReplace { + project_id, + scope_id, + instrument, + order_id, + owner, + new_price_ticks, + new_qty_be, + new_time_in_force, + new_exec_instructions, + } => apply_order_book_cancel_replace( + keyspace, + &project_id, + &scope_id, + &instrument, + order_id, + &owner, + new_price_ticks, + new_qty_be, + new_time_in_force, + new_exec_instructions, + commit_seq, + )?, + Mutation::OrderBookMassCancel { + project_id, + scope_id, + instrument, + owner, + side, + owner_filter, + price_range_ticks, + } => apply_order_book_mass_cancel( + keyspace, + &project_id, + &scope_id, + &instrument, + &owner, + side, + owner_filter.as_deref(), + price_range_ticks, + commit_seq, + )?, + Mutation::OrderBookReduce { + project_id, + scope_id, + instrument, + order_id, + owner, + reduce_by_be, + } => apply_order_book_reduce( + keyspace, + &project_id, + &scope_id, + &instrument, + order_id, + &owner, + u256_from_be(reduce_by_be), + commit_seq, + )?, + Mutation::OrderBookMatch { + project_id, + scope_id, + instrument, + fills, + } => apply_order_book_match( + keyspace, + &project_id, + &scope_id, + &instrument, + &fills, + commit_seq, + )?, + Mutation::OrderBookDefineTable { + project_id, + scope_id, + table_id, + mode, + } => apply_order_book_define_table( + keyspace, + &project_id, + &scope_id, + &table_id, + mode, + commit_seq, + )?, + Mutation::OrderBookDropTable { + project_id, + scope_id, + table_id, + } => apply_order_book_drop_table(keyspace, &project_id, &scope_id, &table_id, commit_seq)?, + Mutation::OrderBookSetInstrumentConfig { + project_id, + scope_id, + instrument, + config, + } => apply_set_instrument_config( + keyspace, + &project_id, + &scope_id, + &instrument, + &config, + commit_seq, + )?, + Mutation::OrderBookSetInstrumentHalted { + project_id, + scope_id, + instrument, + halted, + } => apply_set_instrument_halted( + keyspace, + &project_id, + &scope_id, + &instrument, + halted, + commit_seq, + )?, } Ok(()) } diff --git a/src/commit/executor/internals.rs b/src/commit/executor/internals.rs index e180fbe..efbd18b 100644 --- a/src/commit/executor/internals.rs +++ b/src/commit/executor/internals.rs @@ -7,6 +7,8 @@ use primitive_types::U256; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::mpsc as std_mpsc; +const MEMORY_ESTIMATE_INTERVAL_MICROS: u64 = 250_000; + pub(super) fn pre_stage_validate( validation_catalog: &Arc>, envelope: &TransactionEnvelope, @@ -18,16 +20,23 @@ pub(super) fn pre_stage_validate( async move { let catalog = validation_catalog.read().snapshot(); validate_assertions(&catalog, &envelope.assertions)?; - let mut staged_catalog = catalog.clone(); + let mut staged_catalog: Option = None; for mutation in &envelope.write_intent.mutations { - validate_permissions(&staged_catalog, envelope.caller.as_ref(), mutation)?; - validate_mutation(&staged_catalog, mutation)?; + let active_catalog = staged_catalog.as_ref().unwrap_or(&catalog); + validate_permissions(active_catalog, envelope.caller.as_ref(), mutation)?; + validate_mutation(active_catalog, mutation)?; if let Mutation::Ddl(ddl) = mutation { - staged_catalog.apply_ddl(ddl.clone())?; + if staged_catalog.is_none() { + staged_catalog = Some(catalog.clone()); + } + if let Some(next) = staged_catalog.as_mut() { + next.apply_ddl(ddl.clone())?; + } } } + let partition_catalog = staged_catalog.as_ref().unwrap_or(&catalog); let write_partitions = derive_write_partitions_with_fk_expansion( - &staged_catalog, + partition_catalog, &envelope.write_intent.mutations, ); let read_partitions = derive_read_partitions(&envelope); @@ -142,6 +151,64 @@ pub(super) fn scope_shard_key(mutations: &[Mutation]) -> String { scope_id, .. } => format!("k:{project_id}:{scope_id}"), + Mutation::OrderBookNew { + project_id, + scope_id, + request, + } => format!("ob:{project_id}:{scope_id}:{}", request.instrument), + Mutation::OrderBookCancel { + project_id, + scope_id, + instrument, + .. + } + | Mutation::OrderBookCancelReplace { + project_id, + scope_id, + instrument, + .. + } + | Mutation::OrderBookMassCancel { + project_id, + scope_id, + instrument, + .. + } + | Mutation::OrderBookReduce { + project_id, + scope_id, + instrument, + .. + } + | Mutation::OrderBookMatch { + project_id, + scope_id, + instrument, + .. + } => format!("ob:{project_id}:{scope_id}:{instrument}"), + Mutation::OrderBookDefineTable { + project_id, + scope_id, + table_id, + .. + } + | Mutation::OrderBookDropTable { + project_id, + scope_id, + table_id, + } => format!("obdef:{project_id}:{scope_id}:{table_id}"), + Mutation::OrderBookSetInstrumentConfig { + project_id, + scope_id, + instrument, + .. + } + | Mutation::OrderBookSetInstrumentHalted { + project_id, + scope_id, + instrument, + .. + } => format!("obcfg:{project_id}:{scope_id}:{instrument}"), Mutation::Ddl(ddl) => format!("ddl:{ddl:?}"), } } @@ -276,6 +343,7 @@ pub(super) fn process_commit_epoch( let mut deferred_parallel_commits = Vec::new(); let mut deferred_parallel_namespaces = HashSet::new(); let mut next_seq = state.current_seq; + let mut catalog_changed = false; for request in requests { if request.envelope.write_intent.mutations.is_empty() { @@ -495,6 +563,9 @@ pub(super) fn process_commit_epoch( seq: commit_seq, mutations: mutations.clone(), }; + if !catalog_changed && mutations.iter().any(|m| matches!(m, Mutation::Ddl(_))) { + catalog_changed = true; + } sequenced.push(SequencedCommit { request, seq: commit_seq, @@ -517,6 +588,7 @@ pub(super) fn process_commit_epoch( coordinator_apply_attempts, coordinator_apply_micros, read_set_conflicts, + catalog_changed, }; } @@ -549,6 +621,7 @@ pub(super) fn process_commit_epoch( coordinator_apply_attempts, coordinator_apply_micros, read_set_conflicts, + catalog_changed, }; } @@ -603,6 +676,7 @@ pub(super) fn process_commit_epoch( coordinator_apply_attempts, coordinator_apply_micros, read_set_conflicts, + catalog_changed, }; } } @@ -627,6 +701,7 @@ pub(super) fn process_commit_epoch( coordinator_apply_attempts, coordinator_apply_micros, read_set_conflicts, + catalog_changed, }; } @@ -690,13 +765,19 @@ pub(super) fn process_commit_epoch( state.last_full_snapshot_micros = now_micros(); } - let mem_estimate = state.keyspace.estimate_memory_bytes(); - if mem_estimate > state.config.max_memory_estimate_bytes { - warn!( - mem_estimate, - max_memory_estimate_bytes = state.config.max_memory_estimate_bytes, - "aedb memory estimate exceeded threshold" - ); + let now_micros = now_micros(); + if now_micros.saturating_sub(state.last_memory_estimate_micros) + >= MEMORY_ESTIMATE_INTERVAL_MICROS + { + state.last_memory_estimate_micros = now_micros; + let mem_estimate = state.keyspace.estimate_memory_bytes(); + if mem_estimate > state.config.max_memory_estimate_bytes { + warn!( + mem_estimate, + max_memory_estimate_bytes = state.config.max_memory_estimate_bytes, + "aedb memory estimate exceeded threshold" + ); + } } if state.wal.should_rotate().is_some() { let _ = state @@ -721,6 +802,7 @@ pub(super) fn process_commit_epoch( coordinator_apply_attempts, coordinator_apply_micros, read_set_conflicts, + catalog_changed, } } @@ -1029,6 +1111,46 @@ pub(super) fn namespace_id_for_parallel_mutation(mutation: &Mutation) -> Option< project_id, scope_id, .. + } + | Mutation::OrderBookNew { + project_id, + scope_id, + .. + } + | Mutation::OrderBookCancel { + project_id, + scope_id, + .. + } + | Mutation::OrderBookCancelReplace { + project_id, + scope_id, + .. + } + | Mutation::OrderBookMassCancel { + project_id, + scope_id, + .. + } + | Mutation::OrderBookReduce { + project_id, + scope_id, + .. + } + | Mutation::OrderBookMatch { + project_id, + scope_id, + .. + } + | Mutation::OrderBookDefineTable { + project_id, + scope_id, + .. + } + | Mutation::OrderBookDropTable { + project_id, + scope_id, + .. } => Some(NamespaceId::project_scope(project_id, scope_id)), _ => None, } @@ -1039,7 +1161,15 @@ pub(super) fn is_parallel_mutation_safe(catalog: &Catalog, mutation: &Mutation) Mutation::KvSet { .. } | Mutation::KvDel { .. } | Mutation::KvIncU256 { .. } - | Mutation::KvDecU256 { .. } => true, + | Mutation::KvDecU256 { .. } + | Mutation::OrderBookNew { .. } + | Mutation::OrderBookCancel { .. } + | Mutation::OrderBookCancelReplace { .. } + | Mutation::OrderBookMassCancel { .. } + | Mutation::OrderBookReduce { .. } + | Mutation::OrderBookMatch { .. } + | Mutation::OrderBookDefineTable { .. } + | Mutation::OrderBookDropTable { .. } => true, Mutation::Insert { project_id, scope_id, @@ -1611,6 +1741,14 @@ pub(super) fn payload_type_for_mutations(mutations: &[Mutation]) -> u8 { | Mutation::KvDel { .. } | Mutation::KvIncU256 { .. } | Mutation::KvDecU256 { .. } + | Mutation::OrderBookNew { .. } + | Mutation::OrderBookCancel { .. } + | Mutation::OrderBookCancelReplace { .. } + | Mutation::OrderBookMassCancel { .. } + | Mutation::OrderBookReduce { .. } + | Mutation::OrderBookMatch { .. } + | Mutation::OrderBookDefineTable { .. } + | Mutation::OrderBookDropTable { .. } ) }); if all_kv { 0x04 } else { 0x01 } @@ -1727,6 +1865,76 @@ pub(super) fn derive_write_partitions_with_fk_expansion( let ns = namespace_key(project_id, scope_id); out.insert(kv_key_partition_token(&ns, key)); } + Mutation::OrderBookNew { + project_id, + scope_id, + request, + } => { + let ns = namespace_key(project_id, scope_id); + out.insert(order_book_partition_token(&ns, &request.instrument)); + } + Mutation::OrderBookCancel { + project_id, + scope_id, + instrument, + .. + } + | Mutation::OrderBookCancelReplace { + project_id, + scope_id, + instrument, + .. + } + | Mutation::OrderBookMassCancel { + project_id, + scope_id, + instrument, + .. + } + | Mutation::OrderBookReduce { + project_id, + scope_id, + instrument, + .. + } + | Mutation::OrderBookMatch { + project_id, + scope_id, + instrument, + .. + } => { + let ns = namespace_key(project_id, scope_id); + out.insert(order_book_partition_token(&ns, instrument)); + } + Mutation::OrderBookDefineTable { + project_id, + scope_id, + table_id, + .. + } + | Mutation::OrderBookDropTable { + project_id, + scope_id, + table_id, + } => { + let ns = namespace_key(project_id, scope_id); + out.insert(order_book_meta_partition_token(&ns, "table", table_id)); + } + Mutation::OrderBookSetInstrumentConfig { + project_id, + scope_id, + instrument, + .. + } + | Mutation::OrderBookSetInstrumentHalted { + project_id, + scope_id, + instrument, + .. + } => { + let ns = namespace_key(project_id, scope_id); + out.insert(order_book_meta_partition_token(&ns, "cfg", instrument)); + } Mutation::Ddl(_) => { out.insert(GLOBAL_PARTITION_TOKEN.to_string()); } @@ -1749,11 +1957,28 @@ pub(super) fn derive_write_partitions_with_fk_expansion( out } -#[cfg(test)] pub(super) fn derive_write_partitions(mutations: &[Mutation]) -> HashSet { derive_write_partitions_with_fk_expansion(&Catalog::default(), mutations) } +pub(super) fn mutation_requires_fk_expansion(mutation: &Mutation) -> bool { + matches!( + mutation, + Mutation::Insert { .. } + | Mutation::InsertBatch { .. } + | Mutation::Upsert { .. } + | Mutation::UpsertBatch { .. } + | Mutation::UpsertOnConflict { .. } + | Mutation::UpsertBatchOnConflict { .. } + | Mutation::Delete { .. } + | Mutation::DeleteWhere { .. } + | Mutation::UpdateWhere { .. } + | Mutation::UpdateWhereExpr { .. } + | Mutation::TableIncU256 { .. } + | Mutation::TableDecU256 { .. } + ) +} + pub(super) fn derive_read_partitions(envelope: &TransactionEnvelope) -> HashSet { let mut out = HashSet::new(); for entry in &envelope.read_set.points { @@ -1908,6 +2133,14 @@ fn kv_namespace_partition_token(namespace: &str) -> String { format!("kns:{namespace}") } +fn order_book_partition_token(namespace: &str, instrument: &str) -> String { + format!("ob:{namespace}:{instrument}") +} + +fn order_book_meta_partition_token(namespace: &str, kind: &str, id: &str) -> String { + format!("obm:{namespace}:{kind}:{id}") +} + fn is_cross_partition_write_set(write_set: &HashSet) -> bool { if write_set.contains(GLOBAL_PARTITION_TOKEN) { return true; @@ -1942,6 +2175,16 @@ fn token_namespace(token: &str) -> Option<&str> { { return Some(ns); } + if let Some(rest) = token.strip_prefix("ob:") + && let Some((ns, _instrument)) = rest.rsplit_once(':') + { + return Some(ns); + } + if let Some(rest) = token.strip_prefix("obm:") + && let Some((ns, _tail)) = rest.split_once(':') + { + return Some(ns); + } None } diff --git a/src/commit/executor/mod.rs b/src/commit/executor/mod.rs index 6a06695..e10f4ce 100644 --- a/src/commit/executor/mod.rs +++ b/src/commit/executor/mod.rs @@ -97,6 +97,7 @@ struct EpochProcessResult { coordinator_apply_attempts: u64, coordinator_apply_micros: u64, read_set_conflicts: u64, + catalog_changed: bool, } struct ExecutorState { @@ -116,6 +117,7 @@ struct ExecutorState { idempotency: HashMap, version_store: VersionStore, last_full_snapshot_micros: u64, + last_memory_estimate_micros: u64, } #[derive(Clone)] @@ -244,6 +246,7 @@ impl CommitExecutor { idempotency, version_store, last_full_snapshot_micros: now_micros(), + last_memory_estimate_micros: 0, })); let (apply_tx, mut rx) = tokio_mpsc::channel::(max_inflight_commits); let queued_bytes = Arc::new(AtomicUsize::new(0)); @@ -323,6 +326,7 @@ impl CommitExecutor { let mut s = loop_state.lock().await; let epoch_started = Instant::now(); let epoch_result = process_commit_epoch(&mut s, epoch_requests); + let catalog_changed = epoch_result.catalog_changed; let outcomes = epoch_result.outcomes; let had_error = outcomes.iter().any(|o| o.result.is_err()); s.adaptive_epoch.observe_epoch( @@ -359,7 +363,9 @@ impl CommitExecutor { .epoch_failures .fetch_add(1, Ordering::Relaxed); } - *loop_validation_catalog.write() = s.catalog.clone(); + if catalog_changed { + *loop_validation_catalog.write() = s.catalog.clone(); + } drop(s); for outcome in outcomes { @@ -433,14 +439,22 @@ impl CommitExecutor { let handle = tokio::spawn(async move { while let Some(mut req) = ingress_rx.recv().await { let (write_partitions, read_partitions) = if req.prevalidated { - let catalog = pre_validation_catalog.read().snapshot(); - ( + let write_partitions = if req + .envelope + .write_intent + .mutations + .iter() + .any(mutation_requires_fk_expansion) + { + let catalog = pre_validation_catalog.read().snapshot(); derive_write_partitions_with_fk_expansion( &catalog, &req.envelope.write_intent.mutations, - ), - derive_read_partitions(&req.envelope), - ) + ) + } else { + derive_write_partitions(&req.envelope.write_intent.mutations) + }; + (write_partitions, derive_read_partitions(&req.envelope)) } else { match pre_stage_validate(&pre_validation_catalog, &req.envelope).await { Ok(partitions) => partitions, @@ -535,12 +549,33 @@ impl CommitExecutor { self.submit_as(None, mutation).await } + pub(crate) async fn submit_prevalidated( + &self, + mutation: Mutation, + ) -> Result { + self.submit_envelope_with_mode( + TransactionEnvelope { + caller: None, + idempotency_key: None, + write_class: WriteClass::Standard, + assertions: Vec::new(), + read_set: Default::default(), + write_intent: WriteIntent { + mutations: vec![mutation], + }, + base_seq: 0, + }, + true, + false, + ) + .await + } + pub async fn submit_as( &self, caller: Option, mutation: Mutation, ) -> Result { - let base_seq = self.current_seq().await; self.submit_envelope(TransactionEnvelope { caller, idempotency_key: None, @@ -550,7 +585,9 @@ impl CommitExecutor { write_intent: WriteIntent { mutations: vec![mutation], }, - base_seq, + // No read set/assertions in single-mutation submit path. + // A fixed base_seq avoids an extra state lock on the hot write path. + base_seq: 0, }) .await } @@ -706,12 +743,23 @@ impl CommitExecutor { pub async fn snapshot_state(&self) -> (KeyspaceSnapshot, Catalog, u64) { let mut state = self.state.lock().await; - let view = state - .version_store - .acquire_latest() - .expect("version store should always have a latest view") - .into_view(); - ((*view.keyspace).clone(), (*view.catalog).clone(), view.seq) + match state.version_store.acquire_latest() { + Ok(view) => { + let view = view.into_view(); + ((*view.keyspace).clone(), (*view.catalog).clone(), view.seq) + } + Err(err) => { + warn!( + error = ?err, + "version store latest view unavailable; falling back to executor state snapshot" + ); + ( + state.keyspace.snapshot(), + state.catalog.snapshot(), + state.visible_head_seq, + ) + } + } } pub async fn snapshot_at_seq(&self, seq: u64) -> Result { diff --git a/src/commit/validation.rs b/src/commit/validation.rs index bf0fb92..dcd352b 100644 --- a/src/commit/validation.rs +++ b/src/commit/validation.rs @@ -4,6 +4,10 @@ use crate::catalog::{Catalog, DdlOperation, KV_INDEX_TABLE, ResourceType, namesp use crate::config::AedbConfig; use crate::error::AedbError; use crate::error::ResourceType as ErrorResourceType; +use crate::order_book::{ + ExecInstruction, FillSpec, InstrumentConfig, OrderBookTableMode, OrderRequest, OrderSide, + TimeInForce, +}; use crate::permission::{CallerContext, Permission}; use crate::query::plan::Expr; use primitive_types::U256; @@ -166,6 +170,76 @@ pub enum Mutation { column: String, amount_be: [u8; 32], }, + OrderBookNew { + project_id: String, + scope_id: String, + request: OrderRequest, + }, + OrderBookCancel { + project_id: String, + scope_id: String, + instrument: String, + order_id: u64, + client_order_id: Option, + owner: String, + }, + OrderBookCancelReplace { + project_id: String, + scope_id: String, + instrument: String, + order_id: u64, + owner: String, + new_price_ticks: Option, + new_qty_be: Option<[u8; 32]>, + new_time_in_force: Option, + new_exec_instructions: Option, + }, + OrderBookMassCancel { + project_id: String, + scope_id: String, + instrument: String, + owner: String, + side: Option, + owner_filter: Option, + price_range_ticks: Option<(i64, i64)>, + }, + OrderBookReduce { + project_id: String, + scope_id: String, + instrument: String, + order_id: u64, + owner: String, + reduce_by_be: [u8; 32], + }, + OrderBookMatch { + project_id: String, + scope_id: String, + instrument: String, + fills: Vec, + }, + OrderBookDefineTable { + project_id: String, + scope_id: String, + table_id: String, + mode: OrderBookTableMode, + }, + OrderBookDropTable { + project_id: String, + scope_id: String, + table_id: String, + }, + OrderBookSetInstrumentConfig { + project_id: String, + scope_id: String, + instrument: String, + config: InstrumentConfig, + }, + OrderBookSetInstrumentHalted { + project_id: String, + scope_id: String, + instrument: String, + halted: bool, + }, } /// Early validation of KV mutation sizes to prevent DoS via oversized keys/values. @@ -461,6 +535,137 @@ pub fn validate_mutation_with_config( primary_key, column, ), + Mutation::OrderBookNew { request, .. } => { + if request.instrument.trim().is_empty() { + return Err(AedbError::Validation("instrument cannot be empty".into())); + } + if request.client_order_id.trim().is_empty() { + return Err(AedbError::Validation( + "client_order_id cannot be empty".into(), + )); + } + if request.owner.trim().is_empty() { + return Err(AedbError::Validation("owner cannot be empty".into())); + } + if primitive_types::U256::from_big_endian(&request.qty_be).is_zero() { + return Err(AedbError::Validation("qty must be > 0".into())); + } + if request.exec_instructions.post_only() + && !matches!(request.order_type, crate::order_book::OrderType::Limit) + { + return Err(AedbError::Validation( + "post_only requires limit order".into(), + )); + } + if request.exec_instructions.post_only() + && matches!(request.time_in_force, TimeInForce::Fok) + { + return Err(AedbError::Validation( + "post_only cannot be combined with FOK".into(), + )); + } + Ok(()) + } + Mutation::OrderBookCancel { + instrument, owner, .. + } => { + if instrument.trim().is_empty() || owner.trim().is_empty() { + return Err(AedbError::Validation( + "instrument and owner cannot be empty".into(), + )); + } + Ok(()) + } + Mutation::OrderBookCancelReplace { + instrument, owner, .. + } => { + if instrument.trim().is_empty() || owner.trim().is_empty() { + return Err(AedbError::Validation( + "instrument and owner cannot be empty".into(), + )); + } + Ok(()) + } + Mutation::OrderBookMassCancel { + instrument, + owner, + price_range_ticks, + .. + } => { + if instrument.trim().is_empty() || owner.trim().is_empty() { + return Err(AedbError::Validation( + "instrument and owner cannot be empty".into(), + )); + } + if let Some((min_price, max_price)) = price_range_ticks + && min_price > max_price + { + return Err(AedbError::Validation("invalid price range".into())); + } + Ok(()) + } + Mutation::OrderBookReduce { + instrument, + owner, + reduce_by_be, + .. + } => { + if instrument.trim().is_empty() || owner.trim().is_empty() { + return Err(AedbError::Validation( + "instrument and owner cannot be empty".into(), + )); + } + if primitive_types::U256::from_big_endian(reduce_by_be).is_zero() { + return Err(AedbError::Validation("reduce_by must be > 0".into())); + } + Ok(()) + } + Mutation::OrderBookMatch { + instrument, fills, .. + } => { + if instrument.trim().is_empty() { + return Err(AedbError::Validation("instrument cannot be empty".into())); + } + if fills.is_empty() { + return Err(AedbError::Validation("fills cannot be empty".into())); + } + if fills + .iter() + .any(|fill| primitive_types::U256::from_big_endian(&fill.qty_be).is_zero()) + { + return Err(AedbError::Validation("fill qty must be > 0".into())); + } + Ok(()) + } + Mutation::OrderBookDefineTable { table_id, .. } + | Mutation::OrderBookDropTable { table_id, .. } => { + if table_id.trim().is_empty() { + return Err(AedbError::Validation("table_id cannot be empty".into())); + } + Ok(()) + } + Mutation::OrderBookSetInstrumentConfig { + instrument, config, .. + } => { + if instrument.trim().is_empty() { + return Err(AedbError::Validation("instrument cannot be empty".into())); + } + if config.instrument != *instrument { + return Err(AedbError::Validation( + "instrument config instrument mismatch".into(), + )); + } + if primitive_types::U256::from_big_endian(&config.lot_size_be).is_zero() { + return Err(AedbError::Validation("lot_size must be > 0".into())); + } + Ok(()) + } + Mutation::OrderBookSetInstrumentHalted { instrument, .. } => { + if instrument.trim().is_empty() { + return Err(AedbError::Validation("instrument cannot be empty".into())); + } + Ok(()) + } } } @@ -607,6 +812,35 @@ pub fn validate_permissions( let Some(caller) = caller else { return Ok(()); }; + if matches!(mutation, Mutation::OrderBookMatch { .. }) { + if caller.is_internal_system() { + return Ok(()); + } + return Err(AedbError::PermissionDenied( + "OrderBookMatch is system-only".into(), + )); + } + let is_admin = catalog.has_permission(&caller.caller_id, &Permission::GlobalAdmin); + match mutation { + Mutation::OrderBookNew { request, .. } => { + if !is_admin && request.owner != caller.caller_id { + return Err(AedbError::PermissionDenied( + "order owner must match caller".into(), + )); + } + } + Mutation::OrderBookCancel { owner, .. } + | Mutation::OrderBookCancelReplace { owner, .. } + | Mutation::OrderBookReduce { owner, .. } + | Mutation::OrderBookMassCancel { owner, .. } => { + if !is_admin && owner != &caller.caller_id { + return Err(AedbError::PermissionDenied( + "order owner must match caller".into(), + )); + } + } + _ => {} + } if let Some((project_id, scope_id, key)) = kv_write_target(mutation) { if catalog.has_kv_write_permission(&caller.caller_id, project_id, scope_id, key) { return Ok(()); @@ -703,6 +937,77 @@ fn kv_write_target(mutation: &Mutation) -> Option<(&str, &str, &[u8])> { key, .. } => Some((project_id.as_str(), scope_id.as_str(), key.as_slice())), + Mutation::OrderBookNew { + project_id, + scope_id, + request, + } => Some(( + project_id.as_str(), + scope_id.as_str(), + request.instrument.as_bytes(), + )), + Mutation::OrderBookCancel { + project_id, + scope_id, + instrument, + .. + } + | Mutation::OrderBookCancelReplace { + project_id, + scope_id, + instrument, + .. + } + | Mutation::OrderBookMassCancel { + project_id, + scope_id, + instrument, + .. + } + | Mutation::OrderBookReduce { + project_id, + scope_id, + instrument, + .. + } + | Mutation::OrderBookMatch { + project_id, + scope_id, + instrument, + .. + } => Some(( + project_id.as_str(), + scope_id.as_str(), + instrument.as_bytes(), + )), + Mutation::OrderBookDefineTable { + project_id, + scope_id, + table_id, + .. + } + | Mutation::OrderBookDropTable { + project_id, + scope_id, + table_id, + .. + } => Some((project_id.as_str(), scope_id.as_str(), table_id.as_bytes())), + Mutation::OrderBookSetInstrumentConfig { + project_id, + scope_id, + instrument, + .. + } + | Mutation::OrderBookSetInstrumentHalted { + project_id, + scope_id, + instrument, + .. + } => Some(( + project_id.as_str(), + scope_id.as_str(), + instrument.as_bytes(), + )), _ => None, } } @@ -857,6 +1162,56 @@ pub fn required_permission(mutation: &Mutation) -> Result project_id, scope_id, .. + } + | Mutation::OrderBookNew { + project_id, + scope_id, + .. + } + | Mutation::OrderBookCancel { + project_id, + scope_id, + .. + } + | Mutation::OrderBookCancelReplace { + project_id, + scope_id, + .. + } + | Mutation::OrderBookMassCancel { + project_id, + scope_id, + .. + } + | Mutation::OrderBookReduce { + project_id, + scope_id, + .. + } + | Mutation::OrderBookMatch { + project_id, + scope_id, + .. + } + | Mutation::OrderBookDefineTable { + project_id, + scope_id, + .. + } + | Mutation::OrderBookDropTable { + project_id, + scope_id, + .. + } + | Mutation::OrderBookSetInstrumentConfig { + project_id, + scope_id, + .. + } + | Mutation::OrderBookSetInstrumentHalted { + project_id, + scope_id, + .. } => Ok(Permission::KvWrite { project_id: project_id.clone(), scope_id: Some(scope_id.clone()), diff --git a/src/lib.rs b/src/lib.rs index c0c2833..7cfec7f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -11,6 +11,7 @@ mod lib_tests; pub mod manifest; pub mod migration; pub mod offline; +pub mod order_book; pub mod permission; pub mod preflight; pub mod query; @@ -44,6 +45,11 @@ use crate::manifest::schema::{Manifest, SegmentMeta}; use crate::migration::{ Migration, MigrationRecord, checksum_hex, decode_record, encode_record, migration_key, }; +use crate::order_book::{ + ExecInstruction, FillSpec, InstrumentConfig, OrderBookDepth, OrderBookTableMode, OrderRecord, + OrderRequest, OrderSide, Spread, TimeInForce, read_last_execution_report, read_open_orders, + read_order_status, read_recent_trades, read_spread, read_top_n, scoped_instrument, +}; use crate::permission::{CallerContext, Permission}; use crate::preflight::{PreflightResult, preflight, preflight_plan}; use crate::query::error::QueryError; @@ -699,6 +705,27 @@ impl AedbInstance { Ok(result) } + async fn commit_prevalidated_internal( + &self, + op_name: &'static str, + mutation: Mutation, + ) -> Result { + let started = Instant::now(); + if self.require_authenticated_calls { + return Err(AedbError::PermissionDenied( + "authenticated caller required; use commit_as in secure mode".into(), + )); + } + crate::commit::validation::validate_kv_sizes_early(&mutation, &self._config)?; + if self.checkpoint_in_progress.load(Ordering::Acquire) { + return Err(AedbError::CheckpointInProgress); + } + let _permit = self.acquire_checkpoint_permit().await?; + let result = self.executor.submit_prevalidated(mutation).await; + self.emit_commit_telemetry(op_name, started, &result); + result + } + pub async fn commit_with_finality( &self, mutation: Mutation, @@ -2740,6 +2767,487 @@ impl AedbInstance { .await } + pub async fn order_book_new( + &self, + project_id: &str, + scope_id: &str, + request: OrderRequest, + ) -> Result { + self.commit_prevalidated_internal( + "order_book_new", + Mutation::OrderBookNew { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + request, + }, + ) + .await + } + + pub async fn order_book_define_table( + &self, + project_id: &str, + scope_id: &str, + table_id: &str, + mode: OrderBookTableMode, + ) -> Result { + self.commit_prevalidated_internal( + "order_book_define_table", + Mutation::OrderBookDefineTable { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + table_id: table_id.to_string(), + mode, + }, + ) + .await + } + + pub async fn order_book_drop_table( + &self, + project_id: &str, + scope_id: &str, + table_id: &str, + ) -> Result { + self.commit_prevalidated_internal( + "order_book_drop_table", + Mutation::OrderBookDropTable { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + table_id: table_id.to_string(), + }, + ) + .await + } + + pub async fn order_book_set_instrument_config( + &self, + project_id: &str, + scope_id: &str, + instrument: &str, + config: InstrumentConfig, + ) -> Result { + self.commit_prevalidated_internal( + "order_book_set_instrument_config", + Mutation::OrderBookSetInstrumentConfig { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + instrument: instrument.to_string(), + config, + }, + ) + .await + } + + pub async fn order_book_set_instrument_halted( + &self, + project_id: &str, + scope_id: &str, + instrument: &str, + halted: bool, + ) -> Result { + self.commit_prevalidated_internal( + "order_book_set_instrument_halted", + Mutation::OrderBookSetInstrumentHalted { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + instrument: instrument.to_string(), + halted, + }, + ) + .await + } + + pub async fn order_book_new_in_table( + &self, + project_id: &str, + scope_id: &str, + table_id: &str, + asset_id: &str, + mut request: OrderRequest, + ) -> Result { + request.instrument = scoped_instrument(table_id, asset_id); + self.order_book_new(project_id, scope_id, request).await + } + + pub async fn order_book_new_as( + &self, + caller: CallerContext, + project_id: &str, + scope_id: &str, + request: OrderRequest, + ) -> Result { + self.commit_as( + caller, + Mutation::OrderBookNew { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + request, + }, + ) + .await + } + + pub async fn order_book_cancel( + &self, + project_id: &str, + scope_id: &str, + instrument: &str, + order_id: u64, + owner: &str, + ) -> Result { + self.commit_prevalidated_internal( + "order_book_cancel", + Mutation::OrderBookCancel { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + instrument: instrument.to_string(), + order_id, + client_order_id: None, + owner: owner.to_string(), + }, + ) + .await + } + + pub async fn order_book_cancel_by_client_id( + &self, + project_id: &str, + scope_id: &str, + instrument: &str, + client_order_id: &str, + owner: &str, + ) -> Result { + self.commit_prevalidated_internal( + "order_book_cancel_by_client_id", + Mutation::OrderBookCancel { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + instrument: instrument.to_string(), + order_id: 0, + client_order_id: Some(client_order_id.to_string()), + owner: owner.to_string(), + }, + ) + .await + } + + pub async fn order_book_cancel_replace( + &self, + project_id: &str, + scope_id: &str, + instrument: &str, + order_id: u64, + owner: &str, + new_price_ticks: Option, + new_qty_be: Option<[u8; 32]>, + new_time_in_force: Option, + new_exec_instructions: Option, + ) -> Result { + self.commit_prevalidated_internal( + "order_book_cancel_replace", + Mutation::OrderBookCancelReplace { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + instrument: instrument.to_string(), + order_id, + owner: owner.to_string(), + new_price_ticks, + new_qty_be, + new_time_in_force, + new_exec_instructions, + }, + ) + .await + } + + pub async fn order_book_mass_cancel( + &self, + project_id: &str, + scope_id: &str, + instrument: &str, + owner: &str, + side: Option, + owner_filter: Option, + price_range_ticks: Option<(i64, i64)>, + ) -> Result { + self.commit_prevalidated_internal( + "order_book_mass_cancel", + Mutation::OrderBookMassCancel { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + instrument: instrument.to_string(), + owner: owner.to_string(), + side, + owner_filter, + price_range_ticks, + }, + ) + .await + } + + pub async fn order_book_reduce( + &self, + project_id: &str, + scope_id: &str, + instrument: &str, + order_id: u64, + owner: &str, + reduce_by_be: [u8; 32], + ) -> Result { + self.commit_prevalidated_internal( + "order_book_reduce", + Mutation::OrderBookReduce { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + instrument: instrument.to_string(), + order_id, + owner: owner.to_string(), + reduce_by_be, + }, + ) + .await + } + + pub async fn order_book_match_internal( + &self, + project_id: &str, + scope_id: &str, + instrument: &str, + fills: Vec, + ) -> Result { + self.commit_prevalidated_internal( + "order_book_match_internal", + Mutation::OrderBookMatch { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + instrument: instrument.to_string(), + fills, + }, + ) + .await + } + + pub async fn order_book_top_n( + &self, + project_id: &str, + scope_id: &str, + instrument: &str, + depth: u32, + consistency: ConsistencyMode, + caller: &CallerContext, + ) -> Result { + ensure_query_caller_allowed(caller)?; + let lease = self + .acquire_snapshot(consistency) + .await + .map_err(QueryError::from)?; + let prefix = format!("ob:{instrument}:"); + if !lease.view.catalog.has_kv_read_permission( + &caller.caller_id, + project_id, + scope_id, + prefix.as_bytes(), + ) { + return Err(QueryError::PermissionDenied { + permission: format!("KvRead({project_id}.{scope_id})"), + scope: caller.caller_id.clone(), + }); + } + read_top_n( + &lease.view.keyspace, + project_id, + scope_id, + instrument, + depth as usize, + lease.view.seq, + ) + .map_err(QueryError::from) + } + + pub async fn order_status( + &self, + project_id: &str, + scope_id: &str, + instrument: &str, + order_id: u64, + consistency: ConsistencyMode, + caller: &CallerContext, + ) -> Result, QueryError> { + ensure_query_caller_allowed(caller)?; + let lease = self + .acquire_snapshot(consistency) + .await + .map_err(QueryError::from)?; + let prefix = format!("ob:{instrument}:"); + if !lease.view.catalog.has_kv_read_permission( + &caller.caller_id, + project_id, + scope_id, + prefix.as_bytes(), + ) { + return Err(QueryError::PermissionDenied { + permission: format!("KvRead({project_id}.{scope_id})"), + scope: caller.caller_id.clone(), + }); + } + let order = read_order_status( + &lease.view.keyspace, + project_id, + scope_id, + instrument, + order_id, + ) + .map_err(QueryError::from)?; + if let Some(order) = &order { + let admin = lease + .view + .catalog + .has_permission(&caller.caller_id, &Permission::GlobalAdmin); + if !admin && order.owner != caller.caller_id { + return Err(QueryError::PermissionDenied { + permission: "order_status(owner match)".into(), + scope: caller.caller_id.clone(), + }); + } + } + Ok(order) + } + + pub async fn open_orders( + &self, + project_id: &str, + scope_id: &str, + instrument: &str, + owner: &str, + consistency: ConsistencyMode, + caller: &CallerContext, + ) -> Result, QueryError> { + ensure_query_caller_allowed(caller)?; + let lease = self + .acquire_snapshot(consistency) + .await + .map_err(QueryError::from)?; + let admin = lease + .view + .catalog + .has_permission(&caller.caller_id, &Permission::GlobalAdmin); + if !admin && owner != caller.caller_id { + return Err(QueryError::PermissionDenied { + permission: "open_orders(owner match)".into(), + scope: caller.caller_id.clone(), + }); + } + read_open_orders( + &lease.view.keyspace, + project_id, + scope_id, + instrument, + owner, + ) + .map_err(QueryError::from) + } + + pub async fn recent_trades( + &self, + project_id: &str, + scope_id: &str, + instrument: &str, + limit: u32, + consistency: ConsistencyMode, + caller: &CallerContext, + ) -> Result, QueryError> { + ensure_query_caller_allowed(caller)?; + let lease = self + .acquire_snapshot(consistency) + .await + .map_err(QueryError::from)?; + let prefix = format!("ob:{instrument}:"); + if !lease.view.catalog.has_kv_read_permission( + &caller.caller_id, + project_id, + scope_id, + prefix.as_bytes(), + ) { + return Err(QueryError::PermissionDenied { + permission: format!("KvRead({project_id}.{scope_id})"), + scope: caller.caller_id.clone(), + }); + } + read_recent_trades( + &lease.view.keyspace, + project_id, + scope_id, + instrument, + limit as usize, + ) + .map_err(QueryError::from) + } + + pub async fn spread( + &self, + project_id: &str, + scope_id: &str, + instrument: &str, + consistency: ConsistencyMode, + caller: &CallerContext, + ) -> Result { + ensure_query_caller_allowed(caller)?; + let lease = self + .acquire_snapshot(consistency) + .await + .map_err(QueryError::from)?; + let prefix = format!("ob:{instrument}:"); + if !lease.view.catalog.has_kv_read_permission( + &caller.caller_id, + project_id, + scope_id, + prefix.as_bytes(), + ) { + return Err(QueryError::PermissionDenied { + permission: format!("KvRead({project_id}.{scope_id})"), + scope: caller.caller_id.clone(), + }); + } + read_spread( + &lease.view.keyspace, + project_id, + scope_id, + instrument, + lease.view.seq, + ) + .map_err(QueryError::from) + } + + pub async fn order_book_last_execution_report( + &self, + project_id: &str, + scope_id: &str, + instrument: &str, + consistency: ConsistencyMode, + caller: &CallerContext, + ) -> Result, QueryError> { + ensure_query_caller_allowed(caller)?; + let lease = self + .acquire_snapshot(consistency) + .await + .map_err(QueryError::from)?; + let prefix = format!("ob:{instrument}:"); + if !lease.view.catalog.has_kv_read_permission( + &caller.caller_id, + project_id, + scope_id, + prefix.as_bytes(), + ) { + return Err(QueryError::PermissionDenied { + permission: format!("KvRead({project_id}.{scope_id})"), + scope: caller.caller_id.clone(), + }); + } + read_last_execution_report(&lease.view.keyspace, project_id, scope_id, instrument) + .map_err(QueryError::from) + } + pub async fn compare_and_swap( &self, project_id: &str, diff --git a/src/order_book.rs b/src/order_book.rs new file mode 100644 index 0000000..9c0c24d --- /dev/null +++ b/src/order_book.rs @@ -0,0 +1,2164 @@ +use crate::error::AedbError; +use crate::storage::keyspace::{Keyspace, KeyspaceSnapshot}; +use primitive_types::U256; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Hash)] +#[repr(u8)] +pub enum OrderSide { + Bid = 0, + Ask = 1, +} + +impl OrderSide { + pub fn as_u8(self) -> u8 { + match self { + Self::Bid => 0, + Self::Ask => 1, + } + } + + pub fn opposite(self) -> Self { + match self { + Self::Bid => Self::Ask, + Self::Ask => Self::Bid, + } + } +} + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)] +#[repr(u8)] +pub enum OrderType { + Limit = 0, + Market = 1, +} + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)] +#[repr(u8)] +pub enum TimeInForce { + Gtc = 0, + Ioc = 1, + Fok = 2, +} + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)] +#[repr(u8)] +pub enum SelfTradePrevention { + None = 0, + CancelResting = 1, + CancelAggressor = 2, + CancelBoth = 3, +} + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)] +#[repr(u8)] +pub enum OrderStatus { + Open = 0, + PartiallyFilled = 1, + Filled = 2, + Cancelled = 3, + Rejected = 4, +} + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)] +#[repr(u8)] +pub enum OrderBookTableMode { + PerAsset = 0, + MultiAsset = 1, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct OrderBookTableSpec { + pub table_id: String, + pub mode: OrderBookTableMode, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct BalanceConfig { + pub base_balance_key: String, + pub quote_balance_key: String, + pub escrow_on_place: bool, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct InstrumentConfig { + pub instrument: String, + pub tick_size: i64, + pub lot_size_be: [u8; 32], + pub min_price_ticks: i64, + pub max_price_ticks: i64, + pub market_order_price_band: Option, + pub halted: bool, + pub balance_config: Option, +} + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)] +pub struct ExecInstruction(pub u16); + +impl ExecInstruction { + pub const POST_ONLY: u16 = 0b0000_0001; + + pub fn post_only(self) -> bool { + (self.0 & Self::POST_ONLY) != 0 + } +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct OrderRequest { + pub instrument: String, + pub client_order_id: String, + pub side: OrderSide, + pub order_type: OrderType, + pub time_in_force: TimeInForce, + pub exec_instructions: ExecInstruction, + pub self_trade_prevention: SelfTradePrevention, + pub price_ticks: i64, + pub qty_be: [u8; 32], + pub owner: String, + pub account: Option, + pub nonce: u64, + pub price_limit_ticks: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct FillSpec { + pub aggressor_order_id: u64, + pub passive_order_id: u64, + pub price_ticks: i64, + pub qty_be: [u8; 32], +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct FillRecord { + pub fill_id: u64, + pub instrument: String, + pub price_ticks: i64, + pub qty_be: [u8; 32], + pub aggressor_order_id: u64, + pub aggressor_owner: String, + pub aggressor_side: OrderSide, + pub passive_order_id: u64, + pub passive_owner: String, + pub seq: u64, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub enum RejectReason { + InsufficientBalance, + PostOnlyWouldCross, + FokCannotFill, + SelfTradeBlocked, + InvalidPrice, + InvalidQuantity, + DuplicateClientOrderId, + NonceTooLow, + InstrumentNotFound, + InstrumentHalted, + MarketOrderNoLiquidity, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct ExecutionReport { + pub order_id: u64, + pub client_order_id: String, + pub status: OrderStatus, + pub fills: Vec, + pub remaining_qty_be: [u8; 32], + pub total_filled_qty_be: [u8; 32], + pub avg_price_ticks: Option, + pub reject_reason: Option, + pub seq: u64, +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub struct OrderRecord { + pub order_id: u64, + pub instrument: String, + pub client_order_id: String, + pub owner: String, + pub account: Option, + pub side: OrderSide, + pub order_type: OrderType, + pub time_in_force: TimeInForce, + pub exec_instructions: ExecInstruction, + pub self_trade_prevention: SelfTradePrevention, + pub price_ticks: i64, + pub original_qty_be: [u8; 32], + pub remaining_qty_be: [u8; 32], + pub filled_qty_be: [u8; 32], + pub status: OrderStatus, + pub placed_seq: u64, + pub last_modified_seq: u64, + pub nonce: u64, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PriceLevel { + pub price_ticks: i64, + pub total_qty_be: [u8; 32], + pub order_count: u32, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct OrderBookDepth { + pub instrument: String, + pub bids: Vec, + pub asks: Vec, + pub seq: u64, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Spread { + pub best_bid: Option, + pub best_ask: Option, + pub mid: Option, + pub seq: u64, +} + +pub fn u256_from_be(bytes: [u8; 32]) -> U256 { + U256::from_big_endian(&bytes) +} + +pub fn u256_to_be(v: U256) -> [u8; 32] { + let mut out = [0u8; 32]; + v.to_big_endian(&mut out); + out +} + +pub fn encode_i64_ordered(v: i64) -> [u8; 8] { + ((v as u64) ^ (1u64 << 63)).to_be_bytes() +} + +pub fn decode_i64_ordered(bytes: [u8; 8]) -> i64 { + (u64::from_be_bytes(bytes) ^ (1u64 << 63)) as i64 +} + +fn encode_price_component(side: OrderSide, price_ticks: i64) -> [u8; 8] { + let mut p = encode_i64_ordered(price_ticks); + if matches!(side, OrderSide::Bid) { + for b in &mut p { + *b = !*b; + } + } + p +} + +fn decode_price_component(side: OrderSide, mut enc: [u8; 8]) -> i64 { + if matches!(side, OrderSide::Bid) { + for b in &mut enc { + *b = !*b; + } + } + decode_i64_ordered(enc) +} + +pub fn encode_order_id(order_id: u64) -> [u8; 8] { + order_id.to_be_bytes() +} + +pub fn decode_order_id(bytes: [u8; 8]) -> u64 { + u64::from_be_bytes(bytes) +} + +pub fn key_next_order_id(instrument: &str) -> Vec { + let mut k = Vec::with_capacity(17 + instrument.len()); + k.extend_from_slice(b"ob:"); + k.extend_from_slice(instrument.as_bytes()); + k.extend_from_slice(b":meta:next_oid"); + k +} + +pub fn key_order_book_table_spec(table_id: &str) -> Vec { + let mut k = Vec::with_capacity(7 + table_id.len()); + k.extend_from_slice(b"ob:def:"); + k.extend_from_slice(table_id.as_bytes()); + k +} + +pub fn scoped_instrument(table_id: &str, asset_id: &str) -> String { + let mut scoped = String::with_capacity(table_id.len() + 1 + asset_id.len()); + scoped.push_str(table_id); + scoped.push('/'); + scoped.push_str(asset_id); + scoped +} + +pub fn parse_scoped_instrument(scoped: &str) -> Option<(&str, &str)> { + scoped.split_once('/') +} + +pub fn apply_order_book_define_table( + keyspace: &mut Keyspace, + project_id: &str, + scope_id: &str, + table_id: &str, + mode: OrderBookTableMode, + commit_seq: u64, +) -> Result<(), AedbError> { + let spec = OrderBookTableSpec { + table_id: table_id.to_string(), + mode, + }; + keyspace.kv_set( + project_id, + scope_id, + key_order_book_table_spec(table_id), + serialize(&spec)?, + commit_seq, + ); + Ok(()) +} + +pub fn apply_order_book_drop_table( + keyspace: &mut Keyspace, + project_id: &str, + scope_id: &str, + table_id: &str, + commit_seq: u64, +) -> Result<(), AedbError> { + keyspace.kv_del( + project_id, + scope_id, + &key_order_book_table_spec(table_id), + commit_seq, + ); + Ok(()) +} + +pub fn apply_set_instrument_config( + keyspace: &mut Keyspace, + project_id: &str, + scope_id: &str, + instrument: &str, + config: &InstrumentConfig, + commit_seq: u64, +) -> Result<(), AedbError> { + if config.instrument != instrument { + return Err(AedbError::Validation( + "instrument config instrument mismatch".into(), + )); + } + if config.tick_size <= 0 { + return Err(AedbError::Validation("tick_size must be > 0".into())); + } + if config.min_price_ticks > config.max_price_ticks { + return Err(AedbError::Validation("invalid min/max price ticks".into())); + } + if u256_from_be(config.lot_size_be).is_zero() { + return Err(AedbError::Validation("lot_size must be > 0".into())); + } + keyspace.kv_set( + project_id, + scope_id, + key_instrument_config(instrument), + serialize(config)?, + commit_seq, + ); + Ok(()) +} + +pub fn apply_set_instrument_halted( + keyspace: &mut Keyspace, + project_id: &str, + scope_id: &str, + instrument: &str, + halted: bool, + commit_seq: u64, +) -> Result<(), AedbError> { + let mut config = load_instrument_config(keyspace, project_id, scope_id, instrument)? + .unwrap_or_else(|| InstrumentConfig { + instrument: instrument.to_string(), + tick_size: 1, + lot_size_be: u256_to_be(U256::one()), + min_price_ticks: i64::MIN, + max_price_ticks: i64::MAX, + market_order_price_band: None, + halted: false, + balance_config: None, + }); + config.halted = halted; + keyspace.kv_set( + project_id, + scope_id, + key_instrument_config(instrument), + serialize(&config)?, + commit_seq, + ); + Ok(()) +} + +pub fn key_next_fill_id(instrument: &str) -> Vec { + let mut k = Vec::with_capacity(18 + instrument.len()); + k.extend_from_slice(b"ob:"); + k.extend_from_slice(instrument.as_bytes()); + k.extend_from_slice(b":meta:next_fill"); + k +} + +pub fn key_instrument_config(instrument: &str) -> Vec { + let mut k = Vec::with_capacity(7 + instrument.len()); + k.extend_from_slice(b"ob:"); + k.extend_from_slice(instrument.as_bytes()); + k.extend_from_slice(b":cfg"); + k +} + +pub fn key_execution_report(instrument: &str, commit_seq: u64, order_id: u64) -> Vec { + let mut key = Vec::with_capacity(12 + instrument.len() + 1 + 8 + 1 + 8); + key.extend_from_slice(b"ob:"); + key.extend_from_slice(instrument.as_bytes()); + key.extend_from_slice(b":report:"); + key.extend_from_slice(&commit_seq.to_be_bytes()); + key.push(b':'); + key.extend_from_slice(&order_id.to_be_bytes()); + key +} + +pub fn key_execution_report_last(instrument: &str) -> Vec { + let mut k = Vec::with_capacity(12 + instrument.len()); + k.extend_from_slice(b"ob:"); + k.extend_from_slice(instrument.as_bytes()); + k.extend_from_slice(b":report:last"); + k +} + +pub fn key_owner_nonce(instrument: &str, owner: &str) -> Vec { + let mut k = Vec::with_capacity(10 + instrument.len() + owner.len()); + k.extend_from_slice(b"ob:"); + k.extend_from_slice(instrument.as_bytes()); + k.extend_from_slice(b":nonce:"); + k.extend_from_slice(owner.as_bytes()); + k +} + +pub fn key_client_id(instrument: &str, owner: &str, client_order_id: &str) -> Vec { + let mut k = Vec::with_capacity(9 + instrument.len() + owner.len() + client_order_id.len()); + k.extend_from_slice(b"ob:"); + k.extend_from_slice(instrument.as_bytes()); + k.extend_from_slice(b":cid:"); + k.extend_from_slice(owner.as_bytes()); + k.push(b':'); + k.extend_from_slice(client_order_id.as_bytes()); + k +} + +pub fn key_order(instrument: &str, order_id: u64) -> Vec { + let mut k = Vec::with_capacity(7 + instrument.len() + 8); + k.extend_from_slice(b"ob:"); + k.extend_from_slice(instrument.as_bytes()); + k.extend_from_slice(b":ord:"); + k.extend_from_slice(&encode_order_id(order_id)); + k +} + +pub fn key_plqty(instrument: &str, side: OrderSide, price_ticks: i64) -> Vec { + let mut k = Vec::with_capacity(12 + instrument.len() + 8); + k.extend_from_slice(b"ob:"); + k.extend_from_slice(instrument.as_bytes()); + k.extend_from_slice(b":plqty:"); + k.push(b'0' + side.as_u8()); + k.push(b':'); + k.extend_from_slice(&encode_price_component(side, price_ticks)); + k +} + +pub fn key_fifo( + instrument: &str, + side: OrderSide, + price_ticks: i64, + placed_seq: u64, + order_id: u64, +) -> Vec { + let mut k = Vec::with_capacity(11 + instrument.len() + 8 + 1 + 8 + 1 + 8); + k.extend_from_slice(b"ob:"); + k.extend_from_slice(instrument.as_bytes()); + k.extend_from_slice(b":fifo:"); + k.push(b'0' + side.as_u8()); + k.push(b':'); + k.extend_from_slice(&encode_price_component(side, price_ticks)); + k.push(b':'); + k.extend_from_slice(&placed_seq.to_be_bytes()); + k.push(b':'); + k.extend_from_slice(&order_id.to_be_bytes()); + k +} + +pub fn key_open_order(instrument: &str, owner: &str, order_id: u64) -> Vec { + let mut k = Vec::with_capacity(11 + instrument.len() + owner.len() + 8); + k.extend_from_slice(b"ob:"); + k.extend_from_slice(instrument.as_bytes()); + k.extend_from_slice(b":open:"); + k.extend_from_slice(owner.as_bytes()); + k.push(b':'); + k.extend_from_slice(&order_id.to_be_bytes()); + k +} + +pub fn key_trade(instrument: &str, fill_id: u64) -> Vec { + let mut k = Vec::with_capacity(9 + instrument.len() + 8); + k.extend_from_slice(b"ob:"); + k.extend_from_slice(instrument.as_bytes()); + k.extend_from_slice(b":trade:"); + k.extend_from_slice(&fill_id.to_be_bytes()); + k +} + +pub fn trade_prefix(instrument: &str) -> Vec { + let mut k = Vec::with_capacity(9 + instrument.len()); + k.extend_from_slice(b"ob:"); + k.extend_from_slice(instrument.as_bytes()); + k.extend_from_slice(b":trade:"); + k +} + +pub fn fifo_prefix(instrument: &str, side: OrderSide, price_ticks: i64) -> Vec { + let mut k = Vec::with_capacity(11 + instrument.len() + 8 + 1); + k.extend_from_slice(b"ob:"); + k.extend_from_slice(instrument.as_bytes()); + k.extend_from_slice(b":fifo:"); + k.push(b'0' + side.as_u8()); + k.push(b':'); + k.extend_from_slice(&encode_price_component(side, price_ticks)); + k.push(b':'); + k +} + +pub fn plqty_prefix(instrument: &str, side: OrderSide) -> Vec { + let mut k = Vec::with_capacity(11 + instrument.len()); + k.extend_from_slice(b"ob:"); + k.extend_from_slice(instrument.as_bytes()); + k.extend_from_slice(b":plqty:"); + k.push(b'0' + side.as_u8()); + k.push(b':'); + k +} + +pub fn open_orders_prefix(instrument: &str, owner: &str) -> Vec { + let mut k = Vec::with_capacity(10 + instrument.len() + owner.len()); + k.extend_from_slice(b"ob:"); + k.extend_from_slice(instrument.as_bytes()); + k.extend_from_slice(b":open:"); + k.extend_from_slice(owner.as_bytes()); + k.push(b':'); + k +} + +pub fn all_orders_prefix(instrument: &str) -> Vec { + let mut k = Vec::with_capacity(9 + instrument.len()); + k.extend_from_slice(b"ob:"); + k.extend_from_slice(instrument.as_bytes()); + k.extend_from_slice(b":ord:"); + k +} + +pub fn parse_plqty_price(side: OrderSide, key: &[u8]) -> Option { + let marker = { + let mut m = Vec::with_capacity(9); + m.extend_from_slice(b":plqty:"); + m.push(b'0' + side.as_u8()); + m.push(b':'); + m + }; + let pos = key.windows(marker.len()).position(|w| w == marker)?; + let start = pos + marker.len(); + let bytes: [u8; 8] = key.get(start..start + 8)?.try_into().ok()?; + Some(decode_price_component(side, bytes)) +} + +pub fn parse_fifo_order_id(key: &[u8]) -> Option { + let bytes: [u8; 8] = key.get(key.len().checked_sub(8)?..)?.try_into().ok()?; + Some(u64::from_be_bytes(bytes)) +} + +pub fn parse_order_id_suffix(key: &[u8]) -> Option { + let bytes: [u8; 8] = key.get(key.len().checked_sub(8)?..)?.try_into().ok()?; + Some(u64::from_be_bytes(bytes)) +} + +pub fn apply_order_book_new( + keyspace: &mut Keyspace, + project_id: &str, + scope_id: &str, + request: &OrderRequest, + commit_seq: u64, +) -> Result<(), AedbError> { + validate_instrument_against_table_mode(keyspace, project_id, scope_id, &request.instrument)?; + let config = load_instrument_config(keyspace, project_id, scope_id, &request.instrument)?; + if let Some(cfg) = &config { + if cfg.halted { + return Err(AedbError::Validation("instrument halted".into())); + } + if request.price_ticks < cfg.min_price_ticks || request.price_ticks > cfg.max_price_ticks { + return Err(AedbError::Validation( + "price outside instrument bounds".into(), + )); + } + let lot = u256_from_be(cfg.lot_size_be); + let qty = u256_from_be(request.qty_be); + if !lot.is_zero() && (qty % lot) != U256::zero() { + return Err(AedbError::Validation("quantity violates lot size".into())); + } + } + let effective_request = if let Some(cfg) = &config { + effective_request_for_config(request, cfg, keyspace, project_id, scope_id)? + } else { + request.clone() + }; + let qty = u256_from_be(request.qty_be); + if qty.is_zero() { + return Err(AedbError::Validation("qty must be > 0".into())); + } + if request.exec_instructions.post_only() && matches!(request.order_type, OrderType::Market) { + return Err(AedbError::Validation( + "post_only is invalid for market orders".into(), + )); + } + let nonce_key = key_owner_nonce(&request.instrument, &request.owner); + let last_nonce = load_u64(keyspace, project_id, scope_id, &nonce_key)?.unwrap_or(0); + if request.nonce <= last_nonce { + return Err(AedbError::Validation("nonce too low".into())); + } + let client_key = key_client_id( + &request.instrument, + &request.owner, + &request.client_order_id, + ); + if keyspace.kv_get(project_id, scope_id, &client_key).is_some() { + return Err(AedbError::Validation("duplicate client_order_id".into())); + } + + if effective_request.exec_instructions.post_only() { + if let Some(best_price) = best_price_for_side( + keyspace, + project_id, + scope_id, + &effective_request.instrument, + effective_request.side.opposite(), + ) && crosses( + effective_request.side, + effective_request.price_ticks, + best_price, + ) { + return Err(AedbError::Validation("post_only would cross".into())); + } + } + + if matches!(effective_request.time_in_force, TimeInForce::Fok) + && !can_fok_fill(keyspace, project_id, scope_id, &effective_request)? + { + return Err(AedbError::Validation("fok cannot fill".into())); + } + if matches!(effective_request.order_type, OrderType::Market) + && best_price_for_side( + keyspace, + project_id, + scope_id, + &effective_request.instrument, + effective_request.side.opposite(), + ) + .is_none() + { + return Err(AedbError::Validation( + "market order has no liquidity".into(), + )); + } + + let order_id = allocate_next_id( + keyspace, + project_id, + scope_id, + &key_next_order_id(&effective_request.instrument), + commit_seq, + )?; + + let mut remaining = qty; + let mut filled = U256::zero(); + let mut execution_fills = Vec::new(); + let mut preferred_passive_price: Option = None; + + while !remaining.is_zero() { + let (best_price, mut passive) = if let Some(price) = preferred_passive_price { + if let Some(order) = first_passive_order_at_price( + keyspace, + project_id, + scope_id, + &effective_request.instrument, + effective_request.side.opposite(), + price, + &effective_request, + )? { + (price, order) + } else { + let Some((price, order)) = best_passive_order( + keyspace, + project_id, + scope_id, + &effective_request.instrument, + effective_request.side.opposite(), + &effective_request, + )? + else { + break; + }; + (price, order) + } + } else { + let Some((price, order)) = best_passive_order( + keyspace, + project_id, + scope_id, + &effective_request.instrument, + effective_request.side.opposite(), + &effective_request, + )? + else { + break; + }; + (price, order) + }; + + let passive_remaining = u256_from_be(passive.remaining_qty_be); + if passive_remaining.is_zero() { + clear_open_order(keyspace, project_id, scope_id, &passive)?; + preferred_passive_price = None; + continue; + } + if passive.owner == effective_request.owner { + match effective_request.self_trade_prevention { + SelfTradePrevention::None => {} + SelfTradePrevention::CancelResting => { + apply_order_book_cancel( + keyspace, + project_id, + scope_id, + &effective_request.instrument, + passive.order_id, + Some(passive.client_order_id.as_str()), + &passive.owner, + commit_seq, + )?; + continue; + } + SelfTradePrevention::CancelAggressor => { + break; + } + SelfTradePrevention::CancelBoth => { + apply_order_book_cancel( + keyspace, + project_id, + scope_id, + &effective_request.instrument, + passive.order_id, + Some(passive.client_order_id.as_str()), + &passive.owner, + commit_seq, + )?; + break; + } + } + } + let fill_qty = if remaining < passive_remaining { + remaining + } else { + passive_remaining + }; + + apply_passive_fill( + keyspace, + project_id, + scope_id, + &effective_request.instrument, + &mut passive, + fill_qty, + commit_seq, + )?; + let fill = write_fill( + keyspace, + project_id, + scope_id, + &effective_request.instrument, + &effective_request, + order_id, + &passive, + best_price, + fill_qty, + commit_seq, + )?; + execution_fills.push(fill); + remaining -= fill_qty; + filled += fill_qty; + let level_key = key_plqty( + &effective_request.instrument, + effective_request.side.opposite(), + best_price, + ); + preferred_passive_price = keyspace + .kv_get(project_id, scope_id, &level_key) + .and_then(|entry| decode_u256_bytes(&entry.value).ok()) + .filter(|qty| !qty.is_zero()) + .map(|_| best_price); + } + + let remaining_after_match = remaining; + if matches!(effective_request.order_type, OrderType::Market) && filled.is_zero() { + return Err(AedbError::Validation( + "market order has no liquidity".into(), + )); + } + let rest_on_book = matches!(effective_request.order_type, OrderType::Limit) + && matches!(effective_request.time_in_force, TimeInForce::Gtc) + && !remaining_after_match.is_zero(); + let stored_remaining = if rest_on_book { + remaining_after_match + } else { + U256::zero() + }; + + let status = if remaining_after_match.is_zero() { + OrderStatus::Filled + } else if rest_on_book { + if filled.is_zero() { + OrderStatus::Open + } else { + OrderStatus::PartiallyFilled + } + } else { + if filled.is_zero() { + OrderStatus::Cancelled + } else { + OrderStatus::PartiallyFilled + } + }; + + let record = OrderRecord { + order_id, + instrument: effective_request.instrument.clone(), + client_order_id: effective_request.client_order_id.clone(), + owner: effective_request.owner.clone(), + account: effective_request.account.clone(), + side: effective_request.side, + order_type: effective_request.order_type, + time_in_force: effective_request.time_in_force, + exec_instructions: effective_request.exec_instructions, + self_trade_prevention: effective_request.self_trade_prevention, + price_ticks: effective_request.price_ticks, + original_qty_be: effective_request.qty_be, + remaining_qty_be: u256_to_be(stored_remaining), + filled_qty_be: u256_to_be(filled), + status, + placed_seq: commit_seq, + last_modified_seq: commit_seq, + nonce: effective_request.nonce, + }; + store_order(keyspace, project_id, scope_id, &record, commit_seq)?; + + if rest_on_book { + keyspace.kv_set( + project_id, + scope_id, + key_fifo( + &effective_request.instrument, + effective_request.side, + effective_request.price_ticks, + record.placed_seq, + order_id, + ), + vec![1], + commit_seq, + ); + keyspace.kv_inc_u256( + project_id, + scope_id, + key_plqty( + &effective_request.instrument, + effective_request.side, + effective_request.price_ticks, + ), + remaining, + commit_seq, + )?; + keyspace.kv_set( + project_id, + scope_id, + key_open_order( + &effective_request.instrument, + &effective_request.owner, + order_id, + ), + vec![1], + commit_seq, + ); + } + + keyspace.kv_set( + project_id, + scope_id, + key_client_id( + &effective_request.instrument, + &effective_request.owner, + &effective_request.client_order_id, + ), + order_id.to_be_bytes().to_vec(), + commit_seq, + ); + keyspace.kv_set( + project_id, + scope_id, + key_owner_nonce(&effective_request.instrument, &effective_request.owner), + effective_request.nonce.to_be_bytes().to_vec(), + commit_seq, + ); + write_execution_report( + keyspace, + project_id, + scope_id, + &ExecutionReport { + order_id, + client_order_id: effective_request.client_order_id.clone(), + status, + fills: execution_fills, + remaining_qty_be: u256_to_be(stored_remaining), + total_filled_qty_be: u256_to_be(filled), + avg_price_ticks: None, + reject_reason: None, + seq: commit_seq, + }, + &effective_request.instrument, + commit_seq, + ) +} + +pub fn apply_order_book_cancel( + keyspace: &mut Keyspace, + project_id: &str, + scope_id: &str, + instrument: &str, + order_id: u64, + client_order_id: Option<&str>, + owner: &str, + commit_seq: u64, +) -> Result<(), AedbError> { + let resolved_order_id = if let Some(cid) = client_order_id { + match load_client_order_id(keyspace, project_id, scope_id, instrument, owner, cid)? { + Some(id) => id, + None => return Ok(()), + } + } else { + order_id + }; + if resolved_order_id == 0 { + return Ok(()); + } + let Some(mut order) = load_order( + keyspace, + project_id, + scope_id, + instrument, + resolved_order_id, + )? + else { + return Ok(()); + }; + if order.owner != owner { + return Err(AedbError::PermissionDenied( + "order ownership mismatch".into(), + )); + } + let remaining = u256_from_be(order.remaining_qty_be); + if !remaining.is_zero() { + dec_price_level_qty( + keyspace, + project_id, + scope_id, + instrument, + order.side, + order.price_ticks, + remaining, + commit_seq, + )?; + keyspace.kv_del( + project_id, + scope_id, + &key_fifo( + instrument, + order.side, + order.price_ticks, + order.placed_seq, + resolved_order_id, + ), + commit_seq, + ); + keyspace.kv_del( + project_id, + scope_id, + &key_open_order(instrument, &order.owner, order.order_id), + commit_seq, + ); + } + order.remaining_qty_be = u256_to_be(U256::zero()); + order.status = OrderStatus::Cancelled; + order.last_modified_seq = commit_seq; + store_order(keyspace, project_id, scope_id, &order, commit_seq)?; + write_execution_report( + keyspace, + project_id, + scope_id, + &ExecutionReport { + order_id: order.order_id, + client_order_id: order.client_order_id.clone(), + status: OrderStatus::Cancelled, + fills: Vec::new(), + remaining_qty_be: u256_to_be(U256::zero()), + total_filled_qty_be: order.filled_qty_be, + avg_price_ticks: None, + reject_reason: None, + seq: commit_seq, + }, + instrument, + commit_seq, + ) +} + +#[allow(clippy::too_many_arguments)] +pub fn apply_order_book_cancel_replace( + keyspace: &mut Keyspace, + project_id: &str, + scope_id: &str, + instrument: &str, + order_id: u64, + owner: &str, + new_price_ticks: Option, + new_qty_be: Option<[u8; 32]>, + new_time_in_force: Option, + new_exec_instructions: Option, + commit_seq: u64, +) -> Result<(), AedbError> { + let Some(mut order) = load_order(keyspace, project_id, scope_id, instrument, order_id)? else { + return Ok(()); + }; + if order.owner != owner { + return Err(AedbError::PermissionDenied( + "order ownership mismatch".into(), + )); + } + let mut remaining = u256_from_be(order.remaining_qty_be); + let old_remaining = remaining; + let old_price = order.price_ticks; + let old_side = order.side; + let old_placed_seq = order.placed_seq; + + if let Some(qty_be) = new_qty_be { + let target = u256_from_be(qty_be); + if target.is_zero() { + return apply_order_book_cancel( + keyspace, project_id, scope_id, instrument, order_id, None, owner, commit_seq, + ); + } + remaining = target; + } + if let Some(price) = new_price_ticks { + order.price_ticks = price; + } + if let Some(tif) = new_time_in_force { + order.time_in_force = tif; + } + if let Some(flags) = new_exec_instructions { + order.exec_instructions = flags; + } + + if order.exec_instructions.post_only() + && would_cross_now( + keyspace, + project_id, + scope_id, + instrument, + order.side, + order.price_ticks, + )? + { + return Err(AedbError::Validation( + "cancel_replace post_only would cross".into(), + )); + } + + let loses_priority = order.price_ticks != old_price || remaining > old_remaining; + + keyspace.kv_del( + project_id, + scope_id, + &key_fifo( + instrument, + old_side, + old_price, + old_placed_seq, + order.order_id, + ), + commit_seq, + ); + if remaining > old_remaining { + keyspace.kv_inc_u256( + project_id, + scope_id, + key_plqty(instrument, old_side, old_price), + remaining - old_remaining, + commit_seq, + )?; + } else if old_remaining > remaining { + dec_price_level_qty( + keyspace, + project_id, + scope_id, + instrument, + old_side, + old_price, + old_remaining - remaining, + commit_seq, + )?; + } + + if old_price != order.price_ticks { + dec_price_level_qty( + keyspace, project_id, scope_id, instrument, old_side, old_price, remaining, commit_seq, + )?; + keyspace.kv_inc_u256( + project_id, + scope_id, + key_plqty(instrument, old_side, order.price_ticks), + remaining, + commit_seq, + )?; + } + + order.remaining_qty_be = u256_to_be(remaining); + order.last_modified_seq = commit_seq; + if loses_priority { + order.placed_seq = commit_seq; + } + order.status = if u256_from_be(order.filled_qty_be).is_zero() { + OrderStatus::Open + } else { + OrderStatus::PartiallyFilled + }; + store_order(keyspace, project_id, scope_id, &order, commit_seq)?; + + keyspace.kv_set( + project_id, + scope_id, + key_fifo( + instrument, + order.side, + order.price_ticks, + order.placed_seq, + order.order_id, + ), + vec![1], + commit_seq, + ); + + if matches!(order.time_in_force, TimeInForce::Ioc | TimeInForce::Fok) { + apply_order_book_cancel( + keyspace, project_id, scope_id, instrument, order_id, None, owner, commit_seq, + )?; + } + Ok(()) +} + +#[allow(clippy::too_many_arguments)] +pub fn apply_order_book_mass_cancel( + keyspace: &mut Keyspace, + project_id: &str, + scope_id: &str, + instrument: &str, + owner: &str, + side: Option, + owner_filter: Option<&str>, + price_range_ticks: Option<(i64, i64)>, + commit_seq: u64, +) -> Result<(), AedbError> { + let orders = keyspace.kv_scan_prefix( + project_id, + scope_id, + &all_orders_prefix(instrument), + usize::MAX, + ); + for (_, entry) in orders { + let order: OrderRecord = deserialize(&entry.value)?; + if u256_from_be(order.remaining_qty_be).is_zero() { + continue; + } + if let Some(s) = side + && order.side != s + { + continue; + } + if let Some(filter_owner) = owner_filter + && order.owner != filter_owner + { + continue; + } + if let Some((min_price, max_price)) = price_range_ticks + && (order.price_ticks < min_price || order.price_ticks > max_price) + { + continue; + } + if let Some(filter_owner) = owner_filter { + if filter_owner != owner && order.owner != owner { + continue; + } + } else if order.owner != owner { + continue; + } + apply_order_book_cancel( + keyspace, + project_id, + scope_id, + instrument, + order.order_id, + None, + &order.owner, + commit_seq, + )?; + } + Ok(()) +} + +pub fn apply_order_book_reduce( + keyspace: &mut Keyspace, + project_id: &str, + scope_id: &str, + instrument: &str, + order_id: u64, + owner: &str, + reduce_by: U256, + commit_seq: u64, +) -> Result<(), AedbError> { + let Some(mut order) = load_order(keyspace, project_id, scope_id, instrument, order_id)? else { + return Ok(()); + }; + if order.owner != owner { + return Err(AedbError::PermissionDenied( + "order ownership mismatch".into(), + )); + } + let remaining = u256_from_be(order.remaining_qty_be); + if remaining.is_zero() { + return Ok(()); + } + let next = if reduce_by >= remaining { + U256::zero() + } else { + remaining - reduce_by + }; + let delta = remaining - next; + dec_price_level_qty( + keyspace, + project_id, + scope_id, + instrument, + order.side, + order.price_ticks, + delta, + commit_seq, + )?; + if next.is_zero() { + keyspace.kv_del( + project_id, + scope_id, + &key_fifo( + instrument, + order.side, + order.price_ticks, + order.placed_seq, + order.order_id, + ), + commit_seq, + ); + keyspace.kv_del( + project_id, + scope_id, + &key_open_order(instrument, &order.owner, order.order_id), + commit_seq, + ); + order.status = OrderStatus::Cancelled; + } + order.remaining_qty_be = u256_to_be(next); + order.last_modified_seq = commit_seq; + store_order(keyspace, project_id, scope_id, &order, commit_seq) +} + +pub fn apply_order_book_match( + keyspace: &mut Keyspace, + project_id: &str, + scope_id: &str, + instrument: &str, + fills: &[FillSpec], + commit_seq: u64, +) -> Result<(), AedbError> { + for fill in fills { + let qty = u256_from_be(fill.qty_be); + if qty.is_zero() { + return Err(AedbError::Validation("fill qty must be > 0".into())); + } + let mut aggressor = load_order_required( + keyspace, + project_id, + scope_id, + instrument, + fill.aggressor_order_id, + )?; + let mut passive = load_order_required( + keyspace, + project_id, + scope_id, + instrument, + fill.passive_order_id, + )?; + if aggressor.side == passive.side { + return Err(AedbError::Validation( + "match orders must be opposite side".into(), + )); + } + apply_fill_to_order( + keyspace, + project_id, + scope_id, + instrument, + &mut aggressor, + qty, + commit_seq, + )?; + apply_fill_to_order( + keyspace, + project_id, + scope_id, + instrument, + &mut passive, + qty, + commit_seq, + )?; + let _ = write_fill( + keyspace, + project_id, + scope_id, + instrument, + &OrderRequest { + instrument: instrument.to_string(), + client_order_id: String::new(), + side: aggressor.side, + order_type: aggressor.order_type, + time_in_force: aggressor.time_in_force, + exec_instructions: aggressor.exec_instructions, + self_trade_prevention: aggressor.self_trade_prevention, + price_ticks: fill.price_ticks, + qty_be: fill.qty_be, + owner: aggressor.owner.clone(), + account: aggressor.account.clone(), + nonce: aggressor.nonce, + price_limit_ticks: None, + }, + aggressor.order_id, + &passive, + fill.price_ticks, + qty, + commit_seq, + )?; + } + Ok(()) +} + +pub fn read_order_status( + snapshot: &KeyspaceSnapshot, + project_id: &str, + scope_id: &str, + instrument: &str, + order_id: u64, +) -> Result, AedbError> { + snapshot + .kv_get(project_id, scope_id, &key_order(instrument, order_id)) + .map(|entry| deserialize::(&entry.value)) + .transpose() +} + +pub fn read_open_orders( + snapshot: &KeyspaceSnapshot, + project_id: &str, + scope_id: &str, + instrument: &str, + owner: &str, +) -> Result, AedbError> { + let open = snapshot_scan_prefix( + project_id, + scope_id, + &open_orders_prefix(instrument, owner), + usize::MAX, + snapshot, + ); + let mut out = Vec::with_capacity(open.len()); + for (k, _) in open { + if let Some(order_id) = parse_order_id_suffix(&k) + && let Some(order) = + read_order_status(snapshot, project_id, scope_id, instrument, order_id)? + { + out.push(order); + } + } + Ok(out) +} + +pub fn read_recent_trades( + snapshot: &KeyspaceSnapshot, + project_id: &str, + scope_id: &str, + instrument: &str, + limit: usize, +) -> Result, AedbError> { + let mut trades = snapshot_scan_prefix( + project_id, + scope_id, + &trade_prefix(instrument), + usize::MAX, + snapshot, + ); + if trades.len() > limit { + let start = trades.len() - limit; + trades = trades.split_off(start); + } + let mut out = Vec::with_capacity(trades.len()); + for (_, entry) in trades { + out.push(deserialize::(&entry.value)?); + } + Ok(out) +} + +pub fn read_last_execution_report( + snapshot: &KeyspaceSnapshot, + project_id: &str, + scope_id: &str, + instrument: &str, +) -> Result, AedbError> { + let Some(entry) = snapshot.kv_get(project_id, scope_id, &key_execution_report_last(instrument)) + else { + return Ok(None); + }; + if let Some((commit_seq, order_id)) = decode_last_report_pointer(&entry.value) + && let Some(report_entry) = snapshot.kv_get( + project_id, + scope_id, + &key_execution_report(instrument, commit_seq, order_id), + ) + { + return deserialize::(&report_entry.value).map(Some); + } + deserialize::(&entry.value).map(Some) +} + +pub fn read_spread( + snapshot: &KeyspaceSnapshot, + project_id: &str, + scope_id: &str, + instrument: &str, + seq: u64, +) -> Result { + let best_bid = + best_price_for_side_snapshot(snapshot, project_id, scope_id, instrument, OrderSide::Bid); + let best_ask = + best_price_for_side_snapshot(snapshot, project_id, scope_id, instrument, OrderSide::Ask); + let mid = match (best_bid, best_ask) { + (Some(b), Some(a)) => Some((b + a) / 2), + _ => None, + }; + Ok(Spread { + best_bid, + best_ask, + mid, + seq, + }) +} + +pub fn read_top_n( + snapshot: &KeyspaceSnapshot, + project_id: &str, + scope_id: &str, + instrument: &str, + depth: usize, + seq: u64, +) -> Result { + let bids = top_side( + snapshot, + project_id, + scope_id, + instrument, + OrderSide::Bid, + depth, + )?; + let asks = top_side( + snapshot, + project_id, + scope_id, + instrument, + OrderSide::Ask, + depth, + )?; + Ok(OrderBookDepth { + instrument: instrument.to_string(), + bids, + asks, + seq, + }) +} + +fn top_side( + snapshot: &KeyspaceSnapshot, + project_id: &str, + scope_id: &str, + instrument: &str, + side: OrderSide, + depth: usize, +) -> Result, AedbError> { + let entries = snapshot_scan_prefix( + project_id, + scope_id, + &plqty_prefix(instrument, side), + usize::MAX, + snapshot, + ); + let mut out = Vec::new(); + for (k, v) in entries { + let qty = decode_u256_bytes(&v.value)?; + if qty.is_zero() { + continue; + } + let Some(price) = parse_plqty_price(side, &k) else { + continue; + }; + let order_count = snapshot_scan_prefix( + project_id, + scope_id, + &fifo_prefix(instrument, side, price), + usize::MAX, + snapshot, + ) + .len() as u32; + out.push(PriceLevel { + price_ticks: price, + total_qty_be: u256_to_be(qty), + order_count, + }); + if out.len() >= depth { + break; + } + } + Ok(out) +} + +fn load_u64( + keyspace: &Keyspace, + project_id: &str, + scope_id: &str, + key: &[u8], +) -> Result, AedbError> { + let Some(entry) = keyspace.kv_get(project_id, scope_id, key) else { + return Ok(None); + }; + if entry.value.len() != 8 { + return Err(AedbError::Validation("invalid u64 value length".into())); + } + let mut b = [0u8; 8]; + b.copy_from_slice(&entry.value); + Ok(Some(u64::from_be_bytes(b))) +} + +fn load_instrument_config( + keyspace: &Keyspace, + project_id: &str, + scope_id: &str, + instrument: &str, +) -> Result, AedbError> { + keyspace + .kv_get(project_id, scope_id, &key_instrument_config(instrument)) + .map(|entry| deserialize::(&entry.value)) + .transpose() +} + +fn validate_instrument_against_table_mode( + keyspace: &Keyspace, + project_id: &str, + scope_id: &str, + instrument: &str, +) -> Result<(), AedbError> { + let Some((table_id, asset_id)) = parse_scoped_instrument(instrument) else { + return Ok(()); + }; + let spec_key = key_order_book_table_spec(table_id); + let Some(entry) = keyspace.kv_get(project_id, scope_id, &spec_key) else { + return Err(AedbError::Validation(format!( + "order book table not defined: {table_id}" + ))); + }; + let spec: OrderBookTableSpec = deserialize(&entry.value)?; + match spec.mode { + OrderBookTableMode::MultiAsset => Ok(()), + OrderBookTableMode::PerAsset => { + if asset_id == table_id { + Ok(()) + } else { + Err(AedbError::Validation(format!( + "table {table_id} is PerAsset and only supports asset_id={table_id}" + ))) + } + } + } +} + +fn load_client_order_id( + keyspace: &Keyspace, + project_id: &str, + scope_id: &str, + instrument: &str, + owner: &str, + client_order_id: &str, +) -> Result, AedbError> { + let key = key_client_id(instrument, owner, client_order_id); + load_u64(keyspace, project_id, scope_id, &key) +} + +fn snapshot_scan_prefix( + project_id: &str, + scope_id: &str, + prefix: &[u8], + limit: usize, + snapshot: &KeyspaceSnapshot, +) -> Vec<(Vec, crate::storage::keyspace::KvEntry)> { + let ns = crate::storage::keyspace::NamespaceId::project_scope(project_id, scope_id); + let Some(namespace) = snapshot.namespaces.get(&ns) else { + return Vec::new(); + }; + namespace + .kv + .entries + .iter() + .filter(|(k, _)| k.starts_with(prefix)) + .take(limit) + .map(|(k, v)| (k.clone(), v.clone())) + .collect() +} + +fn allocate_next_id( + keyspace: &mut Keyspace, + project_id: &str, + scope_id: &str, + key: &[u8], + commit_seq: u64, +) -> Result { + let current = load_u64(keyspace, project_id, scope_id, key)?.unwrap_or(1); + let next = current + .checked_add(1) + .ok_or_else(|| AedbError::Validation("id overflow".into()))?; + keyspace.kv_set( + project_id, + scope_id, + key.to_vec(), + next.to_be_bytes().to_vec(), + commit_seq, + ); + Ok(current) +} + +fn serialize(value: &T) -> Result, AedbError> { + rmp_serde::to_vec(value).map_err(|e| AedbError::Encode(e.to_string())) +} + +fn deserialize Deserialize<'de>>(bytes: &[u8]) -> Result { + rmp_serde::from_slice(bytes).map_err(|e| AedbError::Decode(e.to_string())) +} + +fn decode_u256_bytes(bytes: &[u8]) -> Result { + if bytes.len() != 32 { + return Err(AedbError::Validation("invalid u256 bytes length".into())); + } + Ok(U256::from_big_endian(bytes)) +} + +fn load_order( + keyspace: &Keyspace, + project_id: &str, + scope_id: &str, + instrument: &str, + order_id: u64, +) -> Result, AedbError> { + let Some(entry) = keyspace.kv_get(project_id, scope_id, &key_order(instrument, order_id)) + else { + return Ok(None); + }; + Ok(Some(deserialize::(&entry.value)?)) +} + +fn load_order_required( + keyspace: &Keyspace, + project_id: &str, + scope_id: &str, + instrument: &str, + order_id: u64, +) -> Result { + load_order(keyspace, project_id, scope_id, instrument, order_id)? + .ok_or_else(|| AedbError::Validation(format!("order not found: {order_id}"))) +} + +fn store_order( + keyspace: &mut Keyspace, + project_id: &str, + scope_id: &str, + order: &OrderRecord, + commit_seq: u64, +) -> Result<(), AedbError> { + keyspace.kv_set( + project_id, + scope_id, + key_order(&order.instrument, order.order_id), + serialize(order)?, + commit_seq, + ); + Ok(()) +} + +fn best_price_for_side( + keyspace: &Keyspace, + project_id: &str, + scope_id: &str, + instrument: &str, + side: OrderSide, +) -> Option { + let mut out = None; + keyspace.kv_visit_prefix_ref( + project_id, + scope_id, + &plqty_prefix(instrument, side), + usize::MAX, + |k, v| { + if decode_u256_bytes(&v.value) + .ok() + .is_some_and(|x| !x.is_zero()) + { + out = parse_plqty_price(side, k); + return false; + } + true + }, + ); + out +} + +fn best_price_for_side_snapshot( + snapshot: &KeyspaceSnapshot, + project_id: &str, + scope_id: &str, + instrument: &str, + side: OrderSide, +) -> Option { + let entries = snapshot_scan_prefix( + project_id, + scope_id, + &plqty_prefix(instrument, side), + usize::MAX, + snapshot, + ); + for (k, v) in entries { + if decode_u256_bytes(&v.value) + .ok() + .is_some_and(|x| !x.is_zero()) + { + return parse_plqty_price(side, &k); + } + } + None +} + +fn crosses(aggressor_side: OrderSide, aggressor_price: i64, passive_price: i64) -> bool { + match aggressor_side { + OrderSide::Bid => passive_price <= aggressor_price, + OrderSide::Ask => passive_price >= aggressor_price, + } +} + +fn price_allows(request: &OrderRequest, passive_price: i64) -> bool { + match request.order_type { + OrderType::Limit => crosses(request.side, request.price_ticks, passive_price), + OrderType::Market => { + if let Some(limit) = request.price_limit_ticks { + crosses(request.side, limit, passive_price) + } else { + true + } + } + } +} + +fn best_passive_order( + keyspace: &Keyspace, + project_id: &str, + scope_id: &str, + instrument: &str, + passive_side: OrderSide, + request: &OrderRequest, +) -> Result, AedbError> { + let mut out = None; + let mut error: Option = None; + keyspace.kv_visit_prefix_ref( + project_id, + scope_id, + &plqty_prefix(instrument, passive_side), + usize::MAX, + |k, v| { + let level_qty = match decode_u256_bytes(&v.value) { + Ok(q) => q, + Err(e) => { + error = Some(e); + return false; + } + }; + if level_qty.is_zero() { + return true; + } + let Some(price) = parse_plqty_price(passive_side, k) else { + return true; + }; + if !price_allows(request, price) { + return !matches!(request.order_type, OrderType::Limit); + } + let mut first_fifo_order_id = None; + keyspace.kv_visit_prefix_ref( + project_id, + scope_id, + &fifo_prefix(instrument, passive_side, price), + 1, + |fifo_key, _| { + first_fifo_order_id = parse_fifo_order_id(fifo_key); + false + }, + ); + let Some(order_id) = first_fifo_order_id else { + return true; + }; + match load_order(keyspace, project_id, scope_id, instrument, order_id) { + Ok(Some(order)) => { + out = Some((price, order)); + false + } + Ok(None) => true, + Err(e) => { + error = Some(e); + false + } + } + }, + ); + if let Some(e) = error { + return Err(e); + } + Ok(out) +} + +fn first_passive_order_at_price( + keyspace: &Keyspace, + project_id: &str, + scope_id: &str, + instrument: &str, + passive_side: OrderSide, + price: i64, + request: &OrderRequest, +) -> Result, AedbError> { + if !price_allows(request, price) { + return Ok(None); + } + let fifo = keyspace.kv_scan_prefix_ref( + project_id, + scope_id, + &fifo_prefix(instrument, passive_side, price), + 1, + ); + let Some((fifo_key, _)) = fifo.into_iter().next() else { + return Ok(None); + }; + let Some(order_id) = parse_fifo_order_id(fifo_key) else { + return Ok(None); + }; + load_order(keyspace, project_id, scope_id, instrument, order_id) +} + +fn can_fok_fill( + keyspace: &Keyspace, + project_id: &str, + scope_id: &str, + request: &OrderRequest, +) -> Result { + let mut needed = u256_from_be(request.qty_be); + let mut error: Option = None; + keyspace.kv_visit_prefix_ref( + project_id, + scope_id, + &plqty_prefix(&request.instrument, request.side.opposite()), + usize::MAX, + |k, v| { + let Some(price) = parse_plqty_price(request.side.opposite(), k) else { + return true; + }; + if !price_allows(request, price) { + return !matches!(request.order_type, OrderType::Limit); + } + match decode_u256_bytes(&v.value) { + Ok(qty) => { + if qty >= needed { + needed = U256::zero(); + false + } else { + needed -= qty; + true + } + } + Err(e) => { + error = Some(e); + false + } + } + }, + ); + if let Some(e) = error { + return Err(e); + } + Ok(needed.is_zero()) +} + +fn apply_passive_fill( + keyspace: &mut Keyspace, + project_id: &str, + scope_id: &str, + instrument: &str, + passive: &mut OrderRecord, + fill_qty: U256, + commit_seq: u64, +) -> Result<(), AedbError> { + apply_fill_to_order( + keyspace, project_id, scope_id, instrument, passive, fill_qty, commit_seq, + ) +} + +fn apply_fill_to_order( + keyspace: &mut Keyspace, + project_id: &str, + scope_id: &str, + instrument: &str, + order: &mut OrderRecord, + fill_qty: U256, + commit_seq: u64, +) -> Result<(), AedbError> { + let remaining = u256_from_be(order.remaining_qty_be); + if remaining < fill_qty { + return Err(AedbError::Underflow); + } + let next_remaining = remaining - fill_qty; + let filled = u256_from_be(order.filled_qty_be) + .checked_add(fill_qty) + .ok_or(AedbError::Overflow)?; + dec_price_level_qty( + keyspace, + project_id, + scope_id, + instrument, + order.side, + order.price_ticks, + fill_qty, + commit_seq, + )?; + if next_remaining.is_zero() { + clear_open_order(keyspace, project_id, scope_id, order)?; + order.status = OrderStatus::Filled; + } else { + order.status = OrderStatus::PartiallyFilled; + } + order.remaining_qty_be = u256_to_be(next_remaining); + order.filled_qty_be = u256_to_be(filled); + order.last_modified_seq = commit_seq; + store_order(keyspace, project_id, scope_id, order, commit_seq) +} + +fn clear_open_order( + keyspace: &mut Keyspace, + project_id: &str, + scope_id: &str, + order: &OrderRecord, +) -> Result<(), AedbError> { + keyspace.kv_del( + project_id, + scope_id, + &key_fifo( + &order.instrument, + order.side, + order.price_ticks, + order.placed_seq, + order.order_id, + ), + order.last_modified_seq, + ); + keyspace.kv_del( + project_id, + scope_id, + &key_open_order(&order.instrument, &order.owner, order.order_id), + order.last_modified_seq, + ); + Ok(()) +} + +fn dec_price_level_qty( + keyspace: &mut Keyspace, + project_id: &str, + scope_id: &str, + instrument: &str, + side: OrderSide, + price_ticks: i64, + delta: U256, + commit_seq: u64, +) -> Result<(), AedbError> { + let key = key_plqty(instrument, side, price_ticks); + let next = keyspace.kv_dec_u256(project_id, scope_id, key.clone(), delta, commit_seq)?; + if next.is_zero() { + keyspace.kv_del(project_id, scope_id, &key, commit_seq); + } + Ok(()) +} + +fn write_fill( + keyspace: &mut Keyspace, + project_id: &str, + scope_id: &str, + instrument: &str, + request: &OrderRequest, + aggressor_order_id: u64, + passive: &OrderRecord, + price_ticks: i64, + qty: U256, + commit_seq: u64, +) -> Result { + let fill_id = allocate_next_id( + keyspace, + project_id, + scope_id, + &key_next_fill_id(instrument), + commit_seq, + )?; + let fill = FillRecord { + fill_id, + instrument: instrument.to_string(), + price_ticks, + qty_be: u256_to_be(qty), + aggressor_order_id, + aggressor_owner: request.owner.clone(), + aggressor_side: request.side, + passive_order_id: passive.order_id, + passive_owner: passive.owner.clone(), + seq: commit_seq, + }; + keyspace.kv_set( + project_id, + scope_id, + key_trade(instrument, fill_id), + serialize(&fill)?, + commit_seq, + ); + Ok(fill) +} + +fn write_execution_report( + keyspace: &mut Keyspace, + project_id: &str, + scope_id: &str, + report: &ExecutionReport, + instrument: &str, + commit_seq: u64, +) -> Result<(), AedbError> { + let bytes = serialize(report)?; + keyspace.kv_set( + project_id, + scope_id, + key_execution_report(instrument, commit_seq, report.order_id), + bytes, + commit_seq, + ); + keyspace.kv_set( + project_id, + scope_id, + key_execution_report_last(instrument), + encode_last_report_pointer(commit_seq, report.order_id), + commit_seq, + ); + Ok(()) +} + +fn encode_last_report_pointer(commit_seq: u64, order_id: u64) -> Vec { + let mut out = Vec::with_capacity(16); + out.extend_from_slice(&commit_seq.to_be_bytes()); + out.extend_from_slice(&order_id.to_be_bytes()); + out +} + +fn decode_last_report_pointer(bytes: &[u8]) -> Option<(u64, u64)> { + let commit: [u8; 8] = bytes.get(0..8)?.try_into().ok()?; + let order: [u8; 8] = bytes.get(8..16)?.try_into().ok()?; + Some((u64::from_be_bytes(commit), u64::from_be_bytes(order))) +} + +fn would_cross_now( + keyspace: &Keyspace, + project_id: &str, + scope_id: &str, + instrument: &str, + side: OrderSide, + price_ticks: i64, +) -> Result { + let Some(best_opposite) = + best_price_for_side(keyspace, project_id, scope_id, instrument, side.opposite()) + else { + return Ok(false); + }; + Ok(crosses(side, price_ticks, best_opposite)) +} + +fn effective_request_for_config( + request: &OrderRequest, + config: &InstrumentConfig, + keyspace: &Keyspace, + project_id: &str, + scope_id: &str, +) -> Result { + if !matches!(request.order_type, OrderType::Market) || request.price_limit_ticks.is_some() { + return Ok(request.clone()); + } + let Some(band) = config.market_order_price_band else { + return Ok(request.clone()); + }; + let bid = best_price_for_side( + keyspace, + project_id, + scope_id, + &request.instrument, + OrderSide::Bid, + ); + let ask = best_price_for_side( + keyspace, + project_id, + scope_id, + &request.instrument, + OrderSide::Ask, + ); + let Some(mid) = (match (bid, ask) { + (Some(b), Some(a)) => Some((b + a) / 2), + _ => None, + }) else { + return Ok(request.clone()); + }; + let mut next = request.clone(); + next.price_limit_ticks = Some(match request.side { + OrderSide::Bid => mid.saturating_add(band), + OrderSide::Ask => mid.saturating_sub(band), + }); + Ok(next) +} diff --git a/src/preflight/mod.rs b/src/preflight/mod.rs index 6a7362f..55d9db8 100644 --- a/src/preflight/mod.rs +++ b/src/preflight/mod.rs @@ -444,6 +444,141 @@ pub fn preflight_plan( version_at_read: version, }); } + Mutation::OrderBookNew { + project_id, + scope_id, + request, + } => { + let version = snapshot + .kv_get(project_id, scope_id, request.instrument.as_bytes()) + .map(|e| e.version) + .unwrap_or(0); + read_set.points.push(ReadSetEntry { + key: ReadKey::KvKey { + project_id: project_id.clone(), + scope_id: scope_id.clone(), + key: request.instrument.clone().into_bytes(), + }, + version_at_read: version, + }); + } + Mutation::OrderBookCancel { + project_id, + scope_id, + instrument, + order_id, + .. + } + | Mutation::OrderBookCancelReplace { + project_id, + scope_id, + instrument, + order_id, + .. + } + | Mutation::OrderBookReduce { + project_id, + scope_id, + instrument, + order_id, + .. + } => { + let mut key = format!("ob:{instrument}:ord:").into_bytes(); + key.extend_from_slice(&order_id.to_be_bytes()); + let version = snapshot + .kv_get(project_id, scope_id, &key) + .map(|e| e.version) + .unwrap_or(0); + read_set.points.push(ReadSetEntry { + key: ReadKey::KvKey { + project_id: project_id.clone(), + scope_id: scope_id.clone(), + key, + }, + version_at_read: version, + }); + } + Mutation::OrderBookMassCancel { + project_id, + scope_id, + instrument, + .. + } + | Mutation::OrderBookMatch { + project_id, + scope_id, + instrument, + .. + } => { + let prefix = format!("ob:{instrument}:").into_bytes(); + let start = ReadBound::Included(prefix.clone()); + let mut end = prefix.clone(); + end.push(0xff); + let max_version = + snapshot_max_kv_version_for_prefix(snapshot, project_id, scope_id, &prefix); + let structural_version = snapshot_kv_structural_version(snapshot, project_id, scope_id); + read_set.ranges.push(ReadRangeEntry { + range: ReadRange::KvRange { + project_id: project_id.clone(), + scope_id: scope_id.clone(), + start, + end: ReadBound::Excluded(end), + }, + max_version_at_read: max_version, + structural_version_at_read: structural_version, + }); + } + Mutation::OrderBookDefineTable { + project_id, + scope_id, + table_id, + .. + } + | Mutation::OrderBookDropTable { + project_id, + scope_id, + table_id, + } => { + let key = crate::order_book::key_order_book_table_spec(table_id); + let version = snapshot + .kv_get(project_id, scope_id, &key) + .map(|e| e.version) + .unwrap_or(0); + read_set.points.push(ReadSetEntry { + key: ReadKey::KvKey { + project_id: project_id.clone(), + scope_id: scope_id.clone(), + key, + }, + version_at_read: version, + }); + } + Mutation::OrderBookSetInstrumentConfig { + project_id, + scope_id, + instrument, + .. + } + | Mutation::OrderBookSetInstrumentHalted { + project_id, + scope_id, + instrument, + .. + } => { + let key = crate::order_book::key_instrument_config(instrument); + let version = snapshot + .kv_get(project_id, scope_id, &key) + .map(|e| e.version) + .unwrap_or(0); + read_set.points.push(ReadSetEntry { + key: ReadKey::KvKey { + project_id: project_id.clone(), + scope_id: scope_id.clone(), + key, + }, + version_at_read: version, + }); + } Mutation::Ddl(_) => {} } @@ -505,6 +640,39 @@ fn load_table_u256_field( } } +fn snapshot_max_kv_version_for_prefix( + snapshot: &KeyspaceSnapshot, + project_id: &str, + scope_id: &str, + prefix: &[u8], +) -> u64 { + let ns = crate::storage::keyspace::NamespaceId::project_scope(project_id, scope_id); + let Some(namespace) = snapshot.namespaces.get(&ns) else { + return 0; + }; + namespace + .kv + .entries + .iter() + .filter(|(k, _)| k.starts_with(prefix)) + .map(|(_, v)| v.version) + .max() + .unwrap_or(0) +} + +fn snapshot_kv_structural_version( + snapshot: &KeyspaceSnapshot, + project_id: &str, + scope_id: &str, +) -> u64 { + let ns = crate::storage::keyspace::NamespaceId::project_scope(project_id, scope_id); + snapshot + .namespaces + .get(&ns) + .map(|n| n.kv.structural_version) + .unwrap_or(0) +} + #[cfg(test)] mod tests { use super::{PreflightResult, preflight}; diff --git a/src/storage/keyspace.rs b/src/storage/keyspace.rs index cd4a36a..f7f8e01 100644 --- a/src/storage/keyspace.rs +++ b/src/storage/keyspace.rs @@ -547,14 +547,13 @@ impl Keyspace { commit_seq: u64, ) { let kv = self.kv_data_mut(project_id, scope_id); - if !kv.entries.contains_key(&key) { - kv.structural_version = commit_seq; - } - let created_at = kv - .entries - .get(&key) - .map(|e| e.created_at) - .unwrap_or(commit_seq); + let created_at = match kv.entries.get(&key) { + Some(entry) => entry.created_at, + None => { + kv.structural_version = commit_seq; + commit_seq + } + }; kv.entries.insert( key, KvEntry { @@ -593,14 +592,92 @@ impl Keyspace { else { return Vec::new(); }; + if prefix.is_empty() { + return kv + .entries + .iter() + .take(limit) + .map(|(k, v)| (k.clone(), v.clone())) + .collect(); + } + let start = Bound::Included(prefix.to_vec()); + let end = prefix_range_end(prefix) + .map(Bound::Excluded) + .unwrap_or(Bound::Unbounded); kv.entries - .iter() - .filter(|(k, _)| k.starts_with(prefix)) + .range((start, end)) .take(limit) .map(|(k, v)| (k.clone(), v.clone())) .collect() } + pub fn kv_scan_prefix_ref<'a>( + &'a self, + project_id: &str, + scope_id: &str, + prefix: &[u8], + limit: usize, + ) -> Vec<(&'a [u8], &'a KvEntry)> { + let Some(kv) = self + .namespace(&NamespaceId::project_scope(project_id, scope_id)) + .map(|ns| &ns.kv) + else { + return Vec::new(); + }; + if prefix.is_empty() { + return kv + .entries + .iter() + .take(limit) + .map(|(k, v)| (k.as_slice(), v)) + .collect(); + } + let start = Bound::Included(prefix.to_vec()); + let end = prefix_range_end(prefix) + .map(Bound::Excluded) + .unwrap_or(Bound::Unbounded); + kv.entries + .range((start, end)) + .take(limit) + .map(|(k, v)| (k.as_slice(), v)) + .collect() + } + + pub fn kv_visit_prefix_ref( + &self, + project_id: &str, + scope_id: &str, + prefix: &[u8], + limit: usize, + mut visitor: F, + ) where + F: FnMut(&[u8], &KvEntry) -> bool, + { + let Some(kv) = self + .namespace(&NamespaceId::project_scope(project_id, scope_id)) + .map(|ns| &ns.kv) + else { + return; + }; + if prefix.is_empty() { + for (k, v) in kv.entries.iter().take(limit) { + if !visitor(k.as_slice(), v) { + break; + } + } + return; + } + let start = Bound::Included(prefix.to_vec()); + let end = prefix_range_end(prefix) + .map(Bound::Excluded) + .unwrap_or(Bound::Unbounded); + for (k, v) in kv.entries.range((start, end)).take(limit) { + if !visitor(k.as_slice(), v) { + break; + } + } + } + pub fn kv_scan_range( &self, project_id: &str, @@ -811,6 +888,18 @@ fn default_primary_index_backend() -> PrimaryIndexBackend { PrimaryIndexBackend::OrdMap } +fn prefix_range_end(prefix: &[u8]) -> Option> { + let mut end = prefix.to_vec(); + for idx in (0..end.len()).rev() { + if end[idx] != u8::MAX { + end[idx] = end[idx].saturating_add(1); + end.truncate(idx + 1); + return Some(end); + } + } + None +} + fn encode_u256(v: U256) -> Vec { let mut bytes = [0u8; 32]; v.to_big_endian(&mut bytes); @@ -999,4 +1088,22 @@ mod tests { 11 ); } + + #[test] + fn kv_prefix_scans_are_lexicographically_bounded() { + let mut ks = Keyspace::default(); + ks.kv_set("p", "app", b"ob:a:1".to_vec(), b"v1".to_vec(), 1); + ks.kv_set("p", "app", b"ob:a:2".to_vec(), b"v2".to_vec(), 2); + ks.kv_set("p", "app", b"ob:b:1".to_vec(), b"v3".to_vec(), 3); + ks.kv_set("p", "app", b"zz".to_vec(), b"v4".to_vec(), 4); + + let rows = ks.kv_scan_prefix("p", "app", b"ob:a:", 10); + assert_eq!(rows.len(), 2); + assert_eq!(rows[0].0, b"ob:a:1".to_vec()); + assert_eq!(rows[1].0, b"ob:a:2".to_vec()); + + let refs = ks.kv_scan_prefix_ref("p", "app", b"ob:", 10); + assert_eq!(refs.len(), 3); + assert!(refs.iter().all(|(k, _)| k.starts_with(b"ob:"))); + } } diff --git a/tests/order_book_native.rs b/tests/order_book_native.rs new file mode 100644 index 0000000..a56edf3 --- /dev/null +++ b/tests/order_book_native.rs @@ -0,0 +1,697 @@ +use aedb::AedbInstance; +use aedb::error::AedbError; +use aedb::order_book::{ + ExecInstruction, InstrumentConfig, OrderBookTableMode, OrderRecord, OrderRequest, OrderSide, + OrderStatus, OrderType, SelfTradePrevention, TimeInForce, key_client_id, key_execution_report, + key_order, scoped_instrument, +}; +use aedb::query::plan::ConsistencyMode; +use tempfile::tempdir; + +fn u256_be(v: u64) -> [u8; 32] { + let mut out = [0u8; 32]; + out[24..].copy_from_slice(&v.to_be_bytes()); + out +} + +fn decode_u64_u256(be: [u8; 32]) -> u64 { + let mut out = [0u8; 8]; + out.copy_from_slice(&be[24..]); + u64::from_be_bytes(out) +} + +fn order_req( + instrument: &str, + owner: &str, + client_id: &str, + side: OrderSide, + tif: TimeInForce, + post_only: bool, + price: i64, + qty: u64, +) -> OrderRequest { + OrderRequest { + instrument: instrument.to_string(), + client_order_id: client_id.to_string(), + side, + order_type: OrderType::Limit, + time_in_force: tif, + exec_instructions: ExecInstruction(if post_only { + ExecInstruction::POST_ONLY + } else { + 0 + }), + self_trade_prevention: aedb::order_book::SelfTradePrevention::None, + price_ticks: price, + qty_be: u256_be(qty), + owner: owner.to_string(), + account: None, + nonce: 1, + price_limit_ticks: None, + } +} + +fn order_req_with_stp( + instrument: &str, + owner: &str, + client_id: &str, + side: OrderSide, + price: i64, + qty: u64, + stp: SelfTradePrevention, +) -> OrderRequest { + let mut req = order_req( + instrument, + owner, + client_id, + side, + TimeInForce::Gtc, + false, + price, + qty, + ); + req.self_trade_prevention = stp; + req +} + +async fn load_order(db: &AedbInstance, instrument: &str, order_id: u64) -> OrderRecord { + let key = key_order(instrument, order_id); + let entry = db + .kv_get_no_auth("p", "app", &key, ConsistencyMode::AtLatest) + .await + .expect("kv_get") + .expect("order entry"); + rmp_serde::from_slice(&entry.value).expect("decode order") +} + +async fn load_order_id( + db: &AedbInstance, + instrument: &str, + owner: &str, + client_order_id: &str, +) -> u64 { + let key = key_client_id(instrument, owner, client_order_id); + let entry = db + .kv_get_no_auth("p", "app", &key, ConsistencyMode::AtLatest) + .await + .expect("kv_get") + .expect("client id mapping"); + let mut out = [0u8; 8]; + out.copy_from_slice(&entry.value); + u64::from_be_bytes(out) +} + +#[tokio::test] +async fn post_only_rejects_crossing() { + let dir = tempdir().expect("temp"); + let db = AedbInstance::open(Default::default(), dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + + db.order_book_new( + "p", + "app", + order_req( + "BTC-USD", + "maker", + "maker-1", + OrderSide::Ask, + TimeInForce::Gtc, + false, + 100, + 5, + ), + ) + .await + .expect("seed ask"); + + let err = db + .order_book_new( + "p", + "app", + order_req( + "BTC-USD", + "taker", + "taker-post", + OrderSide::Bid, + TimeInForce::Gtc, + true, + 100, + 4, + ), + ) + .await + .expect_err("post only should reject"); + assert!( + matches!(err, AedbError::Validation(ref msg) if msg.contains("post_only")), + "unexpected error: {err:?}" + ); + + let trades = db + .kv_scan_prefix_no_auth( + "p", + "app", + b"ob:BTC-USD:trade:", + 100, + ConsistencyMode::AtLatest, + ) + .await + .expect("trade scan"); + assert!(trades.is_empty()); +} + +#[tokio::test] +async fn ioc_partial_fill_and_fok_rejection() { + let dir = tempdir().expect("temp"); + let db = AedbInstance::open(Default::default(), dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + + db.order_book_new( + "p", + "app", + order_req( + "BTC-USD", + "maker", + "maker-a", + OrderSide::Ask, + TimeInForce::Gtc, + false, + 100, + 5, + ), + ) + .await + .expect("seed ask"); + + db.order_book_new( + "p", + "app", + order_req( + "BTC-USD", + "ioc-user", + "ioc-1", + OrderSide::Bid, + TimeInForce::Ioc, + false, + 100, + 8, + ), + ) + .await + .expect("ioc commit"); + + let ioc_order_id = load_order_id(&db, "BTC-USD", "ioc-user", "ioc-1").await; + let ioc = load_order(&db, "BTC-USD", ioc_order_id).await; + assert_eq!(decode_u64_u256(ioc.remaining_qty_be), 0); + assert_eq!(decode_u64_u256(ioc.filled_qty_be), 5); + assert_eq!(ioc.status, OrderStatus::PartiallyFilled); + + db.order_book_new( + "p", + "app", + order_req( + "BTC-USD", + "maker2", + "maker-b", + OrderSide::Ask, + TimeInForce::Gtc, + false, + 101, + 3, + ), + ) + .await + .expect("seed ask 2"); + + let err = db + .order_book_new( + "p", + "app", + order_req( + "BTC-USD", + "fok-user", + "fok-1", + OrderSide::Bid, + TimeInForce::Fok, + false, + 101, + 5, + ), + ) + .await + .expect_err("fok should reject"); + assert!( + matches!(err, AedbError::Validation(ref msg) if msg.contains("fok")), + "unexpected error: {err:?}" + ); +} + +#[tokio::test] +async fn cancel_replace_reduce_and_mass_cancel() { + let dir = tempdir().expect("temp"); + let db = AedbInstance::open(Default::default(), dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + + db.order_book_new( + "p", + "app", + order_req( + "BTC-USD", + "maker", + "maker-cr", + OrderSide::Bid, + TimeInForce::Gtc, + false, + 90, + 10, + ), + ) + .await + .expect("new"); + + let order_id = load_order_id(&db, "BTC-USD", "maker", "maker-cr").await; + + db.order_book_cancel_replace( + "p", + "app", + "BTC-USD", + order_id, + "maker", + Some(91), + Some(u256_be(12)), + None, + None, + ) + .await + .expect("cancel replace"); + + let updated = load_order(&db, "BTC-USD", order_id).await; + assert_eq!(updated.price_ticks, 91); + assert_eq!(decode_u64_u256(updated.remaining_qty_be), 12); + + db.order_book_reduce("p", "app", "BTC-USD", order_id, "maker", u256_be(3)) + .await + .expect("reduce"); + let reduced = load_order(&db, "BTC-USD", order_id).await; + assert_eq!(decode_u64_u256(reduced.remaining_qty_be), 9); + + db.order_book_mass_cancel( + "p", + "app", + "BTC-USD", + "maker", + None, + Some("maker".to_string()), + None, + ) + .await + .expect("mass cancel"); + let cancelled = load_order(&db, "BTC-USD", order_id).await; + assert_eq!(cancelled.status, OrderStatus::Cancelled); + assert_eq!(decode_u64_u256(cancelled.remaining_qty_be), 0); +} + +#[tokio::test] +async fn table_scoped_books_support_multi_asset_and_cancel_by_client_id() { + let dir = tempdir().expect("temp"); + let db = AedbInstance::open(Default::default(), dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + db.order_book_define_table("p", "app", "markets", OrderBookTableMode::MultiAsset) + .await + .expect("define table"); + + db.order_book_new_in_table( + "p", + "app", + "markets", + "BTC-USD", + order_req( + "placeholder", + "alice", + "btc-1", + OrderSide::Bid, + TimeInForce::Gtc, + false, + 100, + 2, + ), + ) + .await + .expect("new btc"); + db.order_book_new_in_table( + "p", + "app", + "markets", + "ETH-USD", + order_req( + "placeholder", + "alice", + "eth-1", + OrderSide::Bid, + TimeInForce::Gtc, + false, + 50, + 3, + ), + ) + .await + .expect("new eth"); + + let btc_scoped = scoped_instrument("markets", "BTC-USD"); + let btc_id = load_order_id(&db, &btc_scoped, "alice", "btc-1").await; + db.order_book_cancel_by_client_id("p", "app", &btc_scoped, "btc-1", "alice") + .await + .expect("cancel by client id"); + let btc_order = load_order(&db, &btc_scoped, btc_id).await; + assert_eq!(btc_order.status, OrderStatus::Cancelled); + + db.order_book_cancel_by_client_id("p", "app", &btc_scoped, "btc-1", "alice") + .await + .expect("idempotent cancel by client id"); + + let eth_scoped = scoped_instrument("markets", "ETH-USD"); + let eth_id = load_order_id(&db, ð_scoped, "alice", "eth-1").await; + let eth_order = load_order(&db, ð_scoped, eth_id).await; + assert_eq!(eth_order.status, OrderStatus::Open); + assert_eq!(decode_u64_u256(eth_order.remaining_qty_be), 3); +} + +#[tokio::test] +async fn per_asset_table_mode_is_enforced() { + let dir = tempdir().expect("temp"); + let db = AedbInstance::open(Default::default(), dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + db.order_book_define_table("p", "app", "BTC-USD", OrderBookTableMode::PerAsset) + .await + .expect("define table"); + + let ok = db + .order_book_new_in_table( + "p", + "app", + "BTC-USD", + "BTC-USD", + order_req( + "placeholder", + "alice", + "btc-pa", + OrderSide::Bid, + TimeInForce::Gtc, + false, + 100, + 1, + ), + ) + .await; + assert!(ok.is_ok()); + + let err = db + .order_book_new_in_table( + "p", + "app", + "BTC-USD", + "ETH-USD", + order_req( + "placeholder", + "alice", + "eth-pa", + OrderSide::Bid, + TimeInForce::Gtc, + false, + 50, + 1, + ), + ) + .await + .expect_err("per asset should reject different asset id"); + assert!(matches!(err, AedbError::Validation(_))); +} + +#[tokio::test] +async fn market_order_without_liquidity_rejects() { + let dir = tempdir().expect("temp"); + let db = AedbInstance::open(Default::default(), dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + + let mut req = order_req( + "BTC-USD", + "alice", + "mkt-1", + OrderSide::Bid, + TimeInForce::Ioc, + false, + 0, + 1, + ); + req.order_type = OrderType::Market; + let err = db + .order_book_new("p", "app", req) + .await + .expect_err("market with no liquidity should reject"); + assert!(matches!(err, AedbError::Validation(_))); +} + +#[tokio::test] +async fn self_trade_prevention_modes_apply() { + let dir = tempdir().expect("temp"); + let db = AedbInstance::open(Default::default(), dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + + db.order_book_new("p", "app", { + let mut req = order_req( + "BTC-USD", + "alice", + "resting", + OrderSide::Ask, + TimeInForce::Gtc, + false, + 100, + 5, + ); + req.nonce = 1; + req + }) + .await + .expect("seed"); + let resting_id = load_order_id(&db, "BTC-USD", "alice", "resting").await; + + db.order_book_new("p", "app", { + let mut req = order_req_with_stp( + "BTC-USD", + "alice", + "aggr-cancel-resting", + OrderSide::Bid, + 100, + 2, + SelfTradePrevention::CancelResting, + ); + req.nonce = 2; + req + }) + .await + .expect("stp cancel resting"); + let resting = load_order(&db, "BTC-USD", resting_id).await; + assert_eq!(resting.status, OrderStatus::Cancelled); + + db.order_book_new("p", "app", { + let mut req = order_req( + "BTC-USD", + "alice", + "resting-2", + OrderSide::Ask, + TimeInForce::Gtc, + false, + 101, + 5, + ); + req.nonce = 3; + req + }) + .await + .expect("seed2"); + let resting2_id = load_order_id(&db, "BTC-USD", "alice", "resting-2").await; + + db.order_book_new("p", "app", { + let mut req = order_req_with_stp( + "BTC-USD", + "alice", + "aggr-cancel-both", + OrderSide::Bid, + 101, + 2, + SelfTradePrevention::CancelBoth, + ); + req.nonce = 4; + req + }) + .await + .expect("stp cancel both"); + let resting2 = load_order(&db, "BTC-USD", resting2_id).await; + assert_eq!(resting2.status, OrderStatus::Cancelled); + + db.order_book_new("p", "app", { + let mut req = order_req( + "BTC-USD", + "alice", + "resting-3", + OrderSide::Ask, + TimeInForce::Gtc, + false, + 102, + 5, + ); + req.nonce = 5; + req + }) + .await + .expect("seed3"); + let resting3_id = load_order_id(&db, "BTC-USD", "alice", "resting-3").await; + + db.order_book_new("p", "app", { + let mut req = order_req_with_stp( + "BTC-USD", + "alice", + "aggr-cancel-aggressor", + OrderSide::Bid, + 102, + 2, + SelfTradePrevention::CancelAggressor, + ); + req.nonce = 6; + req + }) + .await + .expect("stp cancel aggressor"); + let resting3 = load_order(&db, "BTC-USD", resting3_id).await; + assert_eq!(resting3.status, OrderStatus::Open); +} + +#[tokio::test] +async fn instrument_config_and_halt_are_enforced() { + let dir = tempdir().expect("temp"); + let db = AedbInstance::open(Default::default(), dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + + db.order_book_set_instrument_config( + "p", + "app", + "BTC-USD", + InstrumentConfig { + instrument: "BTC-USD".to_string(), + tick_size: 1, + lot_size_be: u256_be(2), + min_price_ticks: 10, + max_price_ticks: 200, + market_order_price_band: Some(20), + halted: false, + balance_config: None, + }, + ) + .await + .expect("set config"); + + let err = db + .order_book_new( + "p", + "app", + order_req( + "BTC-USD", + "alice", + "bad-lot", + OrderSide::Bid, + TimeInForce::Gtc, + false, + 100, + 3, + ), + ) + .await + .expect_err("lot size should reject"); + assert!(matches!(err, AedbError::Validation(_))); + + db.order_book_set_instrument_halted("p", "app", "BTC-USD", true) + .await + .expect("halt"); + let err = db + .order_book_new( + "p", + "app", + order_req( + "BTC-USD", + "alice", + "halted", + OrderSide::Bid, + TimeInForce::Gtc, + false, + 100, + 2, + ), + ) + .await + .expect_err("halted should reject"); + assert!(matches!(err, AedbError::Validation(_))); +} + +#[tokio::test] +async fn execution_report_is_persisted() { + let dir = tempdir().expect("temp"); + let db = AedbInstance::open(Default::default(), dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + db.order_book_new( + "p", + "app", + order_req( + "BTC-USD", + "maker", + "mk-rpt", + OrderSide::Ask, + TimeInForce::Gtc, + false, + 100, + 2, + ), + ) + .await + .expect("seed"); + db.order_book_new( + "p", + "app", + order_req( + "BTC-USD", + "taker", + "tk-rpt", + OrderSide::Bid, + TimeInForce::Ioc, + false, + 100, + 2, + ), + ) + .await + .expect("cross"); + + let report_key = aedb::order_book::key_execution_report_last("BTC-USD"); + let report = db + .kv_get_no_auth("p", "app", &report_key, ConsistencyMode::AtLatest) + .await + .expect("report query") + .expect("report exists"); + let decoded: aedb::order_book::ExecutionReport = if report.value.len() == 16 { + let commit_seq = u64::from_be_bytes(report.value[0..8].try_into().expect("seq bytes")); + let order_id = u64::from_be_bytes(report.value[8..16].try_into().expect("id bytes")); + let full = db + .kv_get_no_auth( + "p", + "app", + &key_execution_report("BTC-USD", commit_seq, order_id), + ConsistencyMode::AtLatest, + ) + .await + .expect("report by pointer query") + .expect("report by pointer exists"); + rmp_serde::from_slice(&full.value).expect("decode pointed report") + } else { + rmp_serde::from_slice(&report.value).expect("decode inline report") + }; + assert_eq!(decoded.client_order_id, "tk-rpt"); + assert!(decoded.seq > 0); +} diff --git a/tests/order_book_simulation.rs b/tests/order_book_simulation.rs new file mode 100644 index 0000000..aa76558 --- /dev/null +++ b/tests/order_book_simulation.rs @@ -0,0 +1,322 @@ +use aedb::AedbInstance; +use aedb::error::AedbError; +use aedb::order_book::{ + ExecInstruction, InstrumentConfig, OrderRequest, OrderSide, OrderStatus, OrderType, + TimeInForce, parse_plqty_price, +}; +use aedb::query::plan::ConsistencyMode; +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; +use std::collections::BTreeMap; +use std::sync::Arc; +use tempfile::tempdir; + +fn u256_be(v: u64) -> [u8; 32] { + let mut out = [0u8; 32]; + out[24..].copy_from_slice(&v.to_be_bytes()); + out +} + +fn decode_u256_u64(bytes: [u8; 32]) -> u64 { + let mut out = [0u8; 8]; + out.copy_from_slice(&bytes[24..]); + u64::from_be_bytes(out) +} + +fn decode_u256_bytes_to_u64(bytes: &[u8]) -> u64 { + assert_eq!(bytes.len(), 32); + let mut out = [0u8; 8]; + out.copy_from_slice(&bytes[24..]); + u64::from_be_bytes(out) +} + +fn request( + instrument: &str, + owner: &str, + client_order_id: String, + side: OrderSide, + order_type: OrderType, + tif: TimeInForce, + post_only: bool, + price_ticks: i64, + qty: u64, + nonce: u64, +) -> OrderRequest { + OrderRequest { + instrument: instrument.to_string(), + client_order_id, + side, + order_type, + time_in_force: tif, + exec_instructions: ExecInstruction(if post_only { + ExecInstruction::POST_ONLY + } else { + 0 + }), + self_trade_prevention: aedb::order_book::SelfTradePrevention::None, + price_ticks, + qty_be: u256_be(qty), + owner: owner.to_string(), + account: None, + nonce, + price_limit_ticks: None, + } +} + +async fn setup_books(db: &AedbInstance, assets: &[String]) { + db.create_project("p").await.expect("project"); + for asset in assets { + db.order_book_set_instrument_config( + "p", + "app", + asset, + InstrumentConfig { + instrument: asset.clone(), + tick_size: 1, + lot_size_be: u256_be(1), + min_price_ticks: 1, + max_price_ticks: 1_000_000, + market_order_price_band: Some(50), + halted: false, + balance_config: None, + }, + ) + .await + .expect("config"); + + // Seed symmetric depth around 1_000 ticks. + for i in 0..20_u64 { + let ask_owner = format!("seed_ask_{}_{}", asset, i); + db.order_book_new( + "p", + "app", + request( + asset, + &ask_owner, + format!("seed-a-{i}"), + OrderSide::Ask, + OrderType::Limit, + TimeInForce::Gtc, + false, + 1_000 + i as i64, + 10, + 1, + ), + ) + .await + .expect("seed ask"); + + let bid_owner = format!("seed_bid_{}_{}", asset, i); + db.order_book_new( + "p", + "app", + request( + asset, + &bid_owner, + format!("seed-b-{i}"), + OrderSide::Bid, + OrderType::Limit, + TimeInForce::Gtc, + false, + 999 - i as i64, + 10, + 1, + ), + ) + .await + .expect("seed bid"); + } + } +} + +async fn run_simulation( + assets: Vec, + traders: usize, + ops_per_trader: usize, +) -> Arc { + let dir = tempdir().expect("temp"); + let db = Arc::new(AedbInstance::open(Default::default(), dir.path()).expect("open")); + setup_books(&db, &assets).await; + + let mut tasks = Vec::with_capacity(traders); + for t in 0..traders { + let db_clone = Arc::clone(&db); + let assets_clone = assets.clone(); + tasks.push(tokio::spawn(async move { + let owner = format!("trader_{t}"); + let mut nonces: BTreeMap = BTreeMap::new(); + let mut rng = StdRng::seed_from_u64(42 + t as u64); + + for op in 0..ops_per_trader { + let asset = &assets_clone[rng.gen_range(0..assets_clone.len())]; + let nonce = nonces.entry(asset.clone()).or_insert(0); + *nonce += 1; + + let side = if rng.gen_bool(0.5) { + OrderSide::Bid + } else { + OrderSide::Ask + }; + let price = 995 + rng.gen_range(0..12) as i64; + let qty = 1 + rng.gen_range(0..5) as u64; + let tif = if rng.gen_bool(0.7) { + TimeInForce::Ioc + } else { + TimeInForce::Fok + }; + let order_type = if rng.gen_bool(0.1) { + OrderType::Market + } else { + OrderType::Limit + }; + let post_only = order_type == OrderType::Limit && rng.gen_bool(0.05); + + let res = db_clone + .order_book_new( + "p", + "app", + request( + asset, + &owner, + format!("{owner}-{op}"), + side, + order_type, + tif, + post_only, + price, + qty, + *nonce, + ), + ) + .await; + + if let Err(err) = res { + // Expected rejects under stress: FOK, market no liquidity, post-only crossing. + match err { + AedbError::Validation(_) => {} + other => panic!("unexpected simulation error: {other:?}"), + } + } + + // Periodically exercise lifecycle primitives. + if op % 100 == 0 { + *nonce += 1; + let cid = format!("gtc-{owner}-{op}"); + let _ = db_clone + .order_book_new( + "p", + "app", + request( + asset, + &owner, + cid.clone(), + side, + OrderType::Limit, + TimeInForce::Gtc, + false, + price, + qty, + *nonce, + ), + ) + .await; + let _ = db_clone + .order_book_cancel_by_client_id("p", "app", asset, &cid, &owner) + .await; + } + } + })); + } + + for task in tasks { + task.await.expect("task join"); + } + + db +} + +async fn assert_book_invariants(db: &AedbInstance, assets: &[String]) { + for asset in assets { + let mut from_orders: BTreeMap<(u8, i64), u64> = BTreeMap::new(); + + let rows = db + .kv_scan_prefix_no_auth( + "p", + "app", + format!("ob:{asset}:ord:").as_bytes(), + 1_000_000, + ConsistencyMode::AtLatest, + ) + .await + .expect("scan orders"); + + for (_, entry) in rows { + let order: aedb::order_book::OrderRecord = + rmp_serde::from_slice(&entry.value).expect("decode order"); + let original = decode_u256_u64(order.original_qty_be); + let remaining = decode_u256_u64(order.remaining_qty_be); + let filled = decode_u256_u64(order.filled_qty_be); + assert!( + remaining + filled <= original, + "quantity accounting invariant" + ); + if remaining > 0 + && matches!( + order.status, + OrderStatus::Open | OrderStatus::PartiallyFilled + ) + { + *from_orders + .entry((order.side as u8, order.price_ticks)) + .or_insert(0) += remaining; + } + } + + let mut from_levels: BTreeMap<(u8, i64), u64> = BTreeMap::new(); + for side in [OrderSide::Bid, OrderSide::Ask] { + let levels = db + .kv_scan_prefix_no_auth( + "p", + "app", + format!("ob:{asset}:plqty:{}:", side as u8).as_bytes(), + 1_000_000, + ConsistencyMode::AtLatest, + ) + .await + .expect("scan levels"); + for (k, v) in levels { + let qty = decode_u256_bytes_to_u64(&v.value); + if qty == 0 { + continue; + } + let price = parse_plqty_price(side, &k).expect("parse level price"); + from_levels.insert((side as u8, price), qty); + } + } + + assert_eq!( + from_orders, from_levels, + "price-level aggregates must match open orders for {asset}" + ); + } +} + +#[tokio::test] +async fn order_book_simulation_smoke() { + let assets = vec!["BTC-USD".to_string(), "ETH-USD".to_string()]; + let db = run_simulation(assets.clone(), 6, 250).await; + assert_book_invariants(&db, &assets).await; +} + +#[tokio::test] +#[ignore = "long-running high-frequency simulation"] +async fn order_book_simulation_hft_soak() { + let assets = vec![ + "BTC-USD".to_string(), + "ETH-USD".to_string(), + "SOL-USD".to_string(), + "DOGE-USD".to_string(), + ]; + let db = run_simulation(assets.clone(), 24, 2_000).await; + assert_book_invariants(&db, &assets).await; +} From eb9699a5289fc7d4a9a6109b598b60845c5111db Mon Sep 17 00:00:00 2001 From: johnny Date: Fri, 20 Feb 2026 05:27:51 -0500 Subject: [PATCH 2/4] setup orderbook testing --- .github/workflows/aedb-tests.yml | 50 +- Cargo.lock | 1 + README.md | 12 + crates/aedb-orderbook/Cargo.toml | 1 + .../aedb-orderbook/src/bin/orderbook_perf.rs | 52 +- .../src/bin/orderbook_perf_guard.rs | 54 +- crates/aedb-orderbook/src/lib.rs | 354 +++++- .../tests/adversarial_slo_sla.rs | 172 +++ .../aedb-orderbook/tests/chaos_ci_profile.rs | 147 +++ .../tests/property_randomized_matrix.rs | 89 ++ docs/SECURITY_ACCEPTANCE_CRITERIA.md | 60 + docs/SECURITY_OPERATIONS_RUNBOOK.md | 54 + scripts/run_orderbook_realism_docker.sh | 56 + scripts/security_gate.sh | 33 + src/commit/executor/internals.rs | 50 +- src/commit/executor/mod.rs | 21 + src/lib.rs | 916 ++++++++++++++- src/lib_helpers.rs | 26 +- src/lib_tests.rs | 1022 ++++++++++++++++- src/order_book.rs | 156 ++- tests/order_book_simulation.rs | 261 ++++- tests/security_properties.rs | 155 +++ tests/security_properties_proptest.rs | 196 ++++ 23 files changed, 3755 insertions(+), 183 deletions(-) create mode 100644 crates/aedb-orderbook/tests/adversarial_slo_sla.rs create mode 100644 crates/aedb-orderbook/tests/chaos_ci_profile.rs create mode 100644 crates/aedb-orderbook/tests/property_randomized_matrix.rs create mode 100644 docs/SECURITY_ACCEPTANCE_CRITERIA.md create mode 100644 docs/SECURITY_OPERATIONS_RUNBOOK.md create mode 100755 scripts/run_orderbook_realism_docker.sh create mode 100755 scripts/security_gate.sh create mode 100644 tests/security_properties.rs create mode 100644 tests/security_properties_proptest.rs diff --git a/.github/workflows/aedb-tests.yml b/.github/workflows/aedb-tests.yml index 73d3058..03708b7 100644 --- a/.github/workflows/aedb-tests.yml +++ b/.github/workflows/aedb-tests.yml @@ -38,10 +38,35 @@ jobs: - name: L1 balance conservation (must-pass smoke) run: cargo test --test stress arcana_l1_balance_conservation_under_load - crash: + security_strict: needs: integration runs-on: ubuntu-latest timeout-minutes: 45 + steps: + - uses: actions/checkout@v4 + - uses: dtolnay/rust-toolchain@stable + - uses: Swatinem/rust-cache@v2 + - name: Security boundaries + run: | + cargo test --test security_boundaries -- --test-threads=1 + cargo test --test security_properties -- --test-threads=1 + - name: Security property tests (fuzz-style randomized) + env: + PROPTEST_CASES: "16" + run: cargo test --test security_properties_proptest -- --test-threads=1 + - name: Strict backup hash-chain acceptance + run: | + cargo test --test backup_restore strict_backup_chain_restore_succeeds_with_hash_chain_enforcement -- --test-threads=1 + cargo test --test backup_restore strict_backup_chain_restore_rejects_tampered_incremental_segment -- --test-threads=1 + - name: Idempotency and audit trail behavior + run: | + cargo test --test read_assertions integration_idempotent_retry_skips_assertion_re_evaluation -- --test-threads=1 + cargo test --test read_assertions integration_failed_assertion_is_logged_to_system_audit_table -- --test-threads=1 + + crash: + needs: security_strict + runs-on: ubuntu-latest + timeout-minutes: 45 steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable @@ -50,7 +75,6 @@ jobs: run: cargo test --test crash_matrix -- --test-threads=1 crash_longrun: - if: github.event_name == 'schedule' || startsWith(github.ref, 'refs/heads/release/') needs: crash runs-on: ubuntu-latest timeout-minutes: 90 @@ -63,8 +87,7 @@ jobs: - name: A17b durability crash loop run: cargo test --test crash_matrix crash_matrix_a17b_thousand_crash_cycles_preserve_state -- --ignored --test-threads=1 - stress: - if: github.event_name == 'schedule' || startsWith(github.ref, 'refs/heads/release/') + chaos_longrun: needs: crash_longrun runs-on: ubuntu-latest timeout-minutes: 90 @@ -72,8 +95,23 @@ jobs: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable - uses: Swatinem/rust-cache@v2 - - name: Full ignored stress suite - run: cargo test --test stress -- --ignored --test-threads=1 + - name: Orderbook adversarial randomized matrix + run: cargo test -p aedb-orderbook --test property_randomized_matrix -- --test-threads=1 + - name: Orderbook adversarial SLO/SLA gate + env: + AEDB_ORDERBOOK_SLA_MIN_ATTEMPTED_TPS: "600" + AEDB_ORDERBOOK_SLA_MAX_P99_US: "1000000" + AEDB_ORDERBOOK_SLA_MAX_FINALITY_GAP: "10000" + AEDB_ORDERBOOK_SLA_MAX_PRIMARY_REJECT_RATIO_PPM: "900000" + run: cargo test -p aedb-orderbook --test adversarial_slo_sla -- --test-threads=1 + - name: Orderbook CI chaos profile + run: cargo test -p aedb-orderbook --test chaos_ci_profile -- --test-threads=1 + - name: Orderbook soak profiles (mandatory) + run: | + cargo test -p aedb-orderbook --test simulation_smoke simulation_soak_multi_asset_mixed -- --ignored --test-threads=1 + cargo test -p aedb-orderbook --test simulation_smoke simulation_soak_single_asset_contention_limit -- --ignored --test-threads=1 + - name: Core orderbook chaos read/write accuracy + run: cargo test --test order_book_simulation order_book_chaos_read_write_accuracy -- --test-threads=1 benchmark: if: github.event_name == 'schedule' || startsWith(github.ref, 'refs/heads/release/') diff --git a/Cargo.lock b/Cargo.lock index 756838b..c1e433e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -54,6 +54,7 @@ version = "0.1.0" dependencies = [ "aedb", "primitive-types", + "proptest", "rand 0.8.5", "rmp-serde", "serde", diff --git a/README.md b/README.md index 3bb747f..0aea2d1 100644 --- a/README.md +++ b/README.md @@ -146,6 +146,12 @@ AEDB supports permission-aware APIs via `CallerContext` and `Permission`. - `open_production` and `open_secure` require authenticated `*_as` calls - `open_secure` enforces hardened durability/recovery settings - table/KV/query access can be scoped per project/scope/resource +- `authz_audit` and `assertion_audit` system tables provide built-in audit trails + +Security/operations docs: + +- `docs/SECURITY_ACCEPTANCE_CRITERIA.md` +- `docs/SECURITY_OPERATIONS_RUNBOOK.md` ## Operational APIs @@ -186,6 +192,12 @@ cargo test --test crash_matrix cargo test --test stress ``` +Security acceptance gate (mandatory profile): + +```bash +./scripts/security_gate.sh +``` + ## License Dual-licensed under: diff --git a/crates/aedb-orderbook/Cargo.toml b/crates/aedb-orderbook/Cargo.toml index a926c96..65d6e68 100644 --- a/crates/aedb-orderbook/Cargo.toml +++ b/crates/aedb-orderbook/Cargo.toml @@ -16,3 +16,4 @@ rmp-serde = "1" [dev-dependencies] tokio = { version = "1", features = ["macros", "rt-multi-thread"] } +proptest = "1" diff --git a/crates/aedb-orderbook/src/bin/orderbook_perf.rs b/crates/aedb-orderbook/src/bin/orderbook_perf.rs index b4c96b5..859ddb5 100644 --- a/crates/aedb-orderbook/src/bin/orderbook_perf.rs +++ b/crates/aedb-orderbook/src/bin/orderbook_perf.rs @@ -211,15 +211,19 @@ async fn main() { } println!( - "scenario,attempted,accepted,rejected,elapsed_ms,attempted_ops_s,accepted_ops_s,rejected_ops_s,lat_avg_us,lat_p50_us,lat_p95_us,lat_p99_us,lat_max_us,max_finality_gap,visible_head,durable_head,zero_dropped,durability" + "scenario,attempted,accepted,rejected,elapsed_ms,attempted_ops_s,accepted_ops_s,rejected_ops_s,lat_avg_us,lat_p50_us,lat_p95_us,lat_p99_us,lat_max_us,max_finality_gap,visible_head,durable_head,zero_dropped,durability,lifecycle_attempted,lifecycle_accepted,lifecycle_rejected,primary_reject_conflict,primary_reject_post_only,primary_reject_fok,primary_reject_no_liquidity,primary_reject_nonce,primary_reject_duplicate_cid,primary_reject_other_validation,lifecycle_reject_conflict,lifecycle_reject_post_only,lifecycle_reject_fok,lifecycle_reject_no_liquidity,lifecycle_reject_nonce,lifecycle_reject_duplicate_cid,lifecycle_reject_other_validation" ); for scenario in scenario_matrix(&scale) { let (sim_cfg, db_cfg) = cfg_for_scenario(&scenario); - let report = run_hft_simulation_with_config(sim_cfg, db_cfg) - .await - .unwrap_or_else(|e| panic!("scenario {} failed: {e}", scenario.name)); + let report = match run_hft_simulation_with_config(sim_cfg, db_cfg).await { + Ok(report) => report, + Err(e) => { + eprintln!("scenario {} failed: {e}", scenario.name); + std::process::exit(1); + } + }; println!( - "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}", + "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}", scenario.name, report.simulation.attempted_orders, report.simulation.accepted_orders, @@ -238,6 +242,44 @@ async fn main() { report.simulation.durable_head_seq, report.zero_dropped_orders, report.durability_mode, + report.simulation.lifecycle_attempted_ops, + report.simulation.lifecycle_accepted_ops, + report.simulation.lifecycle_rejected_ops, + report.simulation.rejection_breakdown.conflict, + report.simulation.rejection_breakdown.post_only_would_cross, + report.simulation.rejection_breakdown.fok_cannot_fill, + report.simulation.rejection_breakdown.market_no_liquidity, + report.simulation.rejection_breakdown.nonce_too_low, + report + .simulation + .rejection_breakdown + .duplicate_client_order_id, + report.simulation.rejection_breakdown.other_validation, + report.simulation.lifecycle_rejection_breakdown.conflict, + report + .simulation + .lifecycle_rejection_breakdown + .post_only_would_cross, + report + .simulation + .lifecycle_rejection_breakdown + .fok_cannot_fill, + report + .simulation + .lifecycle_rejection_breakdown + .market_no_liquidity, + report + .simulation + .lifecycle_rejection_breakdown + .nonce_too_low, + report + .simulation + .lifecycle_rejection_breakdown + .duplicate_client_order_id, + report + .simulation + .lifecycle_rejection_breakdown + .other_validation, ); } } diff --git a/crates/aedb-orderbook/src/bin/orderbook_perf_guard.rs b/crates/aedb-orderbook/src/bin/orderbook_perf_guard.rs index 5230837..324a053 100644 --- a/crates/aedb-orderbook/src/bin/orderbook_perf_guard.rs +++ b/crates/aedb-orderbook/src/bin/orderbook_perf_guard.rs @@ -77,21 +77,63 @@ async fn main() { let mut failures = Vec::new(); println!( - "scenario,attempted_tps,accepted_tps,rejected_tps,min_attempted_tps,zero_dropped,max_finality_gap" + "scenario,attempted_tps,accepted_tps,rejected_tps,min_attempted_tps,zero_dropped,max_finality_gap,lifecycle_attempted,lifecycle_accepted,lifecycle_rejected,primary_reject_conflict,primary_reject_post_only,primary_reject_fok,primary_reject_no_liquidity,primary_reject_nonce,primary_reject_duplicate_cid,primary_reject_other_validation,lifecycle_reject_conflict,lifecycle_reject_post_only,lifecycle_reject_fok,lifecycle_reject_no_liquidity,lifecycle_reject_nonce,lifecycle_reject_duplicate_cid,lifecycle_reject_other_validation" ); for s in scenarios() { - let report = run_hft_simulation_with_config(s.cfg.clone(), db_cfg.clone()) - .await - .unwrap_or_else(|e| panic!("scenario {} failed to run: {e}", s.name)); + let report = match run_hft_simulation_with_config(s.cfg.clone(), db_cfg.clone()).await { + Ok(report) => report, + Err(e) => { + eprintln!("scenario {} failed to run: {e}", s.name); + std::process::exit(1); + } + }; println!( - "{},{},{},{},{},{},{}", + "{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{},{}", s.name, report.attempted_ops_per_sec, report.accepted_ops_per_sec, report.rejected_ops_per_sec, s.min_attempted_tps, report.zero_dropped_orders, - report.max_commit_finality_gap + report.max_commit_finality_gap, + report.simulation.lifecycle_attempted_ops, + report.simulation.lifecycle_accepted_ops, + report.simulation.lifecycle_rejected_ops, + report.simulation.rejection_breakdown.conflict, + report.simulation.rejection_breakdown.post_only_would_cross, + report.simulation.rejection_breakdown.fok_cannot_fill, + report.simulation.rejection_breakdown.market_no_liquidity, + report.simulation.rejection_breakdown.nonce_too_low, + report + .simulation + .rejection_breakdown + .duplicate_client_order_id, + report.simulation.rejection_breakdown.other_validation, + report.simulation.lifecycle_rejection_breakdown.conflict, + report + .simulation + .lifecycle_rejection_breakdown + .post_only_would_cross, + report + .simulation + .lifecycle_rejection_breakdown + .fok_cannot_fill, + report + .simulation + .lifecycle_rejection_breakdown + .market_no_liquidity, + report + .simulation + .lifecycle_rejection_breakdown + .nonce_too_low, + report + .simulation + .lifecycle_rejection_breakdown + .duplicate_client_order_id, + report + .simulation + .lifecycle_rejection_breakdown + .other_validation ); if report.attempted_ops_per_sec < s.min_attempted_tps || !report.zero_dropped_orders { failures.push(format!( diff --git a/crates/aedb-orderbook/src/lib.rs b/crates/aedb-orderbook/src/lib.rs index 4ea1701..18e8dfc 100644 --- a/crates/aedb-orderbook/src/lib.rs +++ b/crates/aedb-orderbook/src/lib.rs @@ -57,6 +57,11 @@ pub struct SimulationReport { pub attempted_orders: usize, pub accepted_orders: usize, pub rejected_orders: usize, + pub lifecycle_attempted_ops: usize, + pub lifecycle_accepted_ops: usize, + pub lifecycle_rejected_ops: usize, + pub lifecycle_rejection_breakdown: RejectionBreakdown, + pub rejection_breakdown: RejectionBreakdown, pub max_commit_seq: u64, pub visible_head_seq: u64, pub durable_head_seq: u64, @@ -64,6 +69,21 @@ pub struct SimulationReport { pub table_profile: TableProfile, } +#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)] +pub struct RejectionBreakdown { + pub conflict: usize, + pub post_only_would_cross: usize, + pub fok_cannot_fill: usize, + pub market_no_liquidity: usize, + pub instrument_halted: usize, + pub nonce_too_low: usize, + pub duplicate_client_order_id: usize, + pub lot_size_violation: usize, + pub qty_non_positive: usize, + pub price_out_of_bounds: usize, + pub other_validation: usize, +} + #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] pub struct LatencyStats { pub samples: usize, @@ -126,11 +146,16 @@ fn decode_u256_u64(bytes: [u8; 32]) -> u64 { u64::from_be_bytes(out) } -fn decode_u256_bytes_to_u64(bytes: &[u8]) -> u64 { - assert_eq!(bytes.len(), 32); +fn decode_u256_bytes_to_u64(bytes: &[u8]) -> Result { + if bytes.len() != 32 { + return Err(AedbError::Validation(format!( + "invalid u256 byte length: {}", + bytes.len() + ))); + } let mut out = [0u8; 8]; out.copy_from_slice(&bytes[24..]); - u64::from_be_bytes(out) + Ok(u64::from_be_bytes(out)) } fn request( @@ -171,7 +196,7 @@ async fn setup_books( assets: &[String], table_profile: &TableProfile, ) -> Result, AedbError> { - db.create_project("p").await.expect("project"); + db.create_project("p").await?; let mut instruments = Vec::with_capacity(assets.len()); let multi_table_id = match table_profile { TableProfile::MultiAssetTable { table_id } => Some(table_id.clone()), @@ -252,7 +277,10 @@ async fn setup_books( Ok(instruments) } -async fn assert_book_invariants(db: &AedbInstance, instruments: &[String]) { +async fn assert_book_invariants( + db: &AedbInstance, + instruments: &[String], +) -> Result<(), AedbError> { for instrument in instruments { let mut from_orders: BTreeMap<(u8, i64), u64> = BTreeMap::new(); @@ -265,18 +293,19 @@ async fn assert_book_invariants(db: &AedbInstance, instruments: &[String]) { ConsistencyMode::AtLatest, ) .await - .expect("scan orders"); + .map_err(|e| AedbError::Validation(e.to_string()))?; for (_, entry) in rows { - let order: aedb::order_book::OrderRecord = - rmp_serde::from_slice(&entry.value).expect("decode order"); + let order: aedb::order_book::OrderRecord = rmp_serde::from_slice(&entry.value) + .map_err(|e| AedbError::Decode(e.to_string()))?; let original = decode_u256_u64(order.original_qty_be); let remaining = decode_u256_u64(order.remaining_qty_be); let filled = decode_u256_u64(order.filled_qty_be); - assert!( - remaining + filled <= original, - "quantity accounting invariant" - ); + if remaining + filled > original { + return Err(AedbError::Validation(format!( + "quantity accounting invariant violated for {instrument}" + ))); + } if remaining > 0 && matches!( order.status, @@ -300,22 +329,28 @@ async fn assert_book_invariants(db: &AedbInstance, instruments: &[String]) { ConsistencyMode::AtLatest, ) .await - .expect("scan levels"); + .map_err(|e| AedbError::Validation(e.to_string()))?; for (k, v) in levels { - let qty = decode_u256_bytes_to_u64(&v.value); + let qty = decode_u256_bytes_to_u64(&v.value)?; if qty == 0 { continue; } - let price = parse_plqty_price(side, &k).expect("parse level price"); + let price = parse_plqty_price(side, &k).ok_or_else(|| { + AedbError::Validation(format!( + "failed to parse plqty price for instrument {instrument}" + )) + })?; from_levels.insert((side as u8, price), qty); } } - assert_eq!( - from_orders, from_levels, - "price-level aggregates must match open orders for {instrument}" - ); + if from_orders != from_levels { + return Err(AedbError::Validation(format!( + "price-level aggregates mismatch for {instrument}" + ))); + } } + Ok(()) } pub async fn run_hft_simulation(cfg: SimulationConfig) -> Result { @@ -334,8 +369,8 @@ pub async fn run_hft_simulation_with_config( DurabilityMode::OsBuffered => "os_buffered", } .to_string(); - let dir = tempdir().expect("temp"); - let db = Arc::new(AedbInstance::open(db_cfg, dir.path()).expect("open")); + let dir = tempdir().map_err(AedbError::Io)?; + let db = Arc::new(AedbInstance::open(db_cfg, dir.path())?); let instruments = setup_books(&db, &cfg.assets, &cfg.table_profile).await?; let run_started = Instant::now(); @@ -354,6 +389,11 @@ pub async fn run_hft_simulation_with_config( let mut rng = StdRng::seed_from_u64(seed + t as u64); let mut accepted = 0usize; let mut rejected = 0usize; + let mut lifecycle_attempted = 0usize; + let mut lifecycle_accepted = 0usize; + let mut lifecycle_rejected = 0usize; + let mut rejection_breakdown = RejectionBreakdown::default(); + let mut lifecycle_rejection_breakdown = RejectionBreakdown::default(); let mut max_commit_seq = 0u64; let mut max_finality_gap = 0u64; let mut latencies_us = if cfg.collect_latency { @@ -438,19 +478,21 @@ pub async fn run_hft_simulation_with_config( &mut latencies_us, &mut accepted, &mut rejected, + &mut rejection_breakdown, &mut max_commit_seq, &mut max_finality_gap, ) .await { - panic!("unexpected simulation error: {other:?}"); + return Err(other); } } if cfg.lifecycle_every_ops > 0 && op % cfg.lifecycle_every_ops == 0 { *nonce += 1; let cid = format!("gtc-{owner}-{op}"); - let _ = db_clone + lifecycle_attempted += 1; + match db_clone .order_book_new( "p", "app", @@ -467,10 +509,25 @@ pub async fn run_hft_simulation_with_config( *nonce, ), ) - .await; - let _ = db_clone + .await + { + Ok(_) => lifecycle_accepted += 1, + Err(err) => { + lifecycle_rejected += 1; + record_rejection_error(&mut lifecycle_rejection_breakdown, err, 1)?; + } + } + lifecycle_attempted += 1; + match db_clone .order_book_cancel_by_client_id("p", "app", instrument, &cid, &owner) - .await; + .await + { + Ok(_) => lifecycle_accepted += 1, + Err(err) => { + lifecycle_rejected += 1; + record_rejection_error(&mut lifecycle_rejection_breakdown, err, 1)?; + } + } } } if let Err(other) = flush_pending_orders( @@ -482,38 +539,58 @@ pub async fn run_hft_simulation_with_config( &mut latencies_us, &mut accepted, &mut rejected, + &mut rejection_breakdown, &mut max_commit_seq, &mut max_finality_gap, ) .await { - panic!("unexpected simulation error: {other:?}"); + return Err(other); } - ( + Ok(( accepted, rejected, + lifecycle_attempted, + lifecycle_accepted, + lifecycle_rejected, + lifecycle_rejection_breakdown, + rejection_breakdown, max_commit_seq, max_finality_gap, latencies_us, - ) + )) })); } let mut accepted_orders = 0usize; let mut rejected_orders = 0usize; + let mut lifecycle_attempted_ops = 0usize; + let mut lifecycle_accepted_ops = 0usize; + let mut lifecycle_rejected_ops = 0usize; + let mut lifecycle_rejection_breakdown = RejectionBreakdown::default(); + let mut rejection_breakdown = RejectionBreakdown::default(); let mut max_commit_seq = 0u64; let mut max_commit_finality_gap = 0u64; let mut all_latencies_us = Vec::new(); for task in tasks { - let (a, r, max_seq, max_gap, mut latencies) = task.await.expect("task join"); + let task_output = task + .await + .map_err(|e| AedbError::Validation(format!("simulation task join failure: {e}")))?; + let (a, r, la, lac, lr, lbreakdown, breakdown, max_seq, max_gap, mut latencies) = + task_output?; accepted_orders += a; rejected_orders += r; + lifecycle_attempted_ops += la; + lifecycle_accepted_ops += lac; + lifecycle_rejected_ops += lr; + merge_rejection_breakdown(&mut lifecycle_rejection_breakdown, &lbreakdown); + merge_rejection_breakdown(&mut rejection_breakdown, &breakdown); max_commit_seq = max_commit_seq.max(max_seq); max_commit_finality_gap = max_commit_finality_gap.max(max_gap); all_latencies_us.append(&mut latencies); } - assert_book_invariants(&db, &instruments).await; + assert_book_invariants(&db, &instruments).await?; let elapsed_ms = run_started.elapsed().as_millis().max(1) as u64; db.force_fsync().await?; let heads = db.head_state().await; @@ -532,6 +609,26 @@ pub async fn run_hft_simulation_with_config( let attempted_ops_per_sec = (attempted_orders as u64).saturating_mul(1000) / elapsed_ms; let accepted_ops_per_sec = (accepted_orders as u64).saturating_mul(1000) / elapsed_ms; let rejected_ops_per_sec = (rejected_orders as u64).saturating_mul(1000) / elapsed_ms; + assert_eq!( + total_rejections(&rejection_breakdown), + rejected_orders, + "primary rejection accounting mismatch" + ); + assert_eq!( + total_rejections(&lifecycle_rejection_breakdown), + lifecycle_rejected_ops, + "lifecycle rejection accounting mismatch" + ); + assert_eq!( + accepted_orders + rejected_orders, + attempted_orders, + "primary flow accounting mismatch" + ); + assert_eq!( + lifecycle_accepted_ops + lifecycle_rejected_ops, + lifecycle_attempted_ops, + "lifecycle flow accounting mismatch" + ); let simulation = SimulationReport { assets: cfg.assets, @@ -541,6 +638,11 @@ pub async fn run_hft_simulation_with_config( attempted_orders, accepted_orders, rejected_orders, + lifecycle_attempted_ops, + lifecycle_accepted_ops, + lifecycle_rejected_ops, + lifecycle_rejection_breakdown, + rejection_breakdown, max_commit_seq, visible_head_seq: heads.visible_head_seq, durable_head_seq: heads.durable_head_seq, @@ -597,6 +699,7 @@ async fn flush_pending_orders( latencies_us: &mut Vec, accepted: &mut usize, rejected: &mut usize, + rejection_breakdown: &mut RejectionBreakdown, max_commit_seq: &mut u64, max_finality_gap: &mut u64, ) -> Result<(), AedbError> { @@ -621,10 +724,191 @@ async fn flush_pending_orders( *max_commit_seq = (*max_commit_seq).max(commit.commit_seq); *accepted += batch_len; } - Err(err) => match err { - AedbError::Validation(_) => *rejected += batch_len, - other => return Err(other), - }, + Err(err) => { + *rejected += batch_len; + record_rejection_error(rejection_breakdown, err, batch_len)?; + } } Ok(()) } + +fn record_rejection_error( + rejection_breakdown: &mut RejectionBreakdown, + err: AedbError, + count: usize, +) -> Result<(), AedbError> { + match err { + AedbError::Validation(msg) => { + record_validation_rejection(rejection_breakdown, &msg, count); + Ok(()) + } + AedbError::Conflict(_) => { + rejection_breakdown.conflict += count; + Ok(()) + } + other => Err(other), + } +} + +fn record_validation_rejection( + rejection_breakdown: &mut RejectionBreakdown, + msg: &str, + count: usize, +) { + if msg.contains("conflict") { + rejection_breakdown.conflict += count; + } else if msg.contains("post_only would cross") { + rejection_breakdown.post_only_would_cross += count; + } else if msg.contains("fok cannot fill") { + rejection_breakdown.fok_cannot_fill += count; + } else if msg.contains("market order has no liquidity") { + rejection_breakdown.market_no_liquidity += count; + } else if msg.contains("instrument halted") { + rejection_breakdown.instrument_halted += count; + } else if msg.contains("nonce too low") { + rejection_breakdown.nonce_too_low += count; + } else if msg.contains("duplicate client_order_id") { + rejection_breakdown.duplicate_client_order_id += count; + } else if msg.contains("quantity violates lot size") { + rejection_breakdown.lot_size_violation += count; + } else if msg.contains("qty must be > 0") { + rejection_breakdown.qty_non_positive += count; + } else if msg.contains("price outside instrument bounds") { + rejection_breakdown.price_out_of_bounds += count; + } else { + rejection_breakdown.other_validation += count; + } +} + +fn merge_rejection_breakdown(dst: &mut RejectionBreakdown, src: &RejectionBreakdown) { + dst.conflict += src.conflict; + dst.post_only_would_cross += src.post_only_would_cross; + dst.fok_cannot_fill += src.fok_cannot_fill; + dst.market_no_liquidity += src.market_no_liquidity; + dst.instrument_halted += src.instrument_halted; + dst.nonce_too_low += src.nonce_too_low; + dst.duplicate_client_order_id += src.duplicate_client_order_id; + dst.lot_size_violation += src.lot_size_violation; + dst.qty_non_positive += src.qty_non_positive; + dst.price_out_of_bounds += src.price_out_of_bounds; + dst.other_validation += src.other_validation; +} + +fn total_rejections(b: &RejectionBreakdown) -> usize { + b.conflict + + b.post_only_would_cross + + b.fok_cannot_fill + + b.market_no_liquidity + + b.instrument_halted + + b.nonce_too_low + + b.duplicate_client_order_id + + b.lot_size_violation + + b.qty_non_positive + + b.price_out_of_bounds + + b.other_validation +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn classify_validation_rejection_reasons() { + let mut breakdown = RejectionBreakdown::default(); + record_validation_rejection(&mut breakdown, "post_only would cross", 2); + record_validation_rejection(&mut breakdown, "fok cannot fill", 3); + record_validation_rejection(&mut breakdown, "market order has no liquidity", 4); + record_validation_rejection(&mut breakdown, "instrument halted", 5); + record_validation_rejection(&mut breakdown, "nonce too low", 6); + record_validation_rejection(&mut breakdown, "duplicate client_order_id", 7); + record_validation_rejection(&mut breakdown, "quantity violates lot size", 8); + record_validation_rejection(&mut breakdown, "qty must be > 0", 9); + record_validation_rejection(&mut breakdown, "price outside instrument bounds", 10); + record_validation_rejection(&mut breakdown, "some other validation", 11); + + assert_eq!(breakdown.post_only_would_cross, 2); + assert_eq!(breakdown.fok_cannot_fill, 3); + assert_eq!(breakdown.market_no_liquidity, 4); + assert_eq!(breakdown.instrument_halted, 5); + assert_eq!(breakdown.nonce_too_low, 6); + assert_eq!(breakdown.duplicate_client_order_id, 7); + assert_eq!(breakdown.lot_size_violation, 8); + assert_eq!(breakdown.qty_non_positive, 9); + assert_eq!(breakdown.price_out_of_bounds, 10); + assert_eq!(breakdown.other_validation, 11); + } + + #[test] + fn merge_rejection_breakdowns() { + let mut left = RejectionBreakdown { + conflict: 1, + post_only_would_cross: 2, + fok_cannot_fill: 3, + market_no_liquidity: 4, + instrument_halted: 5, + nonce_too_low: 6, + duplicate_client_order_id: 7, + lot_size_violation: 8, + qty_non_positive: 9, + price_out_of_bounds: 10, + other_validation: 11, + }; + let right = RejectionBreakdown { + conflict: 10, + post_only_would_cross: 20, + fok_cannot_fill: 30, + market_no_liquidity: 40, + instrument_halted: 50, + nonce_too_low: 60, + duplicate_client_order_id: 70, + lot_size_violation: 80, + qty_non_positive: 90, + price_out_of_bounds: 100, + other_validation: 110, + }; + merge_rejection_breakdown(&mut left, &right); + assert_eq!( + left, + RejectionBreakdown { + conflict: 11, + post_only_would_cross: 22, + fok_cannot_fill: 33, + market_no_liquidity: 44, + instrument_halted: 55, + nonce_too_low: 66, + duplicate_client_order_id: 77, + lot_size_violation: 88, + qty_non_positive: 99, + price_out_of_bounds: 110, + other_validation: 121, + } + ); + assert_eq!(total_rejections(&left), 726); + } + + #[test] + fn classify_conflict_wrapped_as_validation() { + let mut breakdown = RejectionBreakdown::default(); + record_validation_rejection( + &mut breakdown, + "read set conflict: key changed under snapshot", + 5, + ); + assert_eq!(breakdown.conflict, 5); + assert_eq!(breakdown.other_validation, 0); + } + + #[test] + fn classify_conflict_error_variant() { + let mut breakdown = RejectionBreakdown::default(); + record_rejection_error(&mut breakdown, AedbError::Conflict("rw-conflict".into()), 3) + .expect("classification must succeed"); + assert_eq!(breakdown.conflict, 3); + } + + #[test] + fn decode_u256_bytes_rejects_invalid_length() { + let err = decode_u256_bytes_to_u64(&[1, 2, 3]).expect_err("must reject short u256"); + assert!(matches!(err, AedbError::Validation(_))); + } +} diff --git a/crates/aedb-orderbook/tests/adversarial_slo_sla.rs b/crates/aedb-orderbook/tests/adversarial_slo_sla.rs new file mode 100644 index 0000000..df092e8 --- /dev/null +++ b/crates/aedb-orderbook/tests/adversarial_slo_sla.rs @@ -0,0 +1,172 @@ +use aedb_orderbook::{ + MatchWorkload, OrderFlowProfile, ProfiledSimulationReport, SimulationConfig, TableProfile, + high_throughput_simulation_config, run_hft_simulation_with_config, +}; + +#[derive(Clone)] +struct Scenario { + name: &'static str, + cfg: SimulationConfig, +} + +#[derive(Clone)] +struct SloThresholds { + min_attempted_tps: u64, + max_p99_latency_us: u64, + max_finality_gap: u64, + max_primary_reject_ratio_ppm: u64, +} + +fn env_or_u64(var: &str, default: u64) -> u64 { + std::env::var(var) + .ok() + .and_then(|v| v.parse::().ok()) + .unwrap_or(default) +} + +fn thresholds() -> SloThresholds { + SloThresholds { + min_attempted_tps: env_or_u64("AEDB_ORDERBOOK_SLA_MIN_ATTEMPTED_TPS", 600), + max_p99_latency_us: env_or_u64("AEDB_ORDERBOOK_SLA_MAX_P99_US", 1_000_000), + max_finality_gap: env_or_u64("AEDB_ORDERBOOK_SLA_MAX_FINALITY_GAP", 10_000), + max_primary_reject_ratio_ppm: env_or_u64( + "AEDB_ORDERBOOK_SLA_MAX_PRIMARY_REJECT_RATIO_PPM", + 900_000, + ), + } +} + +fn scenarios() -> Vec { + vec![ + Scenario { + name: "sla_per_asset_crossing_mixed", + cfg: SimulationConfig { + assets: vec!["BTC-USD".into(), "ETH-USD".into(), "SOL-USD".into()], + traders: 12, + ops_per_trader: 600, + seed: 1_001, + flow_profile: OrderFlowProfile::MixedMarketAndLimit, + table_profile: TableProfile::PerAssetTable, + collect_latency: true, + lifecycle_every_ops: 80, + orders_per_commit: 1, + match_workload: MatchWorkload::CrossingNearTouch, + }, + }, + Scenario { + name: "sla_multi_asset_crossing_mixed", + cfg: SimulationConfig { + assets: vec![ + "BTC-USD".into(), + "ETH-USD".into(), + "SOL-USD".into(), + "DOGE-USD".into(), + "AVAX-USD".into(), + ], + traders: 12, + ops_per_trader: 600, + seed: 2_002, + flow_profile: OrderFlowProfile::MixedMarketAndLimit, + table_profile: TableProfile::MultiAssetTable { + table_id: "markets".to_string(), + }, + collect_latency: true, + lifecycle_every_ops: 80, + orders_per_commit: 1, + match_workload: MatchWorkload::CrossingNearTouch, + }, + }, + Scenario { + name: "sla_multi_asset_no_cross_limit_only", + cfg: SimulationConfig { + assets: vec![ + "BTC-USD".into(), + "ETH-USD".into(), + "SOL-USD".into(), + "DOGE-USD".into(), + ], + traders: 10, + ops_per_trader: 500, + seed: 3_003, + flow_profile: OrderFlowProfile::LimitOnlyIoc, + table_profile: TableProfile::MultiAssetTable { + table_id: "markets".to_string(), + }, + collect_latency: true, + lifecycle_every_ops: 100, + orders_per_commit: 1, + match_workload: MatchWorkload::NoCrossIoc, + }, + }, + ] +} + +fn assert_common_invariants(name: &str, report: &ProfiledSimulationReport) { + let sim = &report.simulation; + assert_eq!( + sim.accepted_orders + sim.rejected_orders, + sim.attempted_orders, + "{name}: primary accounting mismatch" + ); + assert_eq!( + sim.lifecycle_accepted_ops + sim.lifecycle_rejected_ops, + sim.lifecycle_attempted_ops, + "{name}: lifecycle accounting mismatch" + ); + assert!( + report.zero_dropped_orders, + "{name}: dropped orders detected" + ); + assert!( + sim.visible_head_seq >= sim.max_commit_seq, + "{name}: visible head below max commit sequence" + ); + assert!( + sim.durable_head_seq >= sim.max_commit_seq, + "{name}: durable head below max commit sequence" + ); +} + +fn assert_slo(name: &str, report: &ProfiledSimulationReport, slo: &SloThresholds) { + let sim = &report.simulation; + let reject_ratio_ppm = if sim.attempted_orders == 0 { + 0 + } else { + (sim.rejected_orders as u64) + .saturating_mul(1_000_000) + .saturating_div(sim.attempted_orders as u64) + }; + assert!( + report.attempted_ops_per_sec >= slo.min_attempted_tps, + "{name}: attempted TPS below SLO ({})", + report.attempted_ops_per_sec + ); + assert!( + report.latency.p99_us <= slo.max_p99_latency_us, + "{name}: p99 latency above SLO ({}us)", + report.latency.p99_us + ); + assert!( + report.max_commit_finality_gap <= slo.max_finality_gap, + "{name}: finality gap above SLO ({})", + report.max_commit_finality_gap + ); + assert!( + reject_ratio_ppm <= slo.max_primary_reject_ratio_ppm, + "{name}: reject ratio above SLO ({} ppm)", + reject_ratio_ppm + ); +} + +#[tokio::test] +async fn adversarial_slo_sla_gates() { + let db_cfg = high_throughput_simulation_config(); + let slo = thresholds(); + for scenario in scenarios() { + let report = run_hft_simulation_with_config(scenario.cfg.clone(), db_cfg.clone()) + .await + .unwrap_or_else(|e| panic!("{} failed to run: {e}", scenario.name)); + assert_common_invariants(scenario.name, &report); + assert_slo(scenario.name, &report, &slo); + } +} diff --git a/crates/aedb-orderbook/tests/chaos_ci_profile.rs b/crates/aedb-orderbook/tests/chaos_ci_profile.rs new file mode 100644 index 0000000..1afd0f0 --- /dev/null +++ b/crates/aedb-orderbook/tests/chaos_ci_profile.rs @@ -0,0 +1,147 @@ +use aedb_orderbook::{ + MatchWorkload, OrderFlowProfile, RejectionBreakdown, SimulationConfig, TableProfile, + high_throughput_simulation_config, run_hft_simulation_with_config, +}; + +fn total_rejections(b: &RejectionBreakdown) -> usize { + b.conflict + + b.post_only_would_cross + + b.fok_cannot_fill + + b.market_no_liquidity + + b.instrument_halted + + b.nonce_too_low + + b.duplicate_client_order_id + + b.lot_size_violation + + b.qty_non_positive + + b.price_out_of_bounds + + b.other_validation +} + +#[derive(Clone)] +struct ChaosScenario { + name: &'static str, + cfg: SimulationConfig, + min_attempted_tps: u64, + max_finality_gap: u64, +} + +fn scenarios() -> Vec { + vec![ + ChaosScenario { + name: "ci_per_asset_mixed", + cfg: SimulationConfig { + assets: vec!["BTC-USD".into(), "ETH-USD".into(), "SOL-USD".into()], + traders: 8, + ops_per_trader: 300, + seed: 101, + flow_profile: OrderFlowProfile::MixedMarketAndLimit, + table_profile: TableProfile::PerAssetTable, + collect_latency: true, + lifecycle_every_ops: 100, + orders_per_commit: 1, + match_workload: MatchWorkload::CrossingNearTouch, + }, + min_attempted_tps: 500, + max_finality_gap: 5_000, + }, + ChaosScenario { + name: "ci_multi_asset_mixed", + cfg: SimulationConfig { + assets: vec![ + "BTC-USD".into(), + "ETH-USD".into(), + "SOL-USD".into(), + "DOGE-USD".into(), + ], + traders: 8, + ops_per_trader: 300, + seed: 202, + flow_profile: OrderFlowProfile::MixedMarketAndLimit, + table_profile: TableProfile::MultiAssetTable { + table_id: "markets".to_string(), + }, + collect_latency: true, + lifecycle_every_ops: 100, + orders_per_commit: 1, + match_workload: MatchWorkload::CrossingNearTouch, + }, + min_attempted_tps: 500, + max_finality_gap: 5_000, + }, + ] +} + +#[tokio::test] +async fn chaos_ci_profile_invariants_and_bounds() { + let db_cfg = high_throughput_simulation_config(); + for s in scenarios() { + let report = run_hft_simulation_with_config(s.cfg.clone(), db_cfg.clone()) + .await + .unwrap_or_else(|e| panic!("{} failed to run: {e}", s.name)); + + let sim = &report.simulation; + assert_eq!( + sim.attempted_orders, + s.cfg.traders * s.cfg.ops_per_trader, + "{} attempted mismatch", + s.name + ); + assert_eq!( + sim.accepted_orders + sim.rejected_orders, + sim.attempted_orders, + "{} primary accounting mismatch", + s.name + ); + assert_eq!( + sim.lifecycle_accepted_ops + sim.lifecycle_rejected_ops, + sim.lifecycle_attempted_ops, + "{} lifecycle accounting mismatch", + s.name + ); + assert_eq!( + total_rejections(&sim.rejection_breakdown), + sim.rejected_orders, + "{} primary rejection breakdown mismatch", + s.name + ); + assert_eq!( + total_rejections(&sim.lifecycle_rejection_breakdown), + sim.lifecycle_rejected_ops, + "{} lifecycle rejection breakdown mismatch", + s.name + ); + assert!( + report.zero_dropped_orders, + "{} dropped orders detected", + s.name + ); + assert!( + sim.visible_head_seq >= sim.max_commit_seq, + "{} visible head below commit seq", + s.name + ); + assert!( + sim.durable_head_seq >= sim.max_commit_seq, + "{} durable head below commit seq", + s.name + ); + assert!( + report.max_commit_finality_gap <= s.max_finality_gap, + "{} finality gap too large: {}", + s.name, + report.max_commit_finality_gap + ); + assert!( + report.attempted_ops_per_sec >= s.min_attempted_tps, + "{} attempted TPS too low: {}", + s.name, + report.attempted_ops_per_sec + ); + assert!( + report.latency.p99_us > 0 && report.latency.p99_us < 1_000_000, + "{} p99 latency out of bounds: {}", + s.name, + report.latency.p99_us + ); + } +} diff --git a/crates/aedb-orderbook/tests/property_randomized_matrix.rs b/crates/aedb-orderbook/tests/property_randomized_matrix.rs new file mode 100644 index 0000000..eb1d2a4 --- /dev/null +++ b/crates/aedb-orderbook/tests/property_randomized_matrix.rs @@ -0,0 +1,89 @@ +use aedb_orderbook::{ + MatchWorkload, OrderFlowProfile, SimulationConfig, TableProfile, + high_throughput_simulation_config, run_hft_simulation_with_config, +}; +use proptest::prelude::*; + +fn pick_assets(asset_count: usize) -> Vec { + let universe = ["BTC-USD", "ETH-USD", "SOL-USD", "DOGE-USD", "AVAX-USD"]; + universe + .iter() + .take(asset_count.max(1).min(universe.len())) + .map(|s| s.to_string()) + .collect() +} + +fn table_profile(multi_asset: bool) -> TableProfile { + if multi_asset { + TableProfile::MultiAssetTable { + table_id: "markets".to_string(), + } + } else { + TableProfile::PerAssetTable + } +} + +proptest! { + #![proptest_config(ProptestConfig { + cases: 8, + max_local_rejects: 0, + .. ProptestConfig::default() + })] + #[test] + fn randomized_invariants_hold_under_contention( + seed in any::(), + traders in 4usize..10, + ops_per_trader in 120usize..320, + asset_count in 1usize..5, + mixed_flow in any::(), + multi_asset in any::(), + crossing_workload in any::(), + lifecycle_every_ops in 30usize..140, + orders_per_commit in 1usize..4 + ) { + let flow_profile = if mixed_flow { + OrderFlowProfile::MixedMarketAndLimit + } else { + OrderFlowProfile::LimitOnlyIoc + }; + let workload = if crossing_workload { + MatchWorkload::CrossingNearTouch + } else { + MatchWorkload::NoCrossIoc + }; + let cfg = SimulationConfig { + assets: pick_assets(asset_count), + traders, + ops_per_trader, + seed, + flow_profile, + table_profile: table_profile(multi_asset), + collect_latency: false, + lifecycle_every_ops, + orders_per_commit, + match_workload: workload, + }; + + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .expect("runtime"); + + let report = rt + .block_on(async { run_hft_simulation_with_config(cfg, high_throughput_simulation_config()).await }) + .expect("simulation should succeed"); + + let sim = report.simulation; + prop_assert_eq!( + sim.accepted_orders + sim.rejected_orders, + sim.attempted_orders + ); + prop_assert_eq!( + sim.lifecycle_accepted_ops + sim.lifecycle_rejected_ops, + sim.lifecycle_attempted_ops + ); + prop_assert!(report.zero_dropped_orders); + prop_assert!(sim.visible_head_seq >= sim.max_commit_seq); + prop_assert!(sim.durable_head_seq >= sim.max_commit_seq); + } +} diff --git a/docs/SECURITY_ACCEPTANCE_CRITERIA.md b/docs/SECURITY_ACCEPTANCE_CRITERIA.md new file mode 100644 index 0000000..3a4867f --- /dev/null +++ b/docs/SECURITY_ACCEPTANCE_CRITERIA.md @@ -0,0 +1,60 @@ +# Security Acceptance Criteria + +This document defines the minimum acceptance gates before claiming production readiness for high-integrity financial workloads. + +## Mandatory CI Gates + +The CI pipeline must pass all of the following on pull requests and protected branches: + +- Crash/recovery: + - `cargo test --test crash_matrix -- --test-threads=1` + - `cargo test --test crash_matrix crash_matrix_a17a_strict_restarts_fail_closed -- --ignored --test-threads=1` + - `cargo test --test crash_matrix crash_matrix_a17b_thousand_crash_cycles_preserve_state -- --ignored --test-threads=1` +- Strict security/authorization: + - `cargo test --test security_boundaries -- --test-threads=1` + - `cargo test --test security_properties -- --test-threads=1` + - `cargo test --test security_properties_proptest -- --test-threads=1` + - `cargo test --test read_assertions integration_idempotent_retry_skips_assertion_re_evaluation -- --test-threads=1` + - `cargo test --test read_assertions integration_failed_assertion_is_logged_to_system_audit_table -- --test-threads=1` +- Backup/restore integrity: + - `cargo test --test backup_restore strict_backup_chain_restore_succeeds_with_hash_chain_enforcement -- --test-threads=1` + - `cargo test --test backup_restore strict_backup_chain_restore_rejects_tampered_incremental_segment -- --test-threads=1` +- Long chaos/adversarial orderbook profiles: + - `cargo test -p aedb-orderbook --test property_randomized_matrix -- --test-threads=1` + - `cargo test -p aedb-orderbook --test adversarial_slo_sla -- --test-threads=1` + - `cargo test -p aedb-orderbook --test chaos_ci_profile -- --test-threads=1` + - `cargo test -p aedb-orderbook --test simulation_smoke simulation_soak_multi_asset_mixed -- --ignored --test-threads=1` + - `cargo test -p aedb-orderbook --test simulation_smoke simulation_soak_single_asset_contention_limit -- --ignored --test-threads=1` + - `cargo test --test order_book_simulation order_book_chaos_read_write_accuracy -- --test-threads=1` + +Use `scripts/security_gate.sh` to run this locally. + +## SLO/SLA Thresholds + +The adversarial SLO gate (`adversarial_slo_sla`) enforces the following defaults: + +- `AEDB_ORDERBOOK_SLA_MIN_ATTEMPTED_TPS=600` +- `AEDB_ORDERBOOK_SLA_MAX_P99_US=1000000` +- `AEDB_ORDERBOOK_SLA_MAX_FINALITY_GAP=10000` +- `AEDB_ORDERBOOK_SLA_MAX_PRIMARY_REJECT_RATIO_PPM=900000` + +These environment variables may be tightened in production CI. + +## Invariant Requirements + +All mandatory scenarios must satisfy: + +- Zero dropped primary orders (`accepted + rejected == attempted`). +- Lifecycle accounting exactness (`lifecycle_accepted + lifecycle_rejected == lifecycle_attempted`). +- Durable and visible heads not behind accepted commit head. +- Deterministic replay/integrity checks pass in strict crash and strict restore suites. +- No authorization boundary bypass in secure mode test suites. + +## External Validation (Required Outside This Repo) + +The following are required before financial-grade claims: + +- Independent code audit with focus on commit atomicity, authorization checks, and recovery path. +- Penetration testing of the embedding/API boundary in the host application. +- Key management and secret distribution review (HMAC/checkpoint keys, rotation, revocation). +- Incident response tabletop and restore drills with explicit RTO/RPO evidence. diff --git a/docs/SECURITY_OPERATIONS_RUNBOOK.md b/docs/SECURITY_OPERATIONS_RUNBOOK.md new file mode 100644 index 0000000..9b3ea75 --- /dev/null +++ b/docs/SECURITY_OPERATIONS_RUNBOOK.md @@ -0,0 +1,54 @@ +# Security Operations Runbook + +## 1. Key Management + +- Use `AedbConfig::production([u8; 32])` or `AedbConfig::low_latency([u8; 32])` for manifest HMAC key baseline. +- Set checkpoint encryption key with `with_checkpoint_key([u8; 32])` for encrypted checkpoint at-rest protection. +- Keep keys outside source control; inject via secret manager/environment at process boot. +- Rotate keys by controlled deployment with backup snapshots before and after rotation. + +## 2. Authenticated Caller Model + +- Use `AedbInstance::open_secure(...)` (or `open_production(...)`) in production paths. +- Require all caller-facing operations to use authenticated `*_as` APIs. +- Do not expose anonymous commit/query APIs in host application routes. +- Grant minimum permissions only (project/scope/table/KV-prefix scoped). + +## 3. Audit Logging + +- Keep system audit tables enabled: + - `authz_audit` for grant/revoke/ownership events. + - `assertion_audit` for assertion failures and policy enforcement failures. +- Periodically export audit table snapshots to immutable storage. +- Alert on: + - repeated failed assertions, + - unexpected GlobalAdmin usage, + - unusual permission churn. + +## 4. Backup/Restore Drills + +- Daily full backup, periodic incremental backups (policy defined by RPO target). +- Weekly restore drill: + - restore into clean directory, + - verify strict hash-chain integrity, + - run parity and invariant checks. +- Suggested commands: + - `cargo test --test backup_restore strict_backup_chain_restore_succeeds_with_hash_chain_enforcement -- --test-threads=1` + - `cargo test --test backup_restore strict_backup_chain_restore_rejects_tampered_incremental_segment -- --test-threads=1` + - `cargo run --bin aedb -- check invariants --data-dir ` + +## 5. Incident Response + +- On integrity alert: + - freeze writes at host application level, + - capture WAL segments + manifest + checkpoint files, + - run offline parity/invariant checks, + - restore from latest validated backup chain if needed. +- On authorization alert: + - revoke impacted permissions, + - rotate application credentials, + - inspect `authz_audit` and host API logs for blast radius. +- On repeated commit timeout/conflict spikes: + - collect `operational_metrics()` snapshots, + - reduce admission rate or shard workload by asset/project, + - maintain durable-finality path for high-value transactions. diff --git a/scripts/run_orderbook_realism_docker.sh b/scripts/run_orderbook_realism_docker.sh new file mode 100755 index 0000000..12c58e3 --- /dev/null +++ b/scripts/run_orderbook_realism_docker.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +IMAGE="${IMAGE:-rust:latest}" +CPUS="${CPUS:-6}" +MEMORY="${MEMORY:-12g}" +RETRIES="${RETRIES:-3}" + +if ! command -v docker >/dev/null 2>&1; then + echo "docker is not installed or not on PATH" >&2 + exit 1 +fi + +if ! docker version >/dev/null 2>&1; then + echo "docker daemon is not running; start Docker and retry" >&2 + exit 1 +fi + +run_in_container() { + local cmd="$1" + docker run --rm \ + --cpus="${CPUS}" \ + --memory="${MEMORY}" \ + -e PATH=/usr/local/cargo/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin \ + -v "${ROOT_DIR}:/work" \ + -v "${HOME}/.cargo/registry:/usr/local/cargo/registry" \ + -v "${HOME}/.cargo/git:/usr/local/cargo/git" \ + -w /work \ + "${IMAGE}" \ + bash -c "${cmd}" +} + +retry() { + local label="$1" + local cmd="$2" + local n=1 + while true; do + echo "[${label}] attempt ${n}/${RETRIES}" + if run_in_container "${cmd}"; then + return 0 + fi + if (( n >= RETRIES )); then + echo "[${label}] failed after ${RETRIES} attempts" >&2 + return 1 + fi + n=$((n + 1)) + sleep 2 + done +} + +retry "chaos_ci_profile" "cargo test -q -p aedb-orderbook --test chaos_ci_profile" +retry "order_book_chaos_read_write_accuracy" "cargo test -q --test order_book_simulation order_book_chaos_read_write_accuracy" +retry "orderbook_perf" "cargo run --release -p aedb-orderbook --bin orderbook_perf" + +echo "docker realism suite completed" diff --git a/scripts/security_gate.sh b/scripts/security_gate.sh new file mode 100755 index 0000000..a391c06 --- /dev/null +++ b/scripts/security_gate.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash +set -euo pipefail + +echo "[security-gate] unit/library" +cargo test --lib + +echo "[security-gate] security boundaries" +cargo test --test security_boundaries -- --test-threads=1 +cargo test --test security_properties -- --test-threads=1 +cargo test --test security_properties_proptest -- --test-threads=1 + +echo "[security-gate] strict backup/restore integrity" +cargo test --test backup_restore strict_backup_chain_restore_succeeds_with_hash_chain_enforcement -- --test-threads=1 +cargo test --test backup_restore strict_backup_chain_restore_rejects_tampered_incremental_segment -- --test-threads=1 + +echo "[security-gate] idempotency + assertion behavior" +cargo test --test read_assertions integration_idempotent_retry_skips_assertion_re_evaluation -- --test-threads=1 +cargo test --test read_assertions integration_failed_assertion_is_logged_to_system_audit_table -- --test-threads=1 + +echo "[security-gate] crash/recovery matrix" +cargo test --test crash_matrix -- --test-threads=1 +cargo test --test crash_matrix crash_matrix_a17a_strict_restarts_fail_closed -- --ignored --test-threads=1 +cargo test --test crash_matrix crash_matrix_a17b_thousand_crash_cycles_preserve_state -- --ignored --test-threads=1 + +echo "[security-gate] orderbook adversarial and chaos" +cargo test -p aedb-orderbook --test property_randomized_matrix -- --test-threads=1 +cargo test -p aedb-orderbook --test adversarial_slo_sla -- --test-threads=1 +cargo test -p aedb-orderbook --test chaos_ci_profile -- --test-threads=1 +cargo test -p aedb-orderbook --test simulation_smoke simulation_soak_multi_asset_mixed -- --ignored --test-threads=1 +cargo test -p aedb-orderbook --test simulation_smoke simulation_soak_single_asset_contention_limit -- --ignored --test-threads=1 +cargo test --test order_book_simulation order_book_chaos_read_write_accuracy -- --test-threads=1 + +echo "[security-gate] complete" diff --git a/src/commit/executor/internals.rs b/src/commit/executor/internals.rs index efbd18b..9810da9 100644 --- a/src/commit/executor/internals.rs +++ b/src/commit/executor/internals.rs @@ -336,7 +336,7 @@ pub(super) fn process_commit_epoch( let mut read_set_conflicts = 0u64; let mut working_keyspace = state.keyspace.clone(); let mut working_catalog = state.catalog.clone(); - let mut working_idempotency = state.idempotency.clone(); + let mut working_idempotency: Option> = None; let mut working_global_unique_index = state.global_unique_index.clone(); let mut sequenced = Vec::new(); let mut internal_sequenced = Vec::new(); @@ -357,18 +357,22 @@ pub(super) fn process_commit_epoch( continue; } - if let Some(key) = request.envelope.idempotency_key.clone() - && let Some(record) = working_idempotency.get(&key) - { - outcomes.push(EpochOutcome { - request, - result: Ok(CommitResult { - commit_seq: record.commit_seq, - durable_head_seq: state.durable_head_seq.max(record.commit_seq), - }), - post_apply_delta: None, - }); - continue; + if let Some(key) = request.envelope.idempotency_key.clone() { + let record = working_idempotency + .as_ref() + .and_then(|map| map.get(&key)) + .or_else(|| state.idempotency.get(&key)); + if let Some(record) = record { + outcomes.push(EpochOutcome { + request, + result: Ok(CommitResult { + commit_seq: record.commit_seq, + durable_head_seq: state.durable_head_seq.max(record.commit_seq), + }), + post_apply_delta: None, + }); + continue; + } } if let Err(err) = revalidate_read_set_for_keyspace(&working_keyspace, &request.envelope) { @@ -550,13 +554,15 @@ pub(super) fn process_commit_epoch( let commit_ts_micros = now_micros(); if let Some(key) = request.envelope.idempotency_key.clone() { - working_idempotency.insert( - key, - IdempotencyRecord { - commit_seq, - recorded_at_micros: commit_ts_micros, - }, - ); + working_idempotency + .get_or_insert_with(|| state.idempotency.clone()) + .insert( + key, + IdempotencyRecord { + commit_seq, + recorded_at_micros: commit_ts_micros, + }, + ); } let delta = CommitDelta { @@ -714,7 +720,9 @@ pub(super) fn process_commit_epoch( state.keyspace = working_keyspace; state.catalog = working_catalog; state.global_unique_index = working_global_unique_index; - state.idempotency = working_idempotency; + if let Some(updated) = working_idempotency { + state.idempotency = updated; + } state.current_seq = last_seq; state.visible_head_seq = last_seq; match state.config.durability_mode { diff --git a/src/commit/executor/mod.rs b/src/commit/executor/mod.rs index e10f4ce..ca257e4 100644 --- a/src/commit/executor/mod.rs +++ b/src/commit/executor/mod.rs @@ -133,6 +133,7 @@ pub struct CommitExecutor { #[derive(Debug, Default)] struct ExecutorTelemetry { inflight_commits: AtomicUsize, + queued_commits: AtomicUsize, commits_total: AtomicU64, commit_errors: AtomicU64, queue_full_rejections: AtomicU64, @@ -155,6 +156,7 @@ struct ExecutorTelemetry { #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct ExecutorMetrics { pub inflight_commits: usize, + pub queued_commits: usize, pub queued_bytes: usize, pub commits_total: u64, pub commit_errors: u64, @@ -403,6 +405,9 @@ impl CommitExecutor { } } queue_counter.fetch_sub(outcome.request.encoded_len, Ordering::Relaxed); + loop_telemetry + .queued_commits + .fetch_sub(1, Ordering::Relaxed); loop_telemetry .inflight_commits .fetch_sub(1, Ordering::Relaxed); @@ -412,6 +417,9 @@ impl CommitExecutor { for req in pending { queue_counter.fetch_sub(req.encoded_len, Ordering::Relaxed); + loop_telemetry + .queued_commits + .fetch_sub(1, Ordering::Relaxed); loop_telemetry .inflight_commits .fetch_sub(1, Ordering::Relaxed); @@ -461,6 +469,7 @@ impl CommitExecutor { Err(e) => { pre_telemetry.commit_errors.fetch_add(1, Ordering::Relaxed); pre_queue_counter.fetch_sub(req.encoded_len, Ordering::Relaxed); + pre_telemetry.queued_commits.fetch_sub(1, Ordering::Relaxed); let _ = req.result_tx.send(Err(e)); continue; } @@ -469,6 +478,7 @@ impl CommitExecutor { if write_partitions.is_empty() { pre_telemetry.commit_errors.fetch_add(1, Ordering::Relaxed); pre_queue_counter.fetch_sub(req.encoded_len, Ordering::Relaxed); + pre_telemetry.queued_commits.fetch_sub(1, Ordering::Relaxed); let _ = req.result_tx.send(Err(AedbError::Validation( "transaction envelope has no mutations".into(), ))); @@ -485,6 +495,7 @@ impl CommitExecutor { .inflight_commits .fetch_sub(1, Ordering::Relaxed); pre_queue_counter.fetch_sub(req.encoded_len, Ordering::Relaxed); + pre_telemetry.queued_commits.fetch_sub(1, Ordering::Relaxed); let _ = req.result_tx.send(Err(AedbError::Validation( "commit apply queue closed".into(), ))); @@ -666,6 +677,9 @@ impl CommitExecutor { Err(observed) => current = observed, } } + self.telemetry + .queued_commits + .fetch_add(1, Ordering::Relaxed); let (result_tx, result_rx) = oneshot::channel(); let shard = shard_for_envelope(&envelope, self.ingress_txs.len()); let ingress_tx = self @@ -692,10 +706,16 @@ impl CommitExecutor { Ok(Ok(())) => {} Ok(Err(e)) => { self.queued_bytes.fetch_sub(len, Ordering::Relaxed); + self.telemetry + .queued_commits + .fetch_sub(1, Ordering::Relaxed); return Err(AedbError::Validation(format!("commit queue closed: {e}"))); } Err(_) => { self.queued_bytes.fetch_sub(len, Ordering::Relaxed); + self.telemetry + .queued_commits + .fetch_sub(1, Ordering::Relaxed); self.telemetry .timeout_rejections .fetch_add(1, Ordering::Relaxed); @@ -816,6 +836,7 @@ impl CommitExecutor { }; ExecutorMetrics { inflight_commits: self.telemetry.inflight_commits.load(Ordering::Relaxed), + queued_commits: self.telemetry.queued_commits.load(Ordering::Relaxed), queued_bytes: self.queued_bytes.load(Ordering::Relaxed), commits_total, commit_errors: self.telemetry.commit_errors.load(Ordering::Relaxed), diff --git a/src/lib.rs b/src/lib.rs index 7cfec7f..1ea5fbc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -34,7 +34,9 @@ use crate::catalog::{DdlOperation, ResourceType}; use crate::checkpoint::loader::load_checkpoint_with_key; use crate::checkpoint::writer::write_checkpoint_with_key; use crate::commit::executor::{CommitExecutor, CommitResult, ExecutorMetrics}; -use crate::commit::tx::{ReadSet, TransactionEnvelope, WriteClass, WriteIntent}; +use crate::commit::tx::{ + ReadKey, ReadSet, ReadSetEntry, TransactionEnvelope, WriteClass, WriteIntent, +}; use crate::commit::validation::{Mutation, TableUpdateExpr, validate_permissions}; use crate::config::{AedbConfig, DurabilityMode, RecoveryMode}; use crate::error::AedbError; @@ -47,8 +49,9 @@ use crate::migration::{ }; use crate::order_book::{ ExecInstruction, FillSpec, InstrumentConfig, OrderBookDepth, OrderBookTableMode, OrderRecord, - OrderRequest, OrderSide, Spread, TimeInForce, read_last_execution_report, read_open_orders, - read_order_status, read_recent_trades, read_spread, read_top_n, scoped_instrument, + OrderRequest, OrderSide, Spread, TimeInForce, key_client_id, key_order, + read_last_execution_report, read_open_orders, read_order_status, read_recent_trades, + read_spread, read_top_n, scoped_instrument, u256_from_be, }; use crate::permission::{CallerContext, Permission}; use crate::preflight::{PreflightResult, preflight, preflight_plan}; @@ -726,6 +729,17 @@ impl AedbInstance { result } + async fn commit_prevalidated_internal_with_finality( + &self, + op_name: &'static str, + mutation: Mutation, + finality: CommitFinality, + ) -> Result { + let mut result = self.commit_prevalidated_internal(op_name, mutation).await?; + self.enforce_finality(&mut result, finality).await?; + Ok(result) + } + pub async fn commit_with_finality( &self, mutation: Mutation, @@ -1644,7 +1658,9 @@ impl AedbInstance { assertions: Vec::new(), read_set: ReadSet::default(), write_intent: WriteIntent { mutations }, - base_seq: self.snapshot_probe(ConsistencyMode::AtLatest).await?, + // No read set/assertions in this helper path. + // Keep hot-path parity with submit/submit_as and avoid snapshot acquisition. + base_seq: 0, }) .await } @@ -1666,7 +1682,9 @@ impl AedbInstance { assertions: Vec::new(), read_set: ReadSet::default(), write_intent: WriteIntent { mutations }, - base_seq: self.snapshot_probe(ConsistencyMode::AtLatest).await?, + // No read set/assertions in this helper path. + // Keep hot-path parity with submit/submit_as and avoid snapshot acquisition. + base_seq: 0, }) .await } @@ -2784,6 +2802,25 @@ impl AedbInstance { .await } + pub async fn order_book_new_with_finality( + &self, + project_id: &str, + scope_id: &str, + request: OrderRequest, + finality: CommitFinality, + ) -> Result { + self.commit_prevalidated_internal_with_finality( + "order_book_new", + Mutation::OrderBookNew { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + request, + }, + finality, + ) + .await + } + pub async fn order_book_define_table( &self, project_id: &str, @@ -2803,6 +2840,26 @@ impl AedbInstance { .await } + pub async fn order_book_define_table_as( + &self, + caller: CallerContext, + project_id: &str, + scope_id: &str, + table_id: &str, + mode: OrderBookTableMode, + ) -> Result { + self.commit_as( + caller, + Mutation::OrderBookDefineTable { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + table_id: table_id.to_string(), + mode, + }, + ) + .await + } + pub async fn order_book_drop_table( &self, project_id: &str, @@ -2820,6 +2877,24 @@ impl AedbInstance { .await } + pub async fn order_book_drop_table_as( + &self, + caller: CallerContext, + project_id: &str, + scope_id: &str, + table_id: &str, + ) -> Result { + self.commit_as( + caller, + Mutation::OrderBookDropTable { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + table_id: table_id.to_string(), + }, + ) + .await + } + pub async fn order_book_set_instrument_config( &self, project_id: &str, @@ -2839,6 +2914,26 @@ impl AedbInstance { .await } + pub async fn order_book_set_instrument_config_as( + &self, + caller: CallerContext, + project_id: &str, + scope_id: &str, + instrument: &str, + config: InstrumentConfig, + ) -> Result { + self.commit_as( + caller, + Mutation::OrderBookSetInstrumentConfig { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + instrument: instrument.to_string(), + config, + }, + ) + .await + } + pub async fn order_book_set_instrument_halted( &self, project_id: &str, @@ -2858,6 +2953,26 @@ impl AedbInstance { .await } + pub async fn order_book_set_instrument_halted_as( + &self, + caller: CallerContext, + project_id: &str, + scope_id: &str, + instrument: &str, + halted: bool, + ) -> Result { + self.commit_as( + caller, + Mutation::OrderBookSetInstrumentHalted { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + instrument: instrument.to_string(), + halted, + }, + ) + .await + } + pub async fn order_book_new_in_table( &self, project_id: &str, @@ -2888,6 +3003,28 @@ impl AedbInstance { .await } + pub async fn order_book_new_as_with_finality( + &self, + caller: CallerContext, + project_id: &str, + scope_id: &str, + request: OrderRequest, + finality: CommitFinality, + ) -> Result { + let mut result = self + .commit_as( + caller, + Mutation::OrderBookNew { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + request, + }, + ) + .await?; + self.enforce_finality(&mut result, finality).await?; + Ok(result) + } + pub async fn order_book_cancel( &self, project_id: &str, @@ -2910,114 +3047,795 @@ impl AedbInstance { .await } - pub async fn order_book_cancel_by_client_id( + pub async fn order_book_cancel_with_finality( &self, project_id: &str, scope_id: &str, instrument: &str, - client_order_id: &str, + order_id: u64, owner: &str, + finality: CommitFinality, ) -> Result { - self.commit_prevalidated_internal( - "order_book_cancel_by_client_id", + self.commit_prevalidated_internal_with_finality( + "order_book_cancel", Mutation::OrderBookCancel { project_id: project_id.to_string(), scope_id: scope_id.to_string(), instrument: instrument.to_string(), - order_id: 0, - client_order_id: Some(client_order_id.to_string()), + order_id, + client_order_id: None, owner: owner.to_string(), }, + finality, ) .await } - pub async fn order_book_cancel_replace( + pub async fn order_book_cancel_strict( &self, project_id: &str, scope_id: &str, instrument: &str, order_id: u64, owner: &str, - new_price_ticks: Option, - new_qty_be: Option<[u8; 32]>, - new_time_in_force: Option, - new_exec_instructions: Option, + finality: CommitFinality, ) -> Result { - self.commit_prevalidated_internal( - "order_book_cancel_replace", - Mutation::OrderBookCancelReplace { + self.order_book_cancel_strict_as_internal( + None, project_id, scope_id, instrument, order_id, owner, finality, + ) + .await + } + + pub async fn order_book_cancel_as( + &self, + caller: CallerContext, + project_id: &str, + scope_id: &str, + instrument: &str, + order_id: u64, + owner: &str, + ) -> Result { + self.commit_as( + caller, + Mutation::OrderBookCancel { project_id: project_id.to_string(), scope_id: scope_id.to_string(), instrument: instrument.to_string(), order_id, + client_order_id: None, owner: owner.to_string(), - new_price_ticks, - new_qty_be, - new_time_in_force, - new_exec_instructions, }, ) .await } - pub async fn order_book_mass_cancel( + pub async fn order_book_cancel_as_with_finality( &self, + caller: CallerContext, project_id: &str, scope_id: &str, instrument: &str, + order_id: u64, owner: &str, - side: Option, - owner_filter: Option, - price_range_ticks: Option<(i64, i64)>, + finality: CommitFinality, ) -> Result { - self.commit_prevalidated_internal( - "order_book_mass_cancel", - Mutation::OrderBookMassCancel { - project_id: project_id.to_string(), - scope_id: scope_id.to_string(), - instrument: instrument.to_string(), - owner: owner.to_string(), - side, - owner_filter, - price_range_ticks, - }, + let mut result = self + .order_book_cancel_as(caller, project_id, scope_id, instrument, order_id, owner) + .await?; + self.enforce_finality(&mut result, finality).await?; + Ok(result) + } + + pub async fn order_book_cancel_strict_as( + &self, + caller: CallerContext, + project_id: &str, + scope_id: &str, + instrument: &str, + order_id: u64, + owner: &str, + finality: CommitFinality, + ) -> Result { + self.order_book_cancel_strict_as_internal( + Some(caller), + project_id, + scope_id, + instrument, + order_id, + owner, + finality, ) .await } - pub async fn order_book_reduce( + async fn order_book_cancel_strict_as_internal( &self, + caller: Option, project_id: &str, scope_id: &str, instrument: &str, order_id: u64, owner: &str, - reduce_by_be: [u8; 32], + finality: CommitFinality, + ) -> Result { + let lease = self + .acquire_snapshot(ConsistencyMode::AtLatest) + .await + .map_err(AedbError::from)?; + let order_key = key_order(instrument, order_id); + let Some(entry) = lease.view.keyspace.kv_get(project_id, scope_id, &order_key) else { + return Err(AedbError::Validation(format!( + "strict cancel target not found: order_id={order_id}" + ))); + }; + let order: OrderRecord = + rmp_serde::from_slice(&entry.value).map_err(|e| AedbError::Decode(e.to_string()))?; + if order.owner != owner { + return Err(AedbError::PermissionDenied( + "order ownership mismatch".into(), + )); + } + if !matches!( + order.status, + crate::order_book::OrderStatus::Open | crate::order_book::OrderStatus::PartiallyFilled + ) || u256_from_be(order.remaining_qty_be).is_zero() + { + return Err(AedbError::Validation(format!( + "order not cancellable in current status: {:?}", + order.status + ))); + } + let envelope = TransactionEnvelope { + caller, + idempotency_key: None, + write_class: WriteClass::Standard, + assertions: Vec::new(), + read_set: ReadSet { + points: vec![ReadSetEntry { + key: ReadKey::KvKey { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + key: order_key, + }, + version_at_read: entry.version, + }], + ranges: Vec::new(), + }, + write_intent: WriteIntent { + mutations: vec![Mutation::OrderBookCancel { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + instrument: instrument.to_string(), + order_id, + client_order_id: None, + owner: owner.to_string(), + }], + }, + base_seq: lease.view.seq, + }; + self.commit_envelope_with_finality(envelope, finality).await + } + + pub async fn order_book_cancel_by_client_id( + &self, + project_id: &str, + scope_id: &str, + instrument: &str, + client_order_id: &str, + owner: &str, ) -> Result { self.commit_prevalidated_internal( - "order_book_reduce", - Mutation::OrderBookReduce { + "order_book_cancel_by_client_id", + Mutation::OrderBookCancel { project_id: project_id.to_string(), scope_id: scope_id.to_string(), instrument: instrument.to_string(), - order_id, + order_id: 0, + client_order_id: Some(client_order_id.to_string()), owner: owner.to_string(), - reduce_by_be, }, ) .await } - pub async fn order_book_match_internal( + pub async fn order_book_cancel_by_client_id_as( &self, + caller: CallerContext, project_id: &str, scope_id: &str, instrument: &str, - fills: Vec, + client_order_id: &str, + owner: &str, ) -> Result { - self.commit_prevalidated_internal( - "order_book_match_internal", + self.commit_as( + caller, + Mutation::OrderBookCancel { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + instrument: instrument.to_string(), + order_id: 0, + client_order_id: Some(client_order_id.to_string()), + owner: owner.to_string(), + }, + ) + .await + } + + pub async fn order_book_cancel_by_client_id_strict( + &self, + project_id: &str, + scope_id: &str, + instrument: &str, + client_order_id: &str, + owner: &str, + finality: CommitFinality, + ) -> Result { + self.order_book_cancel_by_client_id_strict_as_internal( + None, + project_id, + scope_id, + instrument, + client_order_id, + owner, + finality, + ) + .await + } + + pub async fn order_book_cancel_by_client_id_strict_as( + &self, + caller: CallerContext, + project_id: &str, + scope_id: &str, + instrument: &str, + client_order_id: &str, + owner: &str, + finality: CommitFinality, + ) -> Result { + self.order_book_cancel_by_client_id_strict_as_internal( + Some(caller), + project_id, + scope_id, + instrument, + client_order_id, + owner, + finality, + ) + .await + } + + async fn order_book_cancel_by_client_id_strict_as_internal( + &self, + caller: Option, + project_id: &str, + scope_id: &str, + instrument: &str, + client_order_id: &str, + owner: &str, + finality: CommitFinality, + ) -> Result { + let lease = self + .acquire_snapshot(ConsistencyMode::AtLatest) + .await + .map_err(AedbError::from)?; + let cid_key = key_client_id(instrument, owner, client_order_id); + let Some(cid_entry) = lease.view.keyspace.kv_get(project_id, scope_id, &cid_key) else { + return Err(AedbError::Validation(format!( + "strict cancel target not found: client_order_id={client_order_id}" + ))); + }; + if cid_entry.value.len() != 8 { + return Err(AedbError::Validation( + "invalid client-order mapping encoding".into(), + )); + } + let mut id_bytes = [0u8; 8]; + id_bytes.copy_from_slice(&cid_entry.value); + let order_id = u64::from_be_bytes(id_bytes); + let order_key = key_order(instrument, order_id); + let Some(order_entry) = lease.view.keyspace.kv_get(project_id, scope_id, &order_key) else { + return Err(AedbError::Validation(format!( + "strict cancel target not found: order_id={order_id}" + ))); + }; + let order: OrderRecord = rmp_serde::from_slice(&order_entry.value) + .map_err(|e| AedbError::Decode(e.to_string()))?; + if order.owner != owner { + return Err(AedbError::PermissionDenied( + "order ownership mismatch".into(), + )); + } + if !matches!( + order.status, + crate::order_book::OrderStatus::Open | crate::order_book::OrderStatus::PartiallyFilled + ) || u256_from_be(order.remaining_qty_be).is_zero() + { + return Err(AedbError::Validation(format!( + "order not cancellable in current status: {:?}", + order.status + ))); + } + let envelope = TransactionEnvelope { + caller, + idempotency_key: None, + write_class: WriteClass::Standard, + assertions: Vec::new(), + read_set: ReadSet { + points: vec![ + ReadSetEntry { + key: ReadKey::KvKey { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + key: cid_key, + }, + version_at_read: cid_entry.version, + }, + ReadSetEntry { + key: ReadKey::KvKey { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + key: order_key, + }, + version_at_read: order_entry.version, + }, + ], + ranges: Vec::new(), + }, + write_intent: WriteIntent { + mutations: vec![Mutation::OrderBookCancel { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + instrument: instrument.to_string(), + order_id: 0, + client_order_id: Some(client_order_id.to_string()), + owner: owner.to_string(), + }], + }, + base_seq: lease.view.seq, + }; + self.commit_envelope_with_finality(envelope, finality).await + } + + #[allow(clippy::too_many_arguments)] + pub async fn order_book_cancel_replace_strict( + &self, + project_id: &str, + scope_id: &str, + instrument: &str, + order_id: u64, + owner: &str, + new_price_ticks: Option, + new_qty_be: Option<[u8; 32]>, + new_time_in_force: Option, + new_exec_instructions: Option, + finality: CommitFinality, + ) -> Result { + self.order_book_cancel_replace_strict_as_internal( + None, + project_id, + scope_id, + instrument, + order_id, + owner, + new_price_ticks, + new_qty_be, + new_time_in_force, + new_exec_instructions, + finality, + ) + .await + } + + #[allow(clippy::too_many_arguments)] + pub async fn order_book_cancel_replace_strict_as( + &self, + caller: CallerContext, + project_id: &str, + scope_id: &str, + instrument: &str, + order_id: u64, + owner: &str, + new_price_ticks: Option, + new_qty_be: Option<[u8; 32]>, + new_time_in_force: Option, + new_exec_instructions: Option, + finality: CommitFinality, + ) -> Result { + self.order_book_cancel_replace_strict_as_internal( + Some(caller), + project_id, + scope_id, + instrument, + order_id, + owner, + new_price_ticks, + new_qty_be, + new_time_in_force, + new_exec_instructions, + finality, + ) + .await + } + + #[allow(clippy::too_many_arguments)] + async fn order_book_cancel_replace_strict_as_internal( + &self, + caller: Option, + project_id: &str, + scope_id: &str, + instrument: &str, + order_id: u64, + owner: &str, + new_price_ticks: Option, + new_qty_be: Option<[u8; 32]>, + new_time_in_force: Option, + new_exec_instructions: Option, + finality: CommitFinality, + ) -> Result { + let (order_key, version, base_seq) = self + .order_book_strict_cancellable_version( + project_id, scope_id, instrument, order_id, owner, + ) + .await?; + let envelope = TransactionEnvelope { + caller, + idempotency_key: None, + write_class: WriteClass::Standard, + assertions: Vec::new(), + read_set: ReadSet { + points: vec![ReadSetEntry { + key: ReadKey::KvKey { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + key: order_key, + }, + version_at_read: version, + }], + ranges: Vec::new(), + }, + write_intent: WriteIntent { + mutations: vec![Mutation::OrderBookCancelReplace { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + instrument: instrument.to_string(), + order_id, + owner: owner.to_string(), + new_price_ticks, + new_qty_be, + new_time_in_force, + new_exec_instructions, + }], + }, + base_seq, + }; + self.commit_envelope_with_finality(envelope, finality).await + } + + pub async fn order_book_reduce_strict( + &self, + project_id: &str, + scope_id: &str, + instrument: &str, + order_id: u64, + owner: &str, + reduce_by_be: [u8; 32], + finality: CommitFinality, + ) -> Result { + self.order_book_reduce_strict_as_internal( + None, + project_id, + scope_id, + instrument, + order_id, + owner, + reduce_by_be, + finality, + ) + .await + } + + pub async fn order_book_reduce_strict_as( + &self, + caller: CallerContext, + project_id: &str, + scope_id: &str, + instrument: &str, + order_id: u64, + owner: &str, + reduce_by_be: [u8; 32], + finality: CommitFinality, + ) -> Result { + self.order_book_reduce_strict_as_internal( + Some(caller), + project_id, + scope_id, + instrument, + order_id, + owner, + reduce_by_be, + finality, + ) + .await + } + + async fn order_book_reduce_strict_as_internal( + &self, + caller: Option, + project_id: &str, + scope_id: &str, + instrument: &str, + order_id: u64, + owner: &str, + reduce_by_be: [u8; 32], + finality: CommitFinality, + ) -> Result { + let reduce_by = u256_from_be(reduce_by_be); + if reduce_by.is_zero() { + return Err(AedbError::Validation( + "strict reduce requires reduce_by > 0".into(), + )); + } + let (order_key, version, base_seq) = self + .order_book_strict_cancellable_version( + project_id, scope_id, instrument, order_id, owner, + ) + .await?; + let envelope = TransactionEnvelope { + caller, + idempotency_key: None, + write_class: WriteClass::Standard, + assertions: Vec::new(), + read_set: ReadSet { + points: vec![ReadSetEntry { + key: ReadKey::KvKey { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + key: order_key, + }, + version_at_read: version, + }], + ranges: Vec::new(), + }, + write_intent: WriteIntent { + mutations: vec![Mutation::OrderBookReduce { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + instrument: instrument.to_string(), + order_id, + owner: owner.to_string(), + reduce_by_be, + }], + }, + base_seq, + }; + self.commit_envelope_with_finality(envelope, finality).await + } + + async fn order_book_strict_cancellable_version( + &self, + project_id: &str, + scope_id: &str, + instrument: &str, + order_id: u64, + owner: &str, + ) -> Result<(Vec, u64, u64), AedbError> { + let lease = self + .acquire_snapshot(ConsistencyMode::AtLatest) + .await + .map_err(AedbError::from)?; + let order_key = key_order(instrument, order_id); + let Some(entry) = lease.view.keyspace.kv_get(project_id, scope_id, &order_key) else { + return Err(AedbError::Validation(format!( + "strict target not found: order_id={order_id}" + ))); + }; + let order: OrderRecord = + rmp_serde::from_slice(&entry.value).map_err(|e| AedbError::Decode(e.to_string()))?; + if order.owner != owner { + return Err(AedbError::PermissionDenied( + "order ownership mismatch".into(), + )); + } + if !matches!( + order.status, + crate::order_book::OrderStatus::Open | crate::order_book::OrderStatus::PartiallyFilled + ) || u256_from_be(order.remaining_qty_be).is_zero() + { + return Err(AedbError::Validation(format!( + "order not mutable in current status: {:?}", + order.status + ))); + } + Ok((order_key, entry.version, lease.view.seq)) + } + + pub async fn order_book_cancel_replace( + &self, + project_id: &str, + scope_id: &str, + instrument: &str, + order_id: u64, + owner: &str, + new_price_ticks: Option, + new_qty_be: Option<[u8; 32]>, + new_time_in_force: Option, + new_exec_instructions: Option, + ) -> Result { + self.commit_prevalidated_internal( + "order_book_cancel_replace", + Mutation::OrderBookCancelReplace { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + instrument: instrument.to_string(), + order_id, + owner: owner.to_string(), + new_price_ticks, + new_qty_be, + new_time_in_force, + new_exec_instructions, + }, + ) + .await + } + + #[allow(clippy::too_many_arguments)] + pub async fn order_book_cancel_replace_as( + &self, + caller: CallerContext, + project_id: &str, + scope_id: &str, + instrument: &str, + order_id: u64, + owner: &str, + new_price_ticks: Option, + new_qty_be: Option<[u8; 32]>, + new_time_in_force: Option, + new_exec_instructions: Option, + ) -> Result { + self.commit_as( + caller, + Mutation::OrderBookCancelReplace { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + instrument: instrument.to_string(), + order_id, + owner: owner.to_string(), + new_price_ticks, + new_qty_be, + new_time_in_force, + new_exec_instructions, + }, + ) + .await + } + + pub async fn order_book_mass_cancel( + &self, + project_id: &str, + scope_id: &str, + instrument: &str, + owner: &str, + side: Option, + owner_filter: Option, + price_range_ticks: Option<(i64, i64)>, + ) -> Result { + self.commit_prevalidated_internal( + "order_book_mass_cancel", + Mutation::OrderBookMassCancel { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + instrument: instrument.to_string(), + owner: owner.to_string(), + side, + owner_filter, + price_range_ticks, + }, + ) + .await + } + + pub async fn order_book_mass_cancel_as( + &self, + caller: CallerContext, + project_id: &str, + scope_id: &str, + instrument: &str, + owner: &str, + side: Option, + owner_filter: Option, + price_range_ticks: Option<(i64, i64)>, + ) -> Result { + self.commit_as( + caller, + Mutation::OrderBookMassCancel { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + instrument: instrument.to_string(), + owner: owner.to_string(), + side, + owner_filter, + price_range_ticks, + }, + ) + .await + } + + pub async fn order_book_reduce( + &self, + project_id: &str, + scope_id: &str, + instrument: &str, + order_id: u64, + owner: &str, + reduce_by_be: [u8; 32], + ) -> Result { + self.commit_prevalidated_internal( + "order_book_reduce", + Mutation::OrderBookReduce { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + instrument: instrument.to_string(), + order_id, + owner: owner.to_string(), + reduce_by_be, + }, + ) + .await + } + + pub async fn order_book_reduce_as( + &self, + caller: CallerContext, + project_id: &str, + scope_id: &str, + instrument: &str, + order_id: u64, + owner: &str, + reduce_by_be: [u8; 32], + ) -> Result { + self.commit_as( + caller, + Mutation::OrderBookReduce { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + instrument: instrument.to_string(), + order_id, + owner: owner.to_string(), + reduce_by_be, + }, + ) + .await + } + + pub async fn order_book_match_internal( + &self, + project_id: &str, + scope_id: &str, + instrument: &str, + fills: Vec, + ) -> Result { + self.commit_prevalidated_internal( + "order_book_match_internal", + Mutation::OrderBookMatch { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + instrument: instrument.to_string(), + fills, + }, + ) + .await + } + + pub async fn order_book_match_internal_as( + &self, + caller: CallerContext, + project_id: &str, + scope_id: &str, + instrument: &str, + fills: Vec, + ) -> Result { + self.commit_as( + caller, Mutation::OrderBookMatch { project_id: project_id.to_string(), scope_id: scope_id.to_string(), @@ -4685,7 +5503,7 @@ impl AedbInstance { coordinator_apply_attempts: core.coordinator_apply_attempts, avg_coordinator_apply_micros: core.avg_coordinator_apply_micros, inflight_commits: core.inflight_commits, - queue_depth: core.inflight_commits, + queue_depth: core.queued_commits, durable_head_lag: runtime .visible_head_seq .saturating_sub(runtime.durable_head_seq), diff --git a/src/lib_helpers.rs b/src/lib_helpers.rs index 3a909c7..65287ee 100644 --- a/src/lib_helpers.rs +++ b/src/lib_helpers.rs @@ -451,10 +451,15 @@ pub(crate) fn validate_config(config: &AedbConfig) -> Result<(), AedbError> { pub(crate) fn validate_secure_config(config: &AedbConfig) -> Result<(), AedbError> { validate_config(config)?; - if config.manifest_hmac_key.is_none() { + let Some(hmac_key) = &config.manifest_hmac_key else { return Err(AedbError::InvalidConfig { message: "secure mode requires manifest_hmac_key".into(), }); + }; + if hmac_key.len() < 32 { + return Err(AedbError::InvalidConfig { + message: "secure mode requires manifest_hmac_key length >= 32 bytes".into(), + }); } if !matches!(config.recovery_mode, RecoveryMode::Strict) { return Err(AedbError::InvalidConfig { @@ -476,10 +481,16 @@ pub(crate) fn validate_secure_config(config: &AedbConfig) -> Result<(), AedbErro pub fn validate_arcana_config(config: &AedbConfig) -> Result<(), AedbError> { validate_config(config)?; - if config.manifest_hmac_key.is_none() { + let Some(hmac_key) = &config.manifest_hmac_key else { return Err(AedbError::InvalidConfig { message: "manifest_hmac_key is required for Arcana production profile".into(), }); + }; + if hmac_key.len() < 32 { + return Err(AedbError::InvalidConfig { + message: "Arcana production profile requires manifest_hmac_key length >= 32 bytes" + .into(), + }); } if !matches!(config.recovery_mode, RecoveryMode::Strict) { return Err(AedbError::InvalidConfig { @@ -629,6 +640,11 @@ pub(crate) fn qualify_policy_columns(expr: &Expr, alias: &str) -> Expr { } pub(crate) fn ensure_external_caller_allowed(caller: &CallerContext) -> Result<(), AedbError> { + if caller.caller_id.trim().is_empty() { + return Err(AedbError::PermissionDenied( + "caller_id must be non-empty".into(), + )); + } if caller.caller_id == SYSTEM_CALLER_ID && !caller.is_internal_system() { return Err(AedbError::PermissionDenied( "caller_id 'system' is reserved for internal use".into(), @@ -638,6 +654,12 @@ pub(crate) fn ensure_external_caller_allowed(caller: &CallerContext) -> Result<( } pub(crate) fn ensure_query_caller_allowed(caller: &CallerContext) -> Result<(), QueryError> { + if caller.caller_id.trim().is_empty() { + return Err(QueryError::PermissionDenied { + permission: "caller_id must be non-empty".into(), + scope: caller.caller_id.clone(), + }); + } if caller.caller_id == SYSTEM_CALLER_ID && !caller.is_internal_system() { return Err(QueryError::PermissionDenied { permission: "caller_id 'system' is reserved for internal use".into(), diff --git a/src/lib_tests.rs b/src/lib_tests.rs index 9141666..94a261a 100644 --- a/src/lib_tests.rs +++ b/src/lib_tests.rs @@ -7,7 +7,7 @@ use crate::PredicateEvaluationPath; use crate::catalog::schema::{ColumnDef, IndexType}; use crate::catalog::types::{ColumnType, Row, Value}; use crate::catalog::{DdlOperation, ResourceType}; -use crate::commit::tx::{TransactionEnvelope, WriteClass, WriteIntent}; +use crate::commit::tx::{IdempotencyKey, TransactionEnvelope, WriteClass, WriteIntent}; use crate::commit::validation::Mutation; use crate::config::{AedbConfig, DurabilityMode, RecoveryMode}; use crate::error::{AedbError, AedbErrorCode, ResourceType as ErrorResourceType}; @@ -2110,6 +2110,27 @@ async fn secure_profile_requires_hardened_storage_settings() { AedbInstance::open_secure(hardened, dir.path()).expect("open secure"); } +#[tokio::test] +async fn secure_profile_rejects_short_hmac_key() { + let dir = tempdir().expect("temp"); + let weak = AedbConfig::default() + .with_hmac_key(vec![1, 2, 3, 4, 5, 6, 7, 8]) + .with_checkpoint_key([3u8; 32]); + let err = AedbInstance::open_secure(weak, dir.path()) + .err() + .expect("short hmac key must be rejected"); + assert!(matches!(err, AedbError::InvalidConfig { .. })); +} + +#[test] +fn arcana_profile_rejects_short_hmac_key() { + let weak = AedbConfig::default().with_hmac_key(vec![9u8; 16]); + let err = crate::lib_helpers::validate_arcana_config(&weak) + .err() + .expect("short hmac key must be rejected"); + assert!(matches!(err, AedbError::InvalidConfig { .. })); +} + #[test] fn low_latency_profile_uses_batch_durability_with_strict_recovery() { let cfg = AedbConfig::low_latency([5u8; 32]); @@ -2192,6 +2213,43 @@ async fn secure_mode_requires_authenticated_apis() { assert!(matches!(explain_err, QueryError::PermissionDenied { .. })); } +#[tokio::test] +async fn commit_as_rejects_empty_caller_id() { + let dir = tempdir().expect("temp"); + let db = AedbInstance::open(AedbConfig::default(), dir.path()).expect("open"); + let err = db + .commit_as( + CallerContext::new(" "), + Mutation::Ddl(DdlOperation::CreateProject { + owner_id: None, + if_not_exists: true, + project_id: "p".into(), + }), + ) + .await + .expect_err("empty caller id should be rejected"); + assert!(matches!(err, AedbError::PermissionDenied(_))); +} + +#[tokio::test] +async fn query_as_rejects_empty_caller_id() { + let dir = tempdir().expect("temp"); + let db = AedbInstance::open(AedbConfig::default(), dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + let caller = CallerContext::new(""); + let err = db + .query_with_options_as( + Some(&caller), + "p", + "app", + Query::select(&["*"]).from("authz_audit").limit(1), + QueryOptions::default(), + ) + .await + .expect_err("empty caller id should be rejected"); + assert!(matches!(err, QueryError::PermissionDenied { .. })); +} + #[tokio::test] async fn query_with_options_as_rejects_reserved_system_caller() { let dir = tempdir().expect("temp"); @@ -3000,6 +3058,7 @@ async fn metrics_surface_reflects_commits() { let op = db.operational_metrics().await; assert!(op.commits_total >= 1); assert!(op.read_set_conflicts <= op.conflict_rejections); + assert!(op.queue_depth >= op.inflight_commits); assert!(op.snapshot_age_micros <= u64::MAX / 2); } @@ -3170,6 +3229,967 @@ async fn commit_with_durable_finality_waits_until_durable_head_catches_up() { ); } +#[tokio::test] +async fn order_book_new_with_durable_finality_waits_until_durable_head_catches_up() { + let dir = tempdir().expect("temp"); + let config = AedbConfig { + durability_mode: DurabilityMode::Batch, + batch_interval_ms: 60_000, + batch_max_bytes: usize::MAX, + ..AedbConfig::default() + }; + let db = Arc::new(AedbInstance::open(config, dir.path()).expect("open")); + db.create_project("p").await.expect("project"); + + let fsync_db = Arc::clone(&db); + let fsync_task = tokio::spawn(async move { + tokio::time::sleep(Duration::from_millis(20)).await; + fsync_db.force_fsync().await.expect("force fsync"); + }); + + let started = Instant::now(); + let result = db + .order_book_new_with_finality( + "p", + "app", + crate::order_book::OrderRequest { + instrument: "BTC-USD".into(), + client_order_id: "oid-1".into(), + side: crate::order_book::OrderSide::Bid, + order_type: crate::order_book::OrderType::Limit, + time_in_force: crate::order_book::TimeInForce::Gtc, + exec_instructions: crate::order_book::ExecInstruction(0), + self_trade_prevention: crate::order_book::SelfTradePrevention::None, + price_ticks: 100, + qty_be: { + let mut out = [0u8; 32]; + out[31] = 1; + out + }, + owner: "alice".into(), + account: None, + nonce: 1, + price_limit_ticks: None, + }, + CommitFinality::Durable, + ) + .await + .expect("order"); + fsync_task.await.expect("join fsync"); + + assert!( + started.elapsed() >= Duration::from_millis(15), + "durable finality should wait for WAL durability in batch mode" + ); + assert!( + result.durable_head_seq >= result.commit_seq, + "durable finality must report durable head at or beyond commit sequence" + ); +} + +#[tokio::test] +async fn order_book_write_requires_authenticated_caller_in_secure_mode() { + let dir = tempdir().expect("temp"); + let db = AedbInstance::open_secure(AedbConfig::production([9u8; 32]), dir.path()) + .expect("open secure"); + let err = db + .order_book_new( + "p", + "app", + crate::order_book::OrderRequest { + instrument: "BTC-USD".into(), + client_order_id: "oid-secure".into(), + side: crate::order_book::OrderSide::Bid, + order_type: crate::order_book::OrderType::Limit, + time_in_force: crate::order_book::TimeInForce::Gtc, + exec_instructions: crate::order_book::ExecInstruction(0), + self_trade_prevention: crate::order_book::SelfTradePrevention::None, + price_ticks: 100, + qty_be: { + let mut out = [0u8; 32]; + out[31] = 1; + out + }, + owner: "alice".into(), + account: None, + nonce: 1, + price_limit_ticks: None, + }, + ) + .await + .expect_err("secure mode should require authenticated caller"); + assert!(matches!(err, AedbError::PermissionDenied(_))); +} + +#[tokio::test] +async fn secure_mode_supports_order_book_writes_via_authenticated_as_apis() { + let dir = tempdir().expect("temp"); + let db = AedbInstance::open_secure(AedbConfig::production([5u8; 32]), dir.path()) + .expect("open secure"); + db.commit_as( + CallerContext::system_internal(), + Mutation::Ddl(DdlOperation::CreateProject { + owner_id: None, + if_not_exists: true, + project_id: "p".into(), + }), + ) + .await + .expect("create project"); + db.commit_as( + CallerContext::system_internal(), + Mutation::Ddl(DdlOperation::GrantPermission { + caller_id: "alice".into(), + permission: Permission::ProjectAdmin { + project_id: "p".into(), + }, + actor_id: Some("system".into()), + delegable: false, + }), + ) + .await + .expect("grant project admin"); + + let alice = CallerContext::new("alice"); + db.order_book_new_as( + alice.clone(), + "p", + "app", + crate::order_book::OrderRequest { + instrument: "BTC-USD".into(), + client_order_id: "oid-secure-as".into(), + side: crate::order_book::OrderSide::Bid, + order_type: crate::order_book::OrderType::Limit, + time_in_force: crate::order_book::TimeInForce::Gtc, + exec_instructions: crate::order_book::ExecInstruction(0), + self_trade_prevention: crate::order_book::SelfTradePrevention::None, + price_ticks: 100, + qty_be: { + let mut out = [0u8; 32]; + out[31] = 2; + out + }, + owner: "alice".into(), + account: None, + nonce: 1, + price_limit_ticks: None, + }, + ) + .await + .expect("place order"); + db.order_book_cancel_as(alice.clone(), "p", "app", "BTC-USD", 1, "alice") + .await + .expect("cancel order"); + let status = db + .order_status("p", "app", "BTC-USD", 1, ConsistencyMode::AtLatest, &alice) + .await + .expect("status query") + .expect("order exists"); + assert_eq!(status.status, crate::order_book::OrderStatus::Cancelled); +} + +#[tokio::test] +async fn secure_multi_agent_user_perspective_invariants_hold() { + fn u256_be(v: u64) -> [u8; 32] { + let mut out = [0u8; 32]; + out[24..].copy_from_slice(&v.to_be_bytes()); + out + } + + fn req( + instrument: &str, + owner: &str, + cid: String, + side: crate::order_book::OrderSide, + tif: crate::order_book::TimeInForce, + price: i64, + qty: u64, + nonce: u64, + ) -> crate::order_book::OrderRequest { + crate::order_book::OrderRequest { + instrument: instrument.to_string(), + client_order_id: cid, + side, + order_type: crate::order_book::OrderType::Limit, + time_in_force: tif, + exec_instructions: crate::order_book::ExecInstruction(0), + self_trade_prevention: crate::order_book::SelfTradePrevention::None, + price_ticks: price, + qty_be: u256_be(qty), + owner: owner.to_string(), + account: None, + nonce, + price_limit_ticks: None, + } + } + + #[derive(Debug, Default)] + struct AgentMetrics { + primary_attempted: usize, + primary_accepted: usize, + primary_rejected: usize, + lifecycle_attempted: usize, + lifecycle_accepted: usize, + lifecycle_rejected: usize, + own_read_checks: usize, + } + + let dir = tempdir().expect("temp"); + let db = Arc::new( + AedbInstance::open_secure(AedbConfig::production([6u8; 32]), dir.path()) + .expect("open secure"), + ); + let system = CallerContext::system_internal(); + db.commit_as( + system.clone(), + Mutation::Ddl(DdlOperation::CreateProject { + owner_id: None, + if_not_exists: true, + project_id: "p".into(), + }), + ) + .await + .expect("create project"); + + let agents: Vec = (0..8).map(|i| format!("agent_{i}")).collect(); + for a in &agents { + db.commit_as( + system.clone(), + Mutation::Ddl(DdlOperation::GrantPermission { + caller_id: a.clone(), + permission: Permission::ProjectAdmin { + project_id: "p".into(), + }, + actor_id: Some("system".into()), + delegable: false, + }), + ) + .await + .expect("grant project admin"); + } + + db.order_book_set_instrument_config_as( + system.clone(), + "p", + "app", + "BTC-USD", + crate::order_book::InstrumentConfig { + instrument: "BTC-USD".into(), + tick_size: 1, + lot_size_be: u256_be(1), + min_price_ticks: 1, + max_price_ticks: 1_000_000, + market_order_price_band: Some(50), + halted: false, + balance_config: None, + }, + ) + .await + .expect("instrument config"); + + for i in 0..16u64 { + db.order_book_new_as( + system.clone(), + "p", + "app", + req( + "BTC-USD", + &format!("seed_ask_{i}"), + format!("seed-ask-{i}"), + crate::order_book::OrderSide::Ask, + crate::order_book::TimeInForce::Gtc, + 1_000 + i as i64, + 10, + 1, + ), + ) + .await + .expect("seed ask"); + db.order_book_new_as( + system.clone(), + "p", + "app", + req( + "BTC-USD", + &format!("seed_bid_{i}"), + format!("seed-bid-{i}"), + crate::order_book::OrderSide::Bid, + crate::order_book::TimeInForce::Gtc, + 999 - i as i64, + 10, + 1, + ), + ) + .await + .expect("seed bid"); + } + + let mut anchors: Vec<(String, u64)> = Vec::with_capacity(agents.len()); + for (idx, agent_id) in agents.iter().enumerate() { + let caller = CallerContext::new(agent_id.clone()); + let anchor_cid = format!("anchor-{agent_id}"); + db.order_book_new_as( + caller.clone(), + "p", + "app", + req( + "BTC-USD", + agent_id, + anchor_cid.clone(), + if idx % 2 == 0 { + crate::order_book::OrderSide::Bid + } else { + crate::order_book::OrderSide::Ask + }, + crate::order_book::TimeInForce::Gtc, + if idx % 2 == 0 { 980 } else { 1_020 }, + 2, + 1, + ), + ) + .await + .expect("anchor order"); + let own_open = db + .open_orders( + "p", + "app", + "BTC-USD", + agent_id, + ConsistencyMode::AtLatest, + &caller, + ) + .await + .expect("own open orders"); + let anchor = own_open + .into_iter() + .find(|o| o.client_order_id == anchor_cid) + .expect("anchor order discoverable"); + anchors.push((agent_id.clone(), anchor.order_id)); + } + + let mut tasks = Vec::with_capacity(agents.len()); + for (idx, agent_id) in agents.iter().enumerate() { + let db_clone = Arc::clone(&db); + let caller = CallerContext::new(agent_id.clone()); + let owner = agent_id.clone(); + tasks.push(tokio::spawn(async move { + let mut m = AgentMetrics::default(); + let mut nonce = 10u64; + for op in 0..180usize { + let side = if (op + idx) % 2 == 0 { + crate::order_book::OrderSide::Bid + } else { + crate::order_book::OrderSide::Ask + }; + let price = if matches!(side, crate::order_book::OrderSide::Bid) { + 1_001 + } else { + 998 + }; + m.primary_attempted += 1; + let res = db_clone + .order_book_new_as( + caller.clone(), + "p", + "app", + req( + "BTC-USD", + &owner, + format!("{owner}-p-{op}"), + side, + crate::order_book::TimeInForce::Ioc, + price, + 1 + (op % 4) as u64, + nonce, + ), + ) + .await; + nonce += 1; + match res { + Ok(_) => m.primary_accepted += 1, + Err(AedbError::Validation(_)) | Err(AedbError::Conflict(_)) => { + m.primary_rejected += 1 + } + Err(other) => return Err(other), + } + + if op % 30 == 0 { + m.lifecycle_attempted += 1; + let cid = format!("{owner}-l-{op}"); + let opened = db_clone + .order_book_new_as( + caller.clone(), + "p", + "app", + req( + "BTC-USD", + &owner, + cid.clone(), + crate::order_book::OrderSide::Bid, + crate::order_book::TimeInForce::Gtc, + 970, + 1, + nonce, + ), + ) + .await; + nonce += 1; + match opened { + Ok(_) => m.lifecycle_accepted += 1, + Err(AedbError::Validation(_)) | Err(AedbError::Conflict(_)) => { + m.lifecycle_rejected += 1 + } + Err(other) => return Err(other), + } + + m.lifecycle_attempted += 1; + match db_clone + .order_book_cancel_by_client_id_as( + caller.clone(), + "p", + "app", + "BTC-USD", + &cid, + &owner, + ) + .await + { + Ok(_) => m.lifecycle_accepted += 1, + Err(AedbError::Validation(_)) | Err(AedbError::Conflict(_)) => { + m.lifecycle_rejected += 1 + } + Err(other) => return Err(other), + } + } + + if op % 40 == 0 { + let own = db_clone + .open_orders( + "p", + "app", + "BTC-USD", + &owner, + ConsistencyMode::AtLatest, + &caller, + ) + .await + .map_err(|e| { + AedbError::Validation(format!("own open_orders failed: {e}")) + })?; + assert!( + own.iter().all(|o| o.owner == owner), + "open_orders must only return owner rows" + ); + m.own_read_checks += 1; + } + } + Ok::<_, AedbError>(m) + })); + } + + let mut metrics = Vec::with_capacity(agents.len()); + for task in tasks { + metrics.push(task.await.expect("join agent task").expect("agent run")); + } + + for (idx, agent_id) in agents.iter().enumerate() { + let caller = CallerContext::new(agent_id.clone()); + let (target_owner, target_order_id) = &anchors[(idx + 1) % anchors.len()]; + + let err = db + .order_status( + "p", + "app", + "BTC-USD", + *target_order_id, + ConsistencyMode::AtLatest, + &caller, + ) + .await + .expect_err("cross-owner order_status must be denied"); + assert!( + matches!(err, QueryError::PermissionDenied { .. }), + "expected permission denied, got {err:?}" + ); + + let err = db + .open_orders( + "p", + "app", + "BTC-USD", + target_owner, + ConsistencyMode::AtLatest, + &caller, + ) + .await + .expect_err("cross-owner open_orders must be denied"); + assert!( + matches!(err, QueryError::PermissionDenied { .. }), + "expected permission denied, got {err:?}" + ); + } + + for m in &metrics { + assert_eq!( + m.primary_accepted + m.primary_rejected, + m.primary_attempted, + "primary accounting mismatch" + ); + assert_eq!( + m.lifecycle_accepted + m.lifecycle_rejected, + m.lifecycle_attempted, + "lifecycle accounting mismatch" + ); + assert!( + m.own_read_checks > 0, + "agent should perform own-read checks" + ); + } +} + +#[tokio::test] +async fn commit_success_is_observable_at_its_commit_seq() { + let dir = tempdir().expect("temp"); + let db = AedbInstance::open(AedbConfig::default(), dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + + let result = db + .commit(Mutation::KvSet { + project_id: "p".into(), + scope_id: "app".into(), + key: b"inclusion-proof".to_vec(), + value: b"ok".to_vec(), + }) + .await + .expect("commit"); + + let at_seq = db + .kv_get_no_auth( + "p", + "app", + b"inclusion-proof", + ConsistencyMode::AtSeq(result.commit_seq), + ) + .await + .expect("kv_get at seq") + .expect("value present at commit seq"); + assert_eq!(at_seq.value, b"ok".to_vec()); +} + +#[tokio::test] +async fn failed_multi_mutation_envelope_is_atomic_and_has_no_partial_effects() { + let dir = tempdir().expect("temp"); + let db = AedbInstance::open(AedbConfig::default(), dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + + let before = db.head_state().await.visible_head_seq; + let err = db + .commit_envelope(TransactionEnvelope { + caller: None, + idempotency_key: None, + write_class: WriteClass::Standard, + assertions: Vec::new(), + read_set: Default::default(), + write_intent: WriteIntent { + mutations: vec![ + Mutation::KvSet { + project_id: "p".into(), + scope_id: "app".into(), + key: b"atomic-a".to_vec(), + value: b"1".to_vec(), + }, + Mutation::KvDecU256 { + project_id: "p".into(), + scope_id: "app".into(), + key: b"missing-counter".to_vec(), + amount_be: { + let mut out = [0u8; 32]; + out[31] = 1; + out + }, + }, + ], + }, + base_seq: before, + }) + .await + .expect_err("envelope should fail"); + assert!( + matches!( + err, + AedbError::Underflow | AedbError::Validation(_) | AedbError::Conflict(_) + ), + "expected semantic failure, got: {err:?}" + ); + + let after = db.head_state().await.visible_head_seq; + assert_eq!(after, before, "failed envelope must not advance head"); + let leaked = db + .kv_get_no_auth("p", "app", b"atomic-a", ConsistencyMode::AtLatest) + .await + .expect("kv_get") + .is_some(); + assert!( + !leaked, + "failed envelope must not partially apply mutations" + ); +} + +#[tokio::test] +async fn idempotent_retry_does_not_double_apply_non_idempotent_mutation() { + let dir = tempdir().expect("temp"); + let db = Arc::new(AedbInstance::open(AedbConfig::default(), dir.path()).expect("open")); + db.create_project("p").await.expect("project"); + + let key = IdempotencyKey([42u8; 16]); + let mut tasks = Vec::new(); + for _ in 0..8 { + let db = Arc::clone(&db); + let key = key.clone(); + tasks.push(tokio::spawn(async move { + db.commit_envelope(TransactionEnvelope { + caller: None, + idempotency_key: Some(key), + write_class: WriteClass::Standard, + assertions: Vec::new(), + read_set: Default::default(), + write_intent: WriteIntent { + mutations: vec![Mutation::KvIncU256 { + project_id: "p".into(), + scope_id: "app".into(), + key: b"idem-counter".to_vec(), + amount_be: { + let mut out = [0u8; 32]; + out[31] = 1; + out + }, + }], + }, + base_seq: 0, + }) + .await + .expect("idempotent commit") + })); + } + + let mut seqs = std::collections::BTreeSet::new(); + for t in tasks { + let res = t.await.expect("join"); + seqs.insert(res.commit_seq); + } + assert_eq!(seqs.len(), 1, "all retries must resolve to one commit_seq"); + + let entry = db + .kv_get_no_auth("p", "app", b"idem-counter", ConsistencyMode::AtLatest) + .await + .expect("kv_get") + .expect("counter exists"); + assert_eq!( + primitive_types::U256::from_big_endian(&entry.value), + primitive_types::U256::one(), + "idempotent retries must apply mutation exactly once" + ); +} + +#[tokio::test] +async fn strict_cancel_rejects_missing_order() { + let dir = tempdir().expect("temp"); + let db = AedbInstance::open(AedbConfig::default(), dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + + let err = db + .order_book_cancel_strict( + "p", + "app", + "BTC-USD", + 999_999, + "alice", + CommitFinality::Visible, + ) + .await + .expect_err("strict cancel should fail when target is missing"); + assert!(matches!(err, AedbError::Validation(_))); +} + +#[tokio::test] +async fn strict_cancel_rejects_already_final_order() { + let dir = tempdir().expect("temp"); + let db = AedbInstance::open(AedbConfig::default(), dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + + db.order_book_new( + "p", + "app", + crate::order_book::OrderRequest { + instrument: "BTC-USD".into(), + client_order_id: "strict-final".into(), + side: crate::order_book::OrderSide::Bid, + order_type: crate::order_book::OrderType::Limit, + time_in_force: crate::order_book::TimeInForce::Gtc, + exec_instructions: crate::order_book::ExecInstruction(0), + self_trade_prevention: crate::order_book::SelfTradePrevention::None, + price_ticks: 100, + qty_be: { + let mut out = [0u8; 32]; + out[31] = 1; + out + }, + owner: "alice".into(), + account: None, + nonce: 1, + price_limit_ticks: None, + }, + ) + .await + .expect("place order"); + + db.order_book_cancel_strict("p", "app", "BTC-USD", 1, "alice", CommitFinality::Visible) + .await + .expect("first strict cancel"); + + let err = db + .order_book_cancel_strict("p", "app", "BTC-USD", 1, "alice", CommitFinality::Visible) + .await + .expect_err("second strict cancel should fail on already-cancelled order"); + assert!(matches!(err, AedbError::Validation(_))); +} + +#[tokio::test] +async fn strict_cancel_by_client_id_rejects_missing_mapping() { + let dir = tempdir().expect("temp"); + let db = AedbInstance::open(AedbConfig::default(), dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + + let err = db + .order_book_cancel_by_client_id_strict( + "p", + "app", + "BTC-USD", + "missing-client-order-id", + "alice", + CommitFinality::Visible, + ) + .await + .expect_err("strict cancel by client id should fail when mapping is missing"); + assert!(matches!(err, AedbError::Validation(_))); +} + +#[tokio::test] +async fn strict_cancel_by_client_id_rejects_already_final_order() { + let dir = tempdir().expect("temp"); + let db = AedbInstance::open(AedbConfig::default(), dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + + db.order_book_new( + "p", + "app", + crate::order_book::OrderRequest { + instrument: "BTC-USD".into(), + client_order_id: "strict-client-final".into(), + side: crate::order_book::OrderSide::Bid, + order_type: crate::order_book::OrderType::Limit, + time_in_force: crate::order_book::TimeInForce::Gtc, + exec_instructions: crate::order_book::ExecInstruction(0), + self_trade_prevention: crate::order_book::SelfTradePrevention::None, + price_ticks: 100, + qty_be: { + let mut out = [0u8; 32]; + out[31] = 1; + out + }, + owner: "alice".into(), + account: None, + nonce: 1, + price_limit_ticks: None, + }, + ) + .await + .expect("place order"); + + db.order_book_cancel_by_client_id_strict( + "p", + "app", + "BTC-USD", + "strict-client-final", + "alice", + CommitFinality::Visible, + ) + .await + .expect("first strict cancel by client id"); + + let err = db + .order_book_cancel_by_client_id_strict( + "p", + "app", + "BTC-USD", + "strict-client-final", + "alice", + CommitFinality::Visible, + ) + .await + .expect_err("second strict cancel by client id should fail on finalized order"); + assert!(matches!(err, AedbError::Validation(_))); +} + +#[tokio::test] +async fn strict_cancel_by_client_id_rejects_invalid_mapping_encoding() { + let dir = tempdir().expect("temp"); + let db = AedbInstance::open(AedbConfig::default(), dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + + db.commit(Mutation::KvSet { + project_id: "p".into(), + scope_id: "app".into(), + key: crate::order_book::key_client_id("BTC-USD", "alice", "cid-corrupt"), + value: vec![1, 2, 3, 4], + }) + .await + .expect("inject corrupted mapping"); + + let err = db + .order_book_cancel_by_client_id_strict( + "p", + "app", + "BTC-USD", + "cid-corrupt", + "alice", + CommitFinality::Visible, + ) + .await + .expect_err("strict cancel by client id should reject malformed mapping"); + assert!(matches!(err, AedbError::Validation(_))); +} + +#[tokio::test] +async fn strict_cancel_by_client_id_detects_owner_mismatch_under_tampered_mapping() { + let dir = tempdir().expect("temp"); + let db = AedbInstance::open(AedbConfig::default(), dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + + db.order_book_new( + "p", + "app", + crate::order_book::OrderRequest { + instrument: "BTC-USD".into(), + client_order_id: "cid-owner-a".into(), + side: crate::order_book::OrderSide::Bid, + order_type: crate::order_book::OrderType::Limit, + time_in_force: crate::order_book::TimeInForce::Gtc, + exec_instructions: crate::order_book::ExecInstruction(0), + self_trade_prevention: crate::order_book::SelfTradePrevention::None, + price_ticks: 100, + qty_be: { + let mut out = [0u8; 32]; + out[31] = 1; + out + }, + owner: "alice".into(), + account: None, + nonce: 1, + price_limit_ticks: None, + }, + ) + .await + .expect("place order"); + + db.commit(Mutation::KvSet { + project_id: "p".into(), + scope_id: "app".into(), + key: crate::order_book::key_client_id("BTC-USD", "bob", "cid-owner-b"), + value: 1u64.to_be_bytes().to_vec(), + }) + .await + .expect("inject tampered mapping"); + + let err = db + .order_book_cancel_by_client_id_strict( + "p", + "app", + "BTC-USD", + "cid-owner-b", + "bob", + CommitFinality::Visible, + ) + .await + .expect_err("strict cancel by client id should reject owner mismatch"); + assert!(matches!(err, AedbError::PermissionDenied(_))); +} + +#[tokio::test] +async fn strict_reduce_rejects_missing_order() { + let dir = tempdir().expect("temp"); + let db = AedbInstance::open(AedbConfig::default(), dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + let mut one = [0u8; 32]; + one[31] = 1; + let err = db + .order_book_reduce_strict( + "p", + "app", + "BTC-USD", + 777, + "alice", + one, + CommitFinality::Visible, + ) + .await + .expect_err("strict reduce should fail on missing order"); + assert!(matches!(err, AedbError::Validation(_))); +} + +#[tokio::test] +async fn strict_cancel_replace_rejects_already_final_order() { + let dir = tempdir().expect("temp"); + let db = AedbInstance::open(AedbConfig::default(), dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + + db.order_book_new( + "p", + "app", + crate::order_book::OrderRequest { + instrument: "BTC-USD".into(), + client_order_id: "strict-cr-final".into(), + side: crate::order_book::OrderSide::Bid, + order_type: crate::order_book::OrderType::Limit, + time_in_force: crate::order_book::TimeInForce::Gtc, + exec_instructions: crate::order_book::ExecInstruction(0), + self_trade_prevention: crate::order_book::SelfTradePrevention::None, + price_ticks: 100, + qty_be: { + let mut out = [0u8; 32]; + out[31] = 1; + out + }, + owner: "alice".into(), + account: None, + nonce: 1, + price_limit_ticks: None, + }, + ) + .await + .expect("place order"); + db.order_book_cancel_strict("p", "app", "BTC-USD", 1, "alice", CommitFinality::Visible) + .await + .expect("cancel"); + + let err = db + .order_book_cancel_replace_strict( + "p", + "app", + "BTC-USD", + 1, + "alice", + Some(101), + None, + None, + None, + CommitFinality::Visible, + ) + .await + .expect_err("strict cancel-replace should fail on finalized order"); + assert!(matches!(err, AedbError::Validation(_))); +} + #[tokio::test] async fn multi_update_transaction_envelope_updates_table_and_kv() { let dir = tempdir().expect("temp"); diff --git a/src/order_book.rs b/src/order_book.rs index 9c0c24d..3921167 100644 --- a/src/order_book.rs +++ b/src/order_book.rs @@ -2,6 +2,7 @@ use crate::error::AedbError; use crate::storage::keyspace::{Keyspace, KeyspaceSnapshot}; use primitive_types::U256; use serde::{Deserialize, Serialize}; +use std::ops::Bound; #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Hash)] #[repr(u8)] @@ -1412,21 +1413,23 @@ pub fn read_recent_trades( instrument: &str, limit: usize, ) -> Result, AedbError> { - let mut trades = snapshot_scan_prefix( - project_id, - scope_id, - &trade_prefix(instrument), - usize::MAX, - snapshot, - ); - if trades.len() > limit { - let start = trades.len() - limit; - trades = trades.split_off(start); + if limit == 0 { + return Ok(Vec::new()); } - let mut out = Vec::with_capacity(trades.len()); - for (_, entry) in trades { + let ns = crate::storage::keyspace::NamespaceId::project_scope(project_id, scope_id); + let Some(namespace) = snapshot.namespaces.get(&ns) else { + return Ok(Vec::new()); + }; + let prefix = trade_prefix(instrument); + let start = Bound::Included(prefix.clone()); + let end = prefix_range_end(&prefix) + .map(Bound::Excluded) + .unwrap_or(Bound::Unbounded); + let mut out = Vec::with_capacity(limit); + for (_, entry) in namespace.kv.entries.range((start, end)).rev().take(limit) { out.push(deserialize::(&entry.value)?); } + out.reverse(); Ok(out) } @@ -1633,16 +1636,31 @@ fn snapshot_scan_prefix( let Some(namespace) = snapshot.namespaces.get(&ns) else { return Vec::new(); }; + let start = Bound::Included(prefix.to_vec()); + let end = prefix_range_end(prefix) + .map(Bound::Excluded) + .unwrap_or(Bound::Unbounded); namespace .kv .entries - .iter() - .filter(|(k, _)| k.starts_with(prefix)) + .range((start, end)) .take(limit) .map(|(k, v)| (k.clone(), v.clone())) .collect() } +fn prefix_range_end(prefix: &[u8]) -> Option> { + let mut end = prefix.to_vec(); + for idx in (0..end.len()).rev() { + if end[idx] != u8::MAX { + end[idx] = end[idx].saturating_add(1); + end.truncate(idx + 1); + return Some(end); + } + } + None +} + fn allocate_next_id( keyspace: &mut Keyspace, project_id: &str, @@ -2162,3 +2180,113 @@ fn effective_request_for_config( }); Ok(next) } + +#[cfg(test)] +mod tests { + use super::*; + use crate::storage::keyspace::Keyspace; + + fn encode_fill(fill_id: u64, seq: u64) -> Vec { + rmp_serde::to_vec(&FillRecord { + fill_id, + instrument: "BTC-USD".to_string(), + price_ticks: 100 + fill_id as i64, + qty_be: { + let mut out = [0u8; 32]; + out[31] = 1; + out + }, + aggressor_order_id: fill_id, + aggressor_owner: "agg".to_string(), + aggressor_side: OrderSide::Bid, + passive_order_id: fill_id + 10, + passive_owner: "pas".to_string(), + seq, + }) + .expect("encode fill") + } + + #[test] + fn read_recent_trades_returns_latest_n_in_order() { + let mut ks = Keyspace::default(); + for i in 1..=5u64 { + ks.kv_set( + "p", + "app", + key_trade("BTC-USD", i), + encode_fill(i, 100 + i), + 100 + i, + ); + } + let snapshot = ks.snapshot(); + let recent = + read_recent_trades(&snapshot, "p", "app", "BTC-USD", 3).expect("read recent trades"); + let ids: Vec = recent.into_iter().map(|f| f.fill_id).collect(); + assert_eq!(ids, vec![3, 4, 5]); + } + + #[test] + fn read_recent_trades_limit_zero_is_empty() { + let mut ks = Keyspace::default(); + ks.kv_set( + "p", + "app", + key_trade("BTC-USD", 1), + encode_fill(1, 101), + 101, + ); + let snapshot = ks.snapshot(); + let recent = + read_recent_trades(&snapshot, "p", "app", "BTC-USD", 0).expect("read recent trades"); + assert!(recent.is_empty()); + } + + #[test] + fn read_recent_trades_isolated_by_instrument_prefix() { + let mut ks = Keyspace::default(); + for i in 1..=3u64 { + ks.kv_set( + "p", + "app", + key_trade("BTC-USD", i), + encode_fill(i, 100 + i), + 100 + i, + ); + ks.kv_set( + "p", + "app", + key_trade("ETH-USD", i), + encode_fill(100 + i, 200 + i), + 200 + i, + ); + } + let snapshot = ks.snapshot(); + let recent = + read_recent_trades(&snapshot, "p", "app", "BTC-USD", 10).expect("read recent trades"); + let ids: Vec = recent.into_iter().map(|f| f.fill_id).collect(); + assert_eq!(ids, vec![1, 2, 3]); + } + + #[test] + fn read_recent_trades_rejects_malformed_trade_payload() { + let mut ks = Keyspace::default(); + ks.kv_set("p", "app", key_trade("BTC-USD", 1), vec![0, 1, 2, 3], 101); + let snapshot = ks.snapshot(); + let err = read_recent_trades(&snapshot, "p", "app", "BTC-USD", 1) + .expect_err("malformed payload should fail decode"); + assert!(matches!(err, AedbError::Decode(_))); + } + + #[test] + fn prefix_range_end_handles_regular_and_all_ff_prefixes() { + let regular = vec![0x6f, 0x62, 0x3a, 0x00, 0x7f]; + let end = prefix_range_end(®ular).expect("regular prefix end"); + assert!(end > regular); + + let all_ff = vec![0xff, 0xff, 0xff]; + assert!( + prefix_range_end(&all_ff).is_none(), + "all-0xff prefix has no finite upper bound" + ); + } +} diff --git a/tests/order_book_simulation.rs b/tests/order_book_simulation.rs index aa76558..8e6387e 100644 --- a/tests/order_book_simulation.rs +++ b/tests/order_book_simulation.rs @@ -23,11 +23,16 @@ fn decode_u256_u64(bytes: [u8; 32]) -> u64 { u64::from_be_bytes(out) } -fn decode_u256_bytes_to_u64(bytes: &[u8]) -> u64 { - assert_eq!(bytes.len(), 32); +fn decode_u256_bytes_to_u64(bytes: &[u8]) -> Result { + if bytes.len() != 32 { + return Err(AedbError::Validation(format!( + "invalid u256 byte length: {}", + bytes.len() + ))); + } let mut out = [0u8; 8]; out.copy_from_slice(&bytes[24..]); - u64::from_be_bytes(out) + Ok(u64::from_be_bytes(out)) } fn request( @@ -63,8 +68,8 @@ fn request( } } -async fn setup_books(db: &AedbInstance, assets: &[String]) { - db.create_project("p").await.expect("project"); +async fn setup_books(db: &AedbInstance, assets: &[String]) -> Result<(), AedbError> { + db.create_project("p").await?; for asset in assets { db.order_book_set_instrument_config( "p", @@ -81,8 +86,7 @@ async fn setup_books(db: &AedbInstance, assets: &[String]) { balance_config: None, }, ) - .await - .expect("config"); + .await?; // Seed symmetric depth around 1_000 ticks. for i in 0..20_u64 { @@ -103,8 +107,7 @@ async fn setup_books(db: &AedbInstance, assets: &[String]) { 1, ), ) - .await - .expect("seed ask"); + .await?; let bid_owner = format!("seed_bid_{}_{}", asset, i); db.order_book_new( @@ -123,20 +126,79 @@ async fn setup_books(db: &AedbInstance, assets: &[String]) { 1, ), ) + .await?; + } + } + Ok(()) +} + +#[derive(Debug, Default)] +struct ChaosMetrics { + primary_attempted: usize, + primary_accepted: usize, + primary_rejected: usize, + lifecycle_attempted: usize, + lifecycle_accepted: usize, + lifecycle_rejected: usize, + reader_checks: usize, +} + +async fn validate_asset_read_consistency(db: &AedbInstance, asset: &str) -> Result<(), AedbError> { + let rows = db + .kv_scan_prefix_no_auth( + "p", + "app", + format!("ob:{asset}:ord:").as_bytes(), + 2_000_000, + ConsistencyMode::AtLatest, + ) + .await + .map_err(|e| AedbError::Validation(e.to_string()))?; + for (_, entry) in rows { + let order: aedb::order_book::OrderRecord = + rmp_serde::from_slice(&entry.value).map_err(|e| AedbError::Decode(e.to_string()))?; + let original = decode_u256_u64(order.original_qty_be); + let remaining = decode_u256_u64(order.remaining_qty_be); + let filled = decode_u256_u64(order.filled_qty_be); + if remaining + filled > original { + return Err(AedbError::Validation(format!( + "quantity accounting violated in live read for {asset}" + ))); + } + } + + for side in [OrderSide::Bid, OrderSide::Ask] { + let levels = db + .kv_scan_prefix_no_auth( + "p", + "app", + format!("ob:{asset}:plqty:{}:", side as u8).as_bytes(), + 2_000_000, + ConsistencyMode::AtLatest, + ) .await - .expect("seed bid"); + .map_err(|e| AedbError::Validation(e.to_string()))?; + for (k, v) in levels { + let qty = decode_u256_bytes_to_u64(&v.value)?; + if qty == 0 { + continue; + } + parse_plqty_price(side, &k).ok_or_else(|| { + AedbError::Validation(format!("failed to parse level price for {asset}")) + })?; } } + Ok(()) } async fn run_simulation( assets: Vec, traders: usize, ops_per_trader: usize, -) -> Arc { - let dir = tempdir().expect("temp"); - let db = Arc::new(AedbInstance::open(Default::default(), dir.path()).expect("open")); - setup_books(&db, &assets).await; +) -> Result<(Arc, ChaosMetrics), AedbError> { + let dir = tempdir().map_err(AedbError::Io)?; + let db = Arc::new(AedbInstance::open(Default::default(), dir.path())?); + setup_books(&db, &assets).await?; let mut tasks = Vec::with_capacity(traders); for t in 0..traders { @@ -146,6 +208,12 @@ async fn run_simulation( let owner = format!("trader_{t}"); let mut nonces: BTreeMap = BTreeMap::new(); let mut rng = StdRng::seed_from_u64(42 + t as u64); + let mut primary_attempted = 0usize; + let mut primary_accepted = 0usize; + let mut primary_rejected = 0usize; + let mut lifecycle_attempted = 0usize; + let mut lifecycle_accepted = 0usize; + let mut lifecycle_rejected = 0usize; for op in 0..ops_per_trader { let asset = &assets_clone[rng.gen_range(0..assets_clone.len())]; @@ -171,6 +239,7 @@ async fn run_simulation( }; let post_only = order_type == OrderType::Limit && rng.gen_bool(0.05); + primary_attempted += 1; let res = db_clone .order_book_new( "p", @@ -190,11 +259,14 @@ async fn run_simulation( ) .await; - if let Err(err) = res { - // Expected rejects under stress: FOK, market no liquidity, post-only crossing. - match err { - AedbError::Validation(_) => {} - other => panic!("unexpected simulation error: {other:?}"), + match res { + Ok(_) => primary_accepted += 1, + Err(err) => { + // Expected rejects under stress: FOK, market no liquidity, post-only crossing. + match err { + AedbError::Validation(_) => primary_rejected += 1, + other => return Err(other), + } } } @@ -202,7 +274,8 @@ async fn run_simulation( if op % 100 == 0 { *nonce += 1; let cid = format!("gtc-{owner}-{op}"); - let _ = db_clone + lifecycle_attempted += 1; + match db_clone .order_book_new( "p", "app", @@ -219,23 +292,86 @@ async fn run_simulation( *nonce, ), ) - .await; - let _ = db_clone + .await + { + Ok(_) => lifecycle_accepted += 1, + Err(AedbError::Validation(_)) => lifecycle_rejected += 1, + Err(other) => return Err(other), + } + + lifecycle_attempted += 1; + match db_clone .order_book_cancel_by_client_id("p", "app", asset, &cid, &owner) - .await; + .await + { + Ok(_) => lifecycle_accepted += 1, + Err(AedbError::Validation(_)) => lifecycle_rejected += 1, + Err(other) => return Err(other), + } } } + Ok(ChaosMetrics { + primary_attempted, + primary_accepted, + primary_rejected, + lifecycle_attempted, + lifecycle_accepted, + lifecycle_rejected, + reader_checks: 0, + }) })); } + let reader_workers = (traders / 2).max(4); + let reader_loops = (ops_per_trader / 2).max(200); + for r in 0..reader_workers { + let db_clone = Arc::clone(&db); + let assets_clone = assets.clone(); + tasks.push(tokio::spawn(async move { + let mut rng = StdRng::seed_from_u64(9_000 + r as u64); + let mut checks = 0usize; + for _ in 0..reader_loops { + let asset = &assets_clone[rng.gen_range(0..assets_clone.len())]; + validate_asset_read_consistency(db_clone.as_ref(), asset).await?; + checks += 1; + } + Ok(ChaosMetrics { + reader_checks: checks, + ..Default::default() + }) + })); + } + + let mut metrics = ChaosMetrics::default(); for task in tasks { - task.await.expect("task join"); + let worker = task + .await + .map_err(|e| AedbError::Validation(format!("simulation task join failure: {e}")))?; + let worker = worker?; + metrics.primary_attempted += worker.primary_attempted; + metrics.primary_accepted += worker.primary_accepted; + metrics.primary_rejected += worker.primary_rejected; + metrics.lifecycle_attempted += worker.lifecycle_attempted; + metrics.lifecycle_accepted += worker.lifecycle_accepted; + metrics.lifecycle_rejected += worker.lifecycle_rejected; + metrics.reader_checks += worker.reader_checks; } - db + if metrics.primary_accepted + metrics.primary_rejected != metrics.primary_attempted { + return Err(AedbError::Validation( + "primary flow accounting mismatch".into(), + )); + } + if metrics.lifecycle_accepted + metrics.lifecycle_rejected != metrics.lifecycle_attempted { + return Err(AedbError::Validation( + "lifecycle flow accounting mismatch".into(), + )); + } + + Ok((db, metrics)) } -async fn assert_book_invariants(db: &AedbInstance, assets: &[String]) { +async fn assert_book_invariants(db: &AedbInstance, assets: &[String]) -> Result<(), AedbError> { for asset in assets { let mut from_orders: BTreeMap<(u8, i64), u64> = BTreeMap::new(); @@ -248,18 +384,19 @@ async fn assert_book_invariants(db: &AedbInstance, assets: &[String]) { ConsistencyMode::AtLatest, ) .await - .expect("scan orders"); + .map_err(|e| AedbError::Validation(e.to_string()))?; for (_, entry) in rows { - let order: aedb::order_book::OrderRecord = - rmp_serde::from_slice(&entry.value).expect("decode order"); + let order: aedb::order_book::OrderRecord = rmp_serde::from_slice(&entry.value) + .map_err(|e| AedbError::Decode(e.to_string()))?; let original = decode_u256_u64(order.original_qty_be); let remaining = decode_u256_u64(order.remaining_qty_be); let filled = decode_u256_u64(order.filled_qty_be); - assert!( - remaining + filled <= original, - "quantity accounting invariant" - ); + if remaining + filled > original { + return Err(AedbError::Validation( + "quantity accounting invariant violated".into(), + )); + } if remaining > 0 && matches!( order.status, @@ -283,29 +420,61 @@ async fn assert_book_invariants(db: &AedbInstance, assets: &[String]) { ConsistencyMode::AtLatest, ) .await - .expect("scan levels"); + .map_err(|e| AedbError::Validation(e.to_string()))?; for (k, v) in levels { - let qty = decode_u256_bytes_to_u64(&v.value); + let qty = decode_u256_bytes_to_u64(&v.value)?; if qty == 0 { continue; } - let price = parse_plqty_price(side, &k).expect("parse level price"); + let price = parse_plqty_price(side, &k) + .ok_or_else(|| AedbError::Validation("failed to parse level price".into()))?; from_levels.insert((side as u8, price), qty); } } - assert_eq!( - from_orders, from_levels, - "price-level aggregates must match open orders for {asset}" - ); + if from_orders != from_levels { + return Err(AedbError::Validation(format!( + "price-level aggregates mismatch for {asset}" + ))); + } } + Ok(()) } #[tokio::test] async fn order_book_simulation_smoke() { let assets = vec!["BTC-USD".to_string(), "ETH-USD".to_string()]; - let db = run_simulation(assets.clone(), 6, 250).await; - assert_book_invariants(&db, &assets).await; + let (db, metrics) = run_simulation(assets.clone(), 6, 250) + .await + .expect("run simulation"); + assert!(metrics.reader_checks > 0, "reader workers should execute"); + assert_book_invariants(&db, &assets) + .await + .expect("final invariants"); +} + +#[tokio::test] +async fn order_book_chaos_read_write_accuracy() { + let assets = vec![ + "BTC-USD".to_string(), + "ETH-USD".to_string(), + "SOL-USD".to_string(), + "DOGE-USD".to_string(), + ]; + let (db, metrics) = run_simulation(assets.clone(), 16, 800) + .await + .expect("chaos run"); + assert!( + metrics.primary_attempted >= 16 * 800, + "writers should execute full primary load" + ); + assert!( + metrics.reader_checks >= 1_000, + "read-side chaos checks should be substantial" + ); + assert_book_invariants(&db, &assets) + .await + .expect("final invariants"); } #[tokio::test] @@ -317,6 +486,10 @@ async fn order_book_simulation_hft_soak() { "SOL-USD".to_string(), "DOGE-USD".to_string(), ]; - let db = run_simulation(assets.clone(), 24, 2_000).await; - assert_book_invariants(&db, &assets).await; + let (db, _metrics) = run_simulation(assets.clone(), 24, 2_000) + .await + .expect("hft soak"); + assert_book_invariants(&db, &assets) + .await + .expect("final invariants"); } diff --git a/tests/security_properties.rs b/tests/security_properties.rs new file mode 100644 index 0000000..64bf521 --- /dev/null +++ b/tests/security_properties.rs @@ -0,0 +1,155 @@ +use aedb::AedbInstance; +use aedb::commit::tx::{IdempotencyKey, ReadSet, TransactionEnvelope, WriteClass, WriteIntent}; +use aedb::commit::validation::Mutation; +use aedb::config::AedbConfig; +use aedb::error::AedbError; +use aedb::offline; +use aedb::query::plan::ConsistencyMode; +use tempfile::tempdir; + +fn one_u256() -> [u8; 32] { + let mut out = [0u8; 32]; + out[31] = 1; + out +} + +#[tokio::test] +async fn security_atomicity_no_partial_apply_on_envelope_failure() { + let dir = tempdir().expect("temp dir"); + let db = AedbInstance::open(AedbConfig::default(), dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + + let err = db + .commit_envelope(TransactionEnvelope { + caller: None, + idempotency_key: None, + write_class: WriteClass::Standard, + assertions: Vec::new(), + read_set: ReadSet::default(), + write_intent: WriteIntent { + mutations: vec![ + Mutation::KvSet { + project_id: "p".into(), + scope_id: "app".into(), + key: b"must_not_persist".to_vec(), + value: b"x".to_vec(), + }, + Mutation::KvDecU256 { + project_id: "p".into(), + scope_id: "app".into(), + key: b"missing-counter".to_vec(), + amount_be: one_u256(), + }, + ], + }, + base_seq: 0, + }) + .await + .expect_err("envelope should fail atomically"); + assert!(matches!( + err, + AedbError::Underflow | AedbError::Validation(_) + )); + + let entry = db + .kv_get_no_auth("p", "app", b"must_not_persist", ConsistencyMode::AtLatest) + .await + .expect("kv read"); + assert!(entry.is_none(), "failing envelope must not partially apply"); +} + +#[tokio::test] +async fn security_idempotency_survives_restart_exactly_once() { + let dir = tempdir().expect("temp dir"); + let config = AedbConfig::production([7u8; 32]); + let db = AedbInstance::open(config.clone(), dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + + let key = IdempotencyKey([4u8; 16]); + let envelope = TransactionEnvelope { + caller: None, + idempotency_key: Some(key), + write_class: WriteClass::Economic, + assertions: Vec::new(), + read_set: ReadSet::default(), + write_intent: WriteIntent { + mutations: vec![Mutation::KvSet { + project_id: "p".into(), + scope_id: "app".into(), + key: b"idem-restart".to_vec(), + value: b"v1".to_vec(), + }], + }, + base_seq: 0, + }; + + let first = db + .commit_envelope(envelope.clone()) + .await + .expect("first commit"); + let second = db + .commit_envelope(envelope.clone()) + .await + .expect("idempotent retry"); + assert_eq!(second.commit_seq, first.commit_seq); + db.shutdown().await.expect("shutdown"); + drop(db); + + let reopened = AedbInstance::open(config, dir.path()).expect("reopen"); + let third = reopened + .commit_envelope(envelope) + .await + .expect("idempotent retry after restart"); + assert_eq!(third.commit_seq, first.commit_seq); +} + +#[tokio::test] +async fn security_replay_is_deterministic_via_snapshot_parity() { + let dir = tempdir().expect("temp dir"); + let dump_a = tempdir().expect("dump a"); + let dump_b = tempdir().expect("dump b"); + let dump_a_file = dump_a.path().join("state-a.aedbdump"); + let dump_b_file = dump_b.path().join("state-b.aedbdump"); + let config = AedbConfig::production([8u8; 32]); + + let db = AedbInstance::open(config.clone(), dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + for i in 0..500u64 { + db.commit(Mutation::KvSet { + project_id: "p".into(), + scope_id: "app".into(), + key: format!("replay:{i}").into_bytes(), + value: i.to_be_bytes().to_vec(), + }) + .await + .expect("commit"); + } + db.shutdown().await.expect("shutdown"); + + let report_a = + offline::export_snapshot_dump(dir.path(), &config, &dump_a_file).expect("export a"); + let report_b = + offline::export_snapshot_dump(dir.path(), &config, &dump_b_file).expect("export b"); + assert_eq!(report_a.current_seq, report_b.current_seq); + assert_eq!( + report_a.parity_checksum_hex, report_b.parity_checksum_hex, + "replay parity must be deterministic" + ); +} + +#[tokio::test] +async fn security_secure_mode_enforces_authenticated_commit_calls() { + let dir = tempdir().expect("temp dir"); + let db = AedbInstance::open_secure(AedbConfig::production([9u8; 32]), dir.path()) + .expect("open secure"); + + let err = db + .commit(Mutation::Ddl(aedb::catalog::DdlOperation::CreateProject { + owner_id: None, + if_not_exists: true, + project_id: "p".into(), + })) + .await + .expect_err("anonymous commit should be rejected in secure mode"); + assert!(matches!(err, AedbError::PermissionDenied(_))); +} diff --git a/tests/security_properties_proptest.rs b/tests/security_properties_proptest.rs new file mode 100644 index 0000000..54dcd1d --- /dev/null +++ b/tests/security_properties_proptest.rs @@ -0,0 +1,196 @@ +use aedb::AedbInstance; +use aedb::catalog::DdlOperation; +use aedb::commit::tx::{IdempotencyKey, ReadSet, TransactionEnvelope, WriteClass, WriteIntent}; +use aedb::commit::validation::Mutation; +use aedb::config::AedbConfig; +use aedb::error::AedbError; +use aedb::offline; +use aedb::query::plan::ConsistencyMode; +use proptest::prelude::*; +use proptest::test_runner::TestCaseError; +use tempfile::tempdir; + +fn one_u256() -> [u8; 32] { + let mut out = [0u8; 32]; + out[31] = 1; + out +} + +proptest! { + #![proptest_config(ProptestConfig { + cases: 8, + max_local_rejects: 0, + .. ProptestConfig::default() + })] + + #[test] + fn prop_atomicity_no_partial_apply( + suffix in prop::collection::vec(any::(), 4..16), + value in prop::collection::vec(any::(), 1..64), + ) { + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .expect("runtime"); + let outcome: Result<(), TestCaseError> = rt.block_on(async move { + let dir = tempdir().expect("temp dir"); + let db = AedbInstance::open(AedbConfig::default(), dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + + let mut key = b"atomic-prop:".to_vec(); + key.extend_from_slice(&suffix); + let err = db + .commit_envelope(TransactionEnvelope { + caller: None, + idempotency_key: None, + write_class: WriteClass::Standard, + assertions: Vec::new(), + read_set: ReadSet::default(), + write_intent: WriteIntent { + mutations: vec![ + Mutation::KvSet { + project_id: "p".into(), + scope_id: "app".into(), + key: key.clone(), + value, + }, + Mutation::KvDecU256 { + project_id: "p".into(), + scope_id: "app".into(), + key: b"missing-counter".to_vec(), + amount_be: one_u256(), + }, + ], + }, + base_seq: 0, + }) + .await + .expect_err("envelope should fail atomically"); + assert!(matches!(err, AedbError::Underflow | AedbError::Validation(_))); + + let entry = db + .kv_get_no_auth("p", "app", &key, ConsistencyMode::AtLatest) + .await + .expect("kv read"); + prop_assert!(entry.is_none()); + Ok(()) + }); + outcome?; + } + + #[test] + fn prop_idempotency_exactly_once_across_retries( + key_seed in any::(), + retries in 2u8..6, + payload in prop::collection::vec(any::(), 1..48), + ) { + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .expect("runtime"); + let outcome: Result<(), TestCaseError> = rt.block_on(async move { + let dir = tempdir().expect("temp dir"); + let db = AedbInstance::open(AedbConfig::production([7u8; 32]), dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + + let idem = IdempotencyKey(key_seed.to_be_bytes()); + let envelope = TransactionEnvelope { + caller: None, + idempotency_key: Some(idem), + write_class: WriteClass::Economic, + assertions: Vec::new(), + read_set: ReadSet::default(), + write_intent: WriteIntent { + mutations: vec![Mutation::KvSet { + project_id: "p".into(), + scope_id: "app".into(), + key: b"idem-prop".to_vec(), + value: payload, + }], + }, + base_seq: 0, + }; + let first = db + .commit_envelope(envelope.clone()) + .await + .expect("first commit"); + for _ in 0..retries { + let again = db + .commit_envelope(envelope.clone()) + .await + .expect("idempotent retry"); + prop_assert_eq!(again.commit_seq, first.commit_seq); + } + Ok(()) + }); + outcome?; + } + + #[test] + fn prop_replay_determinism_snapshot_parity( + seed in any::(), + writes in 16usize..96, + ) { + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .expect("runtime"); + let outcome: Result<(), TestCaseError> = rt.block_on(async move { + let dir = tempdir().expect("temp dir"); + let dump_a_dir = tempdir().expect("dump a"); + let dump_b_dir = tempdir().expect("dump b"); + let dump_a = dump_a_dir.path().join("a.aedbdump"); + let dump_b = dump_b_dir.path().join("b.aedbdump"); + let config = AedbConfig::production([8u8; 32]); + + let db = AedbInstance::open(config.clone(), dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + for i in 0..writes { + let key = format!("replay-prop:{seed}:{i}").into_bytes(); + let value = ((seed as usize) ^ i).to_be_bytes().to_vec(); + db.commit(Mutation::KvSet { + project_id: "p".into(), + scope_id: "app".into(), + key, + value, + }) + .await + .expect("commit"); + } + db.shutdown().await.expect("shutdown"); + + let report_a = + offline::export_snapshot_dump(dir.path(), &config, &dump_a).expect("export a"); + let report_b = + offline::export_snapshot_dump(dir.path(), &config, &dump_b).expect("export b"); + prop_assert_eq!(report_a.current_seq, report_b.current_seq); + prop_assert_eq!(report_a.parity_checksum_hex, report_b.parity_checksum_hex); + Ok(()) + }); + outcome?; + } + + #[test] + fn prop_secure_mode_rejects_unauthenticated_commits(project_suffix in 0u32..10_000) { + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .expect("runtime"); + let outcome: Result<(), TestCaseError> = rt.block_on(async move { + let dir = tempdir().expect("temp dir"); + let db = AedbInstance::open_secure(AedbConfig::production([9u8; 32]), dir.path()) + .expect("open secure"); + let err = db + .commit(Mutation::Ddl(DdlOperation::CreateProject { + owner_id: None, + if_not_exists: true, + project_id: format!("p-{project_suffix}"), + })) + .await + .expect_err("secure mode must require authenticated caller"); + prop_assert!(matches!(err, AedbError::PermissionDenied(_))); + Ok(()) + }); + outcome?; + } +} From 2906a6a5272ecc28ee3054b8646f05a588205efe Mon Sep 17 00:00:00 2001 From: johnny Date: Thu, 26 Feb 2026 15:14:17 -0500 Subject: [PATCH 3/4] fuzzy matching --- README.md | 4 +- benches/perf.rs | 8 +- crates/aedb-orderbook/Cargo.toml | 2 +- crates/aedb-orderbook/src/lib.rs | 428 ++++- src/backup/mod.rs | 10 +- src/bin/aedb.rs | 6 +- src/checkpoint/loader.rs | 13 +- src/checkpoint/writer.rs | 5 +- src/commit/apply.rs | 212 ++- src/commit/assertions.rs | 4 +- src/commit/executor/global_index.rs | 52 +- src/commit/executor/internals.rs | 237 ++- src/commit/executor/mod.rs | 297 +++- src/commit/executor/tests.rs | 8 +- src/commit/validation.rs | 33 +- src/config.rs | 18 + src/declarative.rs | 48 +- src/lib.rs | 517 ++++-- src/lib_helpers.rs | 112 +- src/lib_tests.rs | 2159 +++++++++++++++++++++++-- src/offline.rs | 183 ++- src/order_book.rs | 136 +- src/query/executor.rs | 64 +- src/query/operators.rs | 123 +- src/recovery/mod.rs | 30 +- src/recovery/replay.rs | 19 +- src/recovery/scanner.rs | 23 +- src/storage/encoded_key.rs | 11 +- src/storage/index.rs | 21 +- src/storage/keyspace.rs | 11 +- src/version_store.rs | 4 +- src/wal/frame.rs | 87 +- src/wal/segment.rs | 3 +- tests/backup_restore.rs | 8 + tests/benchmark_gate.rs | 4 +- tests/crash_matrix.rs | 3 +- tests/naming_conventions.rs | 126 ++ tests/order_book_simulation.rs | 239 ++- tests/security_properties_proptest.rs | 61 + tests/wal_frame_robustness.rs | 35 + 40 files changed, 4595 insertions(+), 769 deletions(-) create mode 100644 tests/naming_conventions.rs create mode 100644 tests/wal_frame_robustness.rs diff --git a/README.md b/README.md index 0aea2d1..ca6de91 100644 --- a/README.md +++ b/README.md @@ -144,7 +144,7 @@ let db = aedb::AedbInstance::open(config, dir.path())?; AEDB supports permission-aware APIs via `CallerContext` and `Permission`. - `open_production` and `open_secure` require authenticated `*_as` calls -- `open_secure` enforces hardened durability/recovery settings +- `open_secure` enforces hardened durability/recovery settings (`DurabilityMode::Full`, strict recovery, hash chain, HMAC) - table/KV/query access can be scoped per project/scope/resource - `authz_audit` and `assertion_audit` system tables provide built-in audit trails @@ -155,7 +155,7 @@ Security/operations docs: ## Operational APIs -- `checkpoint_now()` to force a checkpoint +- `checkpoint_now()` to force a fuzzy checkpoint (does not block commit/query traffic) - `backup_full(...)` / restore helpers for backup workflows - `operational_metrics()` for commit latency, queue depth, durable head lag, and more diff --git a/benches/perf.rs b/benches/perf.rs index 1f6f0bc..6088f02 100644 --- a/benches/perf.rs +++ b/benches/perf.rs @@ -131,8 +131,8 @@ fn bench_aedb_hot_paths(c: &mut Criterion) { if next_multi_commit_base > SEEDED_ROWS { next_multi_commit_base = 1; } - for offset in 0..BATCH_INSERT_ROWS { - let id = ((base + offset - 1) % SEEDED_ROWS) + 1; + for row_offset in 0..BATCH_INSERT_ROWS { + let id = ((base + row_offset - 1) % SEEDED_ROWS) + 1; seed_db .commit(Mutation::Upsert { project_id: PROJECT_ID.into(), @@ -164,8 +164,8 @@ fn bench_aedb_hot_paths(c: &mut Criterion) { next_batch_commit_base = 1; } let mut rows = Vec::with_capacity(BATCH_INSERT_ROWS as usize); - for offset in 0..BATCH_INSERT_ROWS { - let id = ((base + offset - 1) % SEEDED_ROWS) + 1; + for row_offset in 0..BATCH_INSERT_ROWS { + let id = ((base + row_offset - 1) % SEEDED_ROWS) + 1; rows.push(Row { values: vec![ Value::Integer(id), diff --git a/crates/aedb-orderbook/Cargo.toml b/crates/aedb-orderbook/Cargo.toml index 65d6e68..5b10238 100644 --- a/crates/aedb-orderbook/Cargo.toml +++ b/crates/aedb-orderbook/Cargo.toml @@ -2,7 +2,7 @@ name = "aedb-orderbook" version = "0.1.0" edition = "2024" -description = "Order book simulation and stress utilities for AEDB" +description = "Order book workload toolkit and security validation harness for AEDB" license = "MIT OR Apache-2.0" [dependencies] diff --git a/crates/aedb-orderbook/src/lib.rs b/crates/aedb-orderbook/src/lib.rs index 18e8dfc..84c683f 100644 --- a/crates/aedb-orderbook/src/lib.rs +++ b/crates/aedb-orderbook/src/lib.rs @@ -1,4 +1,5 @@ use aedb::AedbInstance; +use aedb::catalog::DdlOperation; use aedb::commit::validation::Mutation; use aedb::config::{AedbConfig, DurabilityMode}; use aedb::error::AedbError; @@ -6,7 +7,8 @@ use aedb::order_book::{ ExecInstruction, InstrumentConfig, OrderBookTableMode, OrderRequest, OrderSide, OrderStatus, OrderType, TimeInForce, parse_plqty_price, scoped_instrument, }; -use aedb::query::plan::ConsistencyMode; +use aedb::permission::CallerContext; +use aedb::query::{KvCursor, plan::ConsistencyMode}; use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; use serde::{Deserialize, Serialize}; @@ -191,35 +193,247 @@ fn request( } } +async fn ensure_namespace( + db: &AedbInstance, + project_id: &str, + scope_id: &str, + caller: Option<&CallerContext>, +) -> Result<(), AedbError> { + match caller { + Some(caller) => { + db.commit_as( + caller.clone(), + Mutation::Ddl(DdlOperation::CreateProject { + owner_id: None, + project_id: project_id.to_string(), + if_not_exists: true, + }), + ) + .await?; + db.commit_as( + caller.clone(), + Mutation::Ddl(DdlOperation::CreateScope { + owner_id: None, + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + if_not_exists: true, + }), + ) + .await?; + } + None => { + db.create_project(project_id).await?; + db.create_scope(project_id, scope_id).await?; + } + } + Ok(()) +} + +async fn order_book_define_table( + db: &AedbInstance, + project_id: &str, + scope_id: &str, + table_id: &str, + mode: OrderBookTableMode, + caller: Option<&CallerContext>, +) -> Result<(), AedbError> { + match caller { + Some(caller) => { + db.order_book_define_table_as(caller.clone(), project_id, scope_id, table_id, mode) + .await?; + } + None => { + db.order_book_define_table(project_id, scope_id, table_id, mode) + .await?; + } + } + Ok(()) +} + +async fn order_book_set_instrument_config( + db: &AedbInstance, + project_id: &str, + scope_id: &str, + instrument: &str, + config: InstrumentConfig, + caller: Option<&CallerContext>, +) -> Result<(), AedbError> { + match caller { + Some(caller) => { + db.order_book_set_instrument_config_as( + caller.clone(), + project_id, + scope_id, + instrument, + config, + ) + .await?; + } + None => { + db.order_book_set_instrument_config(project_id, scope_id, instrument, config) + .await?; + } + } + Ok(()) +} + +async fn order_book_new( + db: &AedbInstance, + project_id: &str, + scope_id: &str, + request: OrderRequest, + caller: Option<&CallerContext>, +) -> Result<(), AedbError> { + match caller { + Some(caller) => { + db.order_book_new_as(caller.clone(), project_id, scope_id, request) + .await?; + } + None => { + db.order_book_new(project_id, scope_id, request).await?; + } + } + Ok(()) +} + +async fn order_book_cancel_by_client_id( + db: &AedbInstance, + project_id: &str, + scope_id: &str, + instrument: &str, + client_order_id: &str, + owner: &str, + caller: Option<&CallerContext>, +) -> Result<(), AedbError> { + match caller { + Some(caller) => { + db.order_book_cancel_by_client_id_as( + caller.clone(), + project_id, + scope_id, + instrument, + client_order_id, + owner, + ) + .await?; + } + None => { + db.order_book_cancel_by_client_id( + project_id, + scope_id, + instrument, + client_order_id, + owner, + ) + .await?; + } + } + Ok(()) +} + +async fn commit_many_atomic( + db: &AedbInstance, + mutations: Vec, + caller: Option<&CallerContext>, +) -> Result { + match caller { + Some(caller) => db.commit_many_atomic_as(caller.clone(), mutations).await, + None => db.commit_many_atomic(mutations).await, + } +} + +async fn scan_prefix_all( + db: &AedbInstance, + project_id: &str, + scope_id: &str, + prefix: &[u8], + caller: Option<&CallerContext>, +) -> Result, aedb::storage::keyspace::KvEntry)>, AedbError> { + match caller { + Some(caller) => { + let mut out = Vec::new(); + let mut cursor: Option = None; + loop { + let page = db + .kv_scan_prefix( + project_id, + scope_id, + prefix, + 10_000, + cursor, + ConsistencyMode::AtLatest, + caller, + ) + .await + .map_err(|e| AedbError::Validation(e.to_string()))?; + out.extend(page.entries); + if !page.truncated { + break; + } + cursor = page.cursor; + } + Ok(out) + } + None => db + .kv_scan_prefix_no_auth( + project_id, + scope_id, + prefix, + 1_000_000, + ConsistencyMode::AtLatest, + ) + .await + .map_err(|e| AedbError::Validation(e.to_string())), + } +} + async fn setup_books( db: &AedbInstance, + project_id: &str, + scope_id: &str, assets: &[String], table_profile: &TableProfile, + caller: Option<&CallerContext>, ) -> Result, AedbError> { - db.create_project("p").await?; + ensure_namespace(db, project_id, scope_id, caller).await?; let mut instruments = Vec::with_capacity(assets.len()); let multi_table_id = match table_profile { TableProfile::MultiAssetTable { table_id } => Some(table_id.clone()), _ => None, }; if let Some(table_id) = &multi_table_id { - db.order_book_define_table("p", "app", table_id, OrderBookTableMode::MultiAsset) - .await?; + order_book_define_table( + db, + project_id, + scope_id, + table_id, + OrderBookTableMode::MultiAsset, + caller, + ) + .await?; } for asset in assets { let instrument = match table_profile { TableProfile::NativeInstrument => asset.clone(), TableProfile::PerAssetTable => { - db.order_book_define_table("p", "app", asset, OrderBookTableMode::PerAsset) - .await?; + order_book_define_table( + db, + project_id, + scope_id, + asset, + OrderBookTableMode::PerAsset, + caller, + ) + .await?; scoped_instrument(asset, asset) } TableProfile::MultiAssetTable { table_id } => scoped_instrument(table_id, asset), }; instruments.push(instrument.clone()); - db.order_book_set_instrument_config( - "p", - "app", + order_book_set_instrument_config( + db, + project_id, + scope_id, &instrument, InstrumentConfig { instrument: instrument.clone(), @@ -231,14 +445,16 @@ async fn setup_books( halted: false, balance_config: None, }, + caller, ) .await?; for i in 0..20_u64 { let ask_owner = format!("seed_ask_{}_{}", instrument, i); - db.order_book_new( - "p", - "app", + order_book_new( + db, + project_id, + scope_id, request( &instrument, &ask_owner, @@ -251,13 +467,15 @@ async fn setup_books( 10, 1, ), + caller, ) .await?; let bid_owner = format!("seed_bid_{}_{}", instrument, i); - db.order_book_new( - "p", - "app", + order_book_new( + db, + project_id, + scope_id, request( &instrument, &bid_owner, @@ -270,6 +488,7 @@ async fn setup_books( 10, 1, ), + caller, ) .await?; } @@ -279,21 +498,22 @@ async fn setup_books( async fn assert_book_invariants( db: &AedbInstance, + project_id: &str, + scope_id: &str, instruments: &[String], + caller: Option<&CallerContext>, ) -> Result<(), AedbError> { for instrument in instruments { let mut from_orders: BTreeMap<(u8, i64), u64> = BTreeMap::new(); - let rows = db - .kv_scan_prefix_no_auth( - "p", - "app", - format!("ob:{instrument}:ord:").as_bytes(), - 1_000_000, - ConsistencyMode::AtLatest, - ) - .await - .map_err(|e| AedbError::Validation(e.to_string()))?; + let rows = scan_prefix_all( + db, + project_id, + scope_id, + format!("ob:{instrument}:ord:").as_bytes(), + caller, + ) + .await?; for (_, entry) in rows { let order: aedb::order_book::OrderRecord = rmp_serde::from_slice(&entry.value) @@ -320,16 +540,14 @@ async fn assert_book_invariants( let mut from_levels: BTreeMap<(u8, i64), u64> = BTreeMap::new(); for side in [OrderSide::Bid, OrderSide::Ask] { - let levels = db - .kv_scan_prefix_no_auth( - "p", - "app", - format!("ob:{instrument}:plqty:{}:", side as u8).as_bytes(), - 1_000_000, - ConsistencyMode::AtLatest, - ) - .await - .map_err(|e| AedbError::Validation(e.to_string()))?; + let levels = scan_prefix_all( + db, + project_id, + scope_id, + format!("ob:{instrument}:plqty:{}:", side as u8).as_bytes(), + caller, + ) + .await?; for (k, v) in levels { let qty = decode_u256_bytes_to_u64(&v.value)?; if qty == 0 { @@ -359,6 +577,28 @@ pub async fn run_hft_simulation(cfg: SimulationConfig) -> Result, + project_id: &str, + scope_id: &str, + caller: Option, +) -> Result { + run_hft_simulation_impl( + cfg, + db, + project_id, + scope_id, + caller, + "external".to_string(), + ) + .await +} + pub async fn run_hft_simulation_with_config( cfg: SimulationConfig, db_cfg: AedbConfig, @@ -371,7 +611,26 @@ pub async fn run_hft_simulation_with_config( .to_string(); let dir = tempdir().map_err(AedbError::Io)?; let db = Arc::new(AedbInstance::open(db_cfg, dir.path())?); - let instruments = setup_books(&db, &cfg.assets, &cfg.table_profile).await?; + run_hft_simulation_impl(cfg, db, "p", "app", None, durability_mode_name).await +} + +async fn run_hft_simulation_impl( + cfg: SimulationConfig, + db: Arc, + project_id: &str, + scope_id: &str, + caller: Option, + durability_mode_name: String, +) -> Result { + let instruments = setup_books( + &db, + project_id, + scope_id, + &cfg.assets, + &cfg.table_profile, + caller.as_ref(), + ) + .await?; let run_started = Instant::now(); let mut tasks = Vec::with_capacity(cfg.traders); @@ -383,6 +642,9 @@ pub async fn run_hft_simulation_with_config( let seed = cfg.seed; let ops_per_trader = cfg.ops_per_trader; let orders_per_commit = cfg.orders_per_commit.max(1); + let project_id = project_id.to_string(); + let scope_id = scope_id.to_string(); + let caller = caller.clone(); tasks.push(tokio::spawn(async move { let owner = format!("trader_{t}"); let mut nonces: BTreeMap = BTreeMap::new(); @@ -452,8 +714,8 @@ pub async fn run_hft_simulation_with_config( pending_started = Some(Instant::now()); } pending_mutations.push(Mutation::OrderBookNew { - project_id: "p".to_string(), - scope_id: "app".to_string(), + project_id: project_id.clone(), + scope_id: scope_id.clone(), request: request( instrument, &owner, @@ -474,6 +736,7 @@ pub async fn run_hft_simulation_with_config( &mut pending_mutations, &mut pending_orders, &mut pending_started, + caller.as_ref(), cfg.collect_latency, &mut latencies_us, &mut accepted, @@ -492,24 +755,25 @@ pub async fn run_hft_simulation_with_config( *nonce += 1; let cid = format!("gtc-{owner}-{op}"); lifecycle_attempted += 1; - match db_clone - .order_book_new( - "p", - "app", - request( - instrument, - &owner, - cid.clone(), - side, - OrderType::Limit, - TimeInForce::Gtc, - false, - price, - qty, - *nonce, - ), - ) - .await + match order_book_new( + db_clone.as_ref(), + &project_id, + &scope_id, + request( + instrument, + &owner, + cid.clone(), + side, + OrderType::Limit, + TimeInForce::Gtc, + false, + price, + qty, + *nonce, + ), + caller.as_ref(), + ) + .await { Ok(_) => lifecycle_accepted += 1, Err(err) => { @@ -518,9 +782,16 @@ pub async fn run_hft_simulation_with_config( } } lifecycle_attempted += 1; - match db_clone - .order_book_cancel_by_client_id("p", "app", instrument, &cid, &owner) - .await + match order_book_cancel_by_client_id( + db_clone.as_ref(), + &project_id, + &scope_id, + instrument, + &cid, + &owner, + caller.as_ref(), + ) + .await { Ok(_) => lifecycle_accepted += 1, Err(err) => { @@ -535,6 +806,7 @@ pub async fn run_hft_simulation_with_config( &mut pending_mutations, &mut pending_orders, &mut pending_started, + caller.as_ref(), cfg.collect_latency, &mut latencies_us, &mut accepted, @@ -590,7 +862,7 @@ pub async fn run_hft_simulation_with_config( all_latencies_us.append(&mut latencies); } - assert_book_invariants(&db, &instruments).await?; + assert_book_invariants(&db, project_id, scope_id, &instruments, caller.as_ref()).await?; let elapsed_ms = run_started.elapsed().as_millis().max(1) as u64; db.force_fsync().await?; let heads = db.head_state().await; @@ -695,6 +967,7 @@ async fn flush_pending_orders( pending_mutations: &mut Vec, pending_orders: &mut usize, pending_started: &mut Option, + caller: Option<&CallerContext>, collect_latency: bool, latencies_us: &mut Vec, accepted: &mut usize, @@ -708,9 +981,7 @@ async fn flush_pending_orders( } let started = pending_started.take().unwrap_or_else(Instant::now); let batch_len = *pending_orders; - let res = db - .commit_many_atomic(std::mem::take(pending_mutations)) - .await; + let res = commit_many_atomic(db, std::mem::take(pending_mutations), caller).await; let elapsed = started.elapsed().as_micros() as u64; if collect_latency && batch_len > 0 { let per_order = (elapsed / batch_len as u64).max(1); @@ -811,6 +1082,7 @@ fn total_rejections(b: &RejectionBreakdown) -> usize { #[cfg(test)] mod tests { use super::*; + use tempfile::tempdir; #[test] fn classify_validation_rejection_reasons() { @@ -911,4 +1183,34 @@ mod tests { let err = decode_u256_bytes_to_u64(&[1, 2, 3]).expect_err("must reject short u256"); assert!(matches!(err, AedbError::Validation(_))); } + + #[tokio::test] + async fn simulation_can_run_on_existing_instance_without_tempdir_wrapper() { + let dir = tempdir().expect("temp dir"); + let db = Arc::new( + AedbInstance::open(high_throughput_simulation_config(), dir.path()).expect("open db"), + ); + let report = run_hft_simulation_on_instance( + SimulationConfig { + assets: vec!["BTC-USD".to_string()], + traders: 2, + ops_per_trader: 32, + seed: 42, + flow_profile: OrderFlowProfile::MixedMarketAndLimit, + table_profile: TableProfile::PerAssetTable, + collect_latency: true, + lifecycle_every_ops: 16, + orders_per_commit: 2, + match_workload: MatchWorkload::CrossingNearTouch, + }, + db, + "sim", + "app", + None, + ) + .await + .expect("simulation"); + assert!(report.simulation.attempted_orders > 0); + assert!(report.zero_dropped_orders); + } } diff --git a/src/backup/mod.rs b/src/backup/mod.rs index 448f2b8..2a02ff3 100644 --- a/src/backup/mod.rs +++ b/src/backup/mod.rs @@ -116,14 +116,14 @@ pub fn write_backup_archive( writer.write_all(&[archive_flags])?; writer.write_all(&salt)?; - for (idx, rel) in rel_files.iter().enumerate() { + for (entry_index, rel) in rel_files.iter().enumerate() { let resolved = resolve_backup_path(dir, rel)?; let raw = fs::read(&resolved)?; let compressed = zstd::stream::encode_all(raw.as_slice(), 3) .map_err(|e| AedbError::Encode(e.to_string()))?; let payload = if let Some(key) = encryption_key { - let nonce = derive_archive_nonce(&salt, idx as u64, rel); + let nonce = derive_archive_nonce(&salt, entry_index as u64, rel); encrypt_archive_payload(&compressed, key, &nonce)? } else { compressed @@ -172,7 +172,7 @@ pub fn extract_backup_archive( let mut salt = [0u8; 16]; reader.read_exact(&mut salt)?; - let mut idx = 0u64; + let mut entry_index = 0u64; loop { let entry = read_u8(&mut reader)?; if entry == BACKUP_ARCHIVE_ENTRY_END { @@ -205,7 +205,7 @@ pub fn extract_backup_archive( "backup archive missing encryption key".into(), )); }; - let expected_nonce = derive_archive_nonce(&salt, idx, &rel); + let expected_nonce = derive_archive_nonce(&salt, entry_index, &rel); decrypt_archive_payload(&payload, key, &expected_nonce)? } else { payload @@ -216,7 +216,7 @@ pub fn extract_backup_archive( let out = resolve_backup_output_path(dir, &rel)?; fs::write(out, bytes)?; - idx = idx.saturating_add(1); + entry_index = entry_index.saturating_add(1); } Ok(()) } diff --git a/src/bin/aedb.rs b/src/bin/aedb.rs index 04b9765..b4cacf2 100644 --- a/src/bin/aedb.rs +++ b/src/bin/aedb.rs @@ -162,9 +162,9 @@ fn cmd_check_invariants(args: &[String]) -> Result<(), String> { } fn parse_flag_value(args: &[String], flag: &str) -> Option { - for idx in 0..args.len() { - if args[idx] == flag { - return args.get(idx + 1).cloned(); + for arg_index in 0..args.len() { + if args[arg_index] == flag { + return args.get(arg_index + 1).cloned(); } } None diff --git a/src/checkpoint/loader.rs b/src/checkpoint/loader.rs index 57fd1ad..54d200f 100644 --- a/src/checkpoint/loader.rs +++ b/src/checkpoint/loader.rs @@ -69,11 +69,17 @@ pub fn load_checkpoint_with_key( } fn decrypt_checkpoint_payload(bytes: &[u8], key: &[u8; 32]) -> Result, AedbError> { - if bytes.len() < 8 + 12 { + const ENCRYPTED_MAGIC_SIZE_BYTES: usize = 8; + const NONCE_SIZE_BYTES: usize = 12; + let encrypted_header_size_bytes = ENCRYPTED_MAGIC_SIZE_BYTES + NONCE_SIZE_BYTES; + if bytes.len() < encrypted_header_size_bytes { return Err(AedbError::Decode("encrypted checkpoint too small".into())); } - let nonce = Nonce::from_slice(&bytes[8..20]); - let ciphertext = &bytes[20..]; + let nonce_offset_bytes = ENCRYPTED_MAGIC_SIZE_BYTES; + let ciphertext_offset_bytes = encrypted_header_size_bytes; + debug_assert!(ciphertext_offset_bytes <= bytes.len()); + let nonce = Nonce::from_slice(&bytes[nonce_offset_bytes..ciphertext_offset_bytes]); + let ciphertext = &bytes[ciphertext_offset_bytes..]; let cipher = Aes256Gcm::new_from_slice(key) .map_err(|e| AedbError::Validation(format!("invalid encryption key: {e}")))?; cipher @@ -185,6 +191,7 @@ mod tests { Some(&key), Some("k1".into()), std::collections::HashMap::new(), + 3, ) .expect("write"); assert_eq!(meta.key_id.as_deref(), Some("k1")); diff --git a/src/checkpoint/writer.rs b/src/checkpoint/writer.rs index 117be1b..2b377d9 100644 --- a/src/checkpoint/writer.rs +++ b/src/checkpoint/writer.rs @@ -37,7 +37,7 @@ pub fn write_checkpoint( seq: u64, dir: &Path, ) -> Result { - write_checkpoint_with_key(snapshot, catalog, seq, dir, None, None, HashMap::new()) + write_checkpoint_with_key(snapshot, catalog, seq, dir, None, None, HashMap::new(), 3) } pub fn write_checkpoint_with_key( @@ -48,6 +48,7 @@ pub fn write_checkpoint_with_key( encryption_key: Option<&[u8; 32]>, key_id: Option, idempotency: HashMap, + compression_level: i32, ) -> Result { fs::create_dir_all(dir)?; let checkpoint = CheckpointData { @@ -57,7 +58,7 @@ pub fn write_checkpoint_with_key( idempotency, }; let encoded = rmp_serde::to_vec(&checkpoint).map_err(|e| AedbError::Encode(e.to_string()))?; - let compressed = zstd::stream::encode_all(encoded.as_slice(), 3) + let compressed = zstd::stream::encode_all(encoded.as_slice(), compression_level) .map_err(|e| AedbError::Io(std::io::Error::other(e.to_string())))?; let created_at_micros = now_micros(); diff --git a/src/commit/apply.rs b/src/commit/apply.rs index e337581..4b7ba21 100644 --- a/src/commit/apply.rs +++ b/src/commit/apply.rs @@ -889,6 +889,7 @@ pub fn apply_mutation_trusted_if_eligible( const AUTHZ_AUDIT_TABLE: &str = "authz_audit"; const ASSERTION_AUDIT_TABLE: &str = "assertion_audit"; +const LIFECYCLE_OUTBOX_TABLE: &str = "lifecycle_outbox"; const SYSTEM_SCOPE_ID: &str = "app"; struct AuthzAuditContext<'a> { @@ -997,6 +998,11 @@ fn ensure_internal_audit_schema_for_upsert( && table_name == ASSERTION_AUDIT_TABLE { ensure_assertion_audit_schema(catalog)?; + } else if project_id == crate::catalog::SYSTEM_PROJECT_ID + && scope_id == SYSTEM_SCOPE_ID + && table_name == LIFECYCLE_OUTBOX_TABLE + { + ensure_lifecycle_outbox_schema(catalog)?; } Ok(()) } @@ -1154,6 +1160,52 @@ fn ensure_assertion_audit_schema(catalog: &mut Catalog) -> Result<(), AedbError> Ok(()) } +fn ensure_lifecycle_outbox_schema(catalog: &mut Catalog) -> Result<(), AedbError> { + ensure_system_project_scope(catalog); + let key = ( + namespace_key(crate::catalog::SYSTEM_PROJECT_ID, SYSTEM_SCOPE_ID), + LIFECYCLE_OUTBOX_TABLE.to_string(), + ); + if catalog.tables.contains_key(&key) { + return Ok(()); + } + catalog.tables.insert( + key, + TableSchema { + project_id: crate::catalog::SYSTEM_PROJECT_ID.to_string(), + scope_id: SYSTEM_SCOPE_ID.to_string(), + table_name: LIFECYCLE_OUTBOX_TABLE.to_string(), + owner_id: Some("system".to_string()), + columns: vec![ + ColumnDef { + name: "commit_seq".to_string(), + col_type: ColumnType::Integer, + nullable: false, + }, + ColumnDef { + name: "ts_micros".to_string(), + col_type: ColumnType::Timestamp, + nullable: false, + }, + ColumnDef { + name: "event_count".to_string(), + col_type: ColumnType::Integer, + nullable: false, + }, + ColumnDef { + name: "events".to_string(), + col_type: ColumnType::Json, + nullable: false, + }, + ], + primary_key: vec!["commit_seq".to_string()], + constraints: vec![], + foreign_keys: vec![], + }, + ); + Ok(()) +} + fn ensure_system_project_scope(catalog: &mut Catalog) { if !catalog .projects @@ -1419,14 +1471,14 @@ fn apply_upsert_once( fn extract_primary_key_from_row(schema: &TableSchema, row: &Row) -> Result, AedbError> { let mut primary_key = Vec::with_capacity(schema.primary_key.len()); for pk_name in &schema.primary_key { - let idx = schema + let column_index = schema .columns .iter() .position(|c| c.name == *pk_name) .ok_or_else(|| { AedbError::Validation(format!("primary key column missing: {pk_name}")) })?; - let value = row.values.get(idx).ok_or_else(|| { + let value = row.values.get(column_index).ok_or_else(|| { AedbError::Validation(format!( "primary key column value missing from row: {pk_name}" )) @@ -1551,13 +1603,13 @@ fn lookup_existing_by_unique_index( fn apply_default_constraints(schema: &TableSchema, row: &mut Row) -> Result<(), AedbError> { for constraint in &schema.constraints { if let Constraint::Default { column, value } = constraint - && let Some(idx) = schema.columns.iter().position(|c| c.name == *column) + && let Some(column_index) = schema.columns.iter().position(|c| c.name == *column) && row .values - .get(idx) + .get(column_index) .is_some_and(|v| matches!(v, Value::Null)) { - row.values[idx] = value.clone(); + row.values[column_index] = value.clone(); } } Ok(()) @@ -1578,7 +1630,7 @@ fn validate_row_constraints( for constraint in &schema.constraints { match constraint { Constraint::NotNull { column } => { - let idx = schema + let column_index = schema .columns .iter() .position(|c| c.name == *column) @@ -1586,7 +1638,7 @@ fn validate_row_constraints( table: table_name.to_string(), column: column.clone(), })?; - if matches!(row.values.get(idx), Some(Value::Null) | None) { + if matches!(row.values.get(column_index), Some(Value::Null) | None) { return Err(AedbError::NotNullViolation { table: table_name.to_string(), column: column.clone(), @@ -1716,7 +1768,7 @@ fn validate_foreign_keys( let mut values = Vec::with_capacity(fk.columns.len()); let mut any_null = false; for col in &fk.columns { - let idx = schema + let column_index = schema .columns .iter() .position(|c| c.name == *col) @@ -1724,7 +1776,7 @@ fn validate_foreign_keys( table: schema.table_name.clone(), column: col.clone(), })?; - let value = row.values.get(idx).cloned().unwrap_or(Value::Null); + let value = row.values.get(column_index).cloned().unwrap_or(Value::Null); if matches!(value, Value::Null) { any_null = true; } @@ -1790,11 +1842,12 @@ fn validate_foreign_keys( let matched = ref_table.rows.iter().any(|(_pk, r)| { let mut same = true; for (i, ref_col) in fk.references_columns.iter().enumerate() { - let Some(idx) = ref_schema.columns.iter().position(|c| c.name == *ref_col) + let Some(reference_column_index) = + ref_schema.columns.iter().position(|c| c.name == *ref_col) else { return false; }; - if r.values.get(idx) != values.get(i) { + if r.values.get(reference_column_index) != values.get(i) { same = false; break; } @@ -1927,13 +1980,13 @@ fn apply_update_where_internal( .map_err(|e| AedbError::Validation(format!("invalid predicate: {e:?}")))?; let mut update_indices = Vec::with_capacity(updates.len()); for (column, value) in updates { - let Some(idx) = schema.columns.iter().position(|c| c.name == *column) else { + let Some(column_index) = schema.columns.iter().position(|c| c.name == *column) else { return Err(AedbError::UnknownColumn { table: table_name.to_string(), column: column.clone(), }); }; - update_indices.push((idx, value.clone())); + update_indices.push((column_index, value.clone())); } let mut staged = Vec::new(); if let Some(table) = keyspace.table_by_namespace_key(&ns, table_name) { @@ -1943,8 +1996,8 @@ fn apply_update_where_internal( } let primary_key = extract_primary_key_from_row(&schema, row)?; let mut next_row = row.clone(); - for (idx, value) in &update_indices { - next_row.values[*idx] = value.clone(); + for (column_index, value) in &update_indices { + next_row.values[*column_index] = value.clone(); } staged.push((primary_key, next_row)); if limit.is_some_and(|max| staged.len() >= max) { @@ -1993,7 +2046,7 @@ fn apply_update_where_expr_internal( .map_err(|e| AedbError::Validation(format!("invalid predicate: {e:?}")))?; let mut update_indices = Vec::with_capacity(updates.len()); for (column, expr) in updates { - let Some(idx) = schema.columns.iter().position(|c| c.name == *column) else { + let Some(column_index) = schema.columns.iter().position(|c| c.name == *column) else { return Err(AedbError::UnknownColumn { table: table_name.to_string(), column: column.clone(), @@ -2015,7 +2068,7 @@ fn apply_update_where_expr_internal( ResolvedTableUpdateExpr::Coalesce(fallback.clone()) } }; - update_indices.push((idx, resolved)); + update_indices.push((column_index, resolved)); } let mut staged = Vec::new(); if let Some(table) = keyspace.table_by_namespace_key(&ns, table_name) { @@ -2025,9 +2078,9 @@ fn apply_update_where_expr_internal( } let primary_key = extract_primary_key_from_row(&schema, row)?; let mut next_row = row.clone(); - for (idx, expr) in &update_indices { - let next_value = evaluate_table_update_expr(expr, &next_row, *idx)?; - next_row.values[*idx] = next_value; + for (column_index, expr) in &update_indices { + let next_value = evaluate_table_update_expr(expr, &next_row, *column_index)?; + next_row.values[*column_index] = next_value; } staged.push((primary_key, next_row)); if limit.is_some_and(|max| staged.len() >= max) { @@ -2113,7 +2166,7 @@ fn handle_referencing_foreign_keys( } let mut referenced_vals = Vec::with_capacity(fk.references_columns.len()); for ref_col in &fk.references_columns { - let idx = catalog + let reference_column_index = catalog .tables .get(&(target_ns.clone(), ref_table_name.to_string())) .and_then(|s| s.columns.iter().position(|c| c.name == *ref_col)) @@ -2123,7 +2176,7 @@ fn handle_referencing_foreign_keys( fk.name )) })?; - referenced_vals.push(ref_row.values[idx].clone()); + referenced_vals.push(ref_row.values[reference_column_index].clone()); } let Some(dep_table) = keyspace .table_by_namespace_key(dep_ns, dep_table_name) @@ -2197,11 +2250,13 @@ fn handle_referencing_foreign_keys( .cloned() .ok_or_else(|| AedbError::Validation("dependent row missing".into()))?; for fk_col in &fk.columns { - if let Some(idx) = + if let Some(column_index) = dep_schema.columns.iter().position(|c| c.name == *fk_col) { match fk.on_delete { - ForeignKeyAction::SetNull => row.values[idx] = Value::Null, + ForeignKeyAction::SetNull => { + row.values[column_index] = Value::Null + } ForeignKeyAction::SetDefault => { if let Some(default_value) = dep_schema.constraints.iter().find_map(|c| match c { @@ -2213,9 +2268,9 @@ fn handle_referencing_foreign_keys( _ => None, }) { - row.values[idx] = default_value; + row.values[column_index] = default_value; } else { - row.values[idx] = Value::Null; + row.values[column_index] = Value::Null; } } _ => {} @@ -2333,11 +2388,11 @@ fn apply_upsert_on_conflict_once( match conflict_action { ConflictAction::DoNothing => {} ConflictAction::DoMerge => { - for idx in 0..final_row.values.len() { - if let Some(proposed) = row.values.get(idx) + for column_index in 0..final_row.values.len() { + if let Some(proposed) = row.values.get(column_index) && !matches!(proposed, Value::Null) { - final_row.values[idx] = proposed.clone(); + final_row.values[column_index] = proposed.clone(); } } } @@ -2445,20 +2500,20 @@ fn evaluate_update_expr( match expr { UpdateExpr::Value(v) => Ok(v), UpdateExpr::Existing(column) => { - let idx = schema + let column_index = schema .columns .iter() .position(|c| c.name == column) .ok_or_else(|| AedbError::Validation("update expr column missing".into()))?; - Ok(existing.values[idx].clone()) + Ok(existing.values[column_index].clone()) } UpdateExpr::Proposed(column) => { - let idx = schema + let column_index = schema .columns .iter() .position(|c| c.name == column) .ok_or_else(|| AedbError::Validation("update expr column missing".into()))?; - Ok(proposed.values[idx].clone()) + Ok(proposed.values[column_index].clone()) } UpdateExpr::AddI64 { existing_column, @@ -2518,13 +2573,13 @@ fn handle_referencing_foreign_keys_on_update( let mut old_values = Vec::with_capacity(fk.references_columns.len()); let mut new_values = Vec::with_capacity(fk.references_columns.len()); for ref_col in &fk.references_columns { - let idx = ref_schema + let reference_column_index = ref_schema .columns .iter() .position(|c| c.name == *ref_col) .ok_or_else(|| AedbError::Validation("referenced column missing".into()))?; - old_values.push(old_row.values[idx].clone()); - new_values.push(new_row.values[idx].clone()); + old_values.push(old_row.values[reference_column_index].clone()); + new_values.push(new_row.values[reference_column_index].clone()); } if old_values == new_values { continue; @@ -2588,16 +2643,18 @@ fn handle_referencing_foreign_keys_on_update( .cloned() .ok_or_else(|| AedbError::Validation("dependent row missing".into()))?; for (i, fk_col) in fk.columns.iter().enumerate() { - if let Some(idx) = + if let Some(column_index) = dep_schema.columns.iter().position(|c| c.name == *fk_col) { match fk.on_update { ForeignKeyAction::Cascade => { - row.values[idx] = new_values[i].clone() + row.values[column_index] = new_values[i].clone() + } + ForeignKeyAction::SetNull => { + row.values[column_index] = Value::Null } - ForeignKeyAction::SetNull => row.values[idx] = Value::Null, ForeignKeyAction::SetDefault => { - row.values[idx] = dep_schema + row.values[column_index] = dep_schema .constraints .iter() .find_map(|c| match c { @@ -2835,33 +2892,34 @@ fn maintain_secondary_indexes( { continue; } - let index = table - .indexes - .entry(idx_name.clone()) - .or_insert_with(|| SecondaryIndex { - store: match idx_def.index_type { - crate::catalog::schema::IndexType::BTree - | crate::catalog::schema::IndexType::Art => { - SecondaryIndexStore::BTree(im::OrdMap::new()) - } - crate::catalog::schema::IndexType::Hash => { - SecondaryIndexStore::Hash(im::HashMap::new()) - } - crate::catalog::schema::IndexType::UniqueHash => { - SecondaryIndexStore::UniqueHash(im::HashMap::new()) - } - }, - columns_bitmask: idx_def.columns_bitmask, - partial_filter: idx_def.partial_filter.clone(), - }); + let secondary_index = + table + .indexes + .entry(idx_name.clone()) + .or_insert_with(|| SecondaryIndex { + store: match idx_def.index_type { + crate::catalog::schema::IndexType::BTree + | crate::catalog::schema::IndexType::Art => { + SecondaryIndexStore::BTree(im::OrdMap::new()) + } + crate::catalog::schema::IndexType::Hash => { + SecondaryIndexStore::Hash(im::HashMap::new()) + } + crate::catalog::schema::IndexType::UniqueHash => { + SecondaryIndexStore::UniqueHash(im::HashMap::new()) + } + }, + columns_bitmask: idx_def.columns_bitmask, + partial_filter: idx_def.partial_filter.clone(), + }); if let Some(before) = old_row - && index.should_include_row(before, schema, table_name)? + && secondary_index.should_include_row(before, schema, table_name)? { let old_key = extract_index_key_encoded(before, schema, &idx_def.columns)?; - index.remove(&old_key, &encoded_pk); + secondary_index.remove(&old_key, &encoded_pk); } if let Some(after) = new_row - && index.should_include_row(after, schema, table_name)? + && secondary_index.should_include_row(after, schema, table_name)? { if matches!( idx_def.index_type, @@ -2874,7 +2932,7 @@ fn maintain_secondary_indexes( if matches!( idx_def.index_type, crate::catalog::schema::IndexType::UniqueHash - ) && index + ) && secondary_index .unique_existing(&new_key) .is_some_and(|existing| existing != encoded_pk) { @@ -2882,7 +2940,7 @@ fn maintain_secondary_indexes( "unique index violation on {idx_name}" ))); } - index.insert(new_key, encoded_pk.clone()); + secondary_index.insert(new_key, encoded_pk.clone()); } } Ok(()) @@ -2904,7 +2962,7 @@ fn rebuild_index_for_table( .ok_or_else(|| AedbError::Validation("table missing".into()))?; let table = keyspace.table_mut_by_namespace_key(&ns, table_name); - let mut index = crate::storage::keyspace::SecondaryIndex { + let mut secondary_index = crate::storage::keyspace::SecondaryIndex { store: match catalog .indexes .get(&(ns.clone(), table_name.to_string(), index_name.to_string())) @@ -2932,7 +2990,7 @@ fn rebuild_index_for_table( .and_then(|d| d.partial_filter.clone()), }; for (pk, row) in &table.rows { - if index.should_include_row(row, schema, table_name)? { + if secondary_index.should_include_row(row, schema, table_name)? { if matches!( catalog .indexes @@ -2950,7 +3008,7 @@ fn rebuild_index_for_table( .get(&(ns.clone(), table_name.to_string(), index_name.to_string())) .map(|d| &d.index_type), Some(crate::catalog::schema::IndexType::UniqueHash) - ) && index + ) && secondary_index .unique_existing(&index_key) .is_some_and(|existing| existing != *pk) { @@ -2958,10 +3016,12 @@ fn rebuild_index_for_table( "unique index violation on {index_name}" ))); } - index.insert(index_key, pk.clone()); + secondary_index.insert(index_key, pk.clone()); } } - table.indexes.insert(index_name.to_string(), index); + table + .indexes + .insert(index_name.to_string(), secondary_index); Ok(()) } @@ -2971,12 +3031,12 @@ fn has_null_in_columns( columns: &[String], ) -> Result { for col in columns { - let idx = schema + let column_index = schema .columns .iter() .position(|c| c.name == *col) .ok_or_else(|| AedbError::Validation(format!("column not found: {col}")))?; - if matches!(row.values.get(idx), Some(Value::Null) | None) { + if matches!(row.values.get(column_index), Some(Value::Null) | None) { return Ok(true); } } @@ -2991,14 +3051,14 @@ fn calculate_modified_columns_bitmask( match (old_row, new_row) { (Some(before), Some(after)) => { let mut mask = 0u128; - for idx in 0..schema.columns.len() { - if idx >= 128 { + for column_index in 0..schema.columns.len() { + if column_index >= 128 { break; } - let lhs = before.values.get(idx); - let rhs = after.values.get(idx); + let lhs = before.values.get(column_index); + let rhs = after.values.get(column_index); if lhs != rhs { - mask |= 1u128 << idx; + mask |= 1u128 << column_index; } } mask diff --git a/src/commit/assertions.rs b/src/commit/assertions.rs index b096206..c58a482 100644 --- a/src/commit/assertions.rs +++ b/src/commit/assertions.rs @@ -458,11 +458,11 @@ fn compare_col( } fn col_value<'a>(row: &'a Row, schema: &TableSchema, col: &str) -> Result<&'a Value, AedbError> { - let Some(idx) = schema.columns.iter().position(|c| c.name == col) else { + let Some(column_index) = schema.columns.iter().position(|c| c.name == col) else { return Err(AedbError::Validation(format!("column not found: {col}"))); }; row.values - .get(idx) + .get(column_index) .ok_or_else(|| AedbError::Validation(format!("column value missing: {col}"))) } diff --git a/src/commit/executor/global_index.rs b/src/commit/executor/global_index.rs index 094a9c0..f2696ff 100644 --- a/src/commit/executor/global_index.rs +++ b/src/commit/executor/global_index.rs @@ -34,9 +34,9 @@ impl GlobalUniqueIndexState { let mut state = Self { entries: HashMap::new(), }; - for (idx, schema) in global_unique_index_definitions(catalog) { - state.entries.entry(idx.clone()).or_default(); - let Some(by_value) = state.entries.get_mut(&idx) else { + for (index_key, schema) in global_unique_index_definitions(catalog) { + state.entries.entry(index_key.clone()).or_default(); + let Some(by_value) = state.entries.get_mut(&index_key) else { continue; }; for (ns_id, ns_data) in keyspace.namespaces.iter() { @@ -46,15 +46,16 @@ impl GlobalUniqueIndexState { if !ns_key.starts_with("_global::") { continue; } - let Some(table) = ns_data.tables.get(&idx.table_name) else { + let Some(table) = ns_data.tables.get(&index_key.table_name) else { continue; }; let Some(table_schema) = schema.get(&**ns_key) else { continue; }; for (pk, row) in &table.rows { - let index_key = extract_index_key_encoded(row, table_schema, &idx.columns)?; - by_value.insert(index_key, (ns_key.clone(), pk.clone())); + let encoded_index_key = + extract_index_key_encoded(row, table_schema, &index_key.columns)?; + by_value.insert(encoded_index_key, (ns_key.clone(), pk.clone())); } } } @@ -316,15 +317,16 @@ impl GlobalUniqueIndexState { let incoming_pk_encoded = EncodedKey::from_values(input.incoming_pk); let schema = table_schema_for(catalog, input.project_id, input.scope_id, input.table_name)?; let defs = defs_for_table(catalog, input.table_name); - for idx in defs { - let index_key = extract_index_key_encoded(input.incoming_row, &schema, &idx.columns)?; - let map = self.entries.entry(idx.clone()).or_default(); - if let Some((existing_ns, existing_pk)) = map.get(&index_key) + for index_key in defs { + let encoded_index_key = + extract_index_key_encoded(input.incoming_row, &schema, &index_key.columns)?; + let map = self.entries.entry(index_key.clone()).or_default(); + if let Some((existing_ns, existing_pk)) = map.get(&encoded_index_key) && !(existing_ns == ¤t_ns && existing_pk == &incoming_pk_encoded) { return Err(AedbError::Validation(format!( "global unique constraint violation on {} ({})", - input.table_name, idx.index_name + input.table_name, index_key.index_name ))); } @@ -334,10 +336,14 @@ impl GlobalUniqueIndexState { input.table_name, &incoming_pk_encoded, ) { - let previous_key = extract_index_key_encoded(existing_row, &schema, &idx.columns)?; + let previous_key = + extract_index_key_encoded(existing_row, &schema, &index_key.columns)?; map.remove(&previous_key); } - map.insert(index_key, (current_ns.clone(), incoming_pk_encoded.clone())); + map.insert( + encoded_index_key, + (current_ns.clone(), incoming_pk_encoded.clone()), + ); } Ok(()) } @@ -360,9 +366,9 @@ impl GlobalUniqueIndexState { return Ok(()); }; let schema = table_schema_for(catalog, project_id, scope_id, table_name)?; - for idx in defs_for_table(catalog, table_name) { - let key = extract_index_key_encoded(row, &schema, &idx.columns)?; - if let Some(map) = self.entries.get_mut(&idx) { + for index_key in defs_for_table(catalog, table_name) { + let key = extract_index_key_encoded(row, &schema, &index_key.columns)?; + if let Some(map) = self.entries.get_mut(&index_key) { map.remove(&key); } } @@ -451,16 +457,16 @@ fn global_unique_index_definitions( ) { continue; } - let idx = IndexKey { + let index_key = IndexKey { table_name: table_name.clone(), index_name: idx_name.clone(), columns: def.columns.clone(), }; let signature = format!( "{}:{}:{}", - idx.table_name, - idx.index_name, - idx.columns.join(",") + index_key.table_name, + index_key.index_name, + index_key.columns.join(",") ); if !seen.insert(signature) { continue; @@ -476,7 +482,7 @@ fn global_unique_index_definitions( } }) .collect::>(); - defs.push((idx, schemas)); + defs.push((index_key, schemas)); } defs } @@ -498,12 +504,12 @@ fn table_schema_for( fn extract_pk_from_row(schema: &TableSchema, row: &Row) -> Result, AedbError> { let mut pk = Vec::with_capacity(schema.primary_key.len()); for col in &schema.primary_key { - let idx = schema + let column_index = schema .columns .iter() .position(|c| c.name == *col) .ok_or_else(|| AedbError::Validation(format!("primary key column missing: {col}")))?; - pk.push(row.values[idx].clone()); + pk.push(row.values[column_index].clone()); } Ok(pk) } diff --git a/src/commit/executor/internals.rs b/src/commit/executor/internals.rs index 9810da9..d7deb91 100644 --- a/src/commit/executor/internals.rs +++ b/src/commit/executor/internals.rs @@ -3,6 +3,7 @@ use super::*; use crate::catalog::SYSTEM_PROJECT_ID; use crate::commit::assertions::{evaluate_assertions, validate_assertions}; use crate::commit::tx::ReadAssertion; +use crate::lib_helpers::{ddl_would_apply, lifecycle_template_for_ddl}; use primitive_types::U256; use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::mpsc as std_mpsc; @@ -334,6 +335,12 @@ pub(super) fn process_commit_epoch( let mut coordinator_apply_attempts = 0u64; let mut coordinator_apply_micros = 0u64; let mut read_set_conflicts = 0u64; + let mut wal_append_ops = 0u64; + let mut wal_append_bytes = 0u64; + let mut wal_append_micros = 0u64; + let mut wal_sync_ops = 0u64; + let mut wal_sync_micros = 0u64; + let mut sync_executed = false; let mut working_keyspace = state.keyspace.clone(); let mut working_catalog = state.catalog.clone(); let mut working_idempotency: Option> = None; @@ -435,10 +442,36 @@ pub(super) fn process_commit_epoch( continue; } } + let lifecycle_events = + match plan_lifecycle_outbox_events_for_mutations(&working_catalog, &mutations) { + Ok(events) => events, + Err(err) => { + outcomes.push(EpochOutcome { + request, + result: Err(err), + post_apply_delta: None, + }); + continue; + } + }; let snapshot_seq_before_commit = next_seq; next_seq = next_seq.saturating_add(1); let commit_seq = next_seq; + if !lifecycle_events.is_empty() { + match build_lifecycle_outbox_mutation(&lifecycle_events, commit_seq) { + Ok(mutation) => mutations.push(mutation), + Err(err) => { + outcomes.push(EpochOutcome { + request, + result: Err(err), + post_apply_delta: None, + }); + next_seq = next_seq.saturating_sub(1); + continue; + } + } + } let requires_coordinator = request_requires_coordinator(&working_catalog, &request, &mutations); let is_cross_partition = is_cross_partition_request(&request); @@ -594,7 +627,14 @@ pub(super) fn process_commit_epoch( coordinator_apply_attempts, coordinator_apply_micros, read_set_conflicts, + wal_append_ops, + wal_append_bytes, + wal_append_micros, + wal_sync_ops, + wal_sync_micros, + sync_executed, catalog_changed, + ..EpochProcessResult::default() }; } @@ -627,11 +667,18 @@ pub(super) fn process_commit_epoch( coordinator_apply_attempts, coordinator_apply_micros, read_set_conflicts, + wal_append_ops, + wal_append_bytes, + wal_append_micros, + wal_sync_ops, + wal_sync_micros, + sync_executed, catalog_changed, + ..EpochProcessResult::default() }; } - let mut wal_bytes = 0usize; + let mut wal_payload_size_bytes = 0usize; let requires_sync = sequenced .iter() .any(|c| matches!(c.request.envelope.write_class, WriteClass::Economic)) @@ -657,7 +704,9 @@ pub(super) fn process_commit_epoch( internal_idx += 1; (c.seq, c.commit_ts_micros, c.payload_type, &c.payload) }; - wal_bytes = wal_bytes.saturating_add(payload.len()); + let payload_size_bytes = payload.len(); + wal_payload_size_bytes = wal_payload_size_bytes.saturating_add(payload_size_bytes); + let append_started = Instant::now(); if let Err(err) = state .wal .append_frame_with_sync(seq, ts, payload_type, payload, false) @@ -682,33 +731,57 @@ pub(super) fn process_commit_epoch( coordinator_apply_attempts, coordinator_apply_micros, read_set_conflicts, + wal_append_ops, + wal_append_bytes, + wal_append_micros, + wal_sync_ops, + wal_sync_micros, + sync_executed, catalog_changed, + ..EpochProcessResult::default() }; } + wal_append_ops = wal_append_ops.saturating_add(1); + wal_append_bytes = wal_append_bytes.saturating_add(payload_size_bytes as u64); + wal_append_micros = + wal_append_micros.saturating_add(append_started.elapsed().as_micros() as u64); } - if requires_sync && let Err(err) = state.wal.sync_active() { - let err = AedbError::Io(std::io::Error::other(err.to_string())); - overwrite_assertion_failures_with_wal_error( - &mut outcomes, - &err, - "epoch aborted during WAL sync", - ); - for failed in sequenced { - outcomes.push(EpochOutcome { - request: failed.request, - result: Err(AedbError::Validation(format!( - "epoch aborted during WAL sync: {err}" - ))), - post_apply_delta: None, - }); + if requires_sync { + let sync_started = Instant::now(); + if let Err(err) = state.wal.sync_active() { + let err = AedbError::Io(std::io::Error::other(err.to_string())); + overwrite_assertion_failures_with_wal_error( + &mut outcomes, + &err, + "epoch aborted during WAL sync", + ); + for failed in sequenced { + outcomes.push(EpochOutcome { + request: failed.request, + result: Err(AedbError::Validation(format!( + "epoch aborted during WAL sync: {err}" + ))), + post_apply_delta: None, + }); + } + return EpochProcessResult { + outcomes, + coordinator_apply_attempts, + coordinator_apply_micros, + read_set_conflicts, + wal_append_ops, + wal_append_bytes, + wal_append_micros, + wal_sync_ops, + wal_sync_micros, + sync_executed, + catalog_changed, + ..EpochProcessResult::default() + }; } - return EpochProcessResult { - outcomes, - coordinator_apply_attempts, - coordinator_apply_micros, - read_set_conflicts, - catalog_changed, - }; + wal_sync_ops = wal_sync_ops.saturating_add(1); + wal_sync_micros = wal_sync_micros.saturating_add(sync_started.elapsed().as_micros() as u64); + sync_executed = true; } let last_user_seq = sequenced.last().map(|c| c.seq).unwrap_or(state.current_seq); @@ -717,6 +790,10 @@ pub(super) fn process_commit_epoch( .map(|c| c.seq) .unwrap_or(state.current_seq); let last_seq = last_user_seq.max(last_internal_seq); + debug_assert!( + last_seq >= state.current_seq, + "commit seq must be monotonic across epochs" + ); state.keyspace = working_keyspace; state.catalog = working_catalog; state.global_unique_index = working_global_unique_index; @@ -737,19 +814,30 @@ pub(super) fn process_commit_epoch( state.pending_batch_bytes = 0; state.pending_batch_max_seq = state.durable_head_seq; } else { - state.pending_batch_bytes = state.pending_batch_bytes.saturating_add(wal_bytes); + state.pending_batch_bytes = state + .pending_batch_bytes + .saturating_add(wal_payload_size_bytes); state.pending_batch_max_seq = last_seq; - if state.pending_batch_bytes >= state.config.batch_max_bytes - && state.wal.sync_active().is_ok() - { - state.durable_head_seq = state.pending_batch_max_seq; - state.pending_batch_bytes = 0; - state.pending_batch_max_seq = state.durable_head_seq; + if state.pending_batch_bytes >= state.config.batch_max_bytes { + let sync_started = Instant::now(); + if state.wal.sync_active().is_ok() { + wal_sync_ops = wal_sync_ops.saturating_add(1); + wal_sync_micros = wal_sync_micros + .saturating_add(sync_started.elapsed().as_micros() as u64); + sync_executed = true; + state.durable_head_seq = state.pending_batch_max_seq; + state.pending_batch_bytes = 0; + state.pending_batch_max_seq = state.durable_head_seq; + } } } } DurabilityMode::OsBuffered => {} } + debug_assert!( + state.durable_head_seq <= state.visible_head_seq, + "durable head cannot exceed visible head" + ); prune_idempotency(state); for commit in &sequenced { @@ -810,7 +898,14 @@ pub(super) fn process_commit_epoch( coordinator_apply_attempts, coordinator_apply_micros, read_set_conflicts, + wal_append_ops, + wal_append_bytes, + wal_append_micros, + wal_sync_ops, + wal_sync_micros, + sync_executed, catalog_changed, + ..EpochProcessResult::default() } } @@ -844,6 +939,62 @@ fn is_read_set_conflict_error(err: &AedbError) -> bool { const ASSERTION_AUDIT_TABLE: &str = "assertion_audit"; const ASSERTION_AUDIT_SCOPE_ID: &str = "app"; +const LIFECYCLE_OUTBOX_TABLE: &str = "lifecycle_outbox"; +const LIFECYCLE_OUTBOX_SCOPE_ID: &str = "app"; + +fn plan_lifecycle_outbox_events_for_mutations( + catalog: &Catalog, + mutations: &[Mutation], +) -> Result, AedbError> { + if !mutations.iter().any(|m| matches!(m, Mutation::Ddl(_))) { + return Ok(Vec::new()); + } + let mut planned_catalog = catalog.clone(); + let mut events = Vec::new(); + for mutation in mutations { + let Mutation::Ddl(op) = mutation else { + continue; + }; + let applied = ddl_would_apply(&planned_catalog, op); + planned_catalog.apply_ddl(op.clone())?; + if applied && let Some(event) = lifecycle_template_for_ddl(op) { + events.push(event); + } + } + Ok(events) +} + +fn build_lifecycle_outbox_mutation( + templates: &[crate::lib_helpers::LifecycleEventTemplate], + lifecycle_commit_seq: u64, +) -> Result { + if templates.is_empty() { + return Err(AedbError::Validation( + "lifecycle outbox mutation requires at least one event".into(), + )); + } + let events: Vec = templates + .iter() + .cloned() + .map(|t| t.with_seq(lifecycle_commit_seq)) + .collect(); + let events_json = + serde_json::to_string(&events).map_err(|e| AedbError::Encode(e.to_string()))?; + + let ts_micros = now_micros(); + Ok(Mutation::Upsert { + project_id: SYSTEM_PROJECT_ID.to_string(), + scope_id: LIFECYCLE_OUTBOX_SCOPE_ID.to_string(), + table_name: LIFECYCLE_OUTBOX_TABLE.to_string(), + primary_key: vec![Value::Integer(lifecycle_commit_seq as i64)], + row: Row::from_values(vec![ + Value::Integer(lifecycle_commit_seq as i64), + Value::Timestamp(ts_micros as i64), + Value::Integer(events.len() as i64), + Value::Json(events_json.into()), + ]), + }) +} fn build_assertion_audit_commit( envelope: &TransactionEnvelope, @@ -977,9 +1128,9 @@ pub(super) fn apply_deferred_parallel_single_partition_commits( let mut receivers = Vec::with_capacity(deferred_indexes.len()); let mut cancellations = Vec::with_capacity(deferred_indexes.len()); let backend = keyspace.primary_index_backend; - for idx in deferred_indexes { + for deferred_index in deferred_indexes { let commit = sequenced - .get(*idx) + .get(*deferred_index) .expect("deferred index must reference sequenced commit"); let seq = commit.seq; let mutations = commit.delta.mutations.clone(); @@ -1601,12 +1752,12 @@ pub(super) fn extract_pk_from_row( ) -> Result, AedbError> { let mut pk = Vec::with_capacity(schema.primary_key.len()); for col in &schema.primary_key { - let idx = schema + let column_index = schema .columns .iter() .position(|c| c.name == *col) .ok_or_else(|| AedbError::Validation(format!("primary key column missing: {col}")))?; - pk.push(row.values[idx].clone()); + pk.push(row.values[column_index].clone()); } Ok(pk) } @@ -2307,11 +2458,13 @@ fn augment_mutations_with_caller(mutations: &mut [Mutation], caller: Option<&Cal } pub(super) fn prune_idempotency(state: &mut ExecutorState) { - let now_micros = now_micros(); - let window = state.config.idempotency_window_seconds * 1_000_000; - state - .idempotency - .retain(|_, rec| now_micros.saturating_sub(rec.recorded_at_micros) <= window); + let window_commits = state.config.idempotency_window_commits; + if window_commits == 0 { + state.idempotency.clear(); + return; + } + let min_seq = state.current_seq.saturating_sub(window_commits); + state.idempotency.retain(|_, rec| rec.commit_seq > min_seq); } pub(super) fn now_micros() -> u64 { @@ -2770,12 +2923,12 @@ pub(super) fn project_row( ) -> Result { let mut values = Vec::with_capacity(projected_columns.len()); for col in projected_columns { - let idx = schema + let column_index = schema .columns .iter() .position(|c| c.name == *col) .ok_or_else(|| AedbError::Validation(format!("projection column missing: {col}")))?; - values.push(row.values[idx].clone()); + values.push(row.values[column_index].clone()); } Ok(crate::catalog::types::Row { values }) } diff --git a/src/commit/executor/mod.rs b/src/commit/executor/mod.rs index ca257e4..738a82f 100644 --- a/src/commit/executor/mod.rs +++ b/src/commit/executor/mod.rs @@ -30,7 +30,7 @@ use std::sync::Arc; use std::sync::Mutex as StdMutex; use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering}; use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH}; -use tokio::sync::{Mutex, mpsc as tokio_mpsc, oneshot}; +use tokio::sync::{Mutex, Notify, mpsc as tokio_mpsc, oneshot}; use tokio::task::JoinHandle; use tracing::warn; @@ -97,6 +97,12 @@ struct EpochProcessResult { coordinator_apply_attempts: u64, coordinator_apply_micros: u64, read_set_conflicts: u64, + wal_append_ops: u64, + wal_append_bytes: u64, + wal_append_micros: u64, + wal_sync_ops: u64, + wal_sync_micros: u64, + sync_executed: bool, catalog_changed: bool, } @@ -125,6 +131,13 @@ pub struct CommitExecutor { ingress_txs: Vec>, config: Arc, state: Arc>, + durable_notify: Arc, + current_seq: Arc, + visible_head_seq: Arc, + durable_head_seq: Arc, + start_instant: Instant, + last_wal_sync_elapsed_us: Arc, + last_full_snapshot_micros: Arc, queued_bytes: Arc, telemetry: Arc, background_tasks: Arc>>>, @@ -136,6 +149,8 @@ struct ExecutorTelemetry { queued_commits: AtomicUsize, commits_total: AtomicU64, commit_errors: AtomicU64, + permission_rejections: AtomicU64, + validation_rejections: AtomicU64, queue_full_rejections: AtomicU64, timeout_rejections: AtomicU64, conflict_rejections: AtomicU64, @@ -147,6 +162,15 @@ struct ExecutorTelemetry { read_set_conflicts: AtomicU64, coordinator_apply_attempts: AtomicU64, coordinator_apply_micros: AtomicU64, + wal_append_ops: AtomicU64, + wal_append_bytes: AtomicU64, + wal_append_micros: AtomicU64, + wal_sync_ops: AtomicU64, + wal_sync_micros: AtomicU64, + prestage_validate_ops: AtomicU64, + prestage_validate_micros: AtomicU64, + epoch_process_ops: AtomicU64, + epoch_process_micros: AtomicU64, parallel_runtime_queue_depth: AtomicUsize, adaptive_epoch_min_commits: AtomicUsize, adaptive_epoch_max_wait_us: AtomicU64, @@ -160,6 +184,8 @@ pub struct ExecutorMetrics { pub queued_bytes: usize, pub commits_total: u64, pub commit_errors: u64, + pub permission_rejections: u64, + pub validation_rejections: u64, pub queue_full_rejections: u64, pub timeout_rejections: u64, pub conflict_rejections: u64, @@ -171,6 +197,15 @@ pub struct ExecutorMetrics { pub read_set_conflicts: u64, pub coordinator_apply_attempts: u64, pub avg_coordinator_apply_micros: u64, + pub wal_append_ops: u64, + pub wal_append_bytes: u64, + pub avg_wal_append_micros: u64, + pub wal_sync_ops: u64, + pub avg_wal_sync_micros: u64, + pub prestage_validate_ops: u64, + pub avg_prestage_validate_micros: u64, + pub epoch_process_ops: u64, + pub avg_epoch_process_micros: u64, pub parallel_runtime_queue_depth: usize, pub adaptive_epoch_min_commits: usize, pub adaptive_epoch_max_wait_us: u64, @@ -231,6 +266,7 @@ impl CommitExecutor { .map_err(|e| AedbError::Io(std::io::Error::other(e.to_string())))?; let config = Arc::new(config); + let initial_last_full_snapshot_micros = now_micros(); let state = Arc::new(Mutex::new(ExecutorState { keyspace, catalog, @@ -247,12 +283,19 @@ impl CommitExecutor { global_unique_index, idempotency, version_store, - last_full_snapshot_micros: now_micros(), + last_full_snapshot_micros: initial_last_full_snapshot_micros, last_memory_estimate_micros: 0, })); let (apply_tx, mut rx) = tokio_mpsc::channel::(max_inflight_commits); let queued_bytes = Arc::new(AtomicUsize::new(0)); let telemetry = Arc::new(ExecutorTelemetry::default()); + let durable_notify = Arc::new(Notify::new()); + let current_seq_atomic = Arc::new(AtomicU64::new(current_seq)); + let visible_head_seq = Arc::new(AtomicU64::new(current_seq)); + let durable_head_seq = Arc::new(AtomicU64::new(current_seq)); + let start_instant = Instant::now(); + let last_wal_sync_elapsed_us = Arc::new(AtomicU64::new(0)); + let last_full_snapshot_micros = Arc::new(AtomicU64::new(initial_last_full_snapshot_micros)); let background_tasks = Arc::new(StdMutex::new(Vec::new())); telemetry .adaptive_epoch_min_commits @@ -293,6 +336,13 @@ impl CommitExecutor { let loop_post_txs = post_apply_txs.clone(); let queue_counter = Arc::clone(&queued_bytes); let loop_telemetry = Arc::clone(&telemetry); + let loop_durable_notify = Arc::clone(&durable_notify); + let loop_current_seq = Arc::clone(¤t_seq_atomic); + let loop_visible_head = Arc::clone(&visible_head_seq); + let loop_durable_head = Arc::clone(&durable_head_seq); + let loop_last_wal_sync_elapsed_us = Arc::clone(&last_wal_sync_elapsed_us); + let loop_start_instant = start_instant; + let loop_last_full_snapshot_micros = Arc::clone(&last_full_snapshot_micros); let apply_handle = tokio::spawn(async move { let mut pending = VecDeque::new(); let mut ingress_closed = false; @@ -326,8 +376,16 @@ impl CommitExecutor { ) .await; let mut s = loop_state.lock().await; + let durable_before = s.durable_head_seq; let epoch_started = Instant::now(); let epoch_result = process_commit_epoch(&mut s, epoch_requests); + loop_telemetry + .epoch_process_ops + .fetch_add(1, Ordering::Relaxed); + loop_telemetry.epoch_process_micros.fetch_add( + epoch_started.elapsed().as_micros() as u64, + Ordering::Relaxed, + ); let catalog_changed = epoch_result.catalog_changed; let outcomes = epoch_result.outcomes; let had_error = outcomes.iter().any(|o| o.result.is_err()); @@ -360,6 +418,26 @@ impl CommitExecutor { loop_telemetry .coordinator_apply_micros .fetch_add(epoch_result.coordinator_apply_micros, Ordering::Relaxed); + loop_telemetry + .wal_append_ops + .fetch_add(epoch_result.wal_append_ops, Ordering::Relaxed); + loop_telemetry + .wal_append_bytes + .fetch_add(epoch_result.wal_append_bytes, Ordering::Relaxed); + loop_telemetry + .wal_append_micros + .fetch_add(epoch_result.wal_append_micros, Ordering::Relaxed); + loop_telemetry + .wal_sync_ops + .fetch_add(epoch_result.wal_sync_ops, Ordering::Relaxed); + loop_telemetry + .wal_sync_micros + .fetch_add(epoch_result.wal_sync_micros, Ordering::Relaxed); + if epoch_result.sync_executed { + let elapsed = loop_start_instant.elapsed().as_micros() as u64; + loop_last_wal_sync_elapsed_us + .store(elapsed.saturating_add(1), Ordering::Relaxed); + } if had_error { loop_telemetry .epoch_failures @@ -368,7 +446,16 @@ impl CommitExecutor { if catalog_changed { *loop_validation_catalog.write() = s.catalog.clone(); } + loop_current_seq.store(s.current_seq, Ordering::Release); + loop_visible_head.store(s.visible_head_seq, Ordering::Release); + loop_durable_head.store(s.durable_head_seq, Ordering::Release); + loop_last_full_snapshot_micros + .store(s.last_full_snapshot_micros, Ordering::Release); + let durable_advanced = s.durable_head_seq > durable_before; drop(s); + if durable_advanced { + loop_durable_notify.notify_waiters(); + } for outcome in outcomes { if let Some(delta) = outcome.post_apply_delta { @@ -387,6 +474,16 @@ impl CommitExecutor { if outcome.result.is_err() { loop_telemetry.commit_errors.fetch_add(1, Ordering::Relaxed); if let Err(err) = &outcome.result { + if is_permission_rejection_error(err) { + loop_telemetry + .permission_rejections + .fetch_add(1, Ordering::Relaxed); + } + if is_validation_rejection_error(err) { + loop_telemetry + .validation_rejections + .fetch_add(1, Ordering::Relaxed); + } if is_conflict_rejection_error(err) { loop_telemetry .conflict_rejections @@ -446,6 +543,7 @@ impl CommitExecutor { let pre_telemetry = Arc::clone(&telemetry); let handle = tokio::spawn(async move { while let Some(mut req) = ingress_rx.recv().await { + let mut prevalidate_elapsed_us = None; let (write_partitions, read_partitions) = if req.prevalidated { let write_partitions = if req .envelope @@ -464,10 +562,25 @@ impl CommitExecutor { }; (write_partitions, derive_read_partitions(&req.envelope)) } else { - match pre_stage_validate(&pre_validation_catalog, &req.envelope).await { + let prevalidate_started = Instant::now(); + let result = + pre_stage_validate(&pre_validation_catalog, &req.envelope).await; + prevalidate_elapsed_us = + Some(prevalidate_started.elapsed().as_micros() as u64); + match result { Ok(partitions) => partitions, Err(e) => { pre_telemetry.commit_errors.fetch_add(1, Ordering::Relaxed); + if is_permission_rejection_error(&e) { + pre_telemetry + .permission_rejections + .fetch_add(1, Ordering::Relaxed); + } + if is_validation_rejection_error(&e) { + pre_telemetry + .validation_rejections + .fetch_add(1, Ordering::Relaxed); + } pre_queue_counter.fetch_sub(req.encoded_len, Ordering::Relaxed); pre_telemetry.queued_commits.fetch_sub(1, Ordering::Relaxed); let _ = req.result_tx.send(Err(e)); @@ -475,6 +588,14 @@ impl CommitExecutor { } } }; + if let Some(elapsed_us) = prevalidate_elapsed_us { + pre_telemetry + .prestage_validate_ops + .fetch_add(1, Ordering::Relaxed); + pre_telemetry + .prestage_validate_micros + .fetch_add(elapsed_us, Ordering::Relaxed); + } if write_partitions.is_empty() { pre_telemetry.commit_errors.fetch_add(1, Ordering::Relaxed); pre_queue_counter.fetch_sub(req.encoded_len, Ordering::Relaxed); @@ -510,6 +631,11 @@ impl CommitExecutor { let flush_state = Arc::clone(&state); let flush_config = Arc::clone(&config); + let flush_telemetry = Arc::clone(&telemetry); + let flush_durable_notify = Arc::clone(&durable_notify); + let flush_durable_head = Arc::clone(&durable_head_seq); + let flush_last_wal_sync_elapsed_us = Arc::clone(&last_wal_sync_elapsed_us); + let flush_start_instant = start_instant; let flush_handle = tokio::spawn(async move { loop { let sleep_ms = flush_config.batch_interval_ms; @@ -518,10 +644,23 @@ impl CommitExecutor { if flush_config.durability_mode != DurabilityMode::Batch { continue; } - if s.pending_batch_bytes > 0 && s.wal.sync_active().is_ok() { - s.durable_head_seq = s.pending_batch_max_seq.max(s.durable_head_seq); - s.pending_batch_bytes = 0; - s.pending_batch_max_seq = s.durable_head_seq; + if s.pending_batch_bytes > 0 { + let sync_started = Instant::now(); + if s.wal.sync_active().is_ok() { + let sync_us = sync_started.elapsed().as_micros() as u64; + s.durable_head_seq = s.pending_batch_max_seq.max(s.durable_head_seq); + s.pending_batch_bytes = 0; + s.pending_batch_max_seq = s.durable_head_seq; + flush_durable_head.store(s.durable_head_seq, Ordering::Release); + flush_telemetry.wal_sync_ops.fetch_add(1, Ordering::Relaxed); + flush_telemetry + .wal_sync_micros + .fetch_add(sync_us, Ordering::Relaxed); + let elapsed = flush_start_instant.elapsed().as_micros() as u64; + flush_last_wal_sync_elapsed_us + .store(elapsed.saturating_add(1), Ordering::Relaxed); + flush_durable_notify.notify_waiters(); + } } prune_idempotency(&mut s); } @@ -550,6 +689,13 @@ impl CommitExecutor { ingress_txs, config, state, + durable_notify, + current_seq: current_seq_atomic, + visible_head_seq, + durable_head_seq, + start_instant, + last_wal_sync_elapsed_us, + last_full_snapshot_micros, queued_bytes, telemetry, background_tasks, @@ -652,16 +798,16 @@ impl CommitExecutor { ) -> Result { let config = &self.config; let encoded = rmp_serde::to_vec(&envelope).map_err(|e| AedbError::Encode(e.to_string()))?; - if encoded.len() > config.max_transaction_bytes { + let encoded_size_bytes = encoded.len(); + if encoded_size_bytes > config.max_transaction_bytes { return Err(AedbError::Validation( "transaction exceeds max_transaction_bytes".into(), )); } - let len = encoded.len(); let mut current = self.queued_bytes.load(Ordering::Relaxed); loop { - let next = current.saturating_add(len); - if next > config.max_commit_queue_bytes { + let next_queued_size_bytes = current.saturating_add(encoded_size_bytes); + if next_queued_size_bytes > config.max_commit_queue_bytes { self.telemetry .queue_full_rejections .fetch_add(1, Ordering::Relaxed); @@ -669,7 +815,7 @@ impl CommitExecutor { } match self.queued_bytes.compare_exchange_weak( current, - next, + next_queued_size_bytes, Ordering::Relaxed, Ordering::Relaxed, ) { @@ -691,7 +837,7 @@ impl CommitExecutor { std::time::Duration::from_millis(config.commit_timeout_ms), ingress_tx.send(CommitRequest { envelope, - encoded_len: len, + encoded_len: encoded_size_bytes, enqueue_micros: now_micros(), prevalidated, assertions_engine_verified, @@ -705,14 +851,16 @@ impl CommitExecutor { match send_result { Ok(Ok(())) => {} Ok(Err(e)) => { - self.queued_bytes.fetch_sub(len, Ordering::Relaxed); + self.queued_bytes + .fetch_sub(encoded_size_bytes, Ordering::Relaxed); self.telemetry .queued_commits .fetch_sub(1, Ordering::Relaxed); return Err(AedbError::Validation(format!("commit queue closed: {e}"))); } Err(_) => { - self.queued_bytes.fetch_sub(len, Ordering::Relaxed); + self.queued_bytes + .fetch_sub(encoded_size_bytes, Ordering::Relaxed); self.telemetry .queued_commits .fetch_sub(1, Ordering::Relaxed); @@ -742,22 +890,31 @@ impl CommitExecutor { } pub async fn current_seq(&self) -> u64 { - self.state.lock().await.current_seq + self.current_seq.load(Ordering::Acquire) } pub async fn visible_head_seq(&self) -> u64 { - self.state.lock().await.visible_head_seq + self.visible_head_seq_now() + } + + #[inline] + pub fn visible_head_seq_now(&self) -> u64 { + self.visible_head_seq.load(Ordering::Acquire) } pub async fn durable_head_seq(&self) -> u64 { - self.state.lock().await.durable_head_seq + self.durable_head_seq_now() + } + + #[inline] + pub fn durable_head_seq_now(&self) -> u64 { + self.durable_head_seq.load(Ordering::Acquire) } pub async fn head_state(&self) -> HeadState { - let s = self.state.lock().await; HeadState { - visible_head_seq: s.visible_head_seq, - durable_head_seq: s.durable_head_seq, + visible_head_seq: self.visible_head_seq.load(Ordering::Acquire), + durable_head_seq: self.durable_head_seq.load(Ordering::Acquire), } } @@ -790,22 +947,54 @@ impl CommitExecutor { pub async fn wait_for_durable(&self, seq: u64) -> Result<(), AedbError> { loop { - if self.durable_head_seq().await >= seq { + let notified = self.durable_notify.notified(); + if self.durable_head_seq_now() >= seq { return Ok(()); } - tokio::time::sleep(std::time::Duration::from_millis(1)).await; + notified.await; } } pub async fn force_fsync(&self) -> Result { let mut s = self.state.lock().await; + if s.pending_batch_bytes == 0 && s.durable_head_seq >= s.visible_head_seq { + return Ok(s.durable_head_seq); + } + let durable_before = s.durable_head_seq; + let sync_started = Instant::now(); s.wal .sync_active() .map_err(|e| AedbError::Io(std::io::Error::other(e.to_string())))?; + let sync_us = sync_started.elapsed().as_micros() as u64; s.durable_head_seq = s.visible_head_seq; s.pending_batch_bytes = 0; s.pending_batch_max_seq = s.durable_head_seq; - Ok(s.durable_head_seq) + self.durable_head_seq + .store(s.durable_head_seq, Ordering::Release); + self.telemetry.wal_sync_ops.fetch_add(1, Ordering::Relaxed); + self.telemetry + .wal_sync_micros + .fetch_add(sync_us, Ordering::Relaxed); + let elapsed = self.start_instant.elapsed().as_micros() as u64; + self.last_wal_sync_elapsed_us + .store(elapsed.saturating_add(1), Ordering::Relaxed); + let durable = s.durable_head_seq; + let durable_advanced = durable > durable_before; + drop(s); + if durable_advanced { + self.durable_notify.notify_waiters(); + } + Ok(durable) + } + + #[inline] + pub fn last_wal_sync_age_us(&self) -> Option { + let last_plus_one = self.last_wal_sync_elapsed_us.load(Ordering::Relaxed); + if last_plus_one == 0 { + return None; + } + let elapsed = self.start_instant.elapsed().as_micros() as u64; + Some(elapsed.saturating_sub(last_plus_one - 1)) } pub async fn idempotency_snapshot(&self) -> HashMap { @@ -834,12 +1023,46 @@ impl CommitExecutor { } else { coordinator_apply_micros / coordinator_apply_attempts }; + let wal_sync_ops = self.telemetry.wal_sync_ops.load(Ordering::Relaxed); + let wal_sync_micros = self.telemetry.wal_sync_micros.load(Ordering::Relaxed); + let wal_append_ops = self.telemetry.wal_append_ops.load(Ordering::Relaxed); + let wal_append_bytes = self.telemetry.wal_append_bytes.load(Ordering::Relaxed); + let wal_append_micros = self.telemetry.wal_append_micros.load(Ordering::Relaxed); + let avg_wal_append_micros = if wal_append_ops == 0 { + 0 + } else { + wal_append_micros / wal_append_ops + }; + let avg_wal_sync_micros = if wal_sync_ops == 0 { + 0 + } else { + wal_sync_micros / wal_sync_ops + }; + let prestage_validate_ops = self.telemetry.prestage_validate_ops.load(Ordering::Relaxed); + let prestage_validate_micros = self + .telemetry + .prestage_validate_micros + .load(Ordering::Relaxed); + let avg_prestage_validate_micros = if prestage_validate_ops == 0 { + 0 + } else { + prestage_validate_micros / prestage_validate_ops + }; + let epoch_process_ops = self.telemetry.epoch_process_ops.load(Ordering::Relaxed); + let epoch_process_micros = self.telemetry.epoch_process_micros.load(Ordering::Relaxed); + let avg_epoch_process_micros = if epoch_process_ops == 0 { + 0 + } else { + epoch_process_micros / epoch_process_ops + }; ExecutorMetrics { inflight_commits: self.telemetry.inflight_commits.load(Ordering::Relaxed), queued_commits: self.telemetry.queued_commits.load(Ordering::Relaxed), queued_bytes: self.queued_bytes.load(Ordering::Relaxed), commits_total, commit_errors: self.telemetry.commit_errors.load(Ordering::Relaxed), + permission_rejections: self.telemetry.permission_rejections.load(Ordering::Relaxed), + validation_rejections: self.telemetry.validation_rejections.load(Ordering::Relaxed), queue_full_rejections: self.telemetry.queue_full_rejections.load(Ordering::Relaxed), timeout_rejections: self.telemetry.timeout_rejections.load(Ordering::Relaxed), conflict_rejections: self.telemetry.conflict_rejections.load(Ordering::Relaxed), @@ -857,6 +1080,15 @@ impl CommitExecutor { read_set_conflicts: self.telemetry.read_set_conflicts.load(Ordering::Relaxed), coordinator_apply_attempts, avg_coordinator_apply_micros, + wal_append_ops, + wal_append_bytes, + avg_wal_append_micros, + wal_sync_ops, + avg_wal_sync_micros, + prestage_validate_ops, + avg_prestage_validate_micros, + epoch_process_ops, + avg_epoch_process_micros, parallel_runtime_queue_depth: self .telemetry .parallel_runtime_queue_depth @@ -877,12 +1109,11 @@ impl CommitExecutor { } pub async fn runtime_state_metrics(&self) -> ExecutorRuntimeState { - let s = self.state.lock().await; ExecutorRuntimeState { - current_seq: s.current_seq, - visible_head_seq: s.visible_head_seq, - durable_head_seq: s.durable_head_seq, - last_full_snapshot_micros: s.last_full_snapshot_micros, + current_seq: self.current_seq.load(Ordering::Acquire), + visible_head_seq: self.visible_head_seq.load(Ordering::Acquire), + durable_head_seq: self.durable_head_seq.load(Ordering::Acquire), + last_full_snapshot_micros: self.last_full_snapshot_micros.load(Ordering::Acquire), } } } @@ -907,6 +1138,14 @@ fn is_conflict_rejection_error(err: &AedbError) -> bool { || matches!(err, AedbError::Validation(msg) if msg.contains("conflict")) } +fn is_permission_rejection_error(err: &AedbError) -> bool { + matches!(err, AedbError::PermissionDenied(_)) +} + +fn is_validation_rejection_error(err: &AedbError) -> bool { + matches!(err, AedbError::Validation(msg) if !msg.contains("conflict")) +} + fn is_coordinator_timeout_error(err: &AedbError) -> bool { matches!(err, AedbError::PartitionLockTimeout) } diff --git a/src/commit/executor/tests.rs b/src/commit/executor/tests.rs index 96f9e86..d164284 100644 --- a/src/commit/executor/tests.rs +++ b/src/commit/executor/tests.rs @@ -646,7 +646,7 @@ fn assertion_read_dependency_conflicts_with_write_token_in_epoch_selection() { pending.push_back(candidate); let mut epoch_writes = HashSet::new(); epoch_writes.insert(format!("k:{}:62616c616e6365", namespace_key("p", "app"))); - let idx = super::find_compatible_candidate_index( + let candidate_index = super::find_compatible_candidate_index( &pending, &epoch_writes, &HashSet::new(), @@ -654,7 +654,7 @@ fn assertion_read_dependency_conflicts_with_write_token_in_epoch_selection() { false, ); assert!( - idx.is_none(), + candidate_index.is_none(), "assertion-derived read token must conflict with same-key write token" ); } @@ -1252,8 +1252,8 @@ async fn create_index_builds_existing_rows_synchronously() { let table = snapshot .table_by_namespace_key(&namespace_key("a", "app"), "users") .expect("table snapshot"); - let idx = table.indexes.get("by_age").expect("index materialized"); - let at_25 = idx.scan_eq(&EncodedKey::from_values(&[Value::Integer(25)])); + let age_index = table.indexes.get("by_age").expect("index materialized"); + let at_25 = age_index.scan_eq(&EncodedKey::from_values(&[Value::Integer(25)])); assert!(!at_25.is_empty()); } diff --git a/src/commit/validation.rs b/src/commit/validation.rs index dcd352b..ac12962 100644 --- a/src/commit/validation.rs +++ b/src/commit/validation.rs @@ -11,6 +11,8 @@ use crate::order_book::{ use crate::permission::{CallerContext, Permission}; use crate::query::plan::Expr; use primitive_types::U256; + +const ORDER_BOOK_ID_MAX_LEN: usize = 1024; use serde::{Deserialize, Serialize}; #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] @@ -539,14 +541,23 @@ pub fn validate_mutation_with_config( if request.instrument.trim().is_empty() { return Err(AedbError::Validation("instrument cannot be empty".into())); } + if request.instrument.len() > ORDER_BOOK_ID_MAX_LEN { + return Err(AedbError::Validation("instrument too long".into())); + } if request.client_order_id.trim().is_empty() { return Err(AedbError::Validation( "client_order_id cannot be empty".into(), )); } + if request.client_order_id.len() > ORDER_BOOK_ID_MAX_LEN { + return Err(AedbError::Validation("client_order_id too long".into())); + } if request.owner.trim().is_empty() { return Err(AedbError::Validation("owner cannot be empty".into())); } + if request.owner.len() > ORDER_BOOK_ID_MAX_LEN { + return Err(AedbError::Validation("owner too long".into())); + } if primitive_types::U256::from_big_endian(&request.qty_be).is_zero() { return Err(AedbError::Validation("qty must be > 0".into())); } @@ -567,13 +578,25 @@ pub fn validate_mutation_with_config( Ok(()) } Mutation::OrderBookCancel { - instrument, owner, .. + instrument, + owner, + client_order_id, + .. } => { if instrument.trim().is_empty() || owner.trim().is_empty() { return Err(AedbError::Validation( "instrument and owner cannot be empty".into(), )); } + if instrument.len() > ORDER_BOOK_ID_MAX_LEN || owner.len() > ORDER_BOOK_ID_MAX_LEN { + return Err(AedbError::Validation("instrument/owner too long".into())); + } + if client_order_id + .as_ref() + .is_some_and(|v| v.len() > ORDER_BOOK_ID_MAX_LEN) + { + return Err(AedbError::Validation("client_order_id too long".into())); + } Ok(()) } Mutation::OrderBookCancelReplace { @@ -1379,14 +1402,14 @@ fn validate_conflict_target( ConflictTarget::PrimaryKey => Ok(()), ConflictTarget::Index(index_name) => { let ns = namespace_key(project_id, scope_id); - let idx = catalog + let index_def = catalog .indexes .get(&(ns, table_name.to_string(), index_name.clone())) .ok_or_else(|| { AedbError::Validation(format!("conflict index does not exist: {index_name}")) })?; if !matches!( - idx.index_type, + index_def.index_type, crate::catalog::schema::IndexType::UniqueHash ) { return Err(AedbError::Validation(format!( @@ -1525,14 +1548,14 @@ fn validate_update_expr(schema: &TableSchema, expr: &UpdateExpr) -> Result<(), A fn extract_primary_key(schema: &TableSchema, row: &Row) -> Result, AedbError> { let mut primary_key = Vec::with_capacity(schema.primary_key.len()); for pk_name in &schema.primary_key { - let idx = schema + let column_index = schema .columns .iter() .position(|c| c.name == *pk_name) .ok_or_else(|| { AedbError::Validation(format!("primary key column missing: {pk_name}")) })?; - let value = row.values.get(idx).ok_or_else(|| { + let value = row.values.get(column_index).ok_or_else(|| { AedbError::Validation(format!( "primary key column value missing from row: {pk_name}" )) diff --git a/src/config.rs b/src/config.rs index 7e3bf83..12078d1 100644 --- a/src/config.rs +++ b/src/config.rs @@ -30,10 +30,13 @@ pub struct AedbConfig { pub batch_interval_ms: u64, pub batch_max_bytes: usize, pub idempotency_window_seconds: u64, + pub idempotency_window_commits: u64, pub max_inflight_commits: usize, pub max_commit_queue_bytes: usize, pub max_transaction_bytes: usize, pub commit_timeout_ms: u64, + pub durable_ack_coalescing_enabled: bool, + pub durable_ack_coalesce_window_us: u64, pub max_snapshot_age_ms: u64, pub max_concurrent_snapshots: usize, pub max_scan_rows: usize, @@ -64,6 +67,9 @@ pub struct AedbConfig { /// preventing key disclosure through core dumps or memory scanning. pub checkpoint_encryption_key: Option>>, pub checkpoint_key_id: Option, + /// zstd compression level used for checkpoint payloads. + /// Typical useful range is 0..=19 where lower is faster and larger. + pub checkpoint_compression_level: i32, /// HMAC key for manifest integrity. Wrapped in Arc> to ensure /// the key is securely zeroed from memory when the last reference is dropped. pub manifest_hmac_key: Option>>>, @@ -81,10 +87,13 @@ impl Default for AedbConfig { batch_interval_ms: 10, batch_max_bytes: 1024 * 1024, idempotency_window_seconds: 300, + idempotency_window_commits: 100_000, max_inflight_commits: 64, max_commit_queue_bytes: 64 * 1024 * 1024, max_transaction_bytes: 16 * 1024 * 1024, commit_timeout_ms: 5000, + durable_ack_coalescing_enabled: false, + durable_ack_coalesce_window_us: 0, max_snapshot_age_ms: 30_000, max_concurrent_snapshots: 128, max_scan_rows: 10_000, @@ -114,6 +123,7 @@ impl Default for AedbConfig { version_gc_interval_ms: 1_000, checkpoint_encryption_key: None, checkpoint_key_id: None, + checkpoint_compression_level: 3, manifest_hmac_key: None, recovery_mode: RecoveryMode::Strict, hash_chain_required: true, @@ -159,6 +169,8 @@ impl AedbConfig { hash_chain_required: true, batch_interval_ms: 2, batch_max_bytes: 512 * 1024, + durable_ack_coalescing_enabled: true, + durable_ack_coalesce_window_us: 500, epoch_max_wait_us: 50, adaptive_epoch_target_latency_us: 1_000, ..Self::default() @@ -194,4 +206,10 @@ impl AedbConfig { self.manifest_hmac_key = Some(Arc::new(Zeroizing::new(key))); self } + + /// Sets the zstd compression level used for checkpoints. + pub fn with_checkpoint_compression_level(mut self, level: i32) -> Self { + self.checkpoint_compression_level = level; + self + } } diff --git a/src/declarative.rs b/src/declarative.rs index 66fedcc..5c48f60 100644 --- a/src/declarative.rs +++ b/src/declarative.rs @@ -151,35 +151,35 @@ impl TableSpec { } } - for index in &self.indexes { - if index.columns.is_empty() { + for index_def in &self.indexes { + if index_def.columns.is_empty() { return Err(AedbError::Validation(format!( "index '{}' on table '{}' must define at least one column", - index.index_name, self.table_name + index_def.index_name, self.table_name ))); } - for col in &index.columns { + for col in &index_def.columns { if !col_names.contains(col.as_str()) { return Err(AedbError::Validation(format!( "index '{}' on table '{}' references unknown column '{}'", - index.index_name, self.table_name, col + index_def.index_name, self.table_name, col ))); } } } - for index in &self.async_indexes { - if index.projected_columns.is_empty() { + for index_def in &self.async_indexes { + if index_def.projected_columns.is_empty() { return Err(AedbError::Validation(format!( "async index '{}' on table '{}' must project at least one column", - index.index_name, self.table_name + index_def.index_name, self.table_name ))); } - for col in &index.projected_columns { + for col in &index_def.projected_columns { if !col_names.contains(col.as_str()) { return Err(AedbError::Validation(format!( "async index '{}' on table '{}' references unknown column '{}'", - index.index_name, self.table_name, col + index_def.index_name, self.table_name, col ))); } } @@ -201,27 +201,27 @@ impl TableSpec { primary_key: self.primary_key.clone(), }]; - for index in &self.indexes { + for index_def in &self.indexes { ops.push(DdlOperation::CreateIndex { project_id: project_id.to_string(), scope_id: scope_id.to_string(), table_name: self.table_name.clone(), - index_name: index.index_name.clone(), - if_not_exists: index.if_not_exists, - columns: index.columns.clone(), - index_type: index.index_type.clone(), - partial_filter: index.partial_filter.clone(), + index_name: index_def.index_name.clone(), + if_not_exists: index_def.if_not_exists, + columns: index_def.columns.clone(), + index_type: index_def.index_type.clone(), + partial_filter: index_def.partial_filter.clone(), }); } - for index in &self.async_indexes { + for index_def in &self.async_indexes { ops.push(DdlOperation::CreateAsyncIndex { project_id: project_id.to_string(), scope_id: scope_id.to_string(), table_name: self.table_name.clone(), - index_name: index.index_name.clone(), - if_not_exists: index.if_not_exists, - projected_columns: index.projected_columns.clone(), + index_name: index_def.index_name.clone(), + if_not_exists: index_def.if_not_exists, + projected_columns: index_def.projected_columns.clone(), }); } @@ -231,22 +231,22 @@ impl TableSpec { pub fn drop_ops(&self, project_id: &str, scope_id: &str, if_exists: bool) -> Vec { let mut ops = Vec::with_capacity(1 + self.indexes.len() + self.async_indexes.len()); - for index in &self.async_indexes { + for index_def in &self.async_indexes { ops.push(DdlOperation::DropAsyncIndex { project_id: project_id.to_string(), scope_id: scope_id.to_string(), table_name: self.table_name.clone(), - index_name: index.index_name.clone(), + index_name: index_def.index_name.clone(), if_exists, }); } - for index in &self.indexes { + for index_def in &self.indexes { ops.push(DdlOperation::DropIndex { project_id: project_id.to_string(), scope_id: scope_id.to_string(), table_name: self.table_name.clone(), - index_name: index.index_name.clone(), + index_name: index_def.index_name.clone(), if_exists, }); } diff --git a/src/lib.rs b/src/lib.rs index 1ea5fbc..82c9df5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -37,7 +37,9 @@ use crate::commit::executor::{CommitExecutor, CommitResult, ExecutorMetrics}; use crate::commit::tx::{ ReadKey, ReadSet, ReadSetEntry, TransactionEnvelope, WriteClass, WriteIntent, }; -use crate::commit::validation::{Mutation, TableUpdateExpr, validate_permissions}; +use crate::commit::validation::{ + Mutation, TableUpdateExpr, validate_mutation_with_config, validate_permissions, +}; use crate::config::{AedbConfig, DurabilityMode, RecoveryMode}; use crate::error::AedbError; use crate::error::ResourceType as ErrorResourceType; @@ -49,7 +51,7 @@ use crate::migration::{ }; use crate::order_book::{ ExecInstruction, FillSpec, InstrumentConfig, OrderBookDepth, OrderBookTableMode, OrderRecord, - OrderRequest, OrderSide, Spread, TimeInForce, key_client_id, key_order, + OrderRequest, OrderSide, OrderType, Spread, TimeInForce, key_client_id, key_order, read_last_execution_report, read_open_orders, read_order_status, read_recent_trades, read_spread, read_top_n, scoped_instrument, u256_from_be, }; @@ -64,10 +66,12 @@ use crate::recovery::replay::replay_segments; use crate::recovery::{recover_at_seq_with_config, recover_with_config}; use crate::snapshot::gc::{SnapshotHandle, SnapshotManager}; use crate::snapshot::reader::SnapshotReadView; -use crate::storage::keyspace::{KvEntry, NamespaceId}; +use crate::storage::encoded_key::EncodedKey; +use crate::storage::keyspace::{Keyspace, KvEntry, NamespaceId}; use crate::wal::frame::{FrameError, FrameReader}; use crate::wal::segment::{SEGMENT_HEADER_SIZE, SegmentHeader}; use parking_lot::Mutex; +use serde::{Deserialize, Serialize}; use std::collections::{BTreeSet, HashMap, VecDeque}; use std::fs; use std::fs::File; @@ -75,12 +79,67 @@ use std::io::{BufReader, Read}; use std::ops::Bound; use std::path::{Path, PathBuf}; use std::sync::Arc; -use std::sync::atomic::{AtomicBool, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; use std::time::{Duration, Instant}; -use tokio::sync::{Semaphore, SemaphorePermit}; +use tokio::sync::Mutex as AsyncMutex; use tracing::{info, warn}; -const CHECKPOINT_GATE_PERMITS: u32 = 1000; +const TRUST_MODE_MARKER_FILE: &str = "trust_mode.json"; + +#[derive(Debug, Clone, Serialize, Deserialize, Default)] +struct TrustModeMarker { + #[serde(default)] + ever_non_strict_recovery: bool, + #[serde(default)] + ever_hash_chain_disabled: bool, +} + +fn trust_mode_marker_path(dir: &Path) -> PathBuf { + dir.join(TRUST_MODE_MARKER_FILE) +} + +fn load_trust_mode_marker(dir: &Path) -> Result, AedbError> { + let path = trust_mode_marker_path(dir); + if !path.exists() { + return Ok(None); + } + let bytes = fs::read(&path)?; + let marker: TrustModeMarker = + serde_json::from_slice(&bytes).map_err(|e| AedbError::Validation(e.to_string()))?; + Ok(Some(marker)) +} + +fn persist_trust_mode_marker(dir: &Path, marker: &TrustModeMarker) -> Result<(), AedbError> { + let bytes = serde_json::to_vec(marker).map_err(|e| AedbError::Encode(e.to_string()))?; + fs::write(trust_mode_marker_path(dir), bytes)?; + Ok(()) +} + +fn enforce_and_record_trust_mode(dir: &Path, config: &AedbConfig) -> Result<(), AedbError> { + let mut marker = load_trust_mode_marker(dir)?.unwrap_or_default(); + if config.strict_recovery() + && (marker.ever_non_strict_recovery || marker.ever_hash_chain_disabled) + { + return Err(AedbError::Validation( + "strict open denied: data directory was previously opened with non-strict recovery or hash-chain disabled" + .into(), + )); + } + + let mut changed = false; + if !config.strict_recovery() && !marker.ever_non_strict_recovery { + marker.ever_non_strict_recovery = true; + changed = true; + } + if !config.hash_chain_required && !marker.ever_hash_chain_disabled { + marker.ever_hash_chain_disabled = true; + changed = true; + } + if changed { + persist_trust_mode_marker(dir, &marker)?; + } + Ok(()) +} /// Creates a directory with restrictive permissions (0o700 on Unix) to prevent /// unauthorized access to database files on multi-user systems. @@ -120,16 +179,16 @@ pub struct AedbInstance { require_authenticated_calls: bool, dir: PathBuf, executor: CommitExecutor, - /// Fast-path shutdown check using atomic flag to avoid RwLock contention. - /// Set to true when checkpoint or shutdown is in progress. - checkpoint_in_progress: Arc, - /// Semaphore to control checkpoint exclusivity. Normal operations acquire - /// a permit in shared mode; checkpoint acquires all permits for exclusivity. - checkpoint_gate: Arc, + /// Serializes checkpoint writers without blocking commit/query traffic. + checkpoint_lock: Arc>, snapshot_manager: Arc>, recovery_cache: Arc>, lifecycle_hooks: Arc>>>, telemetry_hooks: Arc>>>, + upstream_validation_rejections: Arc, + durable_wait_ops: Arc, + durable_wait_micros: Arc, + durable_ack_fsync_leader: Arc, startup_recovery_micros: u64, startup_recovered_seq: u64, } @@ -146,12 +205,27 @@ pub enum CommitFinality { pub struct OperationalMetrics { pub commits_total: u64, pub commit_errors: u64, + pub permission_rejections: u64, + pub validation_rejections: u64, + pub queue_full_rejections: u64, + pub timeout_rejections: u64, pub conflict_rejections: u64, pub read_set_conflicts: u64, pub conflict_rate: f64, pub avg_commit_latency_micros: u64, pub coordinator_apply_attempts: u64, pub avg_coordinator_apply_micros: u64, + pub wal_append_ops: u64, + pub wal_append_bytes: u64, + pub avg_wal_append_micros: u64, + pub wal_sync_ops: u64, + pub avg_wal_sync_micros: u64, + pub prestage_validate_ops: u64, + pub avg_prestage_validate_micros: u64, + pub epoch_process_ops: u64, + pub avg_epoch_process_micros: u64, + pub durable_wait_ops: u64, + pub avg_durable_wait_micros: u64, pub inflight_commits: usize, pub queue_depth: usize, pub durable_head_lag: u64, @@ -219,7 +293,7 @@ pub struct MigrationReport { pub current_version: u64, } -#[derive(Debug, Clone, PartialEq, Eq)] +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub enum LifecycleEvent { ProjectCreated { project_id: String, @@ -580,10 +654,13 @@ impl AedbInstance { batch_interval_ms = config.batch_interval_ms, batch_max_bytes = config.batch_max_bytes, idempotency_window_seconds = config.idempotency_window_seconds, + idempotency_window_commits = config.idempotency_window_commits, max_inflight_commits = config.max_inflight_commits, max_commit_queue_bytes = config.max_commit_queue_bytes, max_transaction_bytes = config.max_transaction_bytes, commit_timeout_ms = config.commit_timeout_ms, + durable_ack_coalescing_enabled = config.durable_ack_coalescing_enabled, + durable_ack_coalesce_window_us = config.durable_ack_coalesce_window_us, max_snapshot_age_ms = config.max_snapshot_age_ms, max_concurrent_snapshots = config.max_concurrent_snapshots, max_scan_rows = config.max_scan_rows, @@ -607,6 +684,7 @@ impl AedbInstance { epoch_apply_timeout_ms = config.epoch_apply_timeout_ms, checkpoint_encryption_enabled = config.checkpoint_encryption_key.is_some(), checkpoint_key_id = config.checkpoint_key_id.as_deref().unwrap_or(""), + checkpoint_compression_level = config.checkpoint_compression_level, manifest_hmac_enabled = config.manifest_hmac_key.is_some(), recovery_mode = ?config.recovery_mode, hash_chain_required = config.hash_chain_required, @@ -614,6 +692,7 @@ impl AedbInstance { "aedb config" ); create_private_dir_all(dir)?; + enforce_and_record_trust_mode(dir, &config)?; let has_existing = fs::read_dir(dir)? .filter_map(|e| e.ok()) .any(|e| e.file_name().to_string_lossy().contains(".aedb")); @@ -663,26 +742,20 @@ impl AedbInstance { require_authenticated_calls, dir: dir.to_path_buf(), executor, - checkpoint_in_progress: Arc::new(AtomicBool::new(false)), - checkpoint_gate: Arc::new(Semaphore::new(CHECKPOINT_GATE_PERMITS as usize)), + checkpoint_lock: Arc::new(AsyncMutex::new(())), snapshot_manager: Arc::new(Mutex::new(SnapshotManager::default())), recovery_cache: Arc::new(Mutex::new(RecoveryCache::default())), lifecycle_hooks: Arc::new(Mutex::new(Vec::new())), telemetry_hooks: Arc::new(Mutex::new(Vec::new())), + upstream_validation_rejections: Arc::new(AtomicU64::new(0)), + durable_wait_ops: Arc::new(AtomicU64::new(0)), + durable_wait_micros: Arc::new(AtomicU64::new(0)), + durable_ack_fsync_leader: Arc::new(AtomicBool::new(false)), startup_recovery_micros, startup_recovered_seq, }) } - async fn acquire_checkpoint_permit(&self) -> Result, AedbError> { - self.checkpoint_gate - .acquire() - .await - .map_err(|_| AedbError::Unavailable { - message: "checkpoint gate is closed".into(), - }) - } - pub async fn commit(&self, mutation: Mutation) -> Result { let started = Instant::now(); if self.require_authenticated_calls { @@ -693,18 +766,11 @@ impl AedbInstance { // Early size validation to prevent DoS via oversized keys/values crate::commit::validation::validate_kv_sizes_early(&mutation, &self._config)?; - // Fast-path atomic check avoids semaphore contention during normal operation - if self.checkpoint_in_progress.load(Ordering::Acquire) { - return Err(AedbError::CheckpointInProgress); - } - let _permit = self.acquire_checkpoint_permit().await?; - let lifecycle_events = self - .plan_lifecycle_events(std::slice::from_ref(&mutation)) - .await?; let result = self.executor.submit(mutation).await; self.emit_commit_telemetry("commit", started, &result); let result = result?; - self.dispatch_lifecycle_events(lifecycle_events, result.commit_seq); + self.dispatch_lifecycle_events_for_commit(result.commit_seq) + .await; Ok(result) } @@ -720,10 +786,6 @@ impl AedbInstance { )); } crate::commit::validation::validate_kv_sizes_early(&mutation, &self._config)?; - if self.checkpoint_in_progress.load(Ordering::Acquire) { - return Err(AedbError::CheckpointInProgress); - } - let _permit = self.acquire_checkpoint_permit().await?; let result = self.executor.submit_prevalidated(mutation).await; self.emit_commit_telemetry(op_name, started, &result); result @@ -827,17 +889,11 @@ impl AedbInstance { // Early size validation to prevent DoS via oversized keys/values crate::commit::validation::validate_kv_sizes_early(&mutation, &self._config)?; - if self.checkpoint_in_progress.load(Ordering::Acquire) { - return Err(AedbError::CheckpointInProgress); - } - let _permit = self.acquire_checkpoint_permit().await?; - let lifecycle_events = self - .plan_lifecycle_events(std::slice::from_ref(&mutation)) - .await?; let result = self.executor.submit_as(Some(caller), mutation).await; self.emit_commit_telemetry("commit_as", started, &result); let result = result?; - self.dispatch_lifecycle_events(lifecycle_events, result.commit_seq); + self.dispatch_lifecycle_events_for_commit(result.commit_seq) + .await; Ok(result) } @@ -884,17 +940,11 @@ impl AedbInstance { } ensure_external_caller_allowed(caller)?; } - if self.checkpoint_in_progress.load(Ordering::Acquire) { - return Err(AedbError::CheckpointInProgress); - } - let _permit = self.acquire_checkpoint_permit().await?; - let lifecycle_events = self - .plan_lifecycle_events(&envelope.write_intent.mutations) - .await?; let result = self.executor.submit_envelope(envelope).await; self.emit_commit_telemetry("commit_envelope", started, &result); let result = result?; - self.dispatch_lifecycle_events(lifecycle_events, result.commit_seq); + self.dispatch_lifecycle_events_for_commit(result.commit_seq) + .await; Ok(result) } @@ -916,36 +966,119 @@ impl AedbInstance { if matches!(finality, CommitFinality::Durable) && result.durable_head_seq < result.commit_seq { - self.wait_for_durable(result.commit_seq).await?; - result.durable_head_seq = self.executor.durable_head_seq().await; + let wait_started = Instant::now(); + if self._config.durable_ack_coalescing_enabled + && matches!(self._config.durability_mode, DurabilityMode::Batch) + { + let window_us = self._config.durable_ack_coalesce_window_us; + if window_us > 0 { + tokio::time::sleep(Duration::from_micros(window_us)).await; + } + if self.executor.durable_head_seq_now() < result.commit_seq { + // Give the periodic batch flusher a chance to satisfy durable + // finality first; only force fsync if it misses this window. + let grace_wait_us = window_us.max( + self._config + .batch_interval_ms + .saturating_mul(1000) + .saturating_mul(2), + ); + if grace_wait_us > 0 { + let _ = tokio::time::timeout( + Duration::from_micros(grace_wait_us), + self.wait_for_durable(result.commit_seq), + ) + .await; + } + + if self.executor.durable_head_seq_now() < result.commit_seq { + let recently_synced = grace_wait_us > 0 + && self + .executor + .last_wal_sync_age_us() + .is_some_and(|age| age < grace_wait_us); + if recently_synced { + let _ = tokio::time::timeout( + Duration::from_micros(grace_wait_us), + self.wait_for_durable(result.commit_seq), + ) + .await; + } + } + + if self.executor.durable_head_seq_now() < result.commit_seq { + if self + .durable_ack_fsync_leader + .compare_exchange(false, true, Ordering::AcqRel, Ordering::Acquire) + .is_ok() + { + struct LeaderGuard<'a>(&'a AtomicBool); + impl Drop for LeaderGuard<'_> { + fn drop(&mut self) { + self.0.store(false, Ordering::Release); + } + } + let _guard = LeaderGuard(&self.durable_ack_fsync_leader); + let _ = self.force_fsync().await?; + } else { + self.wait_for_durable(result.commit_seq).await?; + } + } + } + } else { + self.wait_for_durable(result.commit_seq).await?; + } + self.durable_wait_ops.fetch_add(1, Ordering::Relaxed); + self.durable_wait_micros + .fetch_add(wait_started.elapsed().as_micros() as u64, Ordering::Relaxed); + result.durable_head_seq = self.executor.durable_head_seq_now(); } Ok(()) } - async fn plan_lifecycle_events( - &self, - mutations: &[Mutation], - ) -> Result, AedbError> { - if !mutations.iter().any(|m| matches!(m, Mutation::Ddl(_))) { - return Ok(Vec::new()); + async fn dispatch_lifecycle_events_for_commit(&self, commit_seq: u64) { + if self.lifecycle_hooks.lock().is_empty() { + return; } - let (_, catalog, _) = self.executor.snapshot_state().await; - let mut planned_catalog = catalog; - let mut events = Vec::new(); - for mutation in mutations { - let Mutation::Ddl(op) = mutation else { - continue; - }; - let applied = ddl_would_apply(&planned_catalog, op); - planned_catalog.apply_ddl(op.clone())?; - if applied && let Some(event) = lifecycle_template_for_ddl(op) { - events.push(event); + let events = match self.read_lifecycle_events_for_commit(commit_seq).await { + Ok(events) => events, + Err(err) => { + warn!(commit_seq, error = ?err, "failed to read lifecycle outbox"); + return; } - } - Ok(events) + }; + self.dispatch_lifecycle_events(events); } - fn dispatch_lifecycle_events(&self, events: Vec, seq: u64) { + async fn read_lifecycle_events_for_commit( + &self, + commit_seq: u64, + ) -> Result, AedbError> { + let ns = namespace_key(crate::catalog::SYSTEM_PROJECT_ID, "app"); + let table_key = (ns.clone(), "lifecycle_outbox".to_string()); + let (snapshot, catalog, _) = self.executor.snapshot_state().await; + let Some(schema) = catalog.tables.get(&table_key) else { + return Ok(Vec::new()); + }; + let Some(events_idx) = schema.columns.iter().position(|c| c.name == "events") else { + return Ok(Vec::new()); + }; + let Some(table) = + snapshot.table(crate::catalog::SYSTEM_PROJECT_ID, "app", "lifecycle_outbox") + else { + return Ok(Vec::new()); + }; + let encoded_pk = EncodedKey::from_values(&[Value::Integer(commit_seq as i64)]); + let Some(row) = table.rows.get(&encoded_pk) else { + return Ok(Vec::new()); + }; + let Some(Value::Json(events_json)) = row.values.get(events_idx) else { + return Ok(Vec::new()); + }; + serde_json::from_str(events_json.as_str()).map_err(|e| AedbError::Decode(e.to_string())) + } + + fn dispatch_lifecycle_events(&self, events: Vec) { if events.is_empty() { return; } @@ -954,8 +1087,7 @@ impl AedbInstance { return; } tokio::spawn(async move { - for template in events { - let event = template.with_seq(seq); + for event in events { for hook in &hooks { if std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { hook.on_event(&event) @@ -2791,15 +2923,17 @@ impl AedbInstance { scope_id: &str, request: OrderRequest, ) -> Result { - self.commit_prevalidated_internal( - "order_book_new", - Mutation::OrderBookNew { - project_id: project_id.to_string(), - scope_id: scope_id.to_string(), - request, - }, - ) - .await + self.preflight_order_book_new_if_high_reject_risk(None, project_id, scope_id, &request) + .await?; + let mutation = Mutation::OrderBookNew { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + request, + }; + let (_, catalog, _) = self.executor.snapshot_state().await; + validate_mutation_with_config(&catalog, &mutation, &self._config)?; + self.commit_prevalidated_internal("order_book_new", mutation) + .await } pub async fn order_book_new_with_finality( @@ -2809,16 +2943,9 @@ impl AedbInstance { request: OrderRequest, finality: CommitFinality, ) -> Result { - self.commit_prevalidated_internal_with_finality( - "order_book_new", - Mutation::OrderBookNew { - project_id: project_id.to_string(), - scope_id: scope_id.to_string(), - request, - }, - finality, - ) - .await + let mut result = self.order_book_new(project_id, scope_id, request).await?; + self.enforce_finality(&mut result, finality).await?; + Ok(result) } pub async fn order_book_define_table( @@ -2992,6 +3119,13 @@ impl AedbInstance { scope_id: &str, request: OrderRequest, ) -> Result { + self.preflight_order_book_new_if_high_reject_risk( + Some(&caller), + project_id, + scope_id, + &request, + ) + .await?; self.commit_as( caller, Mutation::OrderBookNew { @@ -3012,19 +3146,46 @@ impl AedbInstance { finality: CommitFinality, ) -> Result { let mut result = self - .commit_as( - caller, - Mutation::OrderBookNew { - project_id: project_id.to_string(), - scope_id: scope_id.to_string(), - request, - }, - ) + .order_book_new_as(caller, project_id, scope_id, request) .await?; self.enforce_finality(&mut result, finality).await?; Ok(result) } + fn should_preflight_order_book_new(request: &OrderRequest) -> bool { + request.exec_instructions.post_only() + || matches!(request.time_in_force, TimeInForce::Fok) + || matches!(request.order_type, OrderType::Market) + } + + async fn preflight_order_book_new_if_high_reject_risk( + &self, + caller: Option<&CallerContext>, + project_id: &str, + scope_id: &str, + request: &OrderRequest, + ) -> Result<(), AedbError> { + if !Self::should_preflight_order_book_new(request) { + return Ok(()); + } + let mutation = Mutation::OrderBookNew { + project_id: project_id.to_string(), + scope_id: scope_id.to_string(), + request: request.clone(), + }; + let preflight_result = if let Some(caller) = caller { + self.preflight_as(caller, mutation).await? + } else { + self.preflight(mutation).await + }; + if let PreflightResult::Err { reason } = preflight_result { + self.upstream_validation_rejections + .fetch_add(1, Ordering::Relaxed); + return Err(AedbError::Validation(reason)); + } + Ok(()) + } + pub async fn order_book_cancel( &self, project_id: &str, @@ -3946,6 +4107,18 @@ impl AedbInstance { .acquire_snapshot(consistency) .await .map_err(QueryError::from)?; + let prefix = format!("ob:{instrument}:"); + if !lease.view.catalog.has_kv_read_permission( + &caller.caller_id, + project_id, + scope_id, + prefix.as_bytes(), + ) { + return Err(QueryError::PermissionDenied { + permission: format!("KvRead({project_id}.{scope_id})"), + scope: caller.caller_id.clone(), + }); + } let admin = lease .view .catalog @@ -5034,54 +5207,46 @@ impl AedbInstance { } pub async fn shutdown(&self) -> Result<(), AedbError> { + // Ensure batch durability tails are flushed before checkpointing shutdown state. + let _ = self.force_fsync().await?; let _ = self.checkpoint_now().await?; Ok(()) } pub async fn checkpoint_now(&self) -> Result { - // Set checkpoint flag to fast-fail new operations, then acquire all permits - // to ensure exclusivity (wait for in-flight operations to complete) - self.checkpoint_in_progress.store(true, Ordering::Release); - - // Ensure flag is reset even if we return early with error - struct ResetGuard<'a>(&'a AtomicBool); - impl Drop for ResetGuard<'_> { - fn drop(&mut self) { - self.0.store(false, Ordering::Release); - } - } - let _reset_guard = ResetGuard(&self.checkpoint_in_progress); - - let _all_permits = self - .checkpoint_gate - .acquire_many(CHECKPOINT_GATE_PERMITS) - .await - .map_err(|_| AedbError::Unavailable { - message: "checkpoint gate is closed".into(), - })?; - - let _ = self.executor.force_fsync().await?; - let (snapshot, catalog, seq) = self.executor.snapshot_state().await; - let idempotency = self.executor.idempotency_snapshot().await; - let heads = self.executor.head_state().await; + // Serialize checkpoints while allowing normal commits/queries to continue. + let _checkpoint_guard = self.checkpoint_lock.lock().await; + + // In batch durability mode, flush un-synced WAL tail so checkpoint captures a + // stable durable horizon and recovery does not lose committed tail entries. + let _ = self.force_fsync().await?; + + // Anchor checkpoint to a stable committed horizon. + let seq = self.executor.durable_head_seq_now(); + let lease = self.acquire_snapshot(ConsistencyMode::AtSeq(seq)).await?; + let snapshot = lease.view.keyspace.as_ref(); + let catalog = lease.view.catalog.as_ref(); + let mut idempotency = self.executor.idempotency_snapshot().await; + idempotency.retain(|_, record| record.commit_seq <= seq); let checkpoint = write_checkpoint_with_key( - &snapshot, - &catalog, + snapshot, + catalog, seq, &self.dir, self._config.checkpoint_key(), self._config.checkpoint_key_id.clone(), idempotency, + self._config.checkpoint_compression_level, )?; - let segments = read_segments(&self.dir)?; + let segments = read_segments_for_checkpoint(&self.dir, seq)?; let active_segment_seq = segments .last() .map(|segment| segment.segment_seq) - .unwrap_or(0); + .unwrap_or(seq.saturating_add(1)); let manifest = Manifest { - durable_seq: heads.durable_head_seq, - visible_seq: heads.visible_head_seq, + durable_seq: seq, + visible_seq: seq, active_segment_seq, checkpoints: vec![checkpoint], segments, @@ -5106,6 +5271,7 @@ impl AedbInstance { self._config.checkpoint_key(), self._config.checkpoint_key_id.clone(), idempotency, + self._config.checkpoint_compression_level, )?; let mut wal_segments = Vec::new(); @@ -5312,6 +5478,7 @@ impl AedbInstance { config.checkpoint_key(), config.checkpoint_key_id.clone(), idempotency, + config.checkpoint_compression_level, )?; let restored_manifest = Manifest { durable_seq: restored_seq, @@ -5360,31 +5527,33 @@ impl AedbInstance { let ns_key = namespace_key(project_id, scope_id); let ns_id = NamespaceId::project_scope(project_id, scope_id); - let mut merged_keyspace = live.keyspace.clone(); - match restored.keyspace.namespaces.get(&ns_id) { - Some(namespace) => { - Arc::make_mut(&mut merged_keyspace.namespaces) - .insert(ns_id.clone(), namespace.clone()); - } - None => { - Arc::make_mut(&mut merged_keyspace.namespaces).remove(&ns_id); - } + let mut merged_namespaces = live + .keyspace + .namespaces + .iter() + .filter(|(ns, _)| **ns != ns_id) + .map(|(ns, namespace)| (ns.clone(), namespace.clone())) + .collect::>(); + if let Some(namespace) = restored.keyspace.namespaces.get(&ns_id) { + merged_namespaces.insert(ns_id.clone(), namespace.clone()); } - let async_keys: Vec<(NamespaceId, String, String)> = merged_keyspace + let mut merged_async_indexes = live + .keyspace .async_indexes - .keys() - .filter(|(ns, _, _)| *ns == ns_id) - .cloned() - .collect(); - for key in async_keys { - Arc::make_mut(&mut merged_keyspace.async_indexes).remove(&key); - } + .iter() + .filter(|(key, _)| key.0 != ns_id) + .map(|(key, value)| (key.clone(), value.clone())) + .collect::>(); for (key, value) in restored.keyspace.async_indexes.iter() { if key.0 == ns_id { - Arc::make_mut(&mut merged_keyspace.async_indexes) - .insert(key.clone(), value.clone()); + merged_async_indexes.insert(key.clone(), value.clone()); } } + let merged_keyspace = Keyspace { + primary_index_backend: live.keyspace.primary_index_backend, + namespaces: Arc::new(merged_namespaces.into()), + async_indexes: Arc::new(merged_async_indexes.into()), + }; let mut merged_catalog = live.catalog.clone(); if let Some(project) = restored.catalog.projects.get(project_id) { @@ -5459,6 +5628,7 @@ impl AedbInstance { config.checkpoint_key(), config.checkpoint_key_id.clone(), live.idempotency, + config.checkpoint_compression_level, )?; let manifest = Manifest { durable_seq: merged_seq, @@ -5493,15 +5663,41 @@ impl AedbInstance { } else { core.conflict_rejections as f64 / core.commits_total as f64 }; + let durable_wait_ops = self.durable_wait_ops.load(Ordering::Relaxed); + let durable_wait_micros = self.durable_wait_micros.load(Ordering::Relaxed); + let avg_durable_wait_micros = if durable_wait_ops == 0 { + 0 + } else { + durable_wait_micros / durable_wait_ops + }; + let upstream_validation_rejections = + self.upstream_validation_rejections.load(Ordering::Relaxed); OperationalMetrics { commits_total: core.commits_total, commit_errors: core.commit_errors, + permission_rejections: core.permission_rejections, + validation_rejections: core + .validation_rejections + .saturating_add(upstream_validation_rejections), + queue_full_rejections: core.queue_full_rejections, + timeout_rejections: core.timeout_rejections, conflict_rejections: core.conflict_rejections, read_set_conflicts: core.read_set_conflicts, conflict_rate, avg_commit_latency_micros: core.avg_commit_latency_micros, coordinator_apply_attempts: core.coordinator_apply_attempts, avg_coordinator_apply_micros: core.avg_coordinator_apply_micros, + wal_append_ops: core.wal_append_ops, + wal_append_bytes: core.wal_append_bytes, + avg_wal_append_micros: core.avg_wal_append_micros, + wal_sync_ops: core.wal_sync_ops, + avg_wal_sync_micros: core.avg_wal_sync_micros, + prestage_validate_ops: core.prestage_validate_ops, + avg_prestage_validate_micros: core.avg_prestage_validate_micros, + epoch_process_ops: core.epoch_process_ops, + avg_epoch_process_micros: core.avg_epoch_process_micros, + durable_wait_ops, + avg_durable_wait_micros, inflight_commits: core.inflight_commits, queue_depth: core.queued_commits, durable_head_lag: runtime @@ -5810,7 +6006,7 @@ impl AedbInstance { .primary_key .iter() .map(|pk_name| { - let idx = schema + let column_index = schema .columns .iter() .position(|c| c.name == *pk_name) @@ -5819,7 +6015,7 @@ impl AedbInstance { "primary key column missing: {pk_name}" )) })?; - Ok(new_row.values[idx].clone()) + Ok(new_row.values[column_index].clone()) }) .collect::, AedbError>>()?; self.commit(Mutation::Upsert { @@ -5863,14 +6059,14 @@ impl AedbInstance { .unwrap_or(0); let mut updated = 0u64; let start_offset = start_offset.min(rows.len()); - for (idx, chunk) in rows[start_offset..].chunks(batch_size).enumerate() { + for (chunk_index, chunk) in rows[start_offset..].chunks(batch_size).enumerate() { for row in chunk { if let Some(new_row) = update(row) { let primary_key = schema .primary_key .iter() .map(|pk_name| { - let idx = schema + let column_index = schema .columns .iter() .position(|c| c.name == *pk_name) @@ -5879,7 +6075,7 @@ impl AedbInstance { "primary key column missing: {pk_name}" )) })?; - Ok(new_row.values[idx].clone()) + Ok(new_row.values[column_index].clone()) }) .collect::, AedbError>>()?; self.commit(Mutation::Upsert { @@ -5893,7 +6089,8 @@ impl AedbInstance { updated += 1; } } - let progressed = start_offset + ((idx + 1) * batch_size).min(rows.len() - start_offset); + let progressed = + start_offset + ((chunk_index + 1) * batch_size).min(rows.len() - start_offset); self.commit(Mutation::KvSet { project_id: project_id.to_string(), scope_id: scope_id.to_string(), diff --git a/src/lib_helpers.rs b/src/lib_helpers.rs index 65287ee..eb8fe87 100644 --- a/src/lib_helpers.rs +++ b/src/lib_helpers.rs @@ -176,6 +176,25 @@ pub(crate) fn authorize_and_bind_query_for_caller( ensure_query_caller_allowed(caller)?; let (base_project_id, base_scope_id, base_table_name) = resolve_query_table_ref(project_id, scope_id, &query.table); + let base_alias = query + .table_alias + .clone() + .unwrap_or_else(|| base_table_name.clone()); + let mut seen_aliases = std::collections::BTreeSet::new(); + if !seen_aliases.insert(base_alias.clone()) { + return Err(QueryError::InvalidQuery { + reason: format!("duplicate table alias in query: {base_alias}"), + }); + } + for join in &query.joins { + let (_, _, join_table_name) = resolve_query_table_ref(project_id, scope_id, &join.table); + let join_alias = join.alias.clone().unwrap_or(join_table_name); + if !seen_aliases.insert(join_alias.clone()) { + return Err(QueryError::InvalidQuery { + reason: format!("duplicate table alias in query: {join_alias}"), + }); + } + } let required = if let Some(index_name) = &options.async_index { Permission::IndexRead { project_id: base_project_id.clone(), @@ -211,26 +230,40 @@ pub(crate) fn authorize_and_bind_query_for_caller( }); } } + let caller_id = caller.caller_id.as_str(); + let has_policy_bypass = |project_id: &str, table_name: &str| { + catalog.has_permission( + caller_id, + &Permission::PolicyBypass { + project_id: project_id.to_string(), + table_name: Some(table_name.to_string()), + }, + ) || catalog.has_permission( + caller_id, + &Permission::PolicyBypass { + project_id: project_id.to_string(), + table_name: None, + }, + ) + }; let mut policies = Vec::new(); - if let Some(policy) = - catalog.read_policy_for_table(&base_project_id, &base_scope_id, &base_table_name) + if !has_policy_bypass(&base_project_id, &base_table_name) + && let Some(policy) = + catalog.read_policy_for_table(&base_project_id, &base_scope_id, &base_table_name) { let bound_policy = bind_policy_expr(&policy, &caller.caller_id); if query.joins.is_empty() { policies.push(bound_policy); } else { - let base_alias = query - .table_alias - .clone() - .unwrap_or_else(|| base_table_name.clone()); policies.push(qualify_policy_columns(&bound_policy, &base_alias)); } } for join in &query.joins { let (join_project_id, join_scope_id, join_table_name) = resolve_query_table_ref(project_id, scope_id, &join.table); - if let Some(policy) = - catalog.read_policy_for_table(&join_project_id, &join_scope_id, &join_table_name) + if !has_policy_bypass(&join_project_id, &join_table_name) + && let Some(policy) = + catalog.read_policy_for_table(&join_project_id, &join_scope_id, &join_table_name) { let bound_policy = bind_policy_expr(&policy, &caller.caller_id); let join_alias = join.alias.clone().unwrap_or(join_table_name); @@ -439,6 +472,11 @@ pub(crate) fn validate_config(config: &AedbConfig) -> Result<(), AedbError> { message: "checkpoint_key_id requires checkpoint_encryption_key".into(), }); } + if !(-7..=22).contains(&config.checkpoint_compression_level) { + return Err(AedbError::InvalidConfig { + message: "checkpoint_compression_level must be between -7 and 22".into(), + }); + } if let Some(key) = &config.manifest_hmac_key && key.is_empty() { @@ -466,9 +504,9 @@ pub(crate) fn validate_secure_config(config: &AedbConfig) -> Result<(), AedbErro message: "secure mode requires strict recovery".into(), }); } - if matches!(config.durability_mode, DurabilityMode::OsBuffered) { + if !matches!(config.durability_mode, DurabilityMode::Full) { return Err(AedbError::InvalidConfig { - message: "secure mode forbids OsBuffered durability mode".into(), + message: "secure mode requires DurabilityMode::Full for crash-safe durability".into(), }); } if !config.hash_chain_required { @@ -497,9 +535,11 @@ pub fn validate_arcana_config(config: &AedbConfig) -> Result<(), AedbError> { message: "Arcana production profile requires strict recovery".into(), }); } - if matches!(config.durability_mode, DurabilityMode::OsBuffered) { + if !matches!(config.durability_mode, DurabilityMode::Full) { return Err(AedbError::InvalidConfig { - message: "Arcana production profile forbids OsBuffered durability mode".into(), + message: + "Arcana production profile requires DurabilityMode::Full for crash-safe durability" + .into(), }); } if !config.hash_chain_required { @@ -675,14 +715,14 @@ pub(crate) fn extract_primary_key_values( ) -> Result, AedbError> { let mut primary_key = Vec::with_capacity(schema.primary_key.len()); for pk_name in &schema.primary_key { - let idx = schema + let column_index = schema .columns .iter() .position(|c| c.name == *pk_name) .ok_or_else(|| { AedbError::Validation(format!("primary key column missing: {pk_name}")) })?; - let value = row.values.get(idx).ok_or_else(|| { + let value = row.values.get(column_index).ok_or_else(|| { AedbError::Validation(format!( "row missing primary key value for column: {pk_name}" )) @@ -1272,6 +1312,36 @@ pub(crate) fn read_segments(dir: &Path) -> Result, AedbError> { Ok(out) } +pub(crate) fn read_segments_for_checkpoint( + dir: &Path, + checkpoint_seq: u64, +) -> Result, AedbError> { + let segments = read_segments(dir)?; + let last_segment = segments.last().cloned(); + let mut filtered = Vec::with_capacity(segments.len()); + for segment in segments { + let path = dir.join(&segment.filename); + let keep = match scan_segment_seq_range(&path)? { + Some((_, max_seq)) => max_seq > checkpoint_seq, + None => false, + }; + if keep { + filtered.push(segment); + } + } + + // Keep at least the current active segment so manifest metadata remains anchored + // even if all observed frames are covered by the checkpoint. + if filtered.is_empty() + && let Some(last) = last_segment + { + filtered.push(last); + } + + filtered.sort_by_key(|segment| segment.segment_seq); + Ok(filtered) +} + pub(crate) fn segment_seq_from_name(name: &str) -> Option { if !name.starts_with("segment_") || !name.ends_with(".aedbwal") { return None; @@ -1323,28 +1393,28 @@ pub(crate) fn validate_backup_chain(chain: &[(PathBuf, BackupManifest)]) -> Resu "backup chain must start with a full backup".into(), )); } - for idx in 1..chain.len() { - let prev = &chain[idx - 1].1; - let cur = &chain[idx].1; + for chain_index in 1..chain.len() { + let prev = &chain[chain_index - 1].1; + let cur = &chain[chain_index].1; if cur.backup_type != "incremental" { return Err(AedbError::Validation(format!( - "chain entry {idx} is not incremental" + "chain entry {chain_index} is not incremental" ))); } if cur.parent_backup_id.as_deref() != Some(prev.backup_id.as_str()) { return Err(AedbError::Validation(format!( - "chain entry {idx} parent mismatch" + "chain entry {chain_index} parent mismatch" ))); } let expected_from = prev.wal_head_seq.saturating_add(1); if cur.from_seq != Some(expected_from) { return Err(AedbError::Validation(format!( - "chain entry {idx} from_seq mismatch" + "chain entry {chain_index} from_seq mismatch" ))); } if cur.wal_head_seq < expected_from.saturating_sub(1) { return Err(AedbError::Validation(format!( - "chain entry {idx} wal_head_seq invalid" + "chain entry {chain_index} wal_head_seq invalid" ))); } } diff --git a/src/lib_tests.rs b/src/lib_tests.rs index 94a261a..69306b9 100644 --- a/src/lib_tests.rs +++ b/src/lib_tests.rs @@ -19,7 +19,6 @@ use std::fs; use std::ops::Bound; use std::path::{Path, PathBuf}; use std::sync::Arc; -use std::sync::atomic::Ordering; use std::time::{Duration, Instant}; use tempfile::tempdir; @@ -2078,6 +2077,371 @@ async fn read_policy_applies_to_joined_tables() { assert_eq!(result.rows.len(), 1); } +#[tokio::test] +async fn table_policy_bypass_permission_skips_row_policy_filtering() { + let dir = tempdir().expect("temp"); + let db = AedbInstance::open(AedbConfig::default(), dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + db.commit(Mutation::Ddl(DdlOperation::CreateTable { + project_id: "p".into(), + scope_id: "app".into(), + table_name: "users".into(), + owner_id: None, + if_not_exists: false, + columns: vec![ + ColumnDef { + name: "id".into(), + col_type: ColumnType::Integer, + nullable: false, + }, + ColumnDef { + name: "owner".into(), + col_type: ColumnType::Text, + nullable: false, + }, + ], + primary_key: vec!["id".into()], + })) + .await + .expect("users"); + db.commit(Mutation::Upsert { + project_id: "p".into(), + scope_id: "app".into(), + table_name: "users".into(), + primary_key: vec![Value::Integer(1)], + row: Row { + values: vec![Value::Integer(1), Value::Text("reader".into())], + }, + }) + .await + .expect("seed user 1"); + db.commit(Mutation::Upsert { + project_id: "p".into(), + scope_id: "app".into(), + table_name: "users".into(), + primary_key: vec![Value::Integer(2)], + row: Row { + values: vec![Value::Integer(2), Value::Text("bob".into())], + }, + }) + .await + .expect("seed user 2"); + db.commit(Mutation::Ddl(DdlOperation::GrantPermission { + caller_id: "reader".into(), + permission: Permission::TableRead { + project_id: "p".into(), + scope_id: "app".into(), + table_name: "users".into(), + }, + actor_id: None, + delegable: false, + })) + .await + .expect("grant table read"); + db.commit(Mutation::Ddl(DdlOperation::GrantPermission { + caller_id: "reader".into(), + permission: Permission::PolicyBypass { + project_id: "p".into(), + table_name: Some("users".into()), + }, + actor_id: None, + delegable: false, + })) + .await + .expect("grant policy bypass"); + db.set_read_policy( + "p", + "app", + "users", + Expr::Eq("owner".into(), Value::Text("$caller_id".into())), + ) + .await + .expect("set policy"); + + let reader = CallerContext::new("reader"); + let result = db + .query_with_options_as( + Some(&reader), + "p", + "app", + Query::select(&["*"]).from("users").limit(10), + QueryOptions::default(), + ) + .await + .expect("query with policy bypass"); + assert_eq!(result.rows.len(), 2); +} + +#[tokio::test] +async fn project_policy_bypass_permission_skips_joined_table_policies() { + let dir = tempdir().expect("temp"); + let db = AedbInstance::open(AedbConfig::default(), dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + db.commit(Mutation::Ddl(DdlOperation::CreateTable { + project_id: "p".into(), + scope_id: "app".into(), + table_name: "users".into(), + owner_id: None, + if_not_exists: false, + columns: vec![ + ColumnDef { + name: "id".into(), + col_type: ColumnType::Integer, + nullable: false, + }, + ColumnDef { + name: "owner".into(), + col_type: ColumnType::Text, + nullable: false, + }, + ], + primary_key: vec!["id".into()], + })) + .await + .expect("users"); + db.commit(Mutation::Ddl(DdlOperation::CreateTable { + project_id: "p".into(), + scope_id: "app".into(), + table_name: "profiles".into(), + owner_id: None, + if_not_exists: false, + columns: vec![ + ColumnDef { + name: "user_id".into(), + col_type: ColumnType::Integer, + nullable: false, + }, + ColumnDef { + name: "owner".into(), + col_type: ColumnType::Text, + nullable: false, + }, + ], + primary_key: vec!["user_id".into()], + })) + .await + .expect("profiles"); + + for (id, owner) in [(1, "reader"), (2, "bob")] { + db.commit(Mutation::Upsert { + project_id: "p".into(), + scope_id: "app".into(), + table_name: "users".into(), + primary_key: vec![Value::Integer(id)], + row: Row { + values: vec![Value::Integer(id), Value::Text(owner.into())], + }, + }) + .await + .expect("seed user"); + db.commit(Mutation::Upsert { + project_id: "p".into(), + scope_id: "app".into(), + table_name: "profiles".into(), + primary_key: vec![Value::Integer(id)], + row: Row { + values: vec![Value::Integer(id), Value::Text(owner.into())], + }, + }) + .await + .expect("seed profile"); + } + + db.commit(Mutation::Ddl(DdlOperation::GrantPermission { + caller_id: "reader".into(), + permission: Permission::TableRead { + project_id: "p".into(), + scope_id: "app".into(), + table_name: "users".into(), + }, + actor_id: None, + delegable: false, + })) + .await + .expect("grant users read"); + db.commit(Mutation::Ddl(DdlOperation::GrantPermission { + caller_id: "reader".into(), + permission: Permission::TableRead { + project_id: "p".into(), + scope_id: "app".into(), + table_name: "profiles".into(), + }, + actor_id: None, + delegable: false, + })) + .await + .expect("grant profiles read"); + db.commit(Mutation::Ddl(DdlOperation::GrantPermission { + caller_id: "reader".into(), + permission: Permission::PolicyBypass { + project_id: "p".into(), + table_name: None, + }, + actor_id: None, + delegable: false, + })) + .await + .expect("grant project policy bypass"); + db.set_read_policy( + "p", + "app", + "users", + Expr::Eq("owner".into(), Value::Text("$caller_id".into())), + ) + .await + .expect("set users policy"); + db.set_read_policy( + "p", + "app", + "profiles", + Expr::Eq("owner".into(), Value::Text("$caller_id".into())), + ) + .await + .expect("set profiles policy"); + + let reader = CallerContext::new("reader"); + let result = db + .query_with_options_as( + Some(&reader), + "p", + "app", + Query::select(&["*"]) + .from("users") + .alias("u") + .inner_join("profiles", "u.id", "user_id") + .with_last_join_alias("pr") + .limit(10), + QueryOptions::default(), + ) + .await + .expect("query with project policy bypass"); + assert_eq!(result.rows.len(), 2); +} + +#[tokio::test] +async fn join_query_rejects_duplicate_aliases_to_prevent_policy_binding_ambiguity() { + let dir = tempdir().expect("temp"); + let db = AedbInstance::open(AedbConfig::default(), dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + db.commit(Mutation::Ddl(DdlOperation::CreateTable { + project_id: "p".into(), + scope_id: "app".into(), + table_name: "users".into(), + owner_id: None, + if_not_exists: false, + columns: vec![ + ColumnDef { + name: "id".into(), + col_type: ColumnType::Integer, + nullable: false, + }, + ColumnDef { + name: "owner".into(), + col_type: ColumnType::Text, + nullable: false, + }, + ], + primary_key: vec!["id".into()], + })) + .await + .expect("users"); + db.commit(Mutation::Ddl(DdlOperation::CreateTable { + project_id: "p".into(), + scope_id: "app".into(), + table_name: "profiles".into(), + owner_id: None, + if_not_exists: false, + columns: vec![ + ColumnDef { + name: "user_id".into(), + col_type: ColumnType::Integer, + nullable: false, + }, + ColumnDef { + name: "owner".into(), + col_type: ColumnType::Text, + nullable: false, + }, + ], + primary_key: vec!["user_id".into()], + })) + .await + .expect("profiles"); + + db.commit(Mutation::Upsert { + project_id: "p".into(), + scope_id: "app".into(), + table_name: "users".into(), + primary_key: vec![Value::Integer(1)], + row: Row { + values: vec![Value::Integer(1), Value::Text("reader".into())], + }, + }) + .await + .expect("seed user"); + db.commit(Mutation::Upsert { + project_id: "p".into(), + scope_id: "app".into(), + table_name: "profiles".into(), + primary_key: vec![Value::Integer(1)], + row: Row { + values: vec![Value::Integer(1), Value::Text("bob".into())], + }, + }) + .await + .expect("seed profile"); + + db.commit(Mutation::Ddl(DdlOperation::GrantPermission { + caller_id: "reader".into(), + permission: Permission::TableRead { + project_id: "p".into(), + scope_id: "app".into(), + table_name: "users".into(), + }, + actor_id: None, + delegable: false, + })) + .await + .expect("grant users read"); + db.commit(Mutation::Ddl(DdlOperation::GrantPermission { + caller_id: "reader".into(), + permission: Permission::TableRead { + project_id: "p".into(), + scope_id: "app".into(), + table_name: "profiles".into(), + }, + actor_id: None, + delegable: false, + })) + .await + .expect("grant profiles read"); + db.set_read_policy( + "p", + "app", + "profiles", + Expr::Eq("owner".into(), Value::Text("$caller_id".into())), + ) + .await + .expect("set policy"); + + let reader = CallerContext::new("reader"); + let err = db + .query_with_options_as( + Some(&reader), + "p", + "app", + Query::select(&["*"]) + .from("users") + .alias("u") + .inner_join("profiles", "u.id", "user_id") + .with_last_join_alias("u"), + QueryOptions::default(), + ) + .await + .expect_err("duplicate aliases should be rejected"); + assert!(matches!(err, QueryError::InvalidQuery { .. })); +} + #[tokio::test] async fn production_profile_requires_hmac() { let dir = tempdir().expect("temp"); @@ -2110,6 +2474,28 @@ async fn secure_profile_requires_hardened_storage_settings() { AedbInstance::open_secure(hardened, dir.path()).expect("open secure"); } +#[tokio::test] +async fn secure_profile_rejects_batch_durability() { + let dir = tempdir().expect("temp"); + let mut weak = AedbConfig::production([9u8; 32]); + weak.durability_mode = DurabilityMode::Batch; + let err = AedbInstance::open_secure(weak, dir.path()) + .err() + .expect("secure profile must reject batch durability"); + assert!(matches!(err, AedbError::InvalidConfig { .. })); +} + +#[tokio::test] +async fn production_profile_rejects_batch_durability() { + let dir = tempdir().expect("temp"); + let mut weak = AedbConfig::production([7u8; 32]); + weak.durability_mode = DurabilityMode::Batch; + let err = AedbInstance::open_production(weak, dir.path()) + .err() + .expect("production profile must reject batch durability"); + assert!(matches!(err, AedbError::InvalidConfig { .. })); +} + #[tokio::test] async fn secure_profile_rejects_short_hmac_key() { let dir = tempdir().expect("temp"); @@ -2142,6 +2528,19 @@ fn low_latency_profile_uses_batch_durability_with_strict_recovery() { assert!(cfg.manifest_hmac_key.is_some()); } +#[test] +fn checkpoint_compression_level_is_validated() { + let mut cfg = AedbConfig::default(); + cfg.checkpoint_compression_level = 23; + let err = crate::lib_helpers::validate_config(&cfg) + .err() + .expect("out-of-range compression level must be rejected"); + assert!(matches!(err, AedbError::InvalidConfig { .. })); + + cfg.checkpoint_compression_level = 1; + crate::lib_helpers::validate_config(&cfg).expect("valid compression level"); +} + #[tokio::test] async fn secure_mode_requires_authenticated_apis() { let dir = tempdir().expect("temp"); @@ -2962,6 +3361,115 @@ async fn lifecycle_hooks_receive_post_commit_events_for_applied_ddl() { )); } +#[tokio::test] +async fn lifecycle_outbox_persists_applied_events() { + let dir = tempdir().expect("temp"); + let db = AedbInstance::open(AedbConfig::default(), dir.path()).expect("open"); + let created = db + .commit_ddl(DdlOperation::CreateProject { + owner_id: None, + project_id: "arcana".into(), + if_not_exists: true, + }) + .await + .expect("create project"); + + let outbox = db + .query_no_auth( + "_system", + "app", + Query::select(&["event_count", "events"]) + .from("lifecycle_outbox") + .where_(Expr::Eq( + "commit_seq".into(), + Value::Integer(created.seq as i64), + )) + .limit(1), + QueryOptions::default(), + ) + .await + .expect("query lifecycle outbox"); + assert_eq!(outbox.rows.len(), 1, "expected lifecycle outbox row"); + assert_eq!(outbox.rows[0].values[0], Value::Integer(1)); + let Value::Json(payload) = &outbox.rows[0].values[1] else { + panic!("expected json payload"); + }; + let events: Vec = + serde_json::from_str(payload.as_str()).expect("decode lifecycle payload"); + assert!(matches!( + events.first(), + Some(LifecycleEvent::ProjectCreated { project_id, seq }) + if project_id == "arcana" && *seq == created.seq + )); +} + +#[tokio::test] +async fn idempotency_prunes_by_commit_window() { + let dir = tempdir().expect("temp"); + let db = AedbInstance::open( + AedbConfig { + idempotency_window_commits: 1, + ..AedbConfig::default() + }, + dir.path(), + ) + .expect("open"); + db.create_project("p").await.expect("project"); + + let first = db + .commit_envelope(TransactionEnvelope { + caller: None, + idempotency_key: Some(IdempotencyKey([1u8; 16])), + write_class: WriteClass::Standard, + assertions: Vec::new(), + read_set: crate::commit::tx::ReadSet::default(), + write_intent: WriteIntent { + mutations: vec![Mutation::KvSet { + project_id: "p".into(), + scope_id: "app".into(), + key: b"k".to_vec(), + value: b"v1".to_vec(), + }], + }, + base_seq: 0, + }) + .await + .expect("first commit"); + + db.commit(Mutation::KvSet { + project_id: "p".into(), + scope_id: "app".into(), + key: b"other".to_vec(), + value: b"v2".to_vec(), + }) + .await + .expect("advance seq"); + + let retried = db + .commit_envelope(TransactionEnvelope { + caller: None, + idempotency_key: Some(IdempotencyKey([1u8; 16])), + write_class: WriteClass::Standard, + assertions: Vec::new(), + read_set: crate::commit::tx::ReadSet::default(), + write_intent: WriteIntent { + mutations: vec![Mutation::KvSet { + project_id: "p".into(), + scope_id: "app".into(), + key: b"k".to_vec(), + value: b"v3".to_vec(), + }], + }, + base_seq: 0, + }) + .await + .expect("retried commit"); + assert!( + retried.commit_seq > first.commit_seq, + "idempotency record should expire by sequence window" + ); +} + #[test] fn open_rejects_invalid_config() { let dir = tempdir().expect("temp"); @@ -3287,6 +3795,190 @@ async fn order_book_new_with_durable_finality_waits_until_durable_head_catches_u ); } +#[tokio::test] +async fn order_book_new_fok_reject_is_dropped_before_wal_append() { + let dir = tempdir().expect("temp"); + let db = AedbInstance::open(AedbConfig::default(), dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + let before = db.operational_metrics().await; + + let err = db + .order_book_new( + "p", + "app", + crate::order_book::OrderRequest { + instrument: "BTC-USD".into(), + client_order_id: "fok-no-liq-1".into(), + side: crate::order_book::OrderSide::Bid, + order_type: crate::order_book::OrderType::Limit, + time_in_force: crate::order_book::TimeInForce::Fok, + exec_instructions: crate::order_book::ExecInstruction(0), + self_trade_prevention: crate::order_book::SelfTradePrevention::None, + price_ticks: 100, + qty_be: { + let mut out = [0u8; 32]; + out[31] = 1; + out + }, + owner: "alice".into(), + account: None, + nonce: 1, + price_limit_ticks: None, + }, + ) + .await + .expect_err("unfillable FOK should reject upstream"); + let after = db.operational_metrics().await; + + match err { + AedbError::Validation(msg) => assert!( + msg.contains("fok cannot fill"), + "unexpected validation message: {msg}" + ), + other => panic!("unexpected error variant: {other:?}"), + } + assert_eq!( + after.wal_append_ops, before.wal_append_ops, + "upstream dropped rejects should not append WAL frames" + ); + assert_eq!( + after.wal_append_bytes, before.wal_append_bytes, + "upstream dropped rejects should not increase WAL append bytes" + ); +} + +#[tokio::test] +#[ignore = "long-running finality latency profile"] +async fn finality_profile_visible_vs_durable_low_latency_mode() { + async fn run_profile( + config: AedbConfig, + finality: CommitFinality, + ops: usize, + ) -> (u64, u64, u64, crate::OperationalMetrics) { + let dir = tempdir().expect("temp"); + let db = AedbInstance::open(config, dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + let started = Instant::now(); + let mut lat_sum = 0u128; + let mut lat_max = 0u64; + for i in 0..ops { + let op_started = Instant::now(); + db.commit_with_finality( + Mutation::KvSet { + project_id: "p".into(), + scope_id: "app".into(), + key: format!("finality:{finality:?}:{i}").into_bytes(), + value: i.to_be_bytes().to_vec(), + }, + finality, + ) + .await + .expect("commit with finality"); + let us = op_started.elapsed().as_micros() as u64; + lat_sum = lat_sum.saturating_add(us as u128); + lat_max = lat_max.max(us); + } + db.force_fsync().await.expect("flush"); + let elapsed = started.elapsed().as_secs_f64().max(0.001); + let tps = (ops as f64 / elapsed) as u64; + let avg_us = (lat_sum / ops.max(1) as u128) as u64; + let op = db.operational_metrics().await; + (tps, avg_us, lat_max, op) + } + + let ops = std::env::var("AEDB_FINALITY_PROFILE_OPS") + .ok() + .and_then(|v| v.parse::().ok()) + .unwrap_or(600) + .max(200); + + let mut low_latency_no_coalesce = AedbConfig::low_latency([1u8; 32]); + low_latency_no_coalesce.durable_ack_coalescing_enabled = false; + low_latency_no_coalesce.durable_ack_coalesce_window_us = 0; + let low_latency_coalesce = AedbConfig::low_latency([1u8; 32]); + + let (visible_tps, visible_avg_us, visible_max_us, visible_op) = run_profile( + low_latency_no_coalesce.clone(), + CommitFinality::Visible, + ops, + ) + .await; + let (durable_base_tps, durable_base_avg_us, durable_base_max_us, durable_base_op) = + run_profile(low_latency_no_coalesce, CommitFinality::Durable, ops).await; + let (durable_tps, durable_avg_us, durable_max_us, durable_op) = + run_profile(low_latency_coalesce, CommitFinality::Durable, ops).await; + + eprintln!( + "finality_profile: ops={} visible_tps={} durable_base_tps={} durable_coalesced_tps={} visible_avg_us={} durable_base_avg_us={} durable_coalesced_avg_us={} visible_max_us={} durable_base_max_us={} durable_coalesced_max_us={} visible_durable_wait_ops={} durable_base_wait_ops={} durable_coalesced_wait_ops={} visible_avg_durable_wait_us={} durable_base_avg_durable_wait_us={} durable_coalesced_avg_durable_wait_us={} visible_wal_sync_ops={} durable_base_wal_sync_ops={} durable_coalesced_wal_sync_ops={} visible_avg_wal_sync_us={} durable_base_avg_wal_sync_us={} durable_coalesced_avg_wal_sync_us={} visible_avg_wal_append_us={} durable_base_avg_wal_append_us={} durable_coalesced_avg_wal_append_us={}", + ops, + visible_tps, + durable_base_tps, + durable_tps, + visible_avg_us, + durable_base_avg_us, + durable_avg_us, + visible_max_us, + durable_base_max_us, + durable_max_us, + visible_op.durable_wait_ops, + durable_base_op.durable_wait_ops, + durable_op.durable_wait_ops, + visible_op.avg_durable_wait_micros, + durable_base_op.avg_durable_wait_micros, + durable_op.avg_durable_wait_micros, + visible_op.wal_sync_ops, + durable_base_op.wal_sync_ops, + durable_op.wal_sync_ops, + visible_op.avg_wal_sync_micros, + durable_base_op.avg_wal_sync_micros, + durable_op.avg_wal_sync_micros, + visible_op.avg_wal_append_micros, + durable_base_op.avg_wal_append_micros, + durable_op.avg_wal_append_micros + ); + + assert_eq!( + visible_op.queue_full_rejections, 0, + "visible finality profile should not saturate queue" + ); + assert_eq!( + durable_base_op.queue_full_rejections, 0, + "durable baseline profile should not saturate queue" + ); + assert_eq!( + durable_op.queue_full_rejections, 0, + "durable coalesced profile should not saturate queue" + ); + assert_eq!( + visible_op.timeout_rejections, 0, + "visible finality profile should not timeout" + ); + assert_eq!( + durable_base_op.timeout_rejections, 0, + "durable baseline profile should not timeout" + ); + assert_eq!( + durable_op.timeout_rejections, 0, + "durable coalesced profile should not timeout" + ); + assert_eq!( + visible_op.durable_wait_ops, 0, + "visible finality profile should not accumulate durable wait operations" + ); + assert!( + durable_op.durable_wait_ops > 0, + "durable finality profile should accumulate durable wait operations" + ); + assert!( + durable_tps >= durable_base_tps.saturating_div(2), + "coalesced durable finality regressed severely: base={durable_base_tps} coalesced={durable_tps}" + ); + assert!( + durable_tps <= visible_tps.saturating_mul(2), + "durable finality profile produced implausible TPS vs visible: visible={visible_tps} durable={durable_tps}" + ); +} + #[tokio::test] async fn order_book_write_requires_authenticated_caller_in_secure_mode() { let dir = tempdir().expect("temp"); @@ -3383,9 +4075,83 @@ async fn secure_mode_supports_order_book_writes_via_authenticated_as_apis() { let status = db .order_status("p", "app", "BTC-USD", 1, ConsistencyMode::AtLatest, &alice) .await - .expect("status query") - .expect("order exists"); - assert_eq!(status.status, crate::order_book::OrderStatus::Cancelled); + .expect("status query") + .expect("order exists"); + assert_eq!(status.status, crate::order_book::OrderStatus::Cancelled); +} + +#[tokio::test] +async fn open_orders_requires_kv_read_permission() { + let dir = tempdir().expect("temp"); + let db = AedbInstance::open(AedbConfig::default(), dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + + db.order_book_new( + "p", + "app", + crate::order_book::OrderRequest { + instrument: "BTC-USD".into(), + client_order_id: "cid-open-orders-1".into(), + side: crate::order_book::OrderSide::Bid, + order_type: crate::order_book::OrderType::Limit, + time_in_force: crate::order_book::TimeInForce::Gtc, + exec_instructions: crate::order_book::ExecInstruction(0), + self_trade_prevention: crate::order_book::SelfTradePrevention::None, + price_ticks: 100, + qty_be: { + let mut out = [0u8; 32]; + out[31] = 1; + out + }, + owner: "alice".into(), + account: None, + nonce: 1, + price_limit_ticks: None, + }, + ) + .await + .expect("place order"); + + let alice = CallerContext::new("alice"); + let denied = db + .open_orders( + "p", + "app", + "BTC-USD", + "alice", + ConsistencyMode::AtLatest, + &alice, + ) + .await + .expect_err("missing KvRead should be denied"); + assert!(matches!(denied, QueryError::PermissionDenied { .. })); + + db.commit_ddl(DdlOperation::GrantPermission { + caller_id: "alice".into(), + permission: Permission::KvRead { + project_id: "p".into(), + scope_id: Some("app".into()), + prefix: Some(b"ob:BTC-USD:".to_vec()), + }, + actor_id: None, + delegable: false, + }) + .await + .expect("grant kv read"); + + let open = db + .open_orders( + "p", + "app", + "BTC-USD", + "alice", + ConsistencyMode::AtLatest, + &alice, + ) + .await + .expect("open orders"); + assert_eq!(open.len(), 1); + assert_eq!(open[0].owner, "alice"); } #[tokio::test] @@ -3891,6 +4657,117 @@ async fn idempotent_retry_does_not_double_apply_non_idempotent_mutation() { ); } +#[tokio::test] +async fn retry_idempotency_is_exactly_once_under_commit_pressure() { + let dir = tempdir().expect("temp"); + let config = AedbConfig { + commit_timeout_ms: 1, + ..AedbConfig::default() + }; + let db = AedbInstance::open(config, dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + + let key = IdempotencyKey([91u8; 16]); + let envelope = TransactionEnvelope { + caller: None, + idempotency_key: Some(key), + write_class: WriteClass::Standard, + assertions: Vec::new(), + read_set: Default::default(), + write_intent: WriteIntent { + mutations: vec![ + Mutation::KvSet { + project_id: "p".into(), + scope_id: "app".into(), + key: b"__slow_parallel_worker__".to_vec(), + value: b"slow".to_vec(), + }, + Mutation::KvIncU256 { + project_id: "p".into(), + scope_id: "app".into(), + key: b"timeout-idem-counter".to_vec(), + amount_be: { + let mut out = [0u8; 32]; + out[31] = 1; + out + }, + }, + ], + }, + base_seq: 0, + }; + + let noisy_db = Arc::new(db); + let mut noise_tasks = Vec::new(); + for worker in 0..8 { + let db_clone = Arc::clone(&noisy_db); + noise_tasks.push(tokio::spawn(async move { + for i in 0..200usize { + let _ = db_clone + .commit(Mutation::KvSet { + project_id: "p".into(), + scope_id: "app".into(), + key: format!("noise:{worker}:{i}").into_bytes(), + value: b"n".to_vec(), + }) + .await; + } + })); + } + + let mut saw_timeout = false; + let second = loop { + match noisy_db.commit_envelope(envelope.clone()).await { + Ok(result) => break result, + Err(AedbError::Timeout) => { + saw_timeout = true; + tokio::time::sleep(Duration::from_millis(5)).await; + } + Err(other) => panic!("unexpected error during retry loop: {other:?}"), + } + }; + for t in noise_tasks { + t.await.expect("join noise worker"); + } + noisy_db + .wait_for_durable(second.commit_seq) + .await + .expect("durable ack"); + + let third = noisy_db + .commit_envelope(envelope) + .await + .expect("repeat idempotent retry"); + assert_eq!( + third.commit_seq, second.commit_seq, + "all retries must resolve to one commit sequence" + ); + + let counter = noisy_db + .kv_get_no_auth( + "p", + "app", + b"timeout-idem-counter", + ConsistencyMode::AtLatest, + ) + .await + .expect("kv counter") + .expect("counter exists"); + assert_eq!( + primitive_types::U256::from_big_endian(&counter.value), + primitive_types::U256::one(), + "counter must be applied once" + ); + + let op = noisy_db.operational_metrics().await; + if saw_timeout { + assert!( + op.timeout_rejections >= 1, + "timeout path should be observable in operational metrics" + ); + } +} + #[tokio::test] async fn strict_cancel_rejects_missing_order() { let dir = tempdir().expect("temp"); @@ -4354,127 +5231,847 @@ async fn multi_update_transaction_envelope_updates_table_and_kv() { ConsistencyMode::AtLatest, &caller, ) - .await - .expect("ledger entry"); - assert_eq!( - tx_last.as_ref().map(|v| v.value.clone()), - Some(b"t1".to_vec()) - ); - assert_eq!( - ledger.as_ref().map(|v| v.value.clone()), - Some(b"10".to_vec()) - ); + .await + .expect("ledger entry"); + assert_eq!( + tx_last.as_ref().map(|v| v.value.clone()), + Some(b"t1".to_vec()) + ); + assert_eq!( + ledger.as_ref().map(|v| v.value.clone()), + Some(b"10".to_vec()) + ); +} + +#[tokio::test] +async fn checkpoint_now_allows_commits_while_running() { + let dir = tempdir().expect("temp"); + let db = Arc::new(AedbInstance::open(AedbConfig::default(), dir.path()).expect("open")); + db.create_project("p").await.expect("project"); + + // Build enough state to make checkpoint work measurable. + for i in 0..2_000u32 { + db.commit(Mutation::KvSet { + project_id: "p".into(), + scope_id: "app".into(), + key: format!("seed:{i:06}").into_bytes(), + value: vec![b'x'; 1024], + }) + .await + .expect("seed write"); + } + + let checkpoint_db = Arc::clone(&db); + let checkpoint_task = tokio::spawn(async move { checkpoint_db.checkpoint_now().await }); + + // Wait briefly for checkpoint to begin. + for _ in 0..10 { + if !checkpoint_task.is_finished() { + break; + } + tokio::time::sleep(std::time::Duration::from_millis(1)).await; + } + assert!( + !checkpoint_task.is_finished(), + "checkpoint should still be running" + ); + + let commit = db + .commit(Mutation::KvSet { + project_id: "p".into(), + scope_id: "app".into(), + key: b"concurrent:write".to_vec(), + value: b"ok".to_vec(), + }) + .await + .expect("commit should proceed during checkpoint"); + let checkpoint_seq = checkpoint_task + .await + .expect("checkpoint join") + .expect("checkpoint"); + assert!(commit.commit_seq >= checkpoint_seq); +} + +#[tokio::test] +async fn checkpoint_now_serializes_checkpoint_writers() { + let dir = tempdir().expect("temp"); + let db = Arc::new(AedbInstance::open(AedbConfig::default(), dir.path()).expect("open")); + db.create_project("p").await.expect("project"); + + db.commit(Mutation::KvSet { + project_id: "p".into(), + scope_id: "app".into(), + key: b"k".to_vec(), + value: b"v".to_vec(), + }) + .await + .expect("seed"); + + let db1 = Arc::clone(&db); + let db2 = Arc::clone(&db); + let t1 = tokio::spawn(async move { db1.checkpoint_now().await }); + let t2 = tokio::spawn(async move { db2.checkpoint_now().await }); + + let s1 = t1.await.expect("checkpoint task 1").expect("checkpoint 1"); + let s2 = t2.await.expect("checkpoint task 2").expect("checkpoint 2"); + assert_eq!(s1, s2); +} + +#[tokio::test] +async fn checkpoint_captures_transaction_all_or_none() { + let dir = tempdir().expect("temp"); + let config = AedbConfig::default(); + let db = AedbInstance::open(config.clone(), dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + + let base_seq = db + .snapshot_probe(ConsistencyMode::AtLatest) + .await + .expect("base"); + let commit = db + .commit_envelope(TransactionEnvelope { + caller: None, + idempotency_key: None, + write_class: WriteClass::Standard, + assertions: Vec::new(), + read_set: Default::default(), + write_intent: WriteIntent { + mutations: vec![ + Mutation::KvSet { + project_id: "p".into(), + scope_id: "app".into(), + key: b"tx:part:a".to_vec(), + value: b"1".to_vec(), + }, + Mutation::KvSet { + project_id: "p".into(), + scope_id: "app".into(), + key: b"tx:part:b".to_vec(), + value: b"2".to_vec(), + }, + ], + }, + base_seq, + }) + .await + .expect("atomic multi-mutation commit"); + let checkpoint_seq = db.checkpoint_now().await.expect("checkpoint"); + assert!(checkpoint_seq >= commit.commit_seq); + + let recovered_at_cp = + crate::recovery::recover_at_seq_with_config(dir.path(), checkpoint_seq, &config) + .expect("recover at checkpoint"); + let snapshot_at_cp = recovered_at_cp.keyspace.snapshot(); + assert_eq!( + snapshot_at_cp + .kv_get("p", "app", b"tx:part:a") + .map(|entry| entry.value.clone()), + Some(b"1".to_vec()) + ); + assert_eq!( + snapshot_at_cp + .kv_get("p", "app", b"tx:part:b") + .map(|entry| entry.value.clone()), + Some(b"2".to_vec()) + ); + + if commit.commit_seq > 0 { + let recovered_before = + crate::recovery::recover_at_seq_with_config(dir.path(), commit.commit_seq - 1, &config) + .expect("recover before commit seq"); + let before_snapshot = recovered_before.keyspace.snapshot(); + assert!(before_snapshot.kv_get("p", "app", b"tx:part:a").is_none()); + assert!(before_snapshot.kv_get("p", "app", b"tx:part:b").is_none()); + } +} + +#[tokio::test] +async fn checkpoint_manifest_trims_fully_covered_segments() { + let dir = tempdir().expect("temp"); + let config = AedbConfig { + max_segment_bytes: 4096, + ..AedbConfig::default() + }; + let db = AedbInstance::open(config.clone(), dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + + for i in 0..400u32 { + db.commit(Mutation::KvSet { + project_id: "p".into(), + scope_id: "app".into(), + key: format!("seg:{i:04}").into_bytes(), + value: vec![b'x'; 256], + }) + .await + .expect("seed"); + } + + let all_segments = crate::lib_helpers::read_segments(dir.path()).expect("all segments"); + assert!( + all_segments.len() > 1, + "test must create multiple wal segments" + ); + + let checkpoint_seq = db.checkpoint_now().await.expect("checkpoint"); + let manifest = crate::manifest::atomic::load_manifest_signed(dir.path(), config.hmac_key()) + .expect("manifest"); + + assert!( + manifest.segments.len() < all_segments.len(), + "checkpoint manifest should drop fully covered historical segments" + ); + for segment in &manifest.segments { + let path = dir.path().join(&segment.filename); + let range = crate::lib_helpers::scan_segment_seq_range(&path).expect("scan segment"); + if let Some((_, max_seq)) = range { + assert!( + max_seq > checkpoint_seq || segment.segment_seq == manifest.active_segment_seq, + "manifest retained a segment fully covered by checkpoint" + ); + } + } +} + +#[tokio::test] +#[ignore = "manual perf probe: commit latency with and without concurrent checkpoint"] +async fn benchmark_commit_latency_during_checkpoint() { + fn percentile(sorted: &[u128], p: f64) -> u128 { + if sorted.is_empty() { + return 0; + } + let percentile_index = ((sorted.len() as f64 - 1.0) * p).round() as usize; + sorted[percentile_index.min(sorted.len() - 1)] + } + + async fn run_phase( + db: &Arc, + start: usize, + count: usize, + with_checkpoint: bool, + ) -> (f64, u128, u128) { + for i in 0..10_000usize { + db.commit(Mutation::KvSet { + project_id: "p".into(), + scope_id: "app".into(), + key: format!("warmup:{i:05}").into_bytes(), + value: vec![b'w'; 256], + }) + .await + .expect("warmup seed"); + } + + let checkpoint_task = if with_checkpoint { + Some(tokio::spawn({ + let db = Arc::clone(db); + async move { db.checkpoint_now().await } + })) + } else { + None + }; + + let phase_started = Instant::now(); + let mut latencies_us = Vec::with_capacity(count); + for i in 0..count { + let started = Instant::now(); + db.commit(Mutation::KvSet { + project_id: "p".into(), + scope_id: "app".into(), + key: format!( + "bench:{}:{:06}", + if with_checkpoint { "cp" } else { "base" }, + start + i + ) + .into_bytes(), + value: vec![b'x'; 512], + }) + .await + .expect("bench commit"); + latencies_us.push(started.elapsed().as_micros()); + } + + if let Some(task) = checkpoint_task { + let _ = task + .await + .expect("checkpoint task join") + .expect("checkpoint"); + } + + latencies_us.sort_unstable(); + let elapsed_secs = phase_started.elapsed().as_secs_f64().max(0.000_001); + let tps = count as f64 / elapsed_secs; + let p50_us = percentile(&latencies_us, 0.50); + let p99_us = percentile(&latencies_us, 0.99); + (tps, p50_us, p99_us) + } + + let dir = tempdir().expect("temp"); + let config = AedbConfig::default(); + let db = Arc::new(AedbInstance::open(config, dir.path()).expect("open")); + db.create_project("p").await.expect("project"); + + let (base_tps, base_p50, base_p99) = run_phase(&db, 0, 800, false).await; + let (cp_tps, cp_p50, cp_p99) = run_phase(&db, 1_000_000, 800, true).await; + + eprintln!( + "checkpoint_perf: base_tps={:.2} base_p50_us={} base_p99_us={} | cp_tps={:.2} cp_p50_us={} cp_p99_us={} | tps_ratio={:.3} p50_ratio={:.3} p99_ratio={:.3}", + base_tps, + base_p50, + base_p99, + cp_tps, + cp_p50, + cp_p99, + cp_tps / base_tps.max(0.000_001), + (cp_p50 as f64) / (base_p50.max(1) as f64), + (cp_p99 as f64) / (base_p99.max(1) as f64), + ); +} + +#[tokio::test] +#[ignore = "manual perf probe: parallel commit throughput with and without concurrent checkpoint"] +async fn benchmark_parallel_commit_throughput_during_checkpoint() { + fn percentile(sorted: &[u128], p: f64) -> u128 { + if sorted.is_empty() { + return 0; + } + let percentile_index = ((sorted.len() as f64 - 1.0) * p).round() as usize; + sorted[percentile_index.min(sorted.len() - 1)] + } + + async fn run_parallel_phase( + db: &Arc, + workers: usize, + commits_per_worker: usize, + with_checkpoint: bool, + offset: usize, + ) -> (f64, u128, u128) { + let checkpoint_task = if with_checkpoint { + Some(tokio::spawn({ + let db = Arc::clone(db); + async move { db.checkpoint_now().await } + })) + } else { + None + }; + + let phase_started = Instant::now(); + let mut tasks = Vec::with_capacity(workers); + for worker in 0..workers { + let db = Arc::clone(db); + tasks.push(tokio::spawn(async move { + let mut latencies = Vec::with_capacity(commits_per_worker); + for i in 0..commits_per_worker { + let started = Instant::now(); + db.commit(Mutation::KvSet { + project_id: "p".into(), + scope_id: "app".into(), + key: format!( + "par:{}:{:02}:{:06}", + if with_checkpoint { "cp" } else { "base" }, + worker, + offset + i + ) + .into_bytes(), + value: vec![b'p'; 256], + }) + .await + .expect("parallel bench commit"); + latencies.push(started.elapsed().as_micros()); + } + latencies + })); + } + + let mut all_latencies = Vec::with_capacity(workers * commits_per_worker); + for task in tasks { + let mut worker_latencies = task.await.expect("worker join"); + all_latencies.append(&mut worker_latencies); + } + + if let Some(task) = checkpoint_task { + let _ = task + .await + .expect("checkpoint task join") + .expect("checkpoint"); + } + + all_latencies.sort_unstable(); + let elapsed_secs = phase_started.elapsed().as_secs_f64().max(0.000_001); + let total = workers * commits_per_worker; + let tps = total as f64 / elapsed_secs; + let p50_us = percentile(&all_latencies, 0.50); + let p99_us = percentile(&all_latencies, 0.99); + (tps, p50_us, p99_us) + } + + let dir = tempdir().expect("temp"); + let mut config = AedbConfig { + durability_mode: DurabilityMode::Batch, + batch_interval_ms: 10, + batch_max_bytes: usize::MAX, + recovery_mode: RecoveryMode::Permissive, + hash_chain_required: false, + ..AedbConfig::default() + }; + config.manifest_hmac_key = None; + let db = Arc::new(AedbInstance::open(config, dir.path()).expect("open")); + db.create_project("p").await.expect("project"); + + // Seed state so the checkpoint has meaningful work. + for i in 0..12_000usize { + db.commit(Mutation::KvSet { + project_id: "p".into(), + scope_id: "app".into(), + key: format!("parallel-seed:{i:05}").into_bytes(), + value: vec![b's'; 256], + }) + .await + .expect("seed"); + } + + let workers = 8usize; + let commits_per_worker = 300usize; + let (base_tps, base_p50, base_p99) = + run_parallel_phase(&db, workers, commits_per_worker, false, 0).await; + let (cp_tps, cp_p50, cp_p99) = + run_parallel_phase(&db, workers, commits_per_worker, true, 1_000_000).await; + + eprintln!( + "checkpoint_parallel_perf: workers={} commits_per_worker={} | base_tps={:.2} base_p50_us={} base_p99_us={} | cp_tps={:.2} cp_p50_us={} cp_p99_us={} | tps_ratio={:.3} p50_ratio={:.3} p99_ratio={:.3}", + workers, + commits_per_worker, + base_tps, + base_p50, + base_p99, + cp_tps, + cp_p50, + cp_p99, + cp_tps / base_tps.max(0.000_001), + (cp_p50 as f64) / (base_p50.max(1) as f64), + (cp_p99 as f64) / (base_p99.max(1) as f64), + ); +} + +#[tokio::test] +#[ignore = "manual perf probe: compare checkpoint compression levels under parallel load"] +async fn benchmark_parallel_checkpoint_compression_levels() { + async fn seed(db: &Arc) { + for i in 0..12_000usize { + db.commit(Mutation::KvSet { + project_id: "p".into(), + scope_id: "app".into(), + key: format!("parallel-seed:{i:05}").into_bytes(), + value: vec![b's'; 256], + }) + .await + .expect("seed"); + } + } + + async fn run_parallel_phase( + db: &Arc, + workers: usize, + commits_per_worker: usize, + with_checkpoint: bool, + offset: usize, + ) -> f64 { + let checkpoint_task = if with_checkpoint { + Some(tokio::spawn({ + let db = Arc::clone(db); + async move { db.checkpoint_now().await } + })) + } else { + None + }; + + let phase_started = Instant::now(); + let mut tasks = Vec::with_capacity(workers); + for worker in 0..workers { + let db = Arc::clone(db); + tasks.push(tokio::spawn(async move { + for i in 0..commits_per_worker { + db.commit(Mutation::KvSet { + project_id: "p".into(), + scope_id: "app".into(), + key: format!( + "cmp:{}:{:02}:{:06}", + if with_checkpoint { "cp" } else { "base" }, + worker, + offset + i + ) + .into_bytes(), + value: vec![b'c'; 256], + }) + .await + .expect("parallel bench commit"); + } + })); + } + for task in tasks { + task.await.expect("worker join"); + } + if let Some(task) = checkpoint_task { + let _ = task + .await + .expect("checkpoint task join") + .expect("checkpoint"); + } + + let elapsed_secs = phase_started.elapsed().as_secs_f64().max(0.000_001); + let total = workers * commits_per_worker; + total as f64 / elapsed_secs + } + + let workers = 8usize; + let commits_per_worker = 300usize; + let levels = [3, 1, 0]; + + let mut baseline_tps = 0.0f64; + for (idx, level) in levels.iter().copied().enumerate() { + let dir = tempdir().expect("temp"); + let mut config = AedbConfig { + durability_mode: DurabilityMode::Batch, + batch_interval_ms: 10, + batch_max_bytes: usize::MAX, + recovery_mode: RecoveryMode::Permissive, + hash_chain_required: false, + ..AedbConfig::default() + }; + config.manifest_hmac_key = None; + config.checkpoint_compression_level = level; + let db = Arc::new(AedbInstance::open(config, dir.path()).expect("open")); + db.create_project("p").await.expect("project"); + seed(&db).await; + + let base = + run_parallel_phase(&db, workers, commits_per_worker, false, idx * 1_000_000).await; + let cp = run_parallel_phase( + &db, + workers, + commits_per_worker, + true, + idx * 1_000_000 + 500_000, + ) + .await; + if idx == 0 { + baseline_tps = base; + } + + eprintln!( + "checkpoint_compression_perf: level={} workers={} commits_per_worker={} base_tps={:.2} cp_tps={:.2} cp_to_base={:.3} cp_to_level3_base={:.3}", + level, + workers, + commits_per_worker, + base, + cp, + cp / base.max(0.000_001), + cp / baseline_tps.max(0.000_001), + ); + } } #[tokio::test] -async fn checkpoint_now_waits_for_active_writes() { - let dir = tempdir().expect("temp"); - let db = Arc::new(AedbInstance::open(AedbConfig::default(), dir.path()).expect("open")); - db.create_project("p").await.expect("project"); +#[ignore = "manual perf probe: durability knob sweep (batch/coalescing)"] +async fn benchmark_durability_knob_sweep() { + fn percentile(sorted: &[u128], p: f64) -> u128 { + if sorted.is_empty() { + return 0; + } + let percentile_index = ((sorted.len() as f64 - 1.0) * p).round() as usize; + sorted[percentile_index.min(sorted.len() - 1)] + } - // Simulate an active operation by holding a semaphore permit - let _permit = db.checkpoint_gate.acquire().await.unwrap(); - let checkpoint_db = Arc::clone(&db); - let checkpoint_task = tokio::spawn(async move { checkpoint_db.checkpoint_now().await }); + #[derive(Clone)] + struct Profile { + name: &'static str, + batch_interval_ms: u64, + batch_max_bytes: usize, + coalesce_enabled: bool, + coalesce_window_us: u64, + } - tokio::time::sleep(std::time::Duration::from_millis(20)).await; - assert!( - !checkpoint_task.is_finished(), - "checkpoint should wait while operations are active" - ); + async fn run_profile(profile: &Profile) -> (f64, u128, u128, u64, u64) { + let dir = tempdir().expect("temp"); + let mut config = AedbConfig { + durability_mode: DurabilityMode::Batch, + batch_interval_ms: profile.batch_interval_ms, + batch_max_bytes: profile.batch_max_bytes, + recovery_mode: RecoveryMode::Permissive, + hash_chain_required: false, + durable_ack_coalescing_enabled: profile.coalesce_enabled, + durable_ack_coalesce_window_us: profile.coalesce_window_us, + ..AedbConfig::default() + }; + config.manifest_hmac_key = None; + let db = Arc::new(AedbInstance::open(config, dir.path()).expect("open")); + db.create_project("p").await.expect("project"); - drop(_permit); - checkpoint_task - .await - .expect("checkpoint join") - .expect("checkpoint"); -} + for i in 0..8_000usize { + db.commit(Mutation::KvSet { + project_id: "p".into(), + scope_id: "app".into(), + key: format!("sweep-seed:{i:05}").into_bytes(), + value: vec![b's'; 128], + }) + .await + .expect("seed"); + } -#[tokio::test] -async fn checkpoint_now_returns_structured_error_when_gate_closed() { - let dir = tempdir().expect("temp"); - let db = AedbInstance::open(AedbConfig::default(), dir.path()).expect("open"); - db.create_project("p").await.expect("project"); + let workers = 8usize; + let commits_per_worker = 500usize; + let started = Instant::now(); + let mut tasks = Vec::with_capacity(workers); + for worker in 0..workers { + let db = Arc::clone(&db); + tasks.push(tokio::spawn(async move { + let mut lats = Vec::with_capacity(commits_per_worker); + for i in 0..commits_per_worker { + let t0 = Instant::now(); + db.commit(Mutation::KvSet { + project_id: "p".into(), + scope_id: "app".into(), + key: format!("sweep:{worker:02}:{i:06}").into_bytes(), + value: vec![b'x'; 256], + }) + .await + .expect("commit"); + lats.push(t0.elapsed().as_micros()); + } + lats + })); + } - db.checkpoint_gate.close(); - let err = db - .checkpoint_now() - .await - .expect_err("closed gate should return error"); - assert!(matches!(err, AedbError::Unavailable { .. })); - assert!( - !db.checkpoint_in_progress.load(Ordering::Acquire), - "checkpoint flag should be reset on error" - ); + let mut all_lat = Vec::with_capacity(workers * commits_per_worker); + for task in tasks { + let mut lats = task.await.expect("worker join"); + all_lat.append(&mut lats); + } + all_lat.sort_unstable(); + + let elapsed = started.elapsed().as_secs_f64().max(0.000_001); + let tps = (workers * commits_per_worker) as f64 / elapsed; + let p50 = percentile(&all_lat, 0.50); + let p99 = percentile(&all_lat, 0.99); + let op = db.operational_metrics().await; + (tps, p50, p99, op.wal_sync_ops, op.avg_wal_sync_micros) + } + + let profiles = vec![ + Profile { + name: "baseline_10ms_1mb_no_coalesce", + batch_interval_ms: 10, + batch_max_bytes: 1 * 1024 * 1024, + coalesce_enabled: false, + coalesce_window_us: 0, + }, + Profile { + name: "trial_20ms_4mb_coalesce_1000us", + batch_interval_ms: 20, + batch_max_bytes: 4 * 1024 * 1024, + coalesce_enabled: true, + coalesce_window_us: 1000, + }, + Profile { + name: "trial_20ms_8mb_coalesce_1500us", + batch_interval_ms: 20, + batch_max_bytes: 8 * 1024 * 1024, + coalesce_enabled: true, + coalesce_window_us: 1500, + }, + Profile { + name: "trial_40ms_8mb_coalesce_1500us", + batch_interval_ms: 40, + batch_max_bytes: 8 * 1024 * 1024, + coalesce_enabled: true, + coalesce_window_us: 1500, + }, + ]; + + for profile in &profiles { + let (tps, p50, p99, wal_sync_ops, avg_wal_sync_us) = run_profile(profile).await; + eprintln!( + "durability_sweep: profile={} tps={:.2} p50_us={} p99_us={} wal_sync_ops={} avg_wal_sync_us={}", + profile.name, tps, p50, p99, wal_sync_ops, avg_wal_sync_us + ); + } } #[tokio::test] -async fn commit_returns_structured_error_when_gate_closed() { +#[ignore = "manual profiling: end-to-end pipeline breakdown (commit/checkpoint/recovery)"] +async fn profile_end_to_end_pipeline_breakdown() { + fn percentile(sorted: &[u128], p: f64) -> u128 { + if sorted.is_empty() { + return 0; + } + let percentile_index = ((sorted.len() as f64 - 1.0) * p).round() as usize; + sorted[percentile_index.min(sorted.len() - 1)] + } + let dir = tempdir().expect("temp"); - let db = AedbInstance::open(AedbConfig::default(), dir.path()).expect("open"); + let mut config = AedbConfig { + durability_mode: DurabilityMode::Batch, + batch_interval_ms: 10, + batch_max_bytes: usize::MAX, + recovery_mode: RecoveryMode::Permissive, + hash_chain_required: false, + ..AedbConfig::default() + }; + config.manifest_hmac_key = None; + let db = Arc::new(AedbInstance::open(config.clone(), dir.path()).expect("open")); db.create_project("p").await.expect("project"); - db.checkpoint_gate.close(); - let err = db - .commit(Mutation::KvSet { + for i in 0..20_000usize { + db.commit(Mutation::KvSet { project_id: "p".into(), scope_id: "app".into(), - key: b"k".to_vec(), - value: b"v".to_vec(), + key: format!("seed:{i:06}").into_bytes(), + value: vec![b's'; 256], }) .await - .expect_err("closed gate should fail commit"); - assert!(matches!(err, AedbError::Unavailable { .. })); -} - -#[tokio::test] -async fn backup_lock_blocks_new_write_submissions() { - let dir = tempdir().expect("temp"); - let db = Arc::new(AedbInstance::open(AedbConfig::default(), dir.path()).expect("open")); - db.create_project("p").await.expect("project"); + .expect("seed"); + } - // Simulate a checkpoint/backup in progress - db.checkpoint_in_progress.store(true, Ordering::Release); - let _all_permits = db - .checkpoint_gate - .acquire_many(super::CHECKPOINT_GATE_PERMITS) + let workers = 8usize; + let commits_per_worker = 500usize; + let commit_started = Instant::now(); + let mut tasks = Vec::with_capacity(workers); + for worker in 0..workers { + let db = Arc::clone(&db); + tasks.push(tokio::spawn(async move { + let mut latencies_us = Vec::with_capacity(commits_per_worker); + for i in 0..commits_per_worker { + let started = Instant::now(); + db.commit(Mutation::KvSet { + project_id: "p".into(), + scope_id: "app".into(), + key: format!("profile:commit:{worker:02}:{i:06}").into_bytes(), + value: vec![b'c'; 256], + }) + .await + .expect("commit"); + latencies_us.push(started.elapsed().as_micros()); + } + latencies_us + })); + } + let mut all_commit_latencies = Vec::with_capacity(workers * commits_per_worker); + for task in tasks { + let mut worker_latencies = task.await.expect("worker join"); + all_commit_latencies.append(&mut worker_latencies); + } + let commit_elapsed = commit_started.elapsed(); + all_commit_latencies.sort_unstable(); + let commit_tps = (workers * commits_per_worker) as f64 / commit_elapsed.as_secs_f64(); + let commit_p50 = percentile(&all_commit_latencies, 0.50); + let commit_p99 = percentile(&all_commit_latencies, 0.99); + let op_after_commits = db.operational_metrics().await; + + let checkpoint_total_started = Instant::now(); + let checkpoint_lock_started = Instant::now(); + let _checkpoint_guard = db.checkpoint_lock.lock().await; + let checkpoint_lock_wait = checkpoint_lock_started.elapsed(); + + let snapshot_started = Instant::now(); + let checkpoint_seq = db.executor.durable_head_seq_now(); + let lease = db + .acquire_snapshot(ConsistencyMode::AtSeq(checkpoint_seq)) .await - .unwrap(); + .expect("checkpoint snapshot"); + let snapshot_elapsed = snapshot_started.elapsed(); + + let idempotency_started = Instant::now(); + let mut idempotency = db.executor.idempotency_snapshot().await; + idempotency.retain(|_, record| record.commit_seq <= checkpoint_seq); + let idempotency_elapsed = idempotency_started.elapsed(); + + let write_checkpoint_started = Instant::now(); + let checkpoint = crate::checkpoint::writer::write_checkpoint_with_key( + lease.view.keyspace.as_ref(), + lease.view.catalog.as_ref(), + checkpoint_seq, + &db.dir, + db._config.checkpoint_key(), + db._config.checkpoint_key_id.clone(), + idempotency, + db._config.checkpoint_compression_level, + ) + .expect("write checkpoint"); + let write_checkpoint_elapsed = write_checkpoint_started.elapsed(); + + let segments_started = Instant::now(); + let segments = crate::lib_helpers::read_segments_for_checkpoint(&db.dir, checkpoint_seq) + .expect("read segments for checkpoint"); + let active_segment_seq = segments + .last() + .map(|segment| segment.segment_seq) + .unwrap_or(checkpoint_seq.saturating_add(1)); + let segments_elapsed = segments_started.elapsed(); + + let manifest_started = Instant::now(); + let manifest = crate::manifest::schema::Manifest { + durable_seq: checkpoint_seq, + visible_seq: checkpoint_seq, + active_segment_seq, + checkpoints: vec![checkpoint.clone()], + segments: segments.clone(), + }; + crate::manifest::atomic::write_manifest_atomic_signed( + &manifest, + &db.dir, + db._config.hmac_key(), + ) + .expect("write manifest"); + let manifest_elapsed = manifest_started.elapsed(); + let checkpoint_total_elapsed = checkpoint_total_started.elapsed(); + drop(_checkpoint_guard); - // Commit should fail fast with checkpoint error (better than blocking) - let result = db - .commit(Mutation::KvSet { - project_id: "p".into(), - scope_id: "app".into(), - key: b"blocked".to_vec(), - value: b"1".to_vec(), - }) - .await; + let checkpoint_bytes = std::fs::metadata(db.dir.join(&checkpoint.filename)) + .expect("checkpoint stat") + .len(); - assert!( - result.is_err(), - "write should be rejected while checkpoint is in progress" + drop(db); + let reopen_started = Instant::now(); + let reopened = AedbInstance::open(config, dir.path()).expect("reopen"); + let reopen_elapsed = reopen_started.elapsed(); + let reopen_metrics = reopened.operational_metrics().await; + + eprintln!( + "pipeline_profile commit_phase: workers={} commits_per_worker={} commits={} elapsed_ms={} tps={:.2} p50_us={} p99_us={} | prestage_validate_ops={} avg_prestage_validate_us={} epoch_process_ops={} avg_epoch_process_us={} avg_wal_append_us={} avg_wal_sync_us={} avg_coordinator_apply_us={}", + workers, + commits_per_worker, + workers * commits_per_worker, + commit_elapsed.as_millis(), + commit_tps, + commit_p50, + commit_p99, + op_after_commits.prestage_validate_ops, + op_after_commits.avg_prestage_validate_micros, + op_after_commits.epoch_process_ops, + op_after_commits.avg_epoch_process_micros, + op_after_commits.avg_wal_append_micros, + op_after_commits.avg_wal_sync_micros, + op_after_commits.avg_coordinator_apply_micros ); - assert!( - result - .unwrap_err() - .to_string() - .contains("checkpoint in progress"), - "error should indicate checkpoint" + eprintln!( + "pipeline_profile checkpoint_phase: seq={} total_ms={} lock_wait_ms={} snapshot_ms={} idempotency_ms={} write_checkpoint_ms={} segment_scan_ms={} manifest_ms={} checkpoint_bytes={} retained_segments={}", + checkpoint_seq, + checkpoint_total_elapsed.as_millis(), + checkpoint_lock_wait.as_millis(), + snapshot_elapsed.as_millis(), + idempotency_elapsed.as_millis(), + write_checkpoint_elapsed.as_millis(), + segments_elapsed.as_millis(), + manifest_elapsed.as_millis(), + checkpoint_bytes, + segments.len() + ); + eprintln!( + "pipeline_profile recovery_phase: reopen_ms={} startup_recovery_micros={} startup_recovered_seq={} durable_head_seq={} visible_head_seq={}", + reopen_elapsed.as_millis(), + reopen_metrics.startup_recovery_micros, + reopen_metrics.startup_recovered_seq, + reopen_metrics.durable_head_seq, + reopen_metrics.visible_head_seq ); - - // Release checkpoint lock and verify commits now work - db.checkpoint_in_progress.store(false, Ordering::Release); - drop(_all_permits); - - db.commit(Mutation::KvSet { - project_id: "p".into(), - scope_id: "app".into(), - key: b"success".to_vec(), - value: b"1".to_vec(), - }) - .await - .expect("commit after checkpoint should succeed"); } #[tokio::test] @@ -5618,3 +7215,363 @@ async fn sql_transaction_plan_helpers_commit() { .expect("exists"); assert!(exists); } + +#[tokio::test] +#[ignore = "long-running multi-agent user-perspective profile"] +async fn secure_multi_agent_profile_identifies_core_shortcomings() { + fn u256_be(v: u64) -> [u8; 32] { + let mut out = [0u8; 32]; + out[24..].copy_from_slice(&v.to_be_bytes()); + out + } + fn decode_u256_u64(bytes: [u8; 32]) -> u64 { + let mut out = [0u8; 8]; + out.copy_from_slice(&bytes[24..]); + u64::from_be_bytes(out) + } + + #[derive(Debug, Default, Clone, Copy)] + struct WorkerStats { + attempted: usize, + accepted: usize, + rejected: usize, + unauthorized_attempted: usize, + unauthorized_denied: usize, + latency_sum_us: u128, + latency_max_us: u64, + } + + #[derive(Debug, Default, Clone, Copy)] + struct RuntimePeaks { + queue_depth: usize, + inflight: usize, + durable_lag: u64, + conflict_rate: f64, + durable_wait_ops: u64, + durable_wait_avg_us: u64, + wal_append_ops: u64, + wal_append_bytes: u64, + wal_append_avg_us: u64, + wal_sync_ops: u64, + wal_sync_avg_us: u64, + commit_errors: u64, + permission_rejections: u64, + validation_rejections: u64, + queue_full_rejections: u64, + timeout_rejections: u64, + read_set_conflicts: u64, + } + + let agents = std::env::var("AEDB_MULTI_AGENT_PROFILE_AGENTS") + .ok() + .and_then(|v| v.parse::().ok()) + .unwrap_or(12) + .max(4); + let ops_per_agent = std::env::var("AEDB_MULTI_AGENT_PROFILE_OPS") + .ok() + .and_then(|v| v.parse::().ok()) + .unwrap_or(800) + .max(100); + + let dir = tempdir().expect("temp"); + let db = Arc::new( + AedbInstance::open_secure(AedbConfig::production([3u8; 32]), dir.path()) + .expect("open secure"), + ); + let system = CallerContext::system_internal(); + db.commit_as( + system.clone(), + Mutation::Ddl(DdlOperation::CreateProject { + owner_id: None, + if_not_exists: true, + project_id: "p".into(), + }), + ) + .await + .expect("create project"); + + let agent_ids: Vec = (0..agents).map(|i| format!("prof_agent_{i}")).collect(); + for a in &agent_ids { + db.commit_as( + system.clone(), + Mutation::Ddl(DdlOperation::GrantPermission { + caller_id: a.clone(), + permission: Permission::KvRead { + project_id: "p".into(), + scope_id: Some("app".into()), + prefix: Some(format!("agent:{a}:").into_bytes()), + }, + actor_id: Some("system".into()), + delegable: false, + }), + ) + .await + .expect("grant kv read"); + db.commit_as( + system.clone(), + Mutation::Ddl(DdlOperation::GrantPermission { + caller_id: a.clone(), + permission: Permission::KvWrite { + project_id: "p".into(), + scope_id: Some("app".into()), + prefix: Some(format!("agent:{a}:").into_bytes()), + }, + actor_id: Some("system".into()), + delegable: false, + }), + ) + .await + .expect("grant kv write"); + db.commit_as( + system.clone(), + Mutation::KvSet { + project_id: "p".into(), + scope_id: "app".into(), + key: format!("agent:{a}:balance").into_bytes(), + value: u256_be(0).to_vec(), + }, + ) + .await + .expect("seed balance"); + } + + let done = Arc::new(tokio::sync::Notify::new()); + let peaks = Arc::new(std::sync::Mutex::new(RuntimePeaks::default())); + let monitor_db = Arc::clone(&db); + let monitor_done = Arc::clone(&done); + let monitor_peaks = Arc::clone(&peaks); + let monitor = tokio::spawn(async move { + loop { + tokio::select! { + _ = monitor_done.notified() => break, + _ = tokio::time::sleep(Duration::from_millis(10)) => { + let op = monitor_db.operational_metrics().await; + let mut p = monitor_peaks.lock().expect("peaks lock"); + p.queue_depth = p.queue_depth.max(op.queue_depth); + p.inflight = p.inflight.max(op.inflight_commits); + p.durable_lag = p.durable_lag.max(op.durable_head_lag); + p.conflict_rate = p.conflict_rate.max(op.conflict_rate); + p.durable_wait_ops = p.durable_wait_ops.max(op.durable_wait_ops); + p.durable_wait_avg_us = p.durable_wait_avg_us.max(op.avg_durable_wait_micros); + p.wal_append_ops = p.wal_append_ops.max(op.wal_append_ops); + p.wal_append_bytes = p.wal_append_bytes.max(op.wal_append_bytes); + p.wal_append_avg_us = p.wal_append_avg_us.max(op.avg_wal_append_micros); + p.wal_sync_ops = p.wal_sync_ops.max(op.wal_sync_ops); + p.wal_sync_avg_us = p.wal_sync_avg_us.max(op.avg_wal_sync_micros); + p.commit_errors = p.commit_errors.max(op.commit_errors); + p.permission_rejections = p.permission_rejections.max(op.permission_rejections); + p.validation_rejections = p.validation_rejections.max(op.validation_rejections); + p.queue_full_rejections = p.queue_full_rejections.max(op.queue_full_rejections); + p.timeout_rejections = p.timeout_rejections.max(op.timeout_rejections); + p.read_set_conflicts = p.read_set_conflicts.max(op.read_set_conflicts); + } + } + } + }); + + let started = Instant::now(); + let mut tasks = Vec::with_capacity(agent_ids.len()); + for (idx, agent) in agent_ids.iter().enumerate() { + let db_clone = Arc::clone(&db); + let caller = CallerContext::new(agent.clone()); + let own_key = format!("agent:{agent}:balance").into_bytes(); + let neighbor = &agent_ids[(idx + 1) % agent_ids.len()]; + let cross_key = format!("agent:{neighbor}:balance").into_bytes(); + tasks.push(tokio::spawn(async move { + let mut stats = WorkerStats::default(); + for op in 0..ops_per_agent { + stats.attempted += 1; + let now = Instant::now(); + let res = db_clone + .commit_as( + caller.clone(), + Mutation::KvIncU256 { + project_id: "p".into(), + scope_id: "app".into(), + key: own_key.clone(), + amount_be: u256_be(1), + }, + ) + .await; + let us = now.elapsed().as_micros() as u64; + stats.latency_sum_us += us as u128; + stats.latency_max_us = stats.latency_max_us.max(us); + match res { + Ok(_) => stats.accepted += 1, + Err(AedbError::Validation(_)) | Err(AedbError::Conflict(_)) => { + stats.rejected += 1 + } + Err(other) => return Err(other), + } + + if op % 25 == 0 { + stats.unauthorized_attempted += 1; + let denied = db_clone + .commit_as( + caller.clone(), + Mutation::KvIncU256 { + project_id: "p".into(), + scope_id: "app".into(), + key: cross_key.clone(), + amount_be: u256_be(1), + }, + ) + .await; + if matches!(denied, Err(AedbError::PermissionDenied(_))) { + stats.unauthorized_denied += 1; + } else if let Err(other) = denied { + return Err(other); + } + } + } + Ok::<_, AedbError>(stats) + })); + } + + let mut merged = WorkerStats::default(); + for t in tasks { + let s = t.await.expect("join profile task").expect("profile run"); + merged.attempted += s.attempted; + merged.accepted += s.accepted; + merged.rejected += s.rejected; + merged.unauthorized_attempted += s.unauthorized_attempted; + merged.unauthorized_denied += s.unauthorized_denied; + merged.latency_sum_us += s.latency_sum_us; + merged.latency_max_us = merged.latency_max_us.max(s.latency_max_us); + } + + done.notify_waiters(); + monitor.await.expect("join monitor"); + db.force_fsync().await.expect("force fsync"); + + let elapsed = started.elapsed().as_secs_f64().max(0.001); + let attempted_tps = (merged.attempted as f64 / elapsed) as u64; + let accepted_tps = (merged.accepted as f64 / elapsed) as u64; + let avg_lat_us = (merged.latency_sum_us / merged.attempted.max(1) as u128) as u64; + let peaks = *peaks.lock().expect("peaks lock"); + let heads = db.head_state().await; + + eprintln!( + "multi_agent_profile: agents={} ops_per_agent={} attempted={} accepted={} rejected={} unauthorized_denied={} attempted_tps={} accepted_tps={} avg_lat_us={} max_lat_us={} peak_queue_depth={} peak_inflight={} peak_durable_lag={} peak_conflict_rate={:.4} peak_durable_wait_ops={} peak_durable_wait_avg_us={} peak_wal_append_ops={} peak_wal_append_bytes={} peak_wal_append_avg_us={} peak_wal_sync_ops={} peak_wal_sync_avg_us={} peak_commit_errors={} peak_permission_rejections={} peak_validation_rejections={} peak_queue_full_rejections={} peak_timeout_rejections={} peak_read_set_conflicts={} heads(v={},d={})", + agents, + ops_per_agent, + merged.attempted, + merged.accepted, + merged.rejected, + merged.unauthorized_denied, + attempted_tps, + accepted_tps, + avg_lat_us, + merged.latency_max_us, + peaks.queue_depth, + peaks.inflight, + peaks.durable_lag, + peaks.conflict_rate, + peaks.durable_wait_ops, + peaks.durable_wait_avg_us, + peaks.wal_append_ops, + peaks.wal_append_bytes, + peaks.wal_append_avg_us, + peaks.wal_sync_ops, + peaks.wal_sync_avg_us, + peaks.commit_errors, + peaks.permission_rejections, + peaks.validation_rejections, + peaks.queue_full_rejections, + peaks.timeout_rejections, + peaks.read_set_conflicts, + heads.visible_head_seq, + heads.durable_head_seq + ); + + assert_eq!( + merged.accepted + merged.rejected, + merged.attempted, + "profile run must not drop operations" + ); + assert_eq!( + merged.unauthorized_denied, merged.unauthorized_attempted, + "cross-agent unauthorized writes must always be denied" + ); + assert!( + heads.visible_head_seq >= heads.durable_head_seq, + "visible head should be >= durable head" + ); + assert!( + attempted_tps >= 200, + "profile indicates severe throughput regression: attempted_tps={attempted_tps}" + ); + assert!( + peaks.commit_errors <= (merged.rejected + merged.unauthorized_denied) as u64, + "unexpected commit errors exceed rejected + unauthorized-denied operations" + ); + assert_eq!( + peaks.permission_rejections, merged.unauthorized_denied as u64, + "permission rejection accounting must match denied unauthorized attempts" + ); + assert_eq!( + peaks.validation_rejections, merged.rejected as u64, + "validation rejection accounting must match application-level rejected operations" + ); + assert_eq!( + peaks.queue_full_rejections, 0, + "unexpected queue-full rejections under baseline profile load" + ); + assert_eq!( + peaks.timeout_rejections, 0, + "unexpected timeout rejections under baseline profile load" + ); + assert_eq!( + peaks.durable_wait_ops, 0, + "baseline non-durable profile should not accumulate durable wait operations" + ); + assert!( + peaks.wal_sync_ops > 0, + "full-durability secure profile should execute WAL sync operations" + ); + + for agent in &agent_ids { + let caller = CallerContext::new(agent.clone()); + let entry = db + .kv_get( + "p", + "app", + format!("agent:{agent}:balance").as_bytes(), + ConsistencyMode::AtLatest, + &caller, + ) + .await + .expect("read own balance") + .expect("balance exists"); + let mut bal = [0u8; 32]; + bal.copy_from_slice(&entry.value); + assert!( + decode_u256_u64(bal) as usize <= ops_per_agent, + "agent balance must not exceed own attempts" + ); + } +} + +#[tokio::test] +async fn strict_open_rejects_directory_previously_opened_in_non_strict_mode() { + let dir = tempdir().expect("temp"); + let mut permissive = AedbConfig::production([7u8; 32]); + permissive.recovery_mode = RecoveryMode::Permissive; + permissive.hash_chain_required = false; + + let db = AedbInstance::open(permissive, dir.path()).expect("open permissive"); + db.shutdown().await.expect("shutdown permissive"); + + let strict = AedbConfig::production([7u8; 32]); + let err = match AedbInstance::open(strict, dir.path()) { + Ok(db) => { + db.shutdown().await.expect("shutdown unexpected strict db"); + panic!("strict open should fail closed"); + } + Err(err) => err, + }; + assert!( + matches!(err, AedbError::Validation(ref msg) if msg.contains("strict open denied")), + "unexpected error: {err}" + ); +} diff --git a/src/offline.rs b/src/offline.rs index 94fba27..eb64419 100644 --- a/src/offline.rs +++ b/src/offline.rs @@ -5,7 +5,7 @@ use crate::error::AedbError; use crate::manifest::atomic::write_manifest_atomic_signed; use crate::manifest::schema::Manifest; use crate::recovery::{RecoveredState, recover_with_config}; -use crate::storage::keyspace::NamespaceId; +use crate::storage::keyspace::{NamespaceId, SecondaryIndexStore}; use serde::{Deserialize, Serialize}; use sha2::{Digest, Sha256}; use std::collections::HashMap; @@ -112,6 +112,7 @@ pub fn restore_snapshot_dump( config.checkpoint_key(), config.checkpoint_key_id.clone(), envelope.state.idempotency.clone(), + config.checkpoint_compression_level, )?; let manifest = Manifest { durable_seq: envelope.state.current_seq, @@ -183,12 +184,188 @@ fn load_dump(path: &Path) -> Result { } fn checksum_state(state: &SnapshotDumpState) -> Result { - let bytes = rmp_serde::to_vec(state).map_err(|e| AedbError::Encode(e.to_string()))?; let mut h = Sha256::new(); - h.update(&bytes); + hash_label(&mut h, "current_seq"); + hash_encoded(&mut h, &state.current_seq)?; + + hash_label(&mut h, "primary_index_backend"); + hash_encoded(&mut h, &state.keyspace.primary_index_backend)?; + + hash_label(&mut h, "namespaces"); + let mut namespaces = state + .keyspace + .namespaces + .iter() + .map(|(ns_id, namespace)| -> Result<_, AedbError> { + Ok((encode(ns_id)?, ns_id, namespace)) + }) + .collect::, _>>()?; + namespaces.sort_by(|a, b| a.0.cmp(&b.0)); + for (ns_key_bytes, _, namespace) in namespaces { + hash_bytes(&mut h, &ns_key_bytes); + + hash_label(&mut h, "kv_entries"); + for (key, entry) in namespace.kv.entries.iter() { + hash_bytes(&mut h, key); + hash_encoded(&mut h, entry)?; + } + + hash_label(&mut h, "tables"); + let mut tables = namespace.tables.iter().collect::>(); + tables.sort_by(|a, b| a.0.cmp(b.0)); + for (table_name, table) in tables { + hash_bytes(&mut h, table_name.as_bytes()); + hash_encoded(&mut h, &table.structural_version)?; + + hash_label(&mut h, "rows"); + for (pk, row) in table.rows.iter() { + hash_bytes(&mut h, pk.as_slice()); + hash_encoded(&mut h, row)?; + } + + hash_label(&mut h, "row_versions"); + for (pk, version) in table.row_versions.iter() { + hash_bytes(&mut h, pk.as_slice()); + hash_encoded(&mut h, version)?; + } + + hash_label(&mut h, "indexes"); + let mut indexes = table.indexes.iter().collect::>(); + indexes.sort_by(|a, b| a.0.cmp(b.0)); + for (index_name, index) in indexes { + hash_bytes(&mut h, index_name.as_bytes()); + hash_encoded(&mut h, &index.columns_bitmask)?; + hash_encoded(&mut h, &index.partial_filter)?; + hash_secondary_index_store(&mut h, &index.store)?; + } + } + } + + hash_label(&mut h, "async_indexes"); + let mut async_indexes = state + .keyspace + .async_indexes + .iter() + .map(|(key, value)| -> Result<_, AedbError> { Ok((encode(key)?, key, value)) }) + .collect::, _>>()?; + async_indexes.sort_by(|a, b| a.0.cmp(&b.0)); + for (key_bytes, _, value) in async_indexes { + hash_bytes(&mut h, &key_bytes); + hash_encoded(&mut h, &value.materialized_seq)?; + for (pk, row) in value.rows.iter() { + hash_bytes(&mut h, pk.as_slice()); + hash_encoded(&mut h, row)?; + } + } + + hash_label(&mut h, "catalog"); + hash_sorted_entries(&mut h, state.catalog.projects.iter())?; + hash_sorted_entries(&mut h, state.catalog.scopes.iter())?; + hash_sorted_entries(&mut h, state.catalog.tables.iter())?; + hash_sorted_entries(&mut h, state.catalog.indexes.iter())?; + hash_sorted_entries(&mut h, state.catalog.async_indexes.iter())?; + hash_sorted_entries(&mut h, state.catalog.kv_projections.iter())?; + hash_sorted_entries(&mut h, state.catalog.permissions.iter())?; + hash_sorted_entries(&mut h, state.catalog.permission_grants.iter())?; + hash_sorted_entries(&mut h, state.catalog.read_policies.iter())?; + + hash_label(&mut h, "idempotency"); + hash_sorted_entries(&mut h, state.idempotency.iter())?; + Ok(hex::encode(h.finalize())) } +fn hash_secondary_index_store( + hasher: &mut Sha256, + store: &SecondaryIndexStore, +) -> Result<(), AedbError> { + match store { + SecondaryIndexStore::BTree(entries) => { + hash_label(hasher, "btree"); + for (index_key, encoded_pks) in entries.iter() { + hash_bytes(hasher, index_key.as_slice()); + for pk in encoded_pks.iter() { + hash_bytes(hasher, pk.as_slice()); + } + } + } + SecondaryIndexStore::Hash(entries) => { + hash_label(hasher, "hash"); + let mut ordered = entries + .iter() + .map(|(index_key, encoded_pks)| (index_key.as_slice().to_vec(), encoded_pks)) + .collect::>(); + ordered.sort_by(|a, b| a.0.cmp(&b.0)); + for (index_key, encoded_pks) in ordered { + hash_bytes(hasher, &index_key); + let mut pks = encoded_pks + .iter() + .map(|pk| pk.as_slice().to_vec()) + .collect::>(); + pks.sort(); + for pk in pks { + hash_bytes(hasher, &pk); + } + } + } + SecondaryIndexStore::UniqueHash(entries) => { + hash_label(hasher, "unique_hash"); + let mut ordered = entries + .iter() + .map(|(index_key, encoded_pk)| { + ( + index_key.as_slice().to_vec(), + encoded_pk.as_slice().to_vec(), + ) + }) + .collect::>(); + ordered.sort_by(|a, b| a.0.cmp(&b.0)); + for (index_key, encoded_pk) in ordered { + hash_bytes(hasher, &index_key); + hash_bytes(hasher, &encoded_pk); + } + } + } + Ok(()) +} + +fn hash_sorted_entries<'a, K, V, I>(hasher: &mut Sha256, entries: I) -> Result<(), AedbError> +where + K: Serialize + 'a, + V: Serialize + 'a, + I: IntoIterator, +{ + let mut encoded = entries + .into_iter() + .map(|(key, value)| -> Result<_, AedbError> { Ok((encode(key)?, value)) }) + .collect::, _>>()?; + encoded.sort_by(|a, b| a.0.cmp(&b.0)); + for (key_bytes, value) in encoded { + hash_bytes(hasher, &key_bytes); + hash_encoded(hasher, value)?; + } + Ok(()) +} + +fn hash_label(hasher: &mut Sha256, label: &str) { + hash_bytes(hasher, label.as_bytes()); +} + +fn hash_encoded(hasher: &mut Sha256, value: &T) -> Result<(), AedbError> { + let bytes = encode(value)?; + hash_bytes(hasher, &bytes); + Ok(()) +} + +fn hash_bytes(hasher: &mut Sha256, bytes: &[u8]) { + hasher.update((bytes.len() as u64).to_be_bytes()); + hasher.update(bytes); +} + +fn encode(value: &T) -> Result, AedbError> { + rmp_serde::to_vec(value).map_err(|e| AedbError::Encode(e.to_string())) +} + fn state_counts(state: &SnapshotDumpState) -> (u64, u64) { let mut table_rows = 0u64; let mut kv_entries = 0u64; diff --git a/src/order_book.rs b/src/order_book.rs index 3921167..3a45ad8 100644 --- a/src/order_book.rs +++ b/src/order_book.rs @@ -4,6 +4,11 @@ use primitive_types::U256; use serde::{Deserialize, Serialize}; use std::ops::Bound; +const MASS_CANCEL_SCAN_LIMIT: usize = 200_000; +const OPEN_ORDERS_SCAN_LIMIT: usize = 100_000; +const TOP_LEVEL_SCAN_LIMIT: usize = 200_000; +const FIFO_COUNT_SCAN_LIMIT: usize = 100_000; + #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Hash)] #[repr(u8)] pub enum OrderSide { @@ -432,14 +437,18 @@ pub fn key_owner_nonce(instrument: &str, owner: &str) -> Vec { k } +fn append_segment_len_prefixed(out: &mut Vec, segment: &str) { + out.extend_from_slice(&(segment.len() as u64).to_be_bytes()); + out.extend_from_slice(segment.as_bytes()); +} + pub fn key_client_id(instrument: &str, owner: &str, client_order_id: &str) -> Vec { - let mut k = Vec::with_capacity(9 + instrument.len() + owner.len() + client_order_id.len()); + let mut k = Vec::with_capacity(25 + instrument.len() + owner.len() + client_order_id.len()); k.extend_from_slice(b"ob:"); k.extend_from_slice(instrument.as_bytes()); k.extend_from_slice(b":cid:"); - k.extend_from_slice(owner.as_bytes()); - k.push(b':'); - k.extend_from_slice(client_order_id.as_bytes()); + append_segment_len_prefixed(&mut k, owner); + append_segment_len_prefixed(&mut k, client_order_id); k } @@ -485,11 +494,11 @@ pub fn key_fifo( } pub fn key_open_order(instrument: &str, owner: &str, order_id: u64) -> Vec { - let mut k = Vec::with_capacity(11 + instrument.len() + owner.len() + 8); + let mut k = Vec::with_capacity(20 + instrument.len() + owner.len() + 8); k.extend_from_slice(b"ob:"); k.extend_from_slice(instrument.as_bytes()); k.extend_from_slice(b":open:"); - k.extend_from_slice(owner.as_bytes()); + append_segment_len_prefixed(&mut k, owner); k.push(b':'); k.extend_from_slice(&order_id.to_be_bytes()); k @@ -535,11 +544,11 @@ pub fn plqty_prefix(instrument: &str, side: OrderSide) -> Vec { } pub fn open_orders_prefix(instrument: &str, owner: &str) -> Vec { - let mut k = Vec::with_capacity(10 + instrument.len() + owner.len()); + let mut k = Vec::with_capacity(19 + instrument.len() + owner.len()); k.extend_from_slice(b"ob:"); k.extend_from_slice(instrument.as_bytes()); k.extend_from_slice(b":open:"); - k.extend_from_slice(owner.as_bytes()); + append_segment_len_prefixed(&mut k, owner); k.push(b':'); k } @@ -1180,8 +1189,13 @@ pub fn apply_order_book_mass_cancel( project_id, scope_id, &all_orders_prefix(instrument), - usize::MAX, + MASS_CANCEL_SCAN_LIMIT.saturating_add(1), ); + if orders.len() > MASS_CANCEL_SCAN_LIMIT { + return Err(AedbError::Validation( + "mass cancel scan limit exceeded".into(), + )); + } for (_, entry) in orders { let order: OrderRecord = deserialize(&entry.value)?; if u256_from_be(order.remaining_qty_be).is_zero() { @@ -1391,14 +1405,20 @@ pub fn read_open_orders( project_id, scope_id, &open_orders_prefix(instrument, owner), - usize::MAX, + OPEN_ORDERS_SCAN_LIMIT.saturating_add(1), snapshot, ); + if open.len() > OPEN_ORDERS_SCAN_LIMIT { + return Err(AedbError::Validation( + "open_orders scan limit exceeded".into(), + )); + } let mut out = Vec::with_capacity(open.len()); for (k, _) in open { if let Some(order_id) = parse_order_id_suffix(&k) && let Some(order) = read_order_status(snapshot, project_id, scope_id, instrument, order_id)? + && order.owner == owner { out.push(order); } @@ -1522,9 +1542,12 @@ fn top_side( project_id, scope_id, &plqty_prefix(instrument, side), - usize::MAX, + TOP_LEVEL_SCAN_LIMIT.saturating_add(1), snapshot, ); + if entries.len() > TOP_LEVEL_SCAN_LIMIT { + return Err(AedbError::Validation("top_n scan limit exceeded".into())); + } let mut out = Vec::new(); for (k, v) in entries { let qty = decode_u256_bytes(&v.value)?; @@ -1538,14 +1561,19 @@ fn top_side( project_id, scope_id, &fifo_prefix(instrument, side, price), - usize::MAX, + FIFO_COUNT_SCAN_LIMIT.saturating_add(1), snapshot, ) - .len() as u32; + .len(); + if order_count > FIFO_COUNT_SCAN_LIMIT { + return Err(AedbError::Validation( + "top_n fifo scan limit exceeded".into(), + )); + } out.push(PriceLevel { price_ticks: price, total_qty_be: u256_to_be(qty), - order_count, + order_count: order_count as u32, }); if out.len() >= depth { break; @@ -1651,10 +1679,10 @@ fn snapshot_scan_prefix( fn prefix_range_end(prefix: &[u8]) -> Option> { let mut end = prefix.to_vec(); - for idx in (0..end.len()).rev() { - if end[idx] != u8::MAX { - end[idx] = end[idx].saturating_add(1); - end.truncate(idx + 1); + for byte_pos in (0..end.len()).rev() { + if end[byte_pos] != u8::MAX { + end[byte_pos] = end[byte_pos].saturating_add(1); + end.truncate(byte_pos + 1); return Some(end); } } @@ -2289,4 +2317,76 @@ mod tests { "all-0xff prefix has no finite upper bound" ); } + + #[test] + fn key_client_id_is_collision_resistant_for_delimiter_inputs() { + let k1 = key_client_id("BTC-USD", "alice:desk", "order-1"); + let k2 = key_client_id("BTC-USD", "alice", "desk:order-1"); + assert_ne!(k1, k2, "length-prefixed encoding must be unambiguous"); + } + + #[test] + fn read_open_orders_enforces_owner_for_matching_prefix() { + let mut ks = Keyspace::default(); + let order_a = OrderRecord { + order_id: 1, + instrument: "BTC-USD".into(), + client_order_id: "cid-a".into(), + owner: "alice".into(), + account: None, + side: OrderSide::Bid, + order_type: OrderType::Limit, + time_in_force: TimeInForce::Gtc, + exec_instructions: ExecInstruction(0), + self_trade_prevention: SelfTradePrevention::None, + price_ticks: 100, + original_qty_be: u256_to_be(U256::from(1u8)), + remaining_qty_be: u256_to_be(U256::from(1u8)), + filled_qty_be: u256_to_be(U256::zero()), + status: OrderStatus::Open, + placed_seq: 1, + last_modified_seq: 1, + nonce: 1, + }; + let order_b = OrderRecord { + order_id: 2, + owner: "alice:desk".into(), + client_order_id: "cid-b".into(), + ..order_a.clone() + }; + ks.kv_set( + "p", + "app", + key_order("BTC-USD", order_a.order_id), + serialize(&order_a).expect("encode"), + 1, + ); + ks.kv_set( + "p", + "app", + key_order("BTC-USD", order_b.order_id), + serialize(&order_b).expect("encode"), + 1, + ); + ks.kv_set( + "p", + "app", + key_open_order("BTC-USD", &order_a.owner, order_a.order_id), + vec![1], + 1, + ); + ks.kv_set( + "p", + "app", + key_open_order("BTC-USD", &order_b.owner, order_b.order_id), + vec![1], + 1, + ); + + let snapshot = ks.snapshot(); + let open = + read_open_orders(&snapshot, "p", "app", "BTC-USD", "alice").expect("read open orders"); + assert_eq!(open.len(), 1); + assert_eq!(open[0].owner, "alice"); + } } diff --git a/src/query/executor.rs b/src/query/executor.rs index fc40333..a386605 100644 --- a/src/query/executor.rs +++ b/src/query/executor.rs @@ -933,7 +933,7 @@ fn resolve_selected_indices( } let mut indices = Vec::with_capacity(query.select.len()); for col in &query.select { - let idx = schema + let column_index = schema .columns .iter() .position(|c| c.name == *col) @@ -941,7 +941,7 @@ fn resolve_selected_indices( table: query.table.clone(), column: col.clone(), })?; - indices.push(idx); + indices.push(column_index); } Ok(Some(indices)) } @@ -1181,23 +1181,28 @@ fn encode_cursor(cursor: &CursorToken) -> Result { } fn decode_cursor(encoded: &str) -> Result { - if !encoded.len().is_multiple_of(2) { + let encoded_size_bytes = encoded.len(); + if !encoded_size_bytes.is_multiple_of(2) { return Err(QueryError::InvalidQuery { reason: "invalid cursor".into(), }); } - let mut bytes = Vec::with_capacity(encoded.len() / 2); - let raw = encoded.as_bytes(); - for i in (0..raw.len()).step_by(2) { - let hi = decode_hex_nibble(raw[i]).ok_or_else(|| QueryError::InvalidQuery { - reason: "invalid cursor".into(), + let mut decoded_bytes = Vec::with_capacity(encoded_size_bytes / 2); + let encoded_bytes = encoded.as_bytes(); + for byte_offset in (0..encoded_bytes.len()).step_by(2) { + let hi = decode_hex_nibble(encoded_bytes[byte_offset]).ok_or_else(|| { + QueryError::InvalidQuery { + reason: "invalid cursor".into(), + } })?; - let lo = decode_hex_nibble(raw[i + 1]).ok_or_else(|| QueryError::InvalidQuery { - reason: "invalid cursor".into(), + let lo = decode_hex_nibble(encoded_bytes[byte_offset + 1]).ok_or_else(|| { + QueryError::InvalidQuery { + reason: "invalid cursor".into(), + } })?; - bytes.push((hi << 4) | lo); + decoded_bytes.push((hi << 4) | lo); } - rmp_serde::from_slice(&bytes).map_err(|e| QueryError::InvalidQuery { + rmp_serde::from_slice(&decoded_bytes).map_err(|e| QueryError::InvalidQuery { reason: e.to_string(), }) } @@ -1212,7 +1217,7 @@ fn decode_hex_nibble(byte: u8) -> Option { } fn aggregate_col_idx(agg: &Aggregate, columns: &[String]) -> Result, QueryError> { - let idx = match agg { + let column_index = match agg { Aggregate::Count => return Ok(None), Aggregate::Sum(col) | Aggregate::Min(col) | Aggregate::Max(col) | Aggregate::Avg(col) => { columns @@ -1224,7 +1229,7 @@ fn aggregate_col_idx(agg: &Aggregate, columns: &[String]) -> Result String { @@ -1368,12 +1373,13 @@ fn indexed_pks_for_predicate_with_trace( let Some((idx_name, prefix_cols)) = best else { return Ok(None); }; - let index = table - .indexes - .get(&idx_name) - .ok_or_else(|| QueryError::InvalidQuery { - reason: "index not found".into(), - })?; + let selected_index = + table + .indexes + .get(&idx_name) + .ok_or_else(|| QueryError::InvalidQuery { + reason: "index not found".into(), + })?; let idx_def = catalog .indexes .get(&(ns, table_name.to_string(), idx_name.clone())) @@ -1388,9 +1394,9 @@ fn indexed_pks_for_predicate_with_trace( .collect::>(); let encoded = EncodedKey::from_values(&prefix_values); let pks = if prefix_cols == idx_def.columns.len() { - index.scan_eq(&encoded) + selected_index.scan_eq(&encoded) } else { - index.scan_prefix(&encoded) + selected_index.scan_prefix(&encoded) }; return Ok(Some(IndexLookupResult { pks, @@ -1597,10 +1603,10 @@ fn like_prefix(pattern: &str) -> Option { fn next_prefix(prefix: &str) -> Option { let mut bytes = prefix.as_bytes().to_vec(); - for i in (0..bytes.len()).rev() { - if bytes[i] != u8::MAX { - bytes[i] += 1; - bytes.truncate(i + 1); + for byte_index in (0..bytes.len()).rev() { + if bytes[byte_index] != u8::MAX { + bytes[byte_index] += 1; + bytes.truncate(byte_index + 1); return String::from_utf8(bytes).ok(); } } @@ -1747,13 +1753,13 @@ mod tests { let table = keyspace .table_by_namespace_key_mut(&namespace_key("A", "app"), "users") .expect("table"); - let mut index = SecondaryIndex::default(); + let mut secondary_index = SecondaryIndex::default(); for (pk, row) in &table.rows { let age_key = extract_index_key_encoded(row, &schema, &["age".into()]).expect("age index key"); - index.insert(age_key, pk.clone()); + secondary_index.insert(age_key, pk.clone()); } - table.indexes.insert("by_age".into(), index); + table.indexes.insert("by_age".into(), secondary_index); let mut by_name = SecondaryIndex::default(); for (pk, row) in &table.rows { let key = diff --git a/src/query/operators.rs b/src/query/operators.rs index e38be44..f366e03 100644 --- a/src/query/operators.rs +++ b/src/query/operators.rs @@ -215,7 +215,7 @@ impl Operator for ProjectOperator { let values = self .selected .iter() - .map(|idx| row.values[*idx].clone()) + .map(|column_index| row.values[*column_index].clone()) .collect(); Some(Row { values }) } @@ -227,7 +227,7 @@ impl Operator for ProjectOperator { pub struct SortOperator { rows: Vec, - idx: usize, + row_index: usize, examined: usize, } @@ -259,7 +259,7 @@ impl SortOperator { if limit == 0 { return Self { rows: Vec::new(), - idx: 0, + row_index: 0, examined: 0, }; } @@ -305,7 +305,10 @@ impl SortOperator { impl Eq for TopKRow {} - let sort_key_columns: Vec = order_by.iter().map(|(idx, _)| *idx).collect(); + let sort_key_columns: Vec = order_by + .iter() + .map(|(column_index, _)| *column_index) + .collect(); let sort_orders = Arc::new(order_by.iter().map(|(_, order)| *order).collect::>()); let mut heap: BinaryHeap = BinaryHeap::with_capacity(limit); @@ -313,7 +316,7 @@ impl SortOperator { let candidate = TopKRow { sort_key: sort_key_columns .iter() - .map(|idx| row.values[*idx].clone()) + .map(|column_index| row.values[*column_index].clone()) .collect(), row, orders: Arc::clone(&sort_orders), @@ -336,7 +339,7 @@ impl SortOperator { rows.sort_by(compare_rows); return Self { rows, - idx: 0, + row_index: 0, examined, }; } @@ -349,7 +352,7 @@ impl SortOperator { rows.sort_by(compare_rows); Self { rows, - idx: 0, + row_index: 0, examined, } } @@ -357,11 +360,11 @@ impl SortOperator { impl Operator for SortOperator { fn next(&mut self) -> Option { - if self.idx >= self.rows.len() { + if self.row_index >= self.rows.len() { return None; } - let row = self.rows[self.idx].clone(); - self.idx += 1; + let row = self.rows[self.row_index].clone(); + self.row_index += 1; Some(row) } @@ -401,7 +404,7 @@ impl Operator for LimitOperator { pub struct AggregateOperator { rows: Vec, - idx: usize, + row_index: usize, examined: usize, } @@ -425,14 +428,14 @@ impl AggregateState { } } - fn update(&mut self, aggregate: &Aggregate, row: &Row, col_idx: Option) { + fn update(&mut self, aggregate: &Aggregate, row: &Row, aggregate_column_index: Option) { match (self, aggregate) { (AggregateState::Count(v), Aggregate::Count) => { *v = v.saturating_add(1); } (AggregateState::Sum(sum), Aggregate::Sum(_)) => { - if let Some(idx) = col_idx { - let v = match &row.values[idx] { + if let Some(column_index) = aggregate_column_index { + let v = match &row.values[column_index] { Value::Integer(v) => *v, Value::U8(v) => *v as i64, _ => 0, @@ -441,24 +444,24 @@ impl AggregateState { } } (AggregateState::Min(state), Aggregate::Min(_)) => { - if let Some(idx) = col_idx { - let value = row.values[idx].clone(); + if let Some(column_index) = aggregate_column_index { + let value = row.values[column_index].clone(); if state.as_ref().is_none_or(|current| value < *current) { *state = Some(value); } } } (AggregateState::Max(state), Aggregate::Max(_)) => { - if let Some(idx) = col_idx { - let value = row.values[idx].clone(); + if let Some(column_index) = aggregate_column_index { + let value = row.values[column_index].clone(); if state.as_ref().is_none_or(|current| value > *current) { *state = Some(value); } } } (AggregateState::Avg { total, count }, Aggregate::Avg(_)) => { - if let Some(idx) = col_idx { - let maybe_v = match &row.values[idx] { + if let Some(column_index) = aggregate_column_index { + let maybe_v = match &row.values[column_index] { Value::Integer(v) => Some(*v), Value::U8(v) => Some(*v as i64), _ => None, @@ -515,8 +518,12 @@ impl AggregateOperator { .map(AggregateState::from_aggregate) .collect::>() }); - for (idx, state) in states.iter_mut().enumerate() { - state.update(&aggregates[idx], &row, aggregate_col_idx[idx]); + for (aggregate_index, state) in states.iter_mut().enumerate() { + state.update( + &aggregates[aggregate_index], + &row, + aggregate_col_idx[aggregate_index], + ); } } let examined = child.rows_examined(); @@ -532,7 +539,7 @@ impl AggregateOperator { Self { rows, - idx: 0, + row_index: 0, examined, } } @@ -540,11 +547,11 @@ impl AggregateOperator { impl Operator for AggregateOperator { fn next(&mut self) -> Option { - if self.idx >= self.rows.len() { + if self.row_index >= self.rows.len() { return None; } - let row = self.rows[self.idx].clone(); - self.idx += 1; + let row = self.rows[self.row_index].clone(); + self.row_index += 1; Some(row) } @@ -555,41 +562,41 @@ impl Operator for AggregateOperator { fn eval_compiled_expr(expr: &CompiledExpr, row: &Row) -> bool { match expr { - CompiledExpr::Eq(idx, v) => { - get_col(row, *idx).is_some_and(|rv| compare_values(rv, v).is_some_and(|o| o.is_eq())) - } - CompiledExpr::Ne(idx, v) => { - get_col(row, *idx).is_some_and(|rv| compare_values(rv, v).is_some_and(|o| !o.is_eq())) - } - CompiledExpr::Lt(idx, v) => { - get_col(row, *idx).is_some_and(|rv| compare_values(rv, v).is_some_and(|o| o.is_lt())) - } - CompiledExpr::Lte(idx, v) => { - get_col(row, *idx).is_some_and(|rv| compare_values(rv, v).is_some_and(|o| o.is_le())) - } - CompiledExpr::Gt(idx, v) => { - get_col(row, *idx).is_some_and(|rv| compare_values(rv, v).is_some_and(|o| o.is_gt())) - } - CompiledExpr::Gte(idx, v) => { - get_col(row, *idx).is_some_and(|rv| compare_values(rv, v).is_some_and(|o| o.is_ge())) - } - CompiledExpr::In(idx, values) => get_col(row, *idx).is_some_and(|rv| { + CompiledExpr::Eq(column_index, v) => get_col(row, *column_index) + .is_some_and(|rv| compare_values(rv, v).is_some_and(|o| o.is_eq())), + CompiledExpr::Ne(column_index, v) => get_col(row, *column_index) + .is_some_and(|rv| compare_values(rv, v).is_some_and(|o| !o.is_eq())), + CompiledExpr::Lt(column_index, v) => get_col(row, *column_index) + .is_some_and(|rv| compare_values(rv, v).is_some_and(|o| o.is_lt())), + CompiledExpr::Lte(column_index, v) => get_col(row, *column_index) + .is_some_and(|rv| compare_values(rv, v).is_some_and(|o| o.is_le())), + CompiledExpr::Gt(column_index, v) => get_col(row, *column_index) + .is_some_and(|rv| compare_values(rv, v).is_some_and(|o| o.is_gt())), + CompiledExpr::Gte(column_index, v) => get_col(row, *column_index) + .is_some_and(|rv| compare_values(rv, v).is_some_and(|o| o.is_ge())), + CompiledExpr::In(column_index, values) => get_col(row, *column_index).is_some_and(|rv| { values .iter() .any(|v| compare_values(rv, v).is_some_and(|o| o.is_eq())) }), - CompiledExpr::Between(idx, lo, hi) => get_col(row, *idx).is_some_and(|rv| { - compare_values(rv, lo).is_some_and(|o| o.is_ge()) - && compare_values(rv, hi).is_some_and(|o| o.is_le()) - }), - CompiledExpr::IsNull(idx) => get_col(row, *idx).is_some_and(|rv| matches!(rv, Value::Null)), - CompiledExpr::IsNotNull(idx) => { - get_col(row, *idx).is_some_and(|rv| !matches!(rv, Value::Null)) + CompiledExpr::Between(column_index, lo, hi) => { + get_col(row, *column_index).is_some_and(|rv| { + compare_values(rv, lo).is_some_and(|o| o.is_ge()) + && compare_values(rv, hi).is_some_and(|o| o.is_le()) + }) + } + CompiledExpr::IsNull(column_index) => { + get_col(row, *column_index).is_some_and(|rv| matches!(rv, Value::Null)) + } + CompiledExpr::IsNotNull(column_index) => { + get_col(row, *column_index).is_some_and(|rv| !matches!(rv, Value::Null)) + } + CompiledExpr::Like(column_index, pattern) => { + get_col(row, *column_index).is_some_and(|rv| match rv { + Value::Text(s) => like_match(s, pattern), + _ => false, + }) } - CompiledExpr::Like(idx, pattern) => get_col(row, *idx).is_some_and(|rv| match rv { - Value::Text(s) => like_match(s, pattern), - _ => false, - }), CompiledExpr::And(a, b) => eval_compiled_expr(a, row) && eval_compiled_expr(b, row), CompiledExpr::Or(a, b) => eval_compiled_expr(a, row) || eval_compiled_expr(b, row), CompiledExpr::Not(inner) => !eval_compiled_expr(inner, row), @@ -610,8 +617,8 @@ fn find_col_idx(columns: &[String], col: &str, table: &str) -> Result Option<&Value> { - row.values.get(idx) +fn get_col(row: &Row, column_index: usize) -> Option<&Value> { + row.values.get(column_index) } fn like_match(value: &str, pattern: &str) -> bool { diff --git a/src/recovery/mod.rs b/src/recovery/mod.rs index a960ce4..46dc4a3 100644 --- a/src/recovery/mod.rs +++ b/src/recovery/mod.rs @@ -157,7 +157,7 @@ fn segment_paths_for_replay( let mut segments = manifest.segments.clone(); segments.sort_by_key(|segment| segment.segment_seq); - let mut paths = Vec::with_capacity(segments.len()); + let mut segment_paths = Vec::with_capacity(segments.len()); let mut replay_limits = HashMap::new(); let active_segment_seq = manifest.active_segment_seq; for segment in segments { @@ -254,13 +254,13 @@ fn segment_paths_for_replay( ); } } - let limit_path = path.clone(); - paths.push(path); + let replay_limit_path = path.clone(); + segment_paths.push(path); if segment.size_bytes > 0 { - replay_limits.insert(limit_path, segment.size_bytes); + replay_limits.insert(replay_limit_path, segment.size_bytes); } } - Ok((paths, replay_limits)) + Ok((segment_paths, replay_limits)) } fn load_latest_valid_checkpoint( @@ -344,18 +344,20 @@ fn sha256_prefix_hex(path: &Path, bytes_to_hash: u64) -> Result 0 { - let read_len = usize::try_from(remaining.min(buf.len() as u64)).unwrap_or(buf.len()); - let n = reader.read(&mut buf[..read_len])?; - if n == 0 { + let mut remaining_size_bytes = bytes_to_hash; + let mut buffer = [0u8; 16 * 1024]; + while remaining_size_bytes > 0 { + let read_size_bytes = + usize::try_from(remaining_size_bytes.min(buffer.len() as u64)).unwrap_or(buffer.len()); + debug_assert!(read_size_bytes <= buffer.len()); + let read_count = reader.read(&mut buffer[..read_size_bytes])?; + if read_count == 0 { break; } - hasher.update(&buf[..n]); - remaining = remaining.saturating_sub(n as u64); + hasher.update(&buffer[..read_count]); + remaining_size_bytes = remaining_size_bytes.saturating_sub(read_count as u64); } - if remaining > 0 { + if remaining_size_bytes > 0 { return Err(AedbError::Validation( "segment shorter than expected".into(), )); diff --git a/src/recovery/replay.rs b/src/recovery/replay.rs index fd6b81d..11e3017 100644 --- a/src/recovery/replay.rs +++ b/src/recovery/replay.rs @@ -23,27 +23,28 @@ pub fn replay_segments( catalog: &mut Catalog, idempotency: &mut HashMap, ) -> Result { - let valid_prefix_len = + let valid_segment_count = validated_hash_chain_prefix_len(segments, hash_chain_required, strict_recovery)?; - let replay_segments = &segments[..valid_prefix_len]; + let replay_segments = &segments[..valid_segment_count]; let mut max_seq = from_seq_exclusive; let mut last_applied_seq = from_seq_exclusive; for segment in replay_segments { let file = File::open(segment)?; - let file_size = file.metadata()?.len(); - let replay_bytes = segment_replay_byte_limits + let segment_size_bytes = file.metadata()?.len(); + let replay_size_bytes = segment_replay_byte_limits .and_then(|limits| limits.get(segment).copied()) - .unwrap_or(file_size) - .min(file_size); - if replay_bytes <= SEGMENT_HEADER_SIZE as u64 { + .unwrap_or(segment_size_bytes) + .min(segment_size_bytes); + debug_assert!(replay_size_bytes <= segment_size_bytes); + if replay_size_bytes <= SEGMENT_HEADER_SIZE as u64 { continue; } let mut reader = BufReader::with_capacity(64 * 1024, file); let mut header = [0u8; SEGMENT_HEADER_SIZE]; reader.read_exact(&mut header)?; - let payload_bytes = replay_bytes.saturating_sub(SEGMENT_HEADER_SIZE as u64); - let mut frame_reader = FrameReader::new(reader.take(payload_bytes)); + let payload_size_bytes = replay_size_bytes.saturating_sub(SEGMENT_HEADER_SIZE as u64); + let mut frame_reader = FrameReader::new(reader.take(payload_size_bytes)); loop { match frame_reader.next_frame() { Ok(Some(frame)) => { diff --git a/src/recovery/scanner.rs b/src/recovery/scanner.rs index 3876461..905c79a 100644 --- a/src/recovery/scanner.rs +++ b/src/recovery/scanner.rs @@ -48,10 +48,9 @@ pub fn verify_hash_chain_if_required(paths: &[PathBuf], required: bool) -> Resul if !required { return Ok(()); } - if paths - .iter() - .any(|path| fs::metadata(path).map_or(true, |m| m.len() < SEGMENT_HEADER_SIZE as u64)) - { + if paths.iter().any(|path| { + fs::metadata(path).map_or(true, |metadata| metadata.len() < SEGMENT_HEADER_SIZE as u64) + }) { return Err(AedbError::Decode("segment too small".into())); } verify_hash_chain(paths) @@ -73,11 +72,11 @@ pub fn validated_hash_chain_prefix_len( } let mut prev_hash = [0u8; 32]; - let mut valid = 0usize; + let mut valid_segment_count = 0usize; for path in paths { - let meta = match fs::metadata(path) { - Ok(m) => m, + let segment_metadata = match fs::metadata(path) { + Ok(metadata) => metadata, Err(e) => { if strict { return Err(AedbError::Io(e)); @@ -85,7 +84,8 @@ pub fn validated_hash_chain_prefix_len( break; } }; - if meta.len() < SEGMENT_HEADER_SIZE as u64 { + let segment_size_bytes = segment_metadata.len(); + if segment_size_bytes < SEGMENT_HEADER_SIZE as u64 { if strict { return Err(AedbError::Decode("segment too small".into())); } @@ -137,15 +137,16 @@ pub fn validated_hash_chain_prefix_len( if strict { return Err(AedbError::Io(e)); } - return Ok(valid); + return Ok(valid_segment_count); } } } prev_hash = *hasher.finalize().as_bytes(); - valid += 1; + valid_segment_count += 1; } - Ok(valid) + debug_assert!(valid_segment_count <= paths.len()); + Ok(valid_segment_count) } fn parse_seq(name: &str) -> Option { diff --git a/src/storage/encoded_key.rs b/src/storage/encoded_key.rs index 005b38e..0471c1f 100644 --- a/src/storage/encoded_key.rs +++ b/src/storage/encoded_key.rs @@ -33,10 +33,10 @@ impl EncodedKey { pub fn prefix_successor(prefix: &EncodedKey) -> Option { let mut next = prefix.bytes.clone(); - for i in (0..next.len()).rev() { - if next[i] != 0xFF { - next[i] += 1; - next.truncate(i + 1); + for byte_index in (0..next.len()).rev() { + if next[byte_index] != 0xFF { + next[byte_index] += 1; + next.truncate(byte_index + 1); return Some(EncodedKey { bytes: next }); } } @@ -93,7 +93,8 @@ fn encode_value(v: &Value, out: &mut SmallVec<[u8; 64]>) { } Value::Blob(b) => { out.push(0x18); - out.extend_from_slice(&(b.len() as u32).to_be_bytes()); + let blob_size_bytes = b.len() as u32; + out.extend_from_slice(&blob_size_bytes.to_be_bytes()); out.extend_from_slice(b); } Value::Null => { diff --git a/src/storage/index.rs b/src/storage/index.rs index cacdf42..a0d9565 100644 --- a/src/storage/index.rs +++ b/src/storage/index.rs @@ -122,12 +122,12 @@ pub fn extract_index_key( ) -> Result, AedbError> { let mut out = Vec::with_capacity(indexed_columns.len()); for col in indexed_columns { - let idx = schema + let column_index = schema .columns .iter() .position(|c| c.name == *col) .ok_or_else(|| AedbError::Validation(format!("indexed column not found: {col}")))?; - out.push(row.values[idx].clone()); + out.push(row.values[column_index].clone()); } Ok(out) } @@ -152,35 +152,36 @@ mod tests { #[test] fn secondary_index_insert_remove_and_range() { - let mut idx = SecondaryIndex::default(); - idx.insert( + let mut secondary_index = SecondaryIndex::default(); + secondary_index.insert( EncodedKey::from_values(&[Value::Integer(10)]), EncodedKey::from_values(&[Value::Integer(1)]), ); - idx.insert( + secondary_index.insert( EncodedKey::from_values(&[Value::Integer(20)]), EncodedKey::from_values(&[Value::Integer(2)]), ); - idx.insert( + secondary_index.insert( EncodedKey::from_values(&[Value::Integer(30)]), EncodedKey::from_values(&[Value::Integer(3)]), ); - let eq = idx.scan_eq(&EncodedKey::from_values(&[Value::Integer(20)])); + let eq = secondary_index.scan_eq(&EncodedKey::from_values(&[Value::Integer(20)])); assert_eq!(eq, vec![EncodedKey::from_values(&[Value::Integer(2)])]); - let range = idx.scan_range( + let range = secondary_index.scan_range( Bound::Included(EncodedKey::from_values(&[Value::Integer(15)])), Bound::Included(EncodedKey::from_values(&[Value::Integer(30)])), ); assert_eq!(range.len(), 2); - idx.remove( + secondary_index.remove( &EncodedKey::from_values(&[Value::Integer(20)]), &EncodedKey::from_values(&[Value::Integer(2)]), ); assert!( - idx.scan_eq(&EncodedKey::from_values(&[Value::Integer(20)])) + secondary_index + .scan_eq(&EncodedKey::from_values(&[Value::Integer(20)])) .is_empty() ); } diff --git a/src/storage/keyspace.rs b/src/storage/keyspace.rs index f7f8e01..53a77ca 100644 --- a/src/storage/keyspace.rs +++ b/src/storage/keyspace.rs @@ -890,10 +890,10 @@ fn default_primary_index_backend() -> PrimaryIndexBackend { fn prefix_range_end(prefix: &[u8]) -> Option> { let mut end = prefix.to_vec(); - for idx in (0..end.len()).rev() { - if end[idx] != u8::MAX { - end[idx] = end[idx].saturating_add(1); - end.truncate(idx + 1); + for byte_index in (0..end.len()).rev() { + if end[byte_index] != u8::MAX { + end[byte_index] = end[byte_index].saturating_add(1); + end.truncate(byte_index + 1); return Some(end); } } @@ -907,7 +907,8 @@ fn encode_u256(v: U256) -> Vec { } fn decode_u256(bytes: &[u8]) -> Result { - if bytes.len() != 32 { + let value_size_bytes = bytes.len(); + if value_size_bytes != 32 { return Err(crate::error::AedbError::Validation( "invalid u256 bytes length".into(), )); diff --git a/src/version_store.rs b/src/version_store.rs index 6b6cffe..998ac64 100644 --- a/src/version_store.rs +++ b/src/version_store.rs @@ -224,10 +224,10 @@ impl VersionStore { let mut keyspace = snapshot_to_keyspace(base_keyspace); let mut catalog = (**base_catalog).clone(); - for idx in (base_idx + 1)..=target_idx { + for version_index in (base_idx + 1)..=target_idx { let version = self .versions - .get(idx) + .get(version_index) .ok_or_else(|| AedbError::Validation("delta version missing".into()))?; if let Some(delta) = &version.delta { for mutation in &delta.mutations { diff --git a/src/wal/frame.rs b/src/wal/frame.rs index e4a308c..f16dcba 100644 --- a/src/wal/frame.rs +++ b/src/wal/frame.rs @@ -3,6 +3,12 @@ use std::io::{self, Read, Write}; use thiserror::Error; pub const MAX_FRAME_BODY_BYTES: usize = 64 * 1024 * 1024; +const U32_SIZE_BYTES: usize = 4; +const U64_SIZE_BYTES: usize = 8; +const PAYLOAD_TYPE_SIZE_BYTES: usize = 1; +const CRC32C_SIZE_BYTES: usize = 4; +const MIN_FRAME_BODY_SIZE_BYTES: usize = + U64_SIZE_BYTES + U64_SIZE_BYTES + PAYLOAD_TYPE_SIZE_BYTES + CRC32C_SIZE_BYTES; #[derive(Debug, Clone, PartialEq, Eq)] pub struct Frame { @@ -46,14 +52,20 @@ impl FrameWriter { payload_type: u8, payload: &[u8], ) -> Result<(), FrameError> { - let body_len = 8 + 8 + 1 + payload.len() + 4; - let frame_length = u32::try_from(body_len).map_err(|_| FrameError::Corruption)?; + let frame_body_size_bytes = U64_SIZE_BYTES + .saturating_add(U64_SIZE_BYTES) + .saturating_add(PAYLOAD_TYPE_SIZE_BYTES) + .saturating_add(payload.len()) + .saturating_add(CRC32C_SIZE_BYTES); + let frame_length = + u32::try_from(frame_body_size_bytes).map_err(|_| FrameError::Corruption)?; let len_bytes = frame_length.to_be_bytes(); let seq_bytes = commit_seq.to_be_bytes(); let ts_bytes = timestamp_micros.to_be_bytes(); let type_bytes = [payload_type]; - let mut crc_input = Vec::with_capacity(4 + body_len - 4); + let mut crc_input = + Vec::with_capacity(U32_SIZE_BYTES + frame_body_size_bytes - CRC32C_SIZE_BYTES); crc_input.extend_from_slice(&len_bytes); crc_input.extend_from_slice(&seq_bytes); crc_input.extend_from_slice(&ts_bytes); @@ -98,15 +110,15 @@ impl FrameReader { Err(e) => return Err(FrameError::Io(e.to_string())), } let frame_length = u32::from_be_bytes(len_buf); - let body_len = frame_length as usize; - if body_len < 8 + 8 + 1 + 4 { + let frame_body_size_bytes = frame_length as usize; + if frame_body_size_bytes < MIN_FRAME_BODY_SIZE_BYTES { return Err(FrameError::Corruption); } - if body_len > MAX_FRAME_BODY_BYTES { + if frame_body_size_bytes > MAX_FRAME_BODY_BYTES { return Err(FrameError::Corruption); } - let mut body = vec![0u8; body_len]; + let mut body = vec![0u8; frame_body_size_bytes]; match self.inner.read_exact(&mut body) { Ok(_) => {} Err(e) if e.kind() == io::ErrorKind::UnexpectedEof => { @@ -115,26 +127,35 @@ impl FrameReader { Err(e) => return Err(FrameError::Io(e.to_string())), } - let crc_offset = body_len - 4; + let crc_offset_bytes = frame_body_size_bytes.saturating_sub(CRC32C_SIZE_BYTES); + debug_assert!(crc_offset_bytes < frame_body_size_bytes); let stored_crc = u32::from_be_bytes( - body[crc_offset..] + body[crc_offset_bytes..] .try_into() .map_err(|_| FrameError::Corruption)?, ); - let mut crc_input = Vec::with_capacity(4 + crc_offset); + let mut crc_input = Vec::with_capacity(U32_SIZE_BYTES + crc_offset_bytes); crc_input.extend_from_slice(&len_buf); - crc_input.extend_from_slice(&body[..crc_offset]); + crc_input.extend_from_slice(&body[..crc_offset_bytes]); let computed_crc = crc32c(&crc_input); if stored_crc != computed_crc { return Err(FrameError::Corruption); } - let commit_seq = - u64::from_be_bytes(body[0..8].try_into().map_err(|_| FrameError::Corruption)?); - let timestamp_micros = - u64::from_be_bytes(body[8..16].try_into().map_err(|_| FrameError::Corruption)?); - let payload_type = body[16]; - let payload = body[17..crc_offset].to_vec(); + let commit_seq = u64::from_be_bytes( + body[0..U64_SIZE_BYTES] + .try_into() + .map_err(|_| FrameError::Corruption)?, + ); + let timestamp_micros = u64::from_be_bytes( + body[U64_SIZE_BYTES..(2 * U64_SIZE_BYTES)] + .try_into() + .map_err(|_| FrameError::Corruption)?, + ); + let payload_type = body[2 * U64_SIZE_BYTES]; + let payload_offset_bytes = (2 * U64_SIZE_BYTES) + PAYLOAD_TYPE_SIZE_BYTES; + debug_assert!(payload_offset_bytes <= crc_offset_bytes); + let payload = body[payload_offset_bytes..crc_offset_bytes].to_vec(); Ok(Some(Frame { frame_length, @@ -149,7 +170,10 @@ impl FrameReader { #[cfg(test)] mod tests { - use super::{FrameError, FrameReader, FrameWriter}; + use super::{ + FrameError, FrameReader, FrameWriter, PAYLOAD_TYPE_SIZE_BYTES, U32_SIZE_BYTES, + U64_SIZE_BYTES, + }; use std::io::Cursor; #[test] @@ -182,16 +206,27 @@ mod tests { .expect("append"); } let mut bytes = writer.into_inner(); - let mut offset = 0usize; - for frame_idx in 1..=10 { - let len = - u32::from_be_bytes(bytes[offset..offset + 4].try_into().expect("len")) as usize; - if frame_idx == 5 { - let payload_start = offset + 4 + 8 + 8 + 1; - bytes[payload_start] ^= 0xFF; + let mut frame_offset_bytes = 0usize; + let frame_count = 10usize; + for frame_index in 1..=frame_count { + assert!(frame_index <= frame_count); + let frame_body_size_bytes = u32::from_be_bytes( + bytes[frame_offset_bytes..frame_offset_bytes + U32_SIZE_BYTES] + .try_into() + .expect("frame size bytes"), + ) as usize; + let frame_size_bytes = U32_SIZE_BYTES + frame_body_size_bytes; + assert!(frame_offset_bytes + frame_size_bytes <= bytes.len()); + if frame_index == 5 { + let payload_offset_bytes = frame_offset_bytes + + U32_SIZE_BYTES + + U64_SIZE_BYTES + + U64_SIZE_BYTES + + PAYLOAD_TYPE_SIZE_BYTES; + bytes[payload_offset_bytes] ^= 0xFF; break; } - offset += 4 + len; + frame_offset_bytes += frame_size_bytes; } let mut reader = FrameReader::new(Cursor::new(bytes)); diff --git a/src/wal/segment.rs b/src/wal/segment.rs index b4beb1d..fd26ce0 100644 --- a/src/wal/segment.rs +++ b/src/wal/segment.rs @@ -194,9 +194,10 @@ impl SegmentManager { .append(seq, timestamp_micros, payload_type, payload) .map_err(|e| SegmentError::Io(std::io::Error::other(e.to_string())))?; let frame = writer.into_inner(); + let frame_size_bytes = frame.len() as u64; active.file.write_all(&frame)?; active.hasher.update(&frame); - active.size_bytes = active.size_bytes.saturating_add(frame.len() as u64); + active.size_bytes = active.size_bytes.saturating_add(frame_size_bytes); } if sync { active.file.flush()?; diff --git a/tests/backup_restore.rs b/tests/backup_restore.rs index 6e9fe42..208838e 100644 --- a/tests/backup_restore.rs +++ b/tests/backup_restore.rs @@ -411,6 +411,14 @@ async fn namespace_restore_replaces_only_target_namespace() { .expect("app new"); db.shutdown().await.expect("shutdown"); + let live_before_restore = + recover_with_config(live_dir.path(), &config).expect("recover live before restore"); + let other_before_restore = live_before_restore + .keyspace + .kv_get("p", "other", b"k") + .expect("other value exists before restore"); + assert_eq!(other_before_restore.value, b"other-new".to_vec()); + let merged_seq = AedbInstance::restore_namespace_from_backup_chain( &[full_dir.path().to_path_buf()], live_dir.path(), diff --git a/tests/benchmark_gate.rs b/tests/benchmark_gate.rs index 801849b..a8bfa5b 100644 --- a/tests/benchmark_gate.rs +++ b/tests/benchmark_gate.rs @@ -41,8 +41,8 @@ fn percentile(sorted: &[u128], p: f64) -> u128 { if sorted.is_empty() { return 0; } - let idx = ((sorted.len().saturating_sub(1)) as f64 * p).round() as usize; - sorted[idx.min(sorted.len() - 1)] + let percentile_index = ((sorted.len().saturating_sub(1)) as f64 * p).round() as usize; + sorted[percentile_index.min(sorted.len() - 1)] } async fn setup(config: AedbConfig, rows: i64) -> (tempfile::TempDir, AedbInstance) { diff --git a/tests/crash_matrix.rs b/tests/crash_matrix.rs index cee9134..bc604b5 100644 --- a/tests/crash_matrix.rs +++ b/tests/crash_matrix.rs @@ -336,11 +336,12 @@ async fn crash_matrix_after_checkpoint_before_manifest_respects_manifest_lower_b let _unreferenced = write_checkpoint_with_key( &recovered_now.keyspace.snapshot(), &recovered_now.catalog, - recovered_now.current_seq, + recovered_now.current_seq.saturating_add(1), dir.path(), config.checkpoint_key(), config.checkpoint_key_id.clone(), HashMap::new(), + config.checkpoint_compression_level, ) .expect("write unreferenced checkpoint"); diff --git a/tests/naming_conventions.rs b/tests/naming_conventions.rs new file mode 100644 index 0000000..4886b56 --- /dev/null +++ b/tests/naming_conventions.rs @@ -0,0 +1,126 @@ +use std::fs; +use std::path::{Path, PathBuf}; + +fn assert_no_ambiguous_locals(file: &str, source: &str) { + let banned = [ + "let len =", + "let mut len =", + "let idx =", + "let mut idx =", + "let index =", + "let mut index =", + "let size =", + "let mut size =", + "let offset =", + "let mut offset =", + "for idx in", + "for index in", + "for size in", + "for offset in", + ]; + for (line_number, line) in source.lines().enumerate() { + for pattern in banned { + assert!( + !line.contains(pattern), + "{file}:{} contains banned pattern `{pattern}`: {line}", + line_number + 1 + ); + } + } +} + +fn collect_rust_files(root: &Path, out: &mut Vec) { + let Ok(entries) = fs::read_dir(root) else { + return; + }; + for entry in entries.flatten() { + let path = entry.path(); + if path.is_dir() { + collect_rust_files(&path, out); + continue; + } + if path.extension().is_some_and(|ext| ext == "rs") { + out.push(path); + } + } +} + +#[test] +fn critical_modules_avoid_ambiguous_index_size_offset_len_bindings() { + let files: [(&str, &str); 16] = [ + ("src/wal/frame.rs", include_str!("../src/wal/frame.rs")), + ("src/wal/segment.rs", include_str!("../src/wal/segment.rs")), + ( + "src/recovery/replay.rs", + include_str!("../src/recovery/replay.rs"), + ), + ( + "src/recovery/scanner.rs", + include_str!("../src/recovery/scanner.rs"), + ), + ( + "src/recovery/mod.rs", + include_str!("../src/recovery/mod.rs"), + ), + ( + "src/checkpoint/loader.rs", + include_str!("../src/checkpoint/loader.rs"), + ), + ( + "src/commit/executor/mod.rs", + include_str!("../src/commit/executor/mod.rs"), + ), + ( + "src/commit/executor/internals.rs", + include_str!("../src/commit/executor/internals.rs"), + ), + ( + "src/query/executor.rs", + include_str!("../src/query/executor.rs"), + ), + ( + "src/query/operators.rs", + include_str!("../src/query/operators.rs"), + ), + ( + "src/commit/validation.rs", + include_str!("../src/commit/validation.rs"), + ), + ( + "src/commit/apply.rs", + include_str!("../src/commit/apply.rs"), + ), + ( + "src/commit/executor/global_index.rs", + include_str!("../src/commit/executor/global_index.rs"), + ), + ( + "src/storage/encoded_key.rs", + include_str!("../src/storage/encoded_key.rs"), + ), + ( + "src/storage/keyspace.rs", + include_str!("../src/storage/keyspace.rs"), + ), + ( + "src/storage/index.rs", + include_str!("../src/storage/index.rs"), + ), + ]; + for (file, source) in files { + assert_no_ambiguous_locals(file, source); + } +} + +#[test] +fn all_src_files_avoid_ambiguous_index_size_offset_len_bindings() { + let src_root = Path::new(env!("CARGO_MANIFEST_DIR")).join("src"); + let mut files = Vec::new(); + collect_rust_files(&src_root, &mut files); + files.sort(); + for file in files { + let source = fs::read_to_string(&file).expect("read source file"); + let display = file.to_string_lossy().to_string(); + assert_no_ambiguous_locals(&display, &source); + } +} diff --git a/tests/order_book_simulation.rs b/tests/order_book_simulation.rs index 8e6387e..7258927 100644 --- a/tests/order_book_simulation.rs +++ b/tests/order_book_simulation.rs @@ -1,4 +1,5 @@ use aedb::AedbInstance; +use aedb::config::AedbConfig; use aedb::error::AedbError; use aedb::order_book::{ ExecInstruction, InstrumentConfig, OrderRequest, OrderSide, OrderStatus, OrderType, @@ -7,7 +8,7 @@ use aedb::order_book::{ use aedb::query::plan::ConsistencyMode; use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; -use std::collections::BTreeMap; +use std::collections::{BTreeMap, HashMap}; use std::sync::Arc; use tempfile::tempdir; @@ -141,6 +142,8 @@ struct ChaosMetrics { lifecycle_accepted: usize, lifecycle_rejected: usize, reader_checks: usize, + attempted_tps: u64, + accepted_tps: u64, } async fn validate_asset_read_consistency(db: &AedbInstance, asset: &str) -> Result<(), AedbError> { @@ -192,12 +195,14 @@ async fn validate_asset_read_consistency(db: &AedbInstance, asset: &str) -> Resu } async fn run_simulation( + config: AedbConfig, assets: Vec, traders: usize, ops_per_trader: usize, ) -> Result<(Arc, ChaosMetrics), AedbError> { + let started = std::time::Instant::now(); let dir = tempdir().map_err(AedbError::Io)?; - let db = Arc::new(AedbInstance::open(Default::default(), dir.path())?); + let db = Arc::new(AedbInstance::open(config, dir.path())?); setup_books(&db, &assets).await?; let mut tasks = Vec::with_capacity(traders); @@ -318,6 +323,7 @@ async fn run_simulation( lifecycle_accepted, lifecycle_rejected, reader_checks: 0, + ..Default::default() }) })); } @@ -368,6 +374,29 @@ async fn run_simulation( )); } + let elapsed = started.elapsed().as_secs_f64().max(0.001); + let attempted_tps = + ((metrics.primary_attempted + metrics.lifecycle_attempted) as f64 / elapsed) as u64; + let accepted_tps = + ((metrics.primary_accepted + metrics.lifecycle_accepted) as f64 / elapsed) as u64; + metrics.attempted_tps = attempted_tps; + metrics.accepted_tps = accepted_tps; + eprintln!( + "order_book_simulation: assets={} traders={} ops_per_trader={} primary_attempted={} primary_accepted={} primary_rejected={} lifecycle_attempted={} lifecycle_accepted={} lifecycle_rejected={} reader_checks={} attempted_tps={} accepted_tps={}", + assets.len(), + traders, + ops_per_trader, + metrics.primary_attempted, + metrics.primary_accepted, + metrics.primary_rejected, + metrics.lifecycle_attempted, + metrics.lifecycle_accepted, + metrics.lifecycle_rejected, + metrics.reader_checks, + attempted_tps, + accepted_tps + ); + Ok((db, metrics)) } @@ -441,16 +470,134 @@ async fn assert_book_invariants(db: &AedbInstance, assets: &[String]) -> Result< Ok(()) } +async fn assert_trade_and_report_parity( + db: &AedbInstance, + assets: &[String], +) -> Result<(), AedbError> { + for asset in assets { + let mut orders_by_id: HashMap = HashMap::new(); + let order_rows = db + .kv_scan_prefix_no_auth( + "p", + "app", + format!("ob:{asset}:ord:").as_bytes(), + 2_000_000, + ConsistencyMode::AtLatest, + ) + .await + .map_err(|e| AedbError::Validation(e.to_string()))?; + for (_, entry) in order_rows { + let order: aedb::order_book::OrderRecord = rmp_serde::from_slice(&entry.value) + .map_err(|e| AedbError::Decode(e.to_string()))?; + orders_by_id.insert(order.order_id, order); + } + + let trade_rows = db + .kv_scan_prefix_no_auth( + "p", + "app", + format!("ob:{asset}:trade:").as_bytes(), + 2_000_000, + ConsistencyMode::AtLatest, + ) + .await + .map_err(|e| AedbError::Validation(e.to_string()))?; + let mut trades_by_fill_id: HashMap = HashMap::new(); + let mut filled_from_trades: HashMap = HashMap::new(); + for (_, entry) in trade_rows { + let fill: aedb::order_book::FillRecord = rmp_serde::from_slice(&entry.value) + .map_err(|e| AedbError::Decode(e.to_string()))?; + let fill_qty = decode_u256_u64(fill.qty_be); + *filled_from_trades + .entry(fill.aggressor_order_id) + .or_insert(0) += fill_qty; + *filled_from_trades.entry(fill.passive_order_id).or_insert(0) += fill_qty; + trades_by_fill_id.insert(fill.fill_id, fill); + } + + for (order_id, order) in &orders_by_id { + let expected = decode_u256_u64(order.filled_qty_be); + let observed = *filled_from_trades.get(order_id).unwrap_or(&0); + if observed != expected { + return Err(AedbError::Validation(format!( + "trade parity mismatch for {asset} order_id={order_id}: observed_filled={observed} expected_filled={expected}" + ))); + } + } + + let last_key = aedb::order_book::key_execution_report_last(asset); + let report_rows = db + .kv_scan_prefix_no_auth( + "p", + "app", + format!("ob:{asset}:report:").as_bytes(), + 2_000_000, + ConsistencyMode::AtLatest, + ) + .await + .map_err(|e| AedbError::Validation(e.to_string()))?; + for (key, entry) in report_rows { + if key == last_key { + continue; + } + let report: aedb::order_book::ExecutionReport = rmp_serde::from_slice(&entry.value) + .map_err(|e| AedbError::Decode(e.to_string()))?; + for fill in &report.fills { + let Some(persisted) = trades_by_fill_id.get(&fill.fill_id) else { + return Err(AedbError::Validation(format!( + "execution report references missing fill: asset={asset} order_id={} fill_id={}", + report.order_id, fill.fill_id + ))); + }; + if persisted != fill { + return Err(AedbError::Validation(format!( + "execution report fill mismatch: asset={asset} order_id={} fill_id={}", + report.order_id, fill.fill_id + ))); + } + } + if report.order_id != 0 && !orders_by_id.contains_key(&report.order_id) { + return Err(AedbError::Validation(format!( + "execution report references unknown order_id: asset={asset} order_id={}", + report.order_id + ))); + } + } + } + Ok(()) +} + #[tokio::test] async fn order_book_simulation_smoke() { let assets = vec!["BTC-USD".to_string(), "ETH-USD".to_string()]; - let (db, metrics) = run_simulation(assets.clone(), 6, 250) + let (db, metrics) = run_simulation(AedbConfig::default(), assets.clone(), 6, 250) .await .expect("run simulation"); assert!(metrics.reader_checks > 0, "reader workers should execute"); assert_book_invariants(&db, &assets) .await .expect("final invariants"); + assert_trade_and_report_parity(&db, &assets) + .await + .expect("trade/report parity invariants"); + let op = db.operational_metrics().await; + assert_eq!( + op.queue_full_rejections, 0, + "smoke load should not hit queue-full rejections" + ); + assert_eq!( + op.timeout_rejections, 0, + "smoke load should not hit timeout rejections" + ); + assert_eq!( + op.validation_rejections as usize, + metrics.primary_rejected + metrics.lifecycle_rejected, + "validation rejection accounting should match simulation-level rejects" + ); + assert!( + op.wal_sync_ops > 0, + "full-durability smoke run should execute WAL sync operations" + ); } #[tokio::test] @@ -461,7 +608,7 @@ async fn order_book_chaos_read_write_accuracy() { "SOL-USD".to_string(), "DOGE-USD".to_string(), ]; - let (db, metrics) = run_simulation(assets.clone(), 16, 800) + let (db, metrics) = run_simulation(AedbConfig::default(), assets.clone(), 16, 800) .await .expect("chaos run"); assert!( @@ -475,6 +622,45 @@ async fn order_book_chaos_read_write_accuracy() { assert_book_invariants(&db, &assets) .await .expect("final invariants"); + assert_trade_and_report_parity(&db, &assets) + .await + .expect("trade/report parity invariants"); + let op = db.operational_metrics().await; + eprintln!( + "order_book_chaos_metrics: commits_total={} commit_errors={} permission_rejections={} validation_rejections={} queue_full_rejections={} timeout_rejections={} conflict_rejections={} wal_append_ops={} wal_append_bytes={} avg_wal_append_micros={} wal_sync_ops={} avg_wal_sync_micros={} queue_depth={} inflight_commits={} durable_head_lag={}", + op.commits_total, + op.commit_errors, + op.permission_rejections, + op.validation_rejections, + op.queue_full_rejections, + op.timeout_rejections, + op.conflict_rejections, + op.wal_append_ops, + op.wal_append_bytes, + op.avg_wal_append_micros, + op.wal_sync_ops, + op.avg_wal_sync_micros, + op.queue_depth, + op.inflight_commits, + op.durable_head_lag + ); + assert_eq!( + op.queue_full_rejections, 0, + "chaos baseline should not trigger queue-full rejections" + ); + assert_eq!( + op.timeout_rejections, 0, + "chaos baseline should not trigger timeout rejections" + ); + assert_eq!( + op.validation_rejections as usize, + metrics.primary_rejected + metrics.lifecycle_rejected, + "validation rejection accounting should match simulation-level rejects" + ); + assert!( + op.wal_sync_ops > 0, + "full-durability chaos run should execute WAL sync operations" + ); } #[tokio::test] @@ -486,10 +672,53 @@ async fn order_book_simulation_hft_soak() { "SOL-USD".to_string(), "DOGE-USD".to_string(), ]; - let (db, _metrics) = run_simulation(assets.clone(), 24, 2_000) + let (db, _metrics) = run_simulation(AedbConfig::default(), assets.clone(), 24, 2_000) .await .expect("hft soak"); assert_book_invariants(&db, &assets) .await .expect("final invariants"); + assert_trade_and_report_parity(&db, &assets) + .await + .expect("trade/report parity invariants"); +} + +#[tokio::test] +#[ignore = "profiling comparison: full durability vs low-latency durability"] +async fn order_book_durability_profile_compare() { + let assets = vec![ + "BTC-USD".to_string(), + "ETH-USD".to_string(), + "SOL-USD".to_string(), + "DOGE-USD".to_string(), + ]; + let (full_db, full_metrics) = run_simulation(AedbConfig::default(), assets.clone(), 12, 600) + .await + .expect("full profile run"); + let full_op = full_db.operational_metrics().await; + + let (low_db, low_metrics) = + run_simulation(AedbConfig::low_latency([7u8; 32]), assets.clone(), 12, 600) + .await + .expect("low-latency profile run"); + let low_op = low_db.operational_metrics().await; + + eprintln!( + "order_book_durability_compare: full_attempted_tps={} low_attempted_tps={} full_wal_append_ops={} low_wal_append_ops={} full_avg_wal_append_us={} low_avg_wal_append_us={} full_wal_sync_ops={} low_wal_sync_ops={} full_avg_wal_sync_us={} low_avg_wal_sync_us={}", + full_metrics.attempted_tps, + low_metrics.attempted_tps, + full_op.wal_append_ops, + low_op.wal_append_ops, + full_op.avg_wal_append_micros, + low_op.avg_wal_append_micros, + full_op.wal_sync_ops, + low_op.wal_sync_ops, + full_op.avg_wal_sync_micros, + low_op.avg_wal_sync_micros + ); + + assert!( + low_op.wal_sync_ops < full_op.wal_sync_ops, + "low-latency durability should reduce WAL sync operation count under identical workload" + ); } diff --git a/tests/security_properties_proptest.rs b/tests/security_properties_proptest.rs index 54dcd1d..57bc3f4 100644 --- a/tests/security_properties_proptest.rs +++ b/tests/security_properties_proptest.rs @@ -5,6 +5,9 @@ use aedb::commit::validation::Mutation; use aedb::config::AedbConfig; use aedb::error::AedbError; use aedb::offline; +use aedb::order_book::{ + ExecInstruction, OrderRequest, OrderSide, OrderType, SelfTradePrevention, TimeInForce, +}; use aedb::query::plan::ConsistencyMode; use proptest::prelude::*; use proptest::test_runner::TestCaseError; @@ -193,4 +196,62 @@ proptest! { }); outcome?; } + + #[test] + fn prop_order_book_new_rejects_malformed_inputs( + instrument in prop_oneof![ + Just(String::new()), + Just(" ".to_string()), + prop::collection::vec(any::(), 1025..1200) + .prop_map(|v| String::from_utf8_lossy(&v).into_owned()), + ], + owner in prop_oneof![ + Just(String::new()), + Just("\n\t ".to_string()), + prop::collection::vec(any::(), 1025..1200) + .prop_map(|v| String::from_utf8_lossy(&v).into_owned()), + ], + client_order_id in prop_oneof![ + Just(String::new()), + Just(" ".to_string()), + prop::collection::vec(any::(), 1025..1200) + .prop_map(|v| String::from_utf8_lossy(&v).into_owned()), + ], + ) { + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .expect("runtime"); + let outcome: Result<(), TestCaseError> = rt.block_on(async move { + let dir = tempdir().expect("temp dir"); + let db = AedbInstance::open(AedbConfig::default(), dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + + let err = db + .order_book_new( + "p", + "app", + OrderRequest { + instrument, + client_order_id, + side: OrderSide::Bid, + order_type: OrderType::Limit, + time_in_force: TimeInForce::Gtc, + exec_instructions: ExecInstruction(0), + self_trade_prevention: SelfTradePrevention::None, + price_ticks: 100, + qty_be: one_u256(), + owner, + account: None, + nonce: 1, + price_limit_ticks: None, + }, + ) + .await + .expect_err("malformed orderbook request must fail validation"); + prop_assert!(matches!(err, AedbError::Validation(_))); + Ok(()) + }); + outcome?; + } } diff --git a/tests/wal_frame_robustness.rs b/tests/wal_frame_robustness.rs new file mode 100644 index 0000000..4fc3103 --- /dev/null +++ b/tests/wal_frame_robustness.rs @@ -0,0 +1,35 @@ +use aedb::wal::frame::{FrameError, FrameReader}; +use proptest::prelude::*; +use std::io::Cursor; + +proptest! { + #![proptest_config(ProptestConfig { + cases: 64, + .. ProptestConfig::default() + })] + + #[test] + fn prop_random_bytes_never_panic(bytes in prop::collection::vec(any::(), 0..4096)) { + // Exercise prefix truncations to emulate torn writes and random tails. + let mut lengths = vec![bytes.len()]; + if !bytes.is_empty() { + lengths.push(bytes.len() / 2); + lengths.push(bytes.len().saturating_sub(1)); + } + lengths.sort_unstable(); + lengths.dedup(); + + for len in lengths { + let mut candidate = bytes.clone(); + candidate.truncate(len); + let mut reader = FrameReader::new(Cursor::new(candidate)); + loop { + match reader.next_frame() { + Ok(Some(_)) => {} + Ok(None) => break, + Err(FrameError::Truncation) | Err(FrameError::Corruption) | Err(FrameError::Io(_)) => break, + } + } + } + } +} From a33ac9ddc453bdc7b1d1eb7259b088a8cf53baa8 Mon Sep 17 00:00:00 2001 From: johnny Date: Fri, 27 Feb 2026 19:20:16 -0500 Subject: [PATCH 4/4] clippy --- src/checkpoint/writer.rs | 1 + src/commit/executor/internals.rs | 5 ----- src/lib.rs | 21 +++++++++++++++------ src/order_book.rs | 25 ++++++++++++++----------- 4 files changed, 30 insertions(+), 22 deletions(-) diff --git a/src/checkpoint/writer.rs b/src/checkpoint/writer.rs index 2b377d9..08cebe1 100644 --- a/src/checkpoint/writer.rs +++ b/src/checkpoint/writer.rs @@ -40,6 +40,7 @@ pub fn write_checkpoint( write_checkpoint_with_key(snapshot, catalog, seq, dir, None, None, HashMap::new(), 3) } +#[allow(clippy::too_many_arguments)] pub fn write_checkpoint_with_key( snapshot: &KeyspaceSnapshot, catalog: &Catalog, diff --git a/src/commit/executor/internals.rs b/src/commit/executor/internals.rs index d7deb91..f5c263b 100644 --- a/src/commit/executor/internals.rs +++ b/src/commit/executor/internals.rs @@ -634,7 +634,6 @@ pub(super) fn process_commit_epoch( wal_sync_micros, sync_executed, catalog_changed, - ..EpochProcessResult::default() }; } @@ -674,7 +673,6 @@ pub(super) fn process_commit_epoch( wal_sync_micros, sync_executed, catalog_changed, - ..EpochProcessResult::default() }; } @@ -738,7 +736,6 @@ pub(super) fn process_commit_epoch( wal_sync_micros, sync_executed, catalog_changed, - ..EpochProcessResult::default() }; } wal_append_ops = wal_append_ops.saturating_add(1); @@ -776,7 +773,6 @@ pub(super) fn process_commit_epoch( wal_sync_micros, sync_executed, catalog_changed, - ..EpochProcessResult::default() }; } wal_sync_ops = wal_sync_ops.saturating_add(1); @@ -905,7 +901,6 @@ pub(super) fn process_commit_epoch( wal_sync_micros, sync_executed, catalog_changed, - ..EpochProcessResult::default() } } diff --git a/src/lib.rs b/src/lib.rs index 82c9df5..a440b2b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3270,6 +3270,7 @@ impl AedbInstance { .await } + #[allow(clippy::too_many_arguments)] pub async fn order_book_cancel_as_with_finality( &self, caller: CallerContext, @@ -3287,6 +3288,7 @@ impl AedbInstance { Ok(result) } + #[allow(clippy::too_many_arguments)] pub async fn order_book_cancel_strict_as( &self, caller: CallerContext, @@ -3309,6 +3311,7 @@ impl AedbInstance { .await } + #[allow(clippy::too_many_arguments)] async fn order_book_cancel_strict_as_internal( &self, caller: Option, @@ -3321,8 +3324,7 @@ impl AedbInstance { ) -> Result { let lease = self .acquire_snapshot(ConsistencyMode::AtLatest) - .await - .map_err(AedbError::from)?; + .await?; let order_key = key_order(instrument, order_id); let Some(entry) = lease.view.keyspace.kv_get(project_id, scope_id, &order_key) else { return Err(AedbError::Validation(format!( @@ -3443,6 +3445,7 @@ impl AedbInstance { .await } + #[allow(clippy::too_many_arguments)] pub async fn order_book_cancel_by_client_id_strict_as( &self, caller: CallerContext, @@ -3465,6 +3468,7 @@ impl AedbInstance { .await } + #[allow(clippy::too_many_arguments)] async fn order_book_cancel_by_client_id_strict_as_internal( &self, caller: Option, @@ -3477,8 +3481,7 @@ impl AedbInstance { ) -> Result { let lease = self .acquire_snapshot(ConsistencyMode::AtLatest) - .await - .map_err(AedbError::from)?; + .await?; let cid_key = key_client_id(instrument, owner, client_order_id); let Some(cid_entry) = lease.view.keyspace.kv_get(project_id, scope_id, &cid_key) else { return Err(AedbError::Validation(format!( @@ -3672,6 +3675,7 @@ impl AedbInstance { self.commit_envelope_with_finality(envelope, finality).await } + #[allow(clippy::too_many_arguments)] pub async fn order_book_reduce_strict( &self, project_id: &str, @@ -3695,6 +3699,7 @@ impl AedbInstance { .await } + #[allow(clippy::too_many_arguments)] pub async fn order_book_reduce_strict_as( &self, caller: CallerContext, @@ -3719,6 +3724,7 @@ impl AedbInstance { .await } + #[allow(clippy::too_many_arguments)] async fn order_book_reduce_strict_as_internal( &self, caller: Option, @@ -3782,8 +3788,7 @@ impl AedbInstance { ) -> Result<(Vec, u64, u64), AedbError> { let lease = self .acquire_snapshot(ConsistencyMode::AtLatest) - .await - .map_err(AedbError::from)?; + .await?; let order_key = key_order(instrument, order_id); let Some(entry) = lease.view.keyspace.kv_get(project_id, scope_id, &order_key) else { return Err(AedbError::Validation(format!( @@ -3810,6 +3815,7 @@ impl AedbInstance { Ok((order_key, entry.version, lease.view.seq)) } + #[allow(clippy::too_many_arguments)] pub async fn order_book_cancel_replace( &self, project_id: &str, @@ -3870,6 +3876,7 @@ impl AedbInstance { .await } + #[allow(clippy::too_many_arguments)] pub async fn order_book_mass_cancel( &self, project_id: &str, @@ -3895,6 +3902,7 @@ impl AedbInstance { .await } + #[allow(clippy::too_many_arguments)] pub async fn order_book_mass_cancel_as( &self, caller: CallerContext, @@ -3944,6 +3952,7 @@ impl AedbInstance { .await } + #[allow(clippy::too_many_arguments)] pub async fn order_book_reduce_as( &self, caller: CallerContext, diff --git a/src/order_book.rs b/src/order_book.rs index 3a45ad8..f0b76f6 100644 --- a/src/order_book.rs +++ b/src/order_book.rs @@ -637,20 +637,21 @@ pub fn apply_order_book_new( return Err(AedbError::Validation("duplicate client_order_id".into())); } - if effective_request.exec_instructions.post_only() { - if let Some(best_price) = best_price_for_side( + if effective_request.exec_instructions.post_only() + && let Some(best_price) = best_price_for_side( keyspace, project_id, scope_id, &effective_request.instrument, effective_request.side.opposite(), - ) && crosses( + ) + && crosses( effective_request.side, effective_request.price_ticks, best_price, - ) { - return Err(AedbError::Validation("post_only would cross".into())); - } + ) + { + return Err(AedbError::Validation("post_only would cross".into())); } if matches!(effective_request.time_in_force, TimeInForce::Fok) @@ -832,12 +833,10 @@ pub fn apply_order_book_new( } else { OrderStatus::PartiallyFilled } + } else if filled.is_zero() { + OrderStatus::Cancelled } else { - if filled.is_zero() { - OrderStatus::Cancelled - } else { - OrderStatus::PartiallyFilled - } + OrderStatus::PartiallyFilled }; let record = OrderRecord { @@ -938,6 +937,7 @@ pub fn apply_order_book_new( ) } +#[allow(clippy::too_many_arguments)] pub fn apply_order_book_cancel( keyspace: &mut Keyspace, project_id: &str, @@ -1237,6 +1237,7 @@ pub fn apply_order_book_mass_cancel( Ok(()) } +#[allow(clippy::too_many_arguments)] pub fn apply_order_book_reduce( keyspace: &mut Keyspace, project_id: &str, @@ -2054,6 +2055,7 @@ fn clear_open_order( Ok(()) } +#[allow(clippy::too_many_arguments)] fn dec_price_level_qty( keyspace: &mut Keyspace, project_id: &str, @@ -2072,6 +2074,7 @@ fn dec_price_level_qty( Ok(()) } +#[allow(clippy::too_many_arguments)] fn write_fill( keyspace: &mut Keyspace, project_id: &str,