From f1d8a005e2161d3151058946f8e6c370ba610b63 Mon Sep 17 00:00:00 2001 From: johnny Date: Thu, 26 Feb 2026 15:14:17 -0500 Subject: [PATCH 1/3] fuzzy matching --- src/commit/executor/internals.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/commit/executor/internals.rs b/src/commit/executor/internals.rs index f5c263b..d7deb91 100644 --- a/src/commit/executor/internals.rs +++ b/src/commit/executor/internals.rs @@ -634,6 +634,7 @@ pub(super) fn process_commit_epoch( wal_sync_micros, sync_executed, catalog_changed, + ..EpochProcessResult::default() }; } @@ -673,6 +674,7 @@ pub(super) fn process_commit_epoch( wal_sync_micros, sync_executed, catalog_changed, + ..EpochProcessResult::default() }; } @@ -736,6 +738,7 @@ pub(super) fn process_commit_epoch( wal_sync_micros, sync_executed, catalog_changed, + ..EpochProcessResult::default() }; } wal_append_ops = wal_append_ops.saturating_add(1); @@ -773,6 +776,7 @@ pub(super) fn process_commit_epoch( wal_sync_micros, sync_executed, catalog_changed, + ..EpochProcessResult::default() }; } wal_sync_ops = wal_sync_ops.saturating_add(1); @@ -901,6 +905,7 @@ pub(super) fn process_commit_epoch( wal_sync_micros, sync_executed, catalog_changed, + ..EpochProcessResult::default() } } From 91e118d067c8e3d7ee5a681cd5e58a1d169dd7e2 Mon Sep 17 00:00:00 2001 From: johnny Date: Fri, 27 Feb 2026 23:13:56 -0500 Subject: [PATCH 2/3] improve join performacne --- Cargo.lock | 2 +- Cargo.toml | 2 +- src/lib.rs | 53 +- src/lib_tests.rs | 92 +++ src/query/executor/aggregate.rs | 31 + src/query/executor/cursor.rs | 103 +++ src/query/executor/indexing.rs | 291 +++++++ src/query/executor/join.rs | 379 +++++++++ src/query/executor/mod.rs | 468 +++++++++++ src/query/executor/predicate.rs | 42 + src/query/executor/tests.rs | 1340 +++++++++++++++++++++++++++++++ src/query/executor/validate.rs | 128 +++ tests/naming_conventions.rs | 4 +- 13 files changed, 2929 insertions(+), 6 deletions(-) create mode 100644 src/query/executor/aggregate.rs create mode 100644 src/query/executor/cursor.rs create mode 100644 src/query/executor/indexing.rs create mode 100644 src/query/executor/join.rs create mode 100644 src/query/executor/mod.rs create mode 100644 src/query/executor/predicate.rs create mode 100644 src/query/executor/tests.rs create mode 100644 src/query/executor/validate.rs diff --git a/Cargo.lock b/Cargo.lock index c1e433e..eecc11a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -14,7 +14,7 @@ dependencies = [ [[package]] name = "aedb" -version = "0.1.3" +version = "0.1.4" dependencies = [ "aes-gcm", "blake3", diff --git a/Cargo.toml b/Cargo.toml index 2cebd57..5dfd526 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "aedb" -version = "0.1.3" +version = "0.1.4" edition = "2024" description = "Embedded Rust storage engine with transactional commits, WAL durability, and snapshot-consistent reads" license = "MIT OR Apache-2.0" diff --git a/src/lib.rs b/src/lib.rs index a440b2b..7ea4d33 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -6350,7 +6350,11 @@ impl ReadTx<'_> { hydrate_query: Query, hydrate_key_column: &str, ) -> Result<(QueryResult, QueryResult), QueryError> { - let source = self.query(project_id, scope_id, source_query).await?; + let mut source = self.query(project_id, scope_id, source_query).await?; + // This helper treats the source query's limit as the caller's complete key set. + // Do not surface pagination state from the underlying query engine here. + source.cursor = None; + source.truncated = false; let keys = source .rows .iter() @@ -6363,6 +6367,7 @@ impl ReadTx<'_> { rows: Vec::new(), rows_examined: 0, cursor: None, + truncated: false, snapshot_seq: self.lease.view.seq, materialized_seq: None, }, @@ -6376,7 +6381,51 @@ impl ReadTx<'_> { }), ..hydrate_query }; - let hydrated = self.query(project_id, scope_id, hydrate_query).await?; + let page_size = self.db._config.max_scan_rows.min(100).max(1); + let mut hydrate_query = ensure_stable_order_from_catalog( + project_id, + scope_id, + &self.lease.view.catalog, + hydrate_query, + ); + hydrate_query.limit = Some(page_size); + + let mut all_rows = Vec::new(); + let mut total_rows_examined = 0usize; + let mut next_cursor: Option = None; + let mut materialized_seq = None; + loop { + let page = self + .query_with_options( + project_id, + scope_id, + hydrate_query.clone(), + QueryOptions { + consistency: ConsistencyMode::AtSeq(self.lease.view.seq), + cursor: next_cursor.clone(), + ..QueryOptions::default() + }, + ) + .await?; + total_rows_examined = total_rows_examined.saturating_add(page.rows_examined); + if materialized_seq.is_none() { + materialized_seq = page.materialized_seq; + } + all_rows.extend(page.rows); + if let Some(cursor) = page.cursor { + next_cursor = Some(cursor); + continue; + } + break; + } + let hydrated = QueryResult { + rows: all_rows, + rows_examined: total_rows_examined, + cursor: None, + truncated: false, + snapshot_seq: self.lease.view.seq, + materialized_seq, + }; Ok((source, hydrated)) } } diff --git a/src/lib_tests.rs b/src/lib_tests.rs index 69306b9..0795ba1 100644 --- a/src/lib_tests.rs +++ b/src/lib_tests.rs @@ -6616,7 +6616,99 @@ async fn list_batch_and_lookup_helpers_work() { .await .expect("lookup/hydrate"); assert_eq!(source.rows.len(), 3); + assert!(!source.truncated); assert_eq!(hydrated.rows.len(), 3); + assert!(!hydrated.truncated); +} + +#[tokio::test] +async fn lookup_then_hydrate_fetches_all_pages_for_large_key_sets() { + let dir = tempdir().expect("temp"); + let db = AedbInstance::open(AedbConfig::default(), dir.path()).expect("open"); + db.create_project("p").await.expect("project"); + db.create_scope("p", "app").await.expect("scope"); + + create_table( + &db, + "p", + "app", + "items", + vec![ + ColumnDef { + name: "id".into(), + col_type: ColumnType::Integer, + nullable: false, + }, + ColumnDef { + name: "user_id".into(), + col_type: ColumnType::Integer, + nullable: false, + }, + ], + vec!["id"], + ) + .await; + create_table( + &db, + "p", + "app", + "users", + vec![ + ColumnDef { + name: "id".into(), + col_type: ColumnType::Integer, + nullable: false, + }, + ColumnDef { + name: "username".into(), + col_type: ColumnType::Text, + nullable: false, + }, + ], + vec!["id"], + ) + .await; + + for id in 1_i64..=1_000_i64 { + db.commit(Mutation::Upsert { + project_id: "p".into(), + scope_id: "app".into(), + table_name: "items".into(), + primary_key: vec![Value::Integer(id)], + row: Row::from_values(vec![Value::Integer(id), Value::Integer(id)]), + }) + .await + .expect("insert item"); + db.commit(Mutation::Upsert { + project_id: "p".into(), + scope_id: "app".into(), + table_name: "users".into(), + primary_key: vec![Value::Integer(id)], + row: Row::from_values(vec![ + Value::Integer(id), + Value::Text(format!("u{id}").into()), + ]), + }) + .await + .expect("insert user"); + } + + let (source, hydrated) = db + .lookup_then_hydrate( + "p", + "app", + Query::select(&["user_id"]).from("items").limit(1_000), + 0, + Query::select(&["id", "username"]).from("users"), + "id", + ConsistencyMode::AtLatest, + ) + .await + .expect("lookup/hydrate"); + assert_eq!(source.rows.len(), 1_000); + assert!(!source.truncated); + assert_eq!(hydrated.rows.len(), 1_000); + assert!(!hydrated.truncated); } #[tokio::test] diff --git a/src/query/executor/aggregate.rs b/src/query/executor/aggregate.rs new file mode 100644 index 0000000..19c2282 --- /dev/null +++ b/src/query/executor/aggregate.rs @@ -0,0 +1,31 @@ +use crate::query::error::QueryError; +use crate::query::plan::Aggregate; + +pub(super) fn aggregate_col_idx( + agg: &Aggregate, + columns: &[String], +) -> Result, QueryError> { + let column_index = match agg { + Aggregate::Count => return Ok(None), + Aggregate::Sum(col) | Aggregate::Min(col) | Aggregate::Max(col) | Aggregate::Avg(col) => { + columns + .iter() + .position(|c| c == col) + .ok_or_else(|| QueryError::ColumnNotFound { + table: "".to_string(), + column: col.clone(), + })? + } + }; + Ok(Some(column_index)) +} + +pub(super) fn aggregate_output_name(agg: &Aggregate) -> String { + match agg { + Aggregate::Count => "count_star".to_string(), + Aggregate::Sum(col) => format!("sum_{col}"), + Aggregate::Min(col) => format!("min_{col}"), + Aggregate::Max(col) => format!("max_{col}"), + Aggregate::Avg(col) => format!("avg_{col}"), + } +} diff --git a/src/query/executor/cursor.rs b/src/query/executor/cursor.rs new file mode 100644 index 0000000..4b5e699 --- /dev/null +++ b/src/query/executor/cursor.rs @@ -0,0 +1,103 @@ +use crate::catalog::types::{Row, Value}; +use crate::query::error::QueryError; +use serde::{Deserialize, Serialize}; + +pub(super) fn extract_sort_key( + row: &Row, + sort_indices: &[(usize, crate::query::plan::Order)], +) -> Vec { + if sort_indices.is_empty() { + return row.values.clone(); + } + sort_indices + .iter() + .map(|(idx, _)| row.values[*idx].clone()) + .collect() +} + +pub(super) fn extract_pk_key(row: &Row, pk_indices: &[usize]) -> Vec { + if pk_indices.is_empty() { + return row.values.clone(); + } + pk_indices + .iter() + .map(|idx| row.values[*idx].clone()) + .collect() +} + +pub(super) fn row_after_cursor( + row: &Row, + cursor: &CursorToken, + sort_indices: &[(usize, crate::query::plan::Order)], + pk_indices: &[usize], +) -> bool { + let row_sort = extract_sort_key(row, sort_indices); + let row_pk = extract_pk_key(row, pk_indices); + if sort_indices.is_empty() { + return row_pk > cursor.last_pk; + } + for ((_, order), (lhs, rhs)) in sort_indices + .iter() + .zip(row_sort.iter().zip(cursor.last_sort_key.iter())) + { + let cmp = lhs.cmp(rhs); + if cmp.is_eq() { + continue; + } + return match order { + crate::query::plan::Order::Asc => cmp.is_gt(), + crate::query::plan::Order::Desc => cmp.is_lt(), + }; + } + row_pk > cursor.last_pk +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +pub(super) struct CursorToken { + pub(super) snapshot_seq: u64, + pub(super) last_sort_key: Vec, + pub(super) last_pk: Vec, + pub(super) page_size: usize, + pub(super) remaining_limit: Option, +} + +pub(super) fn encode_cursor(cursor: &CursorToken) -> Result { + let bytes = rmp_serde::to_vec(cursor).map_err(|e| QueryError::InternalError(e.to_string()))?; + Ok(bytes.iter().map(|b| format!("{b:02x}")).collect()) +} + +pub(super) fn decode_cursor(encoded: &str) -> Result { + let encoded_size_bytes = encoded.len(); + if !encoded_size_bytes.is_multiple_of(2) { + return Err(QueryError::InvalidQuery { + reason: "invalid cursor".into(), + }); + } + let mut decoded_bytes = Vec::with_capacity(encoded_size_bytes / 2); + let encoded_bytes = encoded.as_bytes(); + for byte_offset in (0..encoded_bytes.len()).step_by(2) { + let hi = decode_hex_nibble(encoded_bytes[byte_offset]).ok_or_else(|| { + QueryError::InvalidQuery { + reason: "invalid cursor".into(), + } + })?; + let lo = decode_hex_nibble(encoded_bytes[byte_offset + 1]).ok_or_else(|| { + QueryError::InvalidQuery { + reason: "invalid cursor".into(), + } + })?; + decoded_bytes.push((hi << 4) | lo); + } + rmp_serde::from_slice(&decoded_bytes).map_err(|e| QueryError::InvalidQuery { + reason: e.to_string(), + }) +} + +fn decode_hex_nibble(byte: u8) -> Option { + match byte { + b'0'..=b'9' => Some(byte - b'0'), + b'a'..=b'f' => Some(byte - b'a' + 10), + b'A'..=b'F' => Some(byte - b'A' + 10), + _ => None, + } +} diff --git a/src/query/executor/indexing.rs b/src/query/executor/indexing.rs new file mode 100644 index 0000000..d2be4b8 --- /dev/null +++ b/src/query/executor/indexing.rs @@ -0,0 +1,291 @@ +use crate::catalog::Catalog; +use crate::catalog::namespace_key; +use crate::catalog::types::Value; +use crate::query::error::QueryError; +use crate::storage::encoded_key::EncodedKey; +use std::collections::{HashMap, HashSet}; +use std::ops::Bound; + +use super::predicate::collect_eq_constraints; + +type IndexBounds = (Bound, Bound); + +enum IndexLookup { + Range { column: String, bounds: IndexBounds }, + MultiEq { column: String, values: Vec }, +} + +pub(super) fn indexed_pks_for_predicate( + catalog: &Catalog, + project_id: &str, + scope_id: &str, + table_name: &str, + table: &crate::storage::keyspace::TableData, + predicate: &crate::query::plan::Expr, +) -> Result>, QueryError> { + use crate::query::plan::Expr; + + match predicate { + Expr::And(lhs, rhs) => { + let left = + indexed_pks_for_predicate(catalog, project_id, scope_id, table_name, table, lhs)?; + let right = + indexed_pks_for_predicate(catalog, project_id, scope_id, table_name, table, rhs)?; + return Ok(match (left, right) { + (Some(left), Some(right)) => Some(intersect_pks(left, right)), + (Some(left), None) => Some(left), + (None, Some(right)) => Some(right), + (None, None) => None, + }); + } + Expr::Or(lhs, rhs) => { + let left = + indexed_pks_for_predicate(catalog, project_id, scope_id, table_name, table, lhs)?; + let right = + indexed_pks_for_predicate(catalog, project_id, scope_id, table_name, table, rhs)?; + return Ok(match (left, right) { + (Some(left), Some(right)) => Some(union_pks(left, right)), + _ => None, + }); + } + _ => {} + } + + let mut equalities = HashMap::new(); + let eq_only = collect_eq_constraints(predicate, &mut equalities); + let Some(lookup) = extract_indexable_predicate(predicate) else { + if !eq_only { + return Ok(None); + } + // Composite + leftmost-prefix support for conjunctions of equality predicates. + let ns = namespace_key(project_id, scope_id); + let mut best: Option<(String, usize)> = None; + for ((p, t, idx_name), idx_def) in &catalog.indexes { + if p != &ns || t != table_name || !table.indexes.contains_key(idx_name) { + continue; + } + if let Some(filter) = &idx_def.partial_filter + && !expr_implied_by_eq_constraints(filter, &equalities) + { + continue; + } + let mut prefix_cols = 0usize; + for col in &idx_def.columns { + if equalities.contains_key(col) { + prefix_cols += 1; + } else { + break; + } + } + if prefix_cols == 0 { + continue; + } + if best.as_ref().map(|(_, c)| *c).unwrap_or(0) < prefix_cols { + best = Some((idx_name.clone(), prefix_cols)); + } + } + let Some((idx_name, prefix_cols)) = best else { + return Ok(None); + }; + let selected_index = + table + .indexes + .get(&idx_name) + .ok_or_else(|| QueryError::InvalidQuery { + reason: "index not found".into(), + })?; + let idx_def = catalog + .indexes + .get(&(ns, table_name.to_string(), idx_name.clone())) + .ok_or_else(|| QueryError::InvalidQuery { + reason: "index definition not found".into(), + })?; + let prefix_values = idx_def + .columns + .iter() + .take(prefix_cols) + .filter_map(|c| equalities.get(c).cloned()) + .collect::>(); + let encoded = EncodedKey::from_values(&prefix_values); + let pks = if prefix_cols == idx_def.columns.len() { + selected_index.scan_eq(&encoded) + } else { + selected_index.scan_prefix(&encoded) + }; + return Ok(Some(pks)); + }; + let column = match &lookup { + IndexLookup::Range { column, .. } => column, + IndexLookup::MultiEq { column, .. } => column, + }; + + let mut selected_index_name: Option = None; + let ns = namespace_key(project_id, scope_id); + for ((p, t, idx_name), idx_def) in &catalog.indexes { + if p == &ns + && t == table_name + && idx_def.columns.len() == 1 + && idx_def.columns[0] == *column + && idx_def + .partial_filter + .as_ref() + .map(|f| expr_implied_by_eq_constraints(f, &equalities)) + .unwrap_or(true) + && table.indexes.contains_key(idx_name) + { + selected_index_name = Some(idx_name.clone()); + break; + } + } + + let Some(index_name) = selected_index_name else { + return Ok(None); + }; + let Some(index) = table.indexes.get(&index_name) else { + return Ok(None); + }; + + let pks = match lookup { + IndexLookup::Range { bounds, .. } => index.scan_range(bounds.0, bounds.1), + IndexLookup::MultiEq { values, .. } => values + .into_iter() + .flat_map(|v| index.scan_eq(&EncodedKey::from_values(&[v]))) + .collect(), + }; + Ok(Some(pks)) +} + +fn intersect_pks(left: Vec, right: Vec) -> Vec { + let mut right_set: HashSet = HashSet::with_capacity(right.len()); + right_set.extend(right); + let mut out = Vec::with_capacity(left.len().min(right_set.len())); + for pk in left { + if right_set.contains(&pk) { + out.push(pk); + } + } + out +} + +fn union_pks(left: Vec, right: Vec) -> Vec { + let mut seen: HashSet = HashSet::with_capacity(left.len() + right.len()); + let mut out = Vec::with_capacity(left.len() + right.len()); + for pk in left.into_iter().chain(right) { + if seen.insert(pk.clone()) { + out.push(pk); + } + } + out +} + +fn expr_implied_by_eq_constraints( + expr: &crate::query::plan::Expr, + equalities: &HashMap, +) -> bool { + use crate::query::plan::Expr; + match expr { + Expr::Eq(col, val) => equalities.get(col) == Some(val), + Expr::And(lhs, rhs) => { + expr_implied_by_eq_constraints(lhs, equalities) + && expr_implied_by_eq_constraints(rhs, equalities) + } + _ => false, + } +} + +fn extract_indexable_predicate(predicate: &crate::query::plan::Expr) -> Option { + use crate::query::plan::Expr; + + match predicate { + Expr::Eq(c, v) => Some(IndexLookup::Range { + column: c.clone(), + bounds: ( + Bound::Included(EncodedKey::from_values(std::slice::from_ref(v))), + Bound::Included(EncodedKey::from_values(std::slice::from_ref(v))), + ), + }), + Expr::In(c, values) => Some(IndexLookup::MultiEq { + column: c.clone(), + values: values.clone(), + }), + Expr::Lt(c, v) => Some(IndexLookup::Range { + column: c.clone(), + bounds: ( + Bound::Unbounded, + Bound::Excluded(EncodedKey::from_values(std::slice::from_ref(v))), + ), + }), + Expr::Lte(c, v) => Some(IndexLookup::Range { + column: c.clone(), + bounds: ( + Bound::Unbounded, + Bound::Included(EncodedKey::from_values(std::slice::from_ref(v))), + ), + }), + Expr::Gt(c, v) => Some(IndexLookup::Range { + column: c.clone(), + bounds: ( + Bound::Excluded(EncodedKey::from_values(std::slice::from_ref(v))), + Bound::Unbounded, + ), + }), + Expr::Gte(c, v) => Some(IndexLookup::Range { + column: c.clone(), + bounds: ( + Bound::Included(EncodedKey::from_values(std::slice::from_ref(v))), + Bound::Unbounded, + ), + }), + Expr::Between(c, lo, hi) => Some(IndexLookup::Range { + column: c.clone(), + bounds: ( + Bound::Included(EncodedKey::from_values(std::slice::from_ref(lo))), + Bound::Included(EncodedKey::from_values(std::slice::from_ref(hi))), + ), + }), + Expr::Like(c, pattern) => { + let prefix = like_prefix(pattern)?; + let start = Bound::Included(EncodedKey::from_values(&[Value::Text( + prefix.clone().into(), + )])); + let end = match next_prefix(&prefix) { + Some(next) => Bound::Excluded(EncodedKey::from_values(&[Value::Text(next.into())])), + None => Bound::Unbounded, + }; + Some(IndexLookup::Range { + column: c.clone(), + bounds: (start, end), + }) + } + _ => None, + } +} + +fn like_prefix(pattern: &str) -> Option { + if !pattern.ends_with('%') { + return None; + } + let mut prefix = String::new(); + for ch in pattern.chars() { + if ch == '%' || ch == '_' { + break; + } + prefix.push(ch); + } + if prefix.is_empty() { + return None; + } + Some(prefix) +} + +fn next_prefix(prefix: &str) -> Option { + let mut bytes = prefix.as_bytes().to_vec(); + for byte_index in (0..bytes.len()).rev() { + if bytes[byte_index] != u8::MAX { + bytes[byte_index] += 1; + bytes.truncate(byte_index + 1); + return String::from_utf8(bytes).ok(); + } + } + None +} diff --git a/src/query/executor/join.rs b/src/query/executor/join.rs new file mode 100644 index 0000000..653fd3a --- /dev/null +++ b/src/query/executor/join.rs @@ -0,0 +1,379 @@ +use super::{ + CursorToken, QueryResult, aggregate_col_idx, aggregate_output_name, encode_cursor, + extract_pk_key, extract_sort_key, row_after_cursor, +}; +use crate::catalog::Catalog; +use crate::catalog::namespace_key; +use crate::catalog::types::{Row, Value}; +use crate::query::error::QueryError; +use crate::query::operators::{AggregateOperator, Operator, ScanOperator, compile_expr}; +use crate::query::plan::{JoinType, Query, QueryOptions}; +use crate::storage::keyspace::KeyspaceSnapshot; +use std::collections::HashMap; + +#[allow(clippy::too_many_arguments)] +pub(super) fn execute_join_query( + snapshot: &KeyspaceSnapshot, + catalog: &Catalog, + project_id: &str, + scope_id: &str, + query: Query, + options: QueryOptions, + snapshot_seq: u64, + max_scan_rows: usize, + cursor_state: Option, +) -> Result { + let (base_ns_project, base_ns_scope, base_table) = + resolve_table_ref(project_id, scope_id, &query.table); + let base_schema = catalog + .tables + .get(&( + namespace_key(&base_ns_project, &base_ns_scope), + base_table.clone(), + )) + .ok_or_else(|| QueryError::TableNotFound { + project_id: base_ns_project.clone(), + table: base_table.clone(), + })?; + let base_alias = query.table_alias.clone().unwrap_or(base_table.clone()); + let mut columns: Vec = base_schema + .columns + .iter() + .map(|c| format!("{base_alias}.{}", c.name)) + .collect(); + let base_count = snapshot + .table(&base_ns_project, &base_ns_scope, &base_table) + .map(|t| t.rows.len()) + .unwrap_or(0); + if !options.allow_full_scan && base_count > max_scan_rows { + return Err(QueryError::ScanBoundExceeded { + estimated_rows: base_count as u64, + max_scan_rows: max_scan_rows as u64, + }); + } + let mut rows: Vec = snapshot + .table(&base_ns_project, &base_ns_scope, &base_table) + .map(|t| t.rows.values().cloned().collect()) + .unwrap_or_default(); + + for join in &query.joins { + let (jp, js, jt) = resolve_table_ref(project_id, scope_id, &join.table); + let join_schema = catalog + .tables + .get(&(namespace_key(&jp, &js), jt.clone())) + .ok_or_else(|| QueryError::TableNotFound { + project_id: jp.clone(), + table: jt.clone(), + })?; + let join_alias = join.alias.clone().unwrap_or(jt.clone()); + let join_rows: Vec<&Row> = snapshot + .table(&jp, &js, &jt) + .map(|t| t.rows.values().collect()) + .unwrap_or_default(); + if !options.allow_full_scan + && rows.len().saturating_mul(join_rows.len().max(1)) > max_scan_rows + { + return Err(QueryError::ScanBoundExceeded { + estimated_rows: rows.len().saturating_mul(join_rows.len().max(1)) as u64, + max_scan_rows: max_scan_rows as u64, + }); + } + let join_col_offset = columns.len(); + let mut next_columns = columns.clone(); + next_columns.extend( + join_schema + .columns + .iter() + .map(|c| format!("{join_alias}.{}", c.name)), + ); + let (left_idx, right_idx) = match join.join_type { + JoinType::Cross => (None, None), + _ => { + let left = join + .left_column + .as_ref() + .ok_or_else(|| QueryError::InvalidQuery { + reason: "join requires left_column".into(), + })?; + let right = join + .right_column + .as_ref() + .ok_or_else(|| QueryError::InvalidQuery { + reason: "join requires right_column".into(), + })?; + let left_idx = columns.iter().position(|c| c == left).ok_or_else(|| { + QueryError::ColumnNotFound { + table: query.table.clone(), + column: left.clone(), + } + })?; + let right_idx = join_schema + .columns + .iter() + .position(|c| format!("{join_alias}.{}", c.name) == *right || c.name == *right) + .ok_or_else(|| QueryError::ColumnNotFound { + table: join.table.clone(), + column: right.clone(), + })?; + (Some(left_idx), Some(right_idx)) + } + }; + + let mut joined = Vec::new(); + match join.join_type { + JoinType::Cross => { + for left in &rows { + for right in &join_rows { + let mut values = left.values.clone(); + values.extend(right.values.clone()); + joined.push(Row { values }); + } + } + } + JoinType::Inner | JoinType::Left => { + let right_idx = right_idx.ok_or_else(|| QueryError::InvalidQuery { + reason: "join requires right join key".into(), + })?; + let left_idx = left_idx.ok_or_else(|| QueryError::InvalidQuery { + reason: "join requires left join key".into(), + })?; + // Hash join for equality predicates. + let mut right_map: HashMap> = HashMap::new(); + for right in &join_rows { + right_map + .entry(right.values[right_idx].clone()) + .or_default() + .push(right); + } + for left in &rows { + let key = left.values[left_idx].clone(); + if let Some(matches) = right_map.get(&key) { + for right in matches { + let mut values = left.values.clone(); + values.extend(right.values.clone()); + joined.push(Row { values }); + } + } else if matches!(join.join_type, JoinType::Left) { + let mut values = left.values.clone(); + values.extend(std::iter::repeat_n(Value::Null, join_schema.columns.len())); + joined.push(Row { values }); + } + } + } + JoinType::Right => { + let left_idx = left_idx.ok_or_else(|| QueryError::InvalidQuery { + reason: "join requires left join key".into(), + })?; + let right_idx = right_idx.ok_or_else(|| QueryError::InvalidQuery { + reason: "join requires right join key".into(), + })?; + let mut left_map: HashMap> = HashMap::new(); + for left in &rows { + left_map + .entry(left.values[left_idx].clone()) + .or_default() + .push(left); + } + for right in &join_rows { + let key = right.values[right_idx].clone(); + if let Some(matches) = left_map.get(&key) { + for left in matches { + let mut values = left.values.clone(); + values.extend(right.values.clone()); + joined.push(Row { values }); + } + } else { + let mut values = + std::iter::repeat_n(Value::Null, join_col_offset).collect::>(); + values.extend(right.values.clone()); + joined.push(Row { values }); + } + } + } + } + rows = joined; + if !options.allow_full_scan && rows.len() > max_scan_rows { + return Err(QueryError::ScanBoundExceeded { + estimated_rows: rows.len() as u64, + max_scan_rows: max_scan_rows as u64, + }); + } + columns = next_columns; + } + + if let Some(predicate) = &query.predicate { + let compiled = compile_expr(predicate, &columns, "join")?; + rows.retain(|r| crate::query::operators::eval_compiled_expr_public(&compiled, r)); + } + + if !query.aggregates.is_empty() { + let group_by_idx = query + .group_by + .iter() + .map(|name| { + columns + .iter() + .position(|c| c == name) + .ok_or_else(|| QueryError::ColumnNotFound { + table: "join".into(), + column: name.clone(), + }) + }) + .collect::, _>>()?; + let agg_col_idx = query + .aggregates + .iter() + .map(|agg| aggregate_col_idx(agg, &columns)) + .collect::, _>>()?; + + let mut aggregate = AggregateOperator::new( + Box::new(ScanOperator::new(rows)), + query.aggregates.clone(), + group_by_idx, + agg_col_idx, + ); + let mut aggregated_rows = Vec::new(); + while let Some(row) = aggregate.next() { + aggregated_rows.push(row); + } + rows = aggregated_rows; + columns = query.group_by.clone(); + columns.extend(query.aggregates.iter().map(aggregate_output_name)); + } + + if let Some(having) = &query.having { + if query.aggregates.is_empty() { + return Err(QueryError::InvalidQuery { + reason: "having requires aggregate or group_by".into(), + }); + } + let compiled = compile_expr(having, &columns, "join")?; + rows.retain(|r| crate::query::operators::eval_compiled_expr_public(&compiled, r)); + } + + if !query.order_by.is_empty() { + let order_pairs: Vec<(usize, crate::query::plan::Order)> = query + .order_by + .iter() + .map(|(col, ord)| { + columns + .iter() + .position(|c| c == col) + .map(|idx| (idx, *ord)) + .ok_or_else(|| QueryError::ColumnNotFound { + table: "join".into(), + column: col.clone(), + }) + }) + .collect::>()?; + rows.sort_by(|a, b| { + for (idx, ord) in &order_pairs { + let cmp = a.values[*idx].cmp(&b.values[*idx]); + let ord_cmp = match ord { + crate::query::plan::Order::Asc => cmp, + crate::query::plan::Order::Desc => cmp.reverse(), + }; + if !ord_cmp.is_eq() { + return ord_cmp; + } + } + std::cmp::Ordering::Equal + }); + } + + let rows_examined = rows.len(); + let page_size = query.limit.unwrap_or_else(|| { + cursor_state + .as_ref() + .map(|c| c.page_size) + .unwrap_or(max_scan_rows.min(100)) + }); + let effective_page_size = page_size.min(max_scan_rows); + let sort_indices: Vec<(usize, crate::query::plan::Order)> = if !query.order_by.is_empty() { + query + .order_by + .iter() + .filter_map(|(name, ord)| columns.iter().position(|c| c == name).map(|i| (i, *ord))) + .collect() + } else { + Vec::new() + }; + let pk_indices: Vec = (0..columns.len()).collect(); + let mut sliced = Vec::new(); + for row in rows { + if let Some(cursor) = &cursor_state + && !row_after_cursor(&row, cursor, &sort_indices, &pk_indices) + { + continue; + } + sliced.push(row); + if sliced.len() > effective_page_size { + break; + } + } + let has_more = sliced.len() > effective_page_size; + if has_more { + sliced.truncate(effective_page_size); + } + let cursor_last_row = sliced.last().cloned(); + + if !query.select.is_empty() && query.select[0] != "*" { + let idxs: Vec = query + .select + .iter() + .map(|col| { + columns + .iter() + .position(|c| c == col) + .ok_or_else(|| QueryError::ColumnNotFound { + table: "join".into(), + column: col.clone(), + }) + }) + .collect::>()?; + sliced = sliced + .into_iter() + .map(|r| Row { + values: idxs.iter().map(|i| r.values[*i].clone()).collect(), + }) + .collect(); + } + + let cursor = if has_more { + let last_row = cursor_last_row.ok_or_else(|| QueryError::InvalidQuery { + reason: "invalid cursor state".into(), + })?; + Some(encode_cursor(&CursorToken { + snapshot_seq, + last_sort_key: extract_sort_key(&last_row, &sort_indices), + last_pk: extract_pk_key(&last_row, &pk_indices), + page_size, + remaining_limit: None, + })?) + } else { + None + }; + Ok(QueryResult { + rows_examined, + rows: sliced, + truncated: cursor.is_some(), + cursor, + snapshot_seq, + materialized_seq: None, + }) +} + +pub(super) fn resolve_table_ref( + project_id: &str, + scope_id: &str, + table_ref: &str, +) -> (String, String, String) { + if let Some(name) = table_ref.strip_prefix("_global.") { + return ("_global".to_string(), "app".to_string(), name.to_string()); + } + ( + project_id.to_string(), + scope_id.to_string(), + table_ref.to_string(), + ) +} diff --git a/src/query/executor/mod.rs b/src/query/executor/mod.rs new file mode 100644 index 0000000..05affcd --- /dev/null +++ b/src/query/executor/mod.rs @@ -0,0 +1,468 @@ +use crate::catalog::Catalog; +use crate::catalog::namespace_key; +use crate::catalog::schema::TableSchema; +use crate::catalog::types::Row; +use crate::query::error::QueryError; +use crate::query::operators::{ + AggregateOperator, FilterOperator, LimitOperator, Operator, ScanOperator, SortOperator, + compile_expr, +}; +use crate::query::plan::{Query, QueryOptions}; +use crate::query::planner::{ExecutionStage, build_physical_plan}; +use crate::storage::encoded_key::EncodedKey; +use crate::storage::keyspace::KeyspaceSnapshot; + +mod aggregate; +mod cursor; +mod indexing; +mod join; +mod predicate; +mod validate; + +use aggregate::{aggregate_col_idx, aggregate_output_name}; +use cursor::{ + CursorToken, decode_cursor, encode_cursor, extract_pk_key, extract_sort_key, row_after_cursor, +}; +use indexing::indexed_pks_for_predicate; +use predicate::extract_primary_key_values; +use validate::validate_query; + +#[derive(Debug, Clone)] +pub struct QueryResult { + pub rows: Vec, + pub rows_examined: usize, + pub cursor: Option, + pub truncated: bool, + pub snapshot_seq: u64, + pub materialized_seq: Option, +} + +pub fn execute_query( + snapshot: &KeyspaceSnapshot, + catalog: &Catalog, + project_id: &str, + scope_id: &str, + query: Query, +) -> Result { + execute_query_with_options( + snapshot, + catalog, + project_id, + scope_id, + query, + &QueryOptions::default(), + 0, + 10_000, + ) +} + +#[allow(clippy::too_many_arguments)] +pub fn execute_query_with_options( + snapshot: &KeyspaceSnapshot, + catalog: &Catalog, + project_id: &str, + scope_id: &str, + query: Query, + options: &QueryOptions, + snapshot_seq: u64, + max_scan_rows: usize, +) -> Result { + let mut options = options.clone(); + if options.async_index.is_none() { + options.async_index = query.use_index.clone(); + } + + // Validate expression depth to prevent stack overflow from deeply nested expressions + if let Some(pred) = &query.predicate { + pred.validate_depth() + .map_err(|e| QueryError::InvalidQuery { + reason: e.to_string(), + })?; + } + if let Some(having) = &query.having { + having + .validate_depth() + .map_err(|e| QueryError::InvalidQuery { + reason: e.to_string(), + })?; + } + + if !options.allow_full_scan && query.limit.is_none() && query.predicate.is_none() { + return Err(QueryError::InvalidQuery { + reason: "full scan requires limit/cursor or allow_full_scan".into(), + }); + } + + let cursor_state = match &options.cursor { + Some(encoded) => Some(decode_cursor(encoded)?), + None => None, + }; + if let Some(cursor) = &cursor_state + && cursor.snapshot_seq != snapshot_seq + { + return Err(QueryError::InvalidQuery { + reason: "cursor snapshot_seq mismatch".into(), + }); + } + + if !query.joins.is_empty() { + return join::execute_join_query( + snapshot, + catalog, + project_id, + scope_id, + query, + options, + snapshot_seq, + max_scan_rows, + cursor_state, + ); + } + + let table_key = (namespace_key(project_id, scope_id), query.table.clone()); + let schema = catalog + .tables + .get(&table_key) + .ok_or_else(|| QueryError::TableNotFound { + project_id: project_id.to_string(), + table: query.table.clone(), + })?; + let table = snapshot.table(project_id, scope_id, &query.table); + let mut materialized_seq = None; + validate_query(schema, &query)?; + + let columns: Vec = schema.columns.iter().map(|c| c.name.clone()).collect(); + let page_size = query.limit.unwrap_or_else(|| { + cursor_state + .as_ref() + .map(|c| c.page_size) + .unwrap_or(max_scan_rows.min(100)) + }); + let effective_page_size = page_size.min(max_scan_rows); + if let Some(result) = + try_primary_key_point_query(schema, table, &query, &cursor_state, snapshot_seq)? + { + return Ok(result); + } + + let estimated_rows: usize; + let row_source: Box + Send> = + if let Some(async_index) = &options.async_index { + let projection = snapshot + .async_index(project_id, scope_id, &query.table, async_index) + .ok_or_else(|| QueryError::InvalidQuery { + reason: "async index not found".into(), + })?; + materialized_seq = Some(projection.materialized_seq); + estimated_rows = projection.rows.len(); + let rows = projection.rows.clone(); + Box::new(rows.into_iter().map(|(_, row)| row)) + } else if let (Some(predicate), Some(table)) = (&query.predicate, table) { + let table_rows = table.rows.clone(); + let indexed_pks = indexed_pks_for_predicate( + catalog, + project_id, + scope_id, + &query.table, + table, + predicate, + )?; + match indexed_pks { + Some(pks) => { + estimated_rows = pks.len(); + Box::new( + pks.into_iter() + .filter_map(move |pk| table_rows.get(&pk).cloned()), + ) + } + None => { + estimated_rows = table.rows.len(); + let rows = table.rows.clone(); + Box::new(rows.into_iter().map(|(_, row)| row)) + } + } + } else { + let rows = table.map(|t| t.rows.clone()).unwrap_or_default(); + estimated_rows = rows.len(); + Box::new(rows.into_iter().map(|(_, row)| row)) + }; + + if estimated_rows > max_scan_rows && query.limit.is_none() && options.cursor.is_none() { + return Err(QueryError::ScanBoundExceeded { + estimated_rows: estimated_rows as u64, + max_scan_rows: max_scan_rows as u64, + }); + } + let physical_plan = build_physical_plan( + schema, + &query, + options.async_index.clone(), + estimated_rows as u64, + query.predicate.is_some(), + )?; + + let mut root: Box = Box::new(ScanOperator::new(row_source)); + let mut selected_indices: Option> = None; + let mut row_columns = columns.clone(); + for stage in &physical_plan.stages { + match stage { + ExecutionStage::Scan => {} + ExecutionStage::Limit => { + if cursor_state.is_none() + && query.order_by.is_empty() + && query.aggregates.is_empty() + && query.having.is_none() + { + root = Box::new(LimitOperator::new( + root, + effective_page_size.saturating_add(1), + )); + } + } + ExecutionStage::Filter => { + if let Some(predicate) = query.predicate.clone() { + let compiled = compile_expr(&predicate, &columns, &query.table)?; + root = Box::new(FilterOperator::new(root, compiled)); + } + } + ExecutionStage::Sort => { + let order_by = query + .order_by + .iter() + .map(|(order_col, order)| { + row_columns + .iter() + .position(|c| c == order_col) + .map(|idx| (idx, *order)) + .ok_or_else(|| QueryError::ColumnNotFound { + table: query.table.clone(), + column: order_col.clone(), + }) + }) + .collect::, _>>()?; + let top_k_limit = if cursor_state.is_none() { + Some(effective_page_size.saturating_add(1)) + } else { + None + }; + root = Box::new(SortOperator::new_with_limit(root, order_by, top_k_limit)); + } + ExecutionStage::Aggregate => { + let group_by_idx = query + .group_by + .iter() + .map(|name| { + columns.iter().position(|c| c == name).ok_or_else(|| { + QueryError::ColumnNotFound { + table: query.table.clone(), + column: name.clone(), + } + }) + }) + .collect::, _>>()?; + let agg_col_idx = query + .aggregates + .iter() + .map(|agg| aggregate_col_idx(agg, &columns)) + .collect::, _>>()?; + root = Box::new(AggregateOperator::new( + root, + query.aggregates.clone(), + group_by_idx, + agg_col_idx, + )); + row_columns = query.group_by.clone(); + row_columns.extend(query.aggregates.iter().map(aggregate_output_name)); + } + ExecutionStage::Having => { + if query.aggregates.is_empty() { + return Err(QueryError::InvalidQuery { + reason: "having requires aggregate or group_by".into(), + }); + } + if let Some(having) = query.having.clone() { + let compiled = compile_expr(&having, &row_columns, &query.table)?; + root = Box::new(FilterOperator::new(root, compiled)); + } + } + ExecutionStage::Project => { + selected_indices = Some( + query + .select + .iter() + .map(|col| { + row_columns.iter().position(|c| c == col).ok_or_else(|| { + QueryError::ColumnNotFound { + table: query.table.clone(), + column: col.clone(), + } + }) + }) + .collect::, _>>()?, + ); + } + } + } + + let sort_indices: Vec<(usize, crate::query::plan::Order)> = if !query.order_by.is_empty() { + query + .order_by + .iter() + .filter_map(|(name, ord)| { + row_columns + .iter() + .position(|c| c == name) + .map(|i| (i, *ord)) + }) + .collect() + } else { + Vec::new() + }; + let pk_indices: Vec = if !query.aggregates.is_empty() { + (0..row_columns.len()).collect() + } else { + schema + .primary_key + .iter() + .filter_map(|pk| row_columns.iter().position(|c| c == pk)) + .collect() + }; + let mut sliced: Vec = Vec::new(); + while let Some(row) = root.next() { + if let Some(cursor) = &cursor_state + && !row_after_cursor(&row, cursor, &sort_indices, &pk_indices) + { + continue; + } + sliced.push(row); + if sliced.len() > effective_page_size { + break; + } + } + let has_more = sliced.len() > effective_page_size; + if has_more { + sliced.truncate(effective_page_size); + } + let cursor_last_row = sliced.last().cloned(); + let sliced: Vec = if let Some(selected) = &selected_indices { + sliced + .into_iter() + .map(|row| Row { + values: selected + .iter() + .map(|idx| row.values[*idx].clone()) + .collect(), + }) + .collect() + } else { + sliced + }; + let cursor = if has_more { + let last_row = cursor_last_row.ok_or_else(|| QueryError::InvalidQuery { + reason: "invalid cursor state".into(), + })?; + Some(encode_cursor(&CursorToken { + snapshot_seq, + last_sort_key: extract_sort_key(&last_row, &sort_indices), + last_pk: extract_pk_key(&last_row, &pk_indices), + page_size, + remaining_limit: None, + })?) + } else { + None + }; + + Ok(QueryResult { + rows: sliced, + rows_examined: root.rows_examined(), + truncated: cursor.is_some(), + cursor, + snapshot_seq, + materialized_seq, + }) +} + +fn try_primary_key_point_query( + schema: &TableSchema, + table: Option<&crate::storage::keyspace::TableData>, + query: &Query, + cursor_state: &Option, + snapshot_seq: u64, +) -> Result, QueryError> { + if cursor_state.is_some() + || query.predicate.is_none() + || !query.group_by.is_empty() + || !query.aggregates.is_empty() + || query.having.is_some() + || !query.order_by.is_empty() + { + return Ok(None); + } + if query.limit == Some(0) { + return Ok(Some(QueryResult { + rows: Vec::new(), + rows_examined: 0, + cursor: None, + truncated: false, + snapshot_seq, + materialized_seq: None, + })); + } + + let Some(predicate) = query.predicate.as_ref() else { + return Ok(None); + }; + let Some(primary_key) = extract_primary_key_values(predicate, &schema.primary_key) else { + return Ok(None); + }; + + let selected_indices = resolve_selected_indices(schema, query)?; + let encoded_pk = EncodedKey::from_values(&primary_key); + let maybe_row = table.and_then(|t| t.rows.get(&encoded_pk)); + let rows = match maybe_row { + Some(row) => vec![project_selected_row(row, selected_indices.as_deref())], + None => Vec::new(), + }; + + Ok(Some(QueryResult { + rows, + rows_examined: 1, + cursor: None, + truncated: false, + snapshot_seq, + materialized_seq: None, + })) +} + +fn resolve_selected_indices( + schema: &TableSchema, + query: &Query, +) -> Result>, QueryError> { + if query.select.len() == 1 && query.select[0] == "*" { + return Ok(None); + } + let mut indices = Vec::with_capacity(query.select.len()); + for col in &query.select { + let column_index = schema + .columns + .iter() + .position(|c| c.name == *col) + .ok_or_else(|| QueryError::ColumnNotFound { + table: query.table.clone(), + column: col.clone(), + })?; + indices.push(column_index); + } + Ok(Some(indices)) +} + +fn project_selected_row(row: &Row, selected_indices: Option<&[usize]>) -> Row { + match selected_indices { + Some(indices) => Row { + values: indices.iter().map(|idx| row.values[*idx].clone()).collect(), + }, + None => row.clone(), + } +} +#[cfg(test)] +mod tests; diff --git a/src/query/executor/predicate.rs b/src/query/executor/predicate.rs new file mode 100644 index 0000000..5dd3e2d --- /dev/null +++ b/src/query/executor/predicate.rs @@ -0,0 +1,42 @@ +use crate::catalog::types::Value; +use crate::query::plan::Expr; +use std::collections::HashMap; + +pub(super) fn extract_primary_key_values( + predicate: &Expr, + primary_key: &[String], +) -> Option> { + if primary_key.is_empty() { + return None; + } + let mut equalities: HashMap = HashMap::new(); + if !collect_eq_constraints(predicate, &mut equalities) { + return None; + } + if equalities.len() != primary_key.len() { + return None; + } + let mut values = Vec::with_capacity(primary_key.len()); + for key_col in primary_key { + let value = equalities.get(key_col)?; + values.push(value.clone()); + } + Some(values) +} + +pub(super) fn collect_eq_constraints(expr: &Expr, equalities: &mut HashMap) -> bool { + match expr { + Expr::Eq(column, value) => { + if let Some(existing) = equalities.get(column) { + existing == value + } else { + equalities.insert(column.clone(), value.clone()); + true + } + } + Expr::And(lhs, rhs) => { + collect_eq_constraints(lhs, equalities) && collect_eq_constraints(rhs, equalities) + } + _ => false, + } +} diff --git a/src/query/executor/tests.rs b/src/query/executor/tests.rs new file mode 100644 index 0000000..9eec260 --- /dev/null +++ b/src/query/executor/tests.rs @@ -0,0 +1,1340 @@ +use super::execute_query_with_options; +use crate::catalog::Catalog; +use crate::catalog::namespace_key; +use crate::catalog::schema::{ColumnDef, IndexType}; +use crate::catalog::types::{ColumnType, Row, Value}; +use crate::query::error::QueryError; +use crate::query::plan::{Aggregate, Expr, Order, Query, QueryOptions, col, lit}; +use crate::storage::encoded_key::EncodedKey; +use crate::storage::index::extract_index_key_encoded; +use crate::storage::keyspace::{Keyspace, NamespaceId, SecondaryIndex}; + +fn execute_query( + snapshot: &crate::storage::keyspace::KeyspaceSnapshot, + catalog: &Catalog, + project_id: &str, + scope_id: &str, + query: Query, +) -> Result { + execute_query_with_options( + snapshot, + catalog, + project_id, + scope_id, + query, + &QueryOptions { + allow_full_scan: true, + ..QueryOptions::default() + }, + 0, + usize::MAX, + ) +} + +fn setup() -> (Keyspace, Catalog) { + let mut keyspace = Keyspace::default(); + let mut catalog = Catalog::default(); + catalog.create_project("A").expect("project A"); + catalog.create_project("B").expect("project B"); + for p in ["A", "B"] { + catalog + .create_table( + p, + "app", + "users", + vec![ + ColumnDef { + name: "id".into(), + col_type: ColumnType::Integer, + nullable: false, + }, + ColumnDef { + name: "name".into(), + col_type: ColumnType::Text, + nullable: false, + }, + ColumnDef { + name: "age".into(), + col_type: ColumnType::Integer, + nullable: false, + }, + ColumnDef { + name: "email".into(), + col_type: ColumnType::Text, + nullable: true, + }, + ], + vec!["id".into()], + ) + .expect("table"); + } + for i in 0..100 { + keyspace.upsert_row( + "A", + "app", + "users", + vec![Value::Integer(i)], + Row { + values: vec![ + Value::Integer(i), + Value::Text(format!("u{i}").into()), + Value::Integer(18 + (i % 50)), + if i == 0 { + Value::Null + } else if i % 2 == 0 { + Value::Text(format!("u{i}@gmail.com").into()) + } else { + Value::Text(format!("u{i}@example.com").into()) + }, + ], + }, + i as u64 + 1, + ); + keyspace.upsert_row( + "B", + "app", + "users", + vec![Value::Integer(i)], + Row { + values: vec![ + Value::Integer(i), + Value::Text(format!("b{i}").into()), + Value::Integer(99), + Value::Text(format!("b{i}@other.com").into()), + ], + }, + i as u64 + 10_000, + ); + } + catalog + .create_index( + "A", + "app", + "users", + "by_age", + vec!["age".into()], + IndexType::BTree, + None, + ) + .expect("create index"); + catalog + .create_index( + "A", + "app", + "users", + "by_name", + vec!["name".into()], + IndexType::BTree, + None, + ) + .expect("create name index"); + let schema = catalog + .tables + .get(&(namespace_key("A", "app"), "users".to_string())) + .expect("schema") + .clone(); + let table = keyspace + .table_by_namespace_key_mut(&namespace_key("A", "app"), "users") + .expect("table"); + let mut secondary_index = SecondaryIndex::default(); + for (pk, row) in &table.rows { + let age_key = + extract_index_key_encoded(row, &schema, &["age".into()]).expect("age index key"); + secondary_index.insert(age_key, pk.clone()); + } + table.indexes.insert("by_age".into(), secondary_index); + let mut by_name = SecondaryIndex::default(); + for (pk, row) in &table.rows { + let key = + extract_index_key_encoded(row, &schema, &["name".into()]).expect("name index key"); + by_name.insert(key, pk.clone()); + } + table.indexes.insert("by_name".into(), by_name); + (keyspace, catalog) +} + +#[test] +fn query_correctness_suite() { + let (keyspace, catalog) = setup(); + let snapshot = keyspace.snapshot(); + + let all = execute_query( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["*"]).from("users"), + ) + .expect("all"); + assert_eq!(all.rows.len(), 100); + + let filtered = execute_query( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["*"]) + .from("users") + .where_(Expr::Gt("age".into(), Value::Integer(30))), + ) + .expect("filtered"); + assert!( + filtered + .rows + .iter() + .all(|r| matches!(r.values[2], Value::Integer(v) if v > 30)) + ); + + let ordered = execute_query( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["*"]) + .from("users") + .order_by("age", Order::Desc) + .order_by("id", Order::Asc), + ) + .expect("ordered"); + for w in ordered.rows.windows(2) { + assert!(w[0].values[2] >= w[1].values[2]); + } + + let limited = execute_query( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["*"]).from("users").limit(5), + ) + .expect("limit"); + assert_eq!(limited.rows.len(), 5); + + let counted = execute_query( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["*"]) + .from("users") + .aggregate(Aggregate::Count), + ) + .expect("count"); + assert_eq!(counted.rows[0].values[0], Value::Integer(100)); + + let grouped = execute_query( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["*"]) + .from("users") + .group_by(&["age"]) + .aggregate(Aggregate::Count), + ) + .expect("grouped"); + assert!(!grouped.rows.is_empty()); + + let compound = execute_query( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["*"]).from("users").where_( + Expr::Gt("age".into(), Value::Integer(30)) + .and(Expr::Like("email".into(), "%@gmail.com".into())), + ), + ) + .expect("compound"); + assert!(compound.rows.iter().all(|r| { + matches!(&r.values[2], Value::Integer(v) if *v > 30) + && matches!(&r.values[3], Value::Text(s) if s.ends_with("@gmail.com")) + })); + + let project_b = execute_query( + &snapshot, + &catalog, + "B", + "app", + Query::select(&["*"]) + .from("users") + .where_(Expr::Eq("age".into(), Value::Integer(99))), + ) + .expect("project B"); + assert_eq!(project_b.rows.len(), 100); +} + +#[test] +fn builder_supports_not_is_not_null_and_like_underscore() { + let (keyspace, catalog) = setup(); + let snapshot = keyspace.snapshot(); + + let query = Query::select(&["id", "email"]).from("users").where_( + col("email") + .is_not_null() + .and(col("name").like(lit("u_"))) + .and(col("age").gt(lit(20)).not().not()), + ); + let result = execute_query(&snapshot, &catalog, "A", "app", query).expect("query"); + assert!(!result.rows.is_empty()); + assert!( + result + .rows + .iter() + .all(|r| matches!(&r.values[1], Value::Text(_))) + ); +} + +#[test] +fn having_filters_post_aggregation() { + let (keyspace, catalog) = setup(); + let snapshot = keyspace.snapshot(); + + let result = execute_query( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["age", "count_star"]) + .from("users") + .group_by(&["age"]) + .aggregate(Aggregate::Count) + .having(Expr::Gt("count_star".into(), Value::Integer(1))), + ) + .expect("having"); + + assert!( + result + .rows + .iter() + .all(|r| matches!(r.values[1], Value::Integer(v) if v > 1)) + ); +} + +#[test] +fn index_backed_range_scan_reduces_examined_rows() { + let (keyspace, catalog) = setup(); + let snapshot = keyspace.snapshot(); + + let full = execute_query( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["*"]).from("users"), + ) + .expect("full"); + let ranged = execute_query( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["*"]).from("users").where_(Expr::Between( + "age".into(), + Value::Integer(40), + Value::Integer(41), + )), + ) + .expect("range"); + assert!(ranged.rows.len() < full.rows.len()); + assert!(ranged.rows_examined < full.rows_examined); +} + +#[test] +fn primary_key_eq_uses_point_lookup_path() { + let (keyspace, catalog) = setup(); + let snapshot = keyspace.snapshot(); + + let result = execute_query( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["id", "name"]) + .from("users") + .where_(Expr::Eq("id".into(), Value::Integer(42))) + .limit(1), + ) + .expect("pk point query"); + + assert_eq!(result.rows.len(), 1); + assert_eq!(result.rows[0].values[0], Value::Integer(42)); + assert_eq!(result.rows_examined, 1); +} + +#[test] +fn primary_key_with_non_pk_eq_falls_back_to_general_path() { + let (keyspace, catalog) = setup(); + let snapshot = keyspace.snapshot(); + + let result = execute_query( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["id", "name"]) + .from("users") + .where_( + Expr::Eq("id".into(), Value::Integer(42)) + .and(Expr::Eq("age".into(), Value::Integer(60))), + ) + .limit(1), + ) + .expect("mixed eq query"); + + assert_eq!(result.rows.len(), 1); + assert!(result.rows_examined > 1); +} + +#[test] +fn use_index_hint_selects_async_projection() { + let mut keyspace = Keyspace::default(); + let mut catalog = Catalog::default(); + catalog.create_project("A").expect("project A"); + catalog + .create_table( + "A", + "app", + "users", + vec![ + ColumnDef { + name: "id".into(), + col_type: ColumnType::Integer, + nullable: false, + }, + ColumnDef { + name: "name".into(), + col_type: ColumnType::Text, + nullable: false, + }, + ], + vec!["id".into()], + ) + .expect("table"); + keyspace.upsert_row( + "A", + "app", + "users", + vec![Value::Integer(1)], + Row { + values: vec![Value::Integer(1), Value::Text("alice".into())], + }, + 1, + ); + keyspace.insert_async_projection( + NamespaceId::Project(namespace_key("A", "app")), + "users".into(), + "users_view".into(), + crate::storage::keyspace::AsyncProjectionData { + rows: { + let mut rows = im::OrdMap::new(); + rows.insert( + EncodedKey::from_values(&[Value::Integer(9)]), + Row { + values: vec![Value::Integer(9), Value::Text("projection".into())], + }, + ); + rows + }, + materialized_seq: 123, + }, + ); + let snapshot = keyspace.snapshot(); + + let result = execute_query( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["*"]).from("users").use_index("users_view"), + ) + .expect("hint query"); + + assert_eq!(result.materialized_seq, Some(123)); + assert_eq!(result.rows[0].values[0], Value::Integer(9)); +} + +#[test] +fn in_and_like_prefix_can_use_index_path() { + let (keyspace, catalog) = setup(); + let snapshot = keyspace.snapshot(); + + let by_in = execute_query( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["*"]).from("users").where_(Expr::In( + "age".into(), + vec![Value::Integer(40), Value::Integer(41)], + )), + ) + .expect("in"); + assert!( + by_in + .rows + .iter() + .all(|r| { matches!(r.values[2], Value::Integer(40) | Value::Integer(41)) }) + ); + + let by_prefix = execute_query( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["*"]) + .from("users") + .where_(Expr::Like("name".into(), "u1%".into())), + ) + .expect("prefix like"); + assert!( + by_prefix + .rows + .iter() + .all(|r| matches!(&r.values[1], Value::Text(s) if s.starts_with("u1"))) + ); +} + +#[test] +fn and_or_predicates_compose_index_row_sets() { + let (keyspace, catalog) = setup(); + let snapshot = keyspace.snapshot(); + + let and_query = execute_query( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["id", "name", "age"]).from("users").where_( + Expr::Eq("age".into(), Value::Integer(40)).and(Expr::Like("name".into(), "u2%".into())), + ), + ) + .expect("and query"); + assert!(and_query.rows.iter().all(|r| { + matches!(r.values[2], Value::Integer(40)) + && matches!(&r.values[1], Value::Text(name) if name.starts_with("u2")) + })); + + let or_query = execute_query( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["id", "name"]) + .from("users") + .where_( + Expr::Eq("name".into(), Value::Text("u1".into())) + .or(Expr::Like("name".into(), "u2%".into())), + ) + .order_by("id", Order::Asc), + ) + .expect("or query"); + assert!(or_query.rows.iter().all(|r| match &r.values[1] { + Value::Text(name) => name == "u1" || name.starts_with("u2"), + _ => false, + })); + assert!(!or_query.rows.is_empty()); +} + +#[test] +fn composite_index_respects_leftmost_prefix_rule() { + let (mut keyspace, mut catalog) = setup(); + catalog + .create_index( + "A", + "app", + "users", + "by_age_name", + vec!["age".into(), "name".into()], + IndexType::BTree, + None, + ) + .expect("composite index"); + let schema = catalog + .tables + .get(&(namespace_key("A", "app"), "users".to_string())) + .expect("schema") + .clone(); + let table = keyspace + .table_by_namespace_key_mut(&namespace_key("A", "app"), "users") + .expect("table"); + let mut by_age_name = SecondaryIndex::default(); + for (pk, row) in &table.rows { + let key = extract_index_key_encoded(row, &schema, &["age".into(), "name".into()]) + .expect("composite key"); + by_age_name.insert(key, pk.clone()); + } + table.indexes.insert("by_age_name".into(), by_age_name); + + let snapshot = keyspace.snapshot(); + + let good = execute_query( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["id", "name", "age"]) + .from("users") + .where_(Expr::Eq("age".into(), Value::Integer(40))), + ) + .expect("leftmost predicate should use composite index"); + assert!( + good.rows_examined < 100, + "leftmost-prefix query should avoid full scan" + ); + + let bad = execute_query( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["id", "name", "age"]) + .from("users") + .where_(Expr::Eq( + "email".into(), + Value::Text("u1@example.com".into()), + )), + ) + .expect("non-leftmost predicate falls back"); + assert!( + bad.rows_examined >= good.rows_examined, + "non-leftmost query should not be better than leftmost" + ); +} + +#[test] +fn partial_index_only_indexes_matching_rows() { + let (mut keyspace, mut catalog) = setup(); + catalog + .create_index( + "A", + "app", + "users", + "adults_only", + vec!["age".into()], + IndexType::BTree, + Some(Expr::Gte("age".into(), Value::Integer(50))), + ) + .expect("partial index"); + let schema = catalog + .tables + .get(&(namespace_key("A", "app"), "users".to_string())) + .expect("schema") + .clone(); + let table = keyspace + .table_by_namespace_key_mut(&namespace_key("A", "app"), "users") + .expect("table"); + let mut adults_only = SecondaryIndex { + partial_filter: Some(Expr::Gte("age".into(), Value::Integer(50))), + ..SecondaryIndex::default() + }; + for (pk, row) in &table.rows { + if adults_only + .should_include_row(row, &schema, "users") + .expect("partial eval") + { + let key = extract_index_key_encoded(row, &schema, &["age".into()]).expect("index key"); + adults_only.insert(key, pk.clone()); + } + } + table.indexes.insert("adults_only".into(), adults_only); + + let snapshot = keyspace.snapshot(); + let result = execute_query( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["id", "age"]) + .from("users") + .where_(Expr::Gte("age".into(), Value::Integer(50))), + ) + .expect("partial query"); + assert!(!result.rows.is_empty()); + assert!( + result + .rows + .iter() + .all(|r| matches!(r.values[1], Value::Integer(v) if v >= 50)) + ); +} + +#[test] +fn bounded_scan_is_enforced_when_full_scan_not_allowed() { + let (keyspace, catalog) = setup(); + let snapshot = keyspace.snapshot(); + let err = execute_query_with_options( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["*"]).from("users"), + &QueryOptions::default(), + 1, + 10_000, + ) + .expect_err("should reject full scan"); + assert!(matches!(err, QueryError::InvalidQuery { .. })); +} + +#[test] +fn default_execute_query_rejects_unbounded_full_scan() { + let (keyspace, catalog) = setup(); + let snapshot = keyspace.snapshot(); + let err = super::execute_query( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["*"]).from("users"), + ) + .expect_err("default execute_query should reject unbounded full scan"); + assert!(matches!(err, QueryError::InvalidQuery { .. })); +} + +#[test] +fn non_join_page_size_is_capped_by_max_scan_rows() { + let (keyspace, catalog) = setup(); + let snapshot = keyspace.snapshot(); + let result = execute_query_with_options( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["*"]) + .from("users") + .order_by("id", Order::Asc) + .limit(50), + &QueryOptions::default(), + 9, + 10, + ) + .expect("bounded page"); + assert_eq!(result.rows.len(), 10); + assert!(result.cursor.is_some()); + assert!(result.rows_examined <= 100); +} + +#[test] +fn join_scan_bound_is_enforced_when_full_scan_not_allowed() { + let (keyspace, mut catalog) = setup(); + catalog + .create_table( + "A", + "app", + "profiles", + vec![ + ColumnDef { + name: "user_id".into(), + col_type: ColumnType::Integer, + nullable: false, + }, + ColumnDef { + name: "country".into(), + col_type: ColumnType::Text, + nullable: false, + }, + ], + vec!["user_id".into()], + ) + .expect("profiles table"); + let mut keyspace = keyspace; + for i in 0..50 { + keyspace.upsert_row( + "A", + "app", + "profiles", + vec![Value::Integer(i)], + Row::from_values(vec![Value::Integer(i), Value::Text("US".into())]), + 1, + ); + } + let snapshot = keyspace.snapshot(); + let err = execute_query_with_options( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["u.id", "p.country"]) + .from("users") + .alias("u") + .inner_join("profiles", "u.id", "user_id") + .with_last_join_alias("p") + .limit(10), + &QueryOptions::default(), + 1, + 1_000, + ) + .expect_err("join scan bound"); + assert!(matches!(err, QueryError::ScanBoundExceeded { .. })); +} + +#[test] +fn type_mismatch_rejected_at_plan_time() { + let (keyspace, catalog) = setup(); + let snapshot = keyspace.snapshot(); + let err = execute_query( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["*"]) + .from("users") + .where_(Expr::Gt("age".into(), Value::Text("oops".into()))), + ) + .expect_err("type mismatch"); + assert!(matches!(err, QueryError::TypeMismatch { .. })); +} + +#[test] +fn cursor_pagination_returns_stable_pages() { + let (keyspace, catalog) = setup(); + let snapshot = keyspace.snapshot(); + let mut options = QueryOptions::default(); + let mut all = Vec::new(); + loop { + let page = execute_query_with_options( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["*"]) + .from("users") + .order_by("id", Order::Asc) + .limit(10), + &options, + 42, + 10_000, + ) + .expect("page"); + all.extend(page.rows.clone()); + if let Some(cursor) = page.cursor { + options.cursor = Some(cursor); + } else { + break; + } + } + assert_eq!(all.len(), 100); + for (i, row) in all.iter().enumerate().take(100) { + assert_eq!(row.values[0], Value::Integer(i as i64)); + } +} + +#[test] +fn inner_join_returns_matching_rows() { + let (keyspace, mut catalog) = setup(); + catalog + .create_table( + "A", + "app", + "profiles", + vec![ + ColumnDef { + name: "user_id".into(), + col_type: ColumnType::Integer, + nullable: false, + }, + ColumnDef { + name: "country".into(), + col_type: ColumnType::Text, + nullable: false, + }, + ], + vec!["user_id".into()], + ) + .expect("profiles table"); + let mut keyspace = keyspace; + for i in 0..50 { + keyspace.upsert_row( + "A", + "app", + "profiles", + vec![Value::Integer(i)], + Row::from_values(vec![Value::Integer(i), Value::Text("US".into())]), + 1, + ); + } + let snapshot = keyspace.snapshot(); + let result = execute_query( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["u.id", "p.country"]) + .from("users") + .alias("u") + .inner_join("profiles", "u.id", "user_id") + .with_last_join_alias("p") + .limit(100), + ) + .expect("join query"); + assert_eq!(result.rows.len(), 50); +} + +#[test] +fn join_aggregate_count_and_having_are_applied() { + let (mut keyspace, mut catalog) = setup(); + catalog + .create_table( + "A", + "app", + "profiles", + vec![ + ColumnDef { + name: "user_id".into(), + col_type: ColumnType::Integer, + nullable: false, + }, + ColumnDef { + name: "country".into(), + col_type: ColumnType::Text, + nullable: false, + }, + ], + vec!["user_id".into()], + ) + .expect("profiles table"); + for i in 0..50 { + keyspace.upsert_row( + "A", + "app", + "profiles", + vec![Value::Integer(i)], + Row::from_values(vec![ + Value::Integer(i), + Value::Text(if i % 2 == 0 { "US" } else { "CA" }.into()), + ]), + 1, + ); + } + let snapshot = keyspace.snapshot(); + let result = execute_query( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["p.country", "count_star"]) + .from("users") + .alias("u") + .inner_join("profiles", "u.id", "user_id") + .with_last_join_alias("p") + .group_by(&["p.country"]) + .aggregate(Aggregate::Count) + .having(Expr::Gt("count_star".into(), Value::Integer(20))) + .order_by("count_star", Order::Desc) + .limit(10), + ) + .expect("join aggregate query"); + + assert_eq!(result.rows.len(), 2); + for row in result.rows { + assert!(matches!(row.values[1], Value::Integer(25))); + } +} + +#[test] +fn left_join_supports_global_table_reference() { + let (mut keyspace, mut catalog) = setup(); + catalog.create_project("_global").expect("global project"); + catalog + .create_table( + "_global", + "app", + "users", + vec![ + ColumnDef { + name: "id".into(), + col_type: ColumnType::Integer, + nullable: false, + }, + ColumnDef { + name: "name".into(), + col_type: ColumnType::Text, + nullable: false, + }, + ], + vec!["id".into()], + ) + .expect("global users"); + for i in 0..20 { + keyspace.upsert_row( + "_global", + "app", + "users", + vec![Value::Integer(i)], + Row::from_values(vec![Value::Integer(i), Value::Text(format!("g{i}").into())]), + 1, + ); + } + let snapshot = keyspace.snapshot(); + let result = execute_query( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["u.id", "g.name"]) + .from("users") + .alias("u") + .left_join("_global.users", "u.id", "id") + .with_last_join_alias("g") + .limit(5), + ) + .expect("left join"); + assert_eq!(result.rows.len(), 5); +} + +#[test] +fn invalid_cursor_is_rejected() { + let (keyspace, catalog) = setup(); + let snapshot = keyspace.snapshot(); + let options = QueryOptions { + cursor: Some("xyz".into()), + ..QueryOptions::default() + }; + let err = execute_query_with_options( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["*"]) + .from("users") + .order_by("id", Order::Asc), + &options, + 42, + 10_000, + ) + .expect_err("invalid cursor should fail"); + assert!(matches!(err, QueryError::InvalidQuery { .. })); +} + +#[test] +fn uppercase_hex_cursor_is_accepted() { + let (keyspace, catalog) = setup(); + let snapshot = keyspace.snapshot(); + let first = execute_query_with_options( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["*"]) + .from("users") + .order_by("id", Order::Asc) + .limit(10), + &QueryOptions::default(), + 42, + 10_000, + ) + .expect("first page"); + let cursor = first + .cursor + .expect("first page should include cursor") + .to_ascii_uppercase(); + let options = QueryOptions { + cursor: Some(cursor), + ..QueryOptions::default() + }; + let second = execute_query_with_options( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["*"]) + .from("users") + .order_by("id", Order::Asc) + .limit(10), + &options, + 42, + 10_000, + ) + .expect("uppercase cursor should decode"); + assert_eq!(second.rows.len(), 10); +} + +#[test] +fn cursor_snapshot_mismatch_is_rejected() { + let (keyspace, catalog) = setup(); + let snapshot = keyspace.snapshot(); + let first = execute_query_with_options( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["*"]) + .from("users") + .order_by("id", Order::Asc) + .limit(10), + &QueryOptions::default(), + 42, + 10_000, + ) + .expect("first page"); + let options = QueryOptions { + cursor: first.cursor, + ..QueryOptions::default() + }; + let err = execute_query_with_options( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["*"]) + .from("users") + .order_by("id", Order::Asc) + .limit(10), + &options, + 43, + 10_000, + ) + .expect_err("snapshot mismatch"); + assert!(matches!(err, QueryError::InvalidQuery { .. })); +} + +#[test] +fn join_query_supports_cursor_pagination() { + let (mut keyspace, mut catalog) = setup(); + catalog + .create_table( + "A", + "app", + "profiles", + vec![ + ColumnDef { + name: "user_id".into(), + col_type: ColumnType::Integer, + nullable: false, + }, + ColumnDef { + name: "country".into(), + col_type: ColumnType::Text, + nullable: false, + }, + ], + vec!["user_id".into()], + ) + .expect("profiles table"); + for i in 0..50 { + keyspace.upsert_row( + "A", + "app", + "profiles", + vec![Value::Integer(i)], + Row::from_values(vec![Value::Integer(i), Value::Text("US".into())]), + 1, + ); + } + let snapshot = keyspace.snapshot(); + + let first = execute_query_with_options( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["*"]) + .from("users") + .alias("u") + .inner_join("profiles", "u.id", "user_id") + .with_last_join_alias("p") + .order_by("u.id", Order::Asc) + .limit(5), + &QueryOptions::default(), + 7, + 10_000, + ) + .expect("first page"); + assert_eq!(first.rows.len(), 5); + assert!(first.cursor.is_some()); + + let options = QueryOptions { + cursor: first.cursor, + ..QueryOptions::default() + }; + let second = execute_query_with_options( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["u.id", "p.country"]) + .from("users") + .alias("u") + .inner_join("profiles", "u.id", "user_id") + .with_last_join_alias("p") + .limit(5), + &options, + 7, + 10_000, + ) + .expect("join cursor page"); + assert_eq!(second.rows.len(), 5); + assert!(second.cursor.is_some()); + + let first_ids: Vec = first.rows.iter().map(|r| r.values[0].clone()).collect(); + let second_ids: Vec = second.rows.iter().map(|r| r.values[0].clone()).collect(); + assert!( + first_ids + .iter() + .all(|id| !second_ids.iter().any(|other| other == id)) + ); +} + +#[test] +fn right_join_includes_unmatched_right_rows_with_nulls() { + let (mut keyspace, mut catalog) = setup(); + catalog + .create_table( + "A", + "app", + "profiles", + vec![ + ColumnDef { + name: "user_id".into(), + col_type: ColumnType::Integer, + nullable: false, + }, + ColumnDef { + name: "country".into(), + col_type: ColumnType::Text, + nullable: false, + }, + ], + vec!["user_id".into()], + ) + .expect("profiles table"); + for i in 90..110 { + keyspace.upsert_row( + "A", + "app", + "profiles", + vec![Value::Integer(i)], + Row::from_values(vec![Value::Integer(i), Value::Text("US".into())]), + 1, + ); + } + let snapshot = keyspace.snapshot(); + let result = execute_query( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["u.id", "p.user_id"]) + .from("users") + .alias("u") + .right_join("profiles", "u.id", "user_id") + .with_last_join_alias("p") + .order_by("p.user_id", Order::Asc) + .limit(200), + ) + .expect("right join"); + + assert_eq!(result.rows.len(), 20); + let unmatched = result + .rows + .iter() + .filter(|r| matches!(r.values[0], Value::Null)) + .count(); + assert_eq!(unmatched, 10); +} + +#[test] +fn cross_join_cardinality_and_limit_are_correct() { + let (mut keyspace, mut catalog) = setup(); + catalog + .create_table( + "A", + "app", + "profiles", + vec![ + ColumnDef { + name: "user_id".into(), + col_type: ColumnType::Integer, + nullable: false, + }, + ColumnDef { + name: "country".into(), + col_type: ColumnType::Text, + nullable: false, + }, + ], + vec!["user_id".into()], + ) + .expect("profiles table"); + for i in 0..5 { + keyspace.upsert_row( + "A", + "app", + "profiles", + vec![Value::Integer(i)], + Row::from_values(vec![Value::Integer(i), Value::Text("US".into())]), + 1, + ); + } + let snapshot = keyspace.snapshot(); + let result = execute_query( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["u.id", "p.user_id"]) + .from("users") + .alias("u") + .cross_join("profiles") + .with_last_join_alias("p") + .limit(123), + ) + .expect("cross join"); + assert_eq!(result.rows.len(), 123); +} + +#[test] +fn descending_cursor_pagination_is_stable() { + let (keyspace, catalog) = setup(); + let snapshot = keyspace.snapshot(); + let mut options = QueryOptions::default(); + let mut all = Vec::new(); + loop { + let page = execute_query_with_options( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["*"]) + .from("users") + .order_by("id", Order::Desc) + .limit(11), + &options, + 55, + 10_000, + ) + .expect("page"); + all.extend(page.rows.clone()); + if let Some(cursor) = page.cursor { + options.cursor = Some(cursor); + } else { + break; + } + } + assert_eq!(all.len(), 100); + for w in all.windows(2) { + assert!(w[0].values[0] > w[1].values[0]); + } +} + +#[test] +fn contradictory_pk_equalities_return_empty_result() { + let (keyspace, catalog) = setup(); + let snapshot = keyspace.snapshot(); + let result = execute_query( + &snapshot, + &catalog, + "A", + "app", + Query::select(&["id"]) + .from("users") + .where_( + Expr::Eq("id".into(), Value::Integer(1)) + .and(Expr::Eq("id".into(), Value::Integer(2))), + ) + .limit(10), + ) + .expect("query"); + assert!(result.rows.is_empty()); +} diff --git a/src/query/executor/validate.rs b/src/query/executor/validate.rs new file mode 100644 index 0000000..431689c --- /dev/null +++ b/src/query/executor/validate.rs @@ -0,0 +1,128 @@ +use crate::catalog::schema::TableSchema; +use crate::catalog::types::Value; +use crate::query::error::QueryError; +use crate::query::plan::Query; + +pub(super) fn validate_query(schema: &TableSchema, query: &Query) -> Result<(), QueryError> { + for (col, _) in &query.order_by { + if !schema.columns.iter().any(|c| c.name == *col) { + return Err(QueryError::ColumnNotFound { + table: query.table.clone(), + column: col.clone(), + }); + } + } + for col in &query.group_by { + if !schema.columns.iter().any(|c| c.name == *col) { + return Err(QueryError::ColumnNotFound { + table: query.table.clone(), + column: col.clone(), + }); + } + } + if let Some(expr) = &query.predicate { + validate_expr_types(schema, expr)?; + } + Ok(()) +} + +fn validate_expr_types( + schema: &TableSchema, + expr: &crate::query::plan::Expr, +) -> Result<(), QueryError> { + use crate::catalog::types::ColumnType; + use crate::query::plan::Expr; + + let find_col_type = |name: &str| -> Result { + schema + .columns + .iter() + .find(|c| c.name == name) + .map(|c| c.col_type.clone()) + .ok_or_else(|| QueryError::ColumnNotFound { + table: schema.table_name.clone(), + column: name.to_string(), + }) + }; + + let value_compatible = |col_type: &ColumnType, value: &Value| -> bool { + matches!(value, Value::Null) + || match col_type { + ColumnType::Integer => matches!( + value, + Value::Integer(_) | Value::Float(_) | Value::Timestamp(_) + ), + ColumnType::Float => matches!( + value, + Value::Integer(_) | Value::Float(_) | Value::Timestamp(_) + ), + ColumnType::Timestamp => matches!( + value, + Value::Integer(_) | Value::Float(_) | Value::Timestamp(_) + ), + ColumnType::Text => matches!(value, Value::Text(_)), + ColumnType::Boolean => matches!(value, Value::Boolean(_)), + ColumnType::U256 => matches!(value, Value::U256(_)), + ColumnType::I256 => matches!(value, Value::I256(_)), + ColumnType::Blob => matches!(value, Value::Blob(_)), + ColumnType::Json => matches!(value, Value::Json(_) | Value::Text(_)), + } + }; + + match expr { + Expr::Eq(c, v) + | Expr::Ne(c, v) + | Expr::Lt(c, v) + | Expr::Lte(c, v) + | Expr::Gt(c, v) + | Expr::Gte(c, v) => { + let t = find_col_type(c)?; + if !value_compatible(&t, v) { + return Err(QueryError::TypeMismatch { + column: c.clone(), + expected: format!("{t:?}"), + got: format!("{v:?}"), + }); + } + } + Expr::In(c, values) => { + let t = find_col_type(c)?; + if !values.iter().all(|v| value_compatible(&t, v)) { + return Err(QueryError::TypeMismatch { + column: c.clone(), + expected: format!("{t:?}"), + got: "IN literal".to_string(), + }); + } + } + Expr::Between(c, lo, hi) => { + let t = find_col_type(c)?; + if !value_compatible(&t, lo) || !value_compatible(&t, hi) { + return Err(QueryError::TypeMismatch { + column: c.clone(), + expected: format!("{t:?}"), + got: "BETWEEN literal".to_string(), + }); + } + } + Expr::Like(c, _) => { + let t = find_col_type(c)?; + if !matches!(t, ColumnType::Text) { + return Err(QueryError::TypeMismatch { + column: c.clone(), + expected: "Text".to_string(), + got: format!("{t:?}"), + }); + } + } + Expr::IsNull(c) | Expr::IsNotNull(c) => { + let _ = find_col_type(c)?; + } + Expr::And(a, b) | Expr::Or(a, b) => { + validate_expr_types(schema, a)?; + validate_expr_types(schema, b)?; + } + Expr::Not(a) => validate_expr_types(schema, a)?, + } + Ok(()) +} diff --git a/tests/naming_conventions.rs b/tests/naming_conventions.rs index 4886b56..1805d38 100644 --- a/tests/naming_conventions.rs +++ b/tests/naming_conventions.rs @@ -75,8 +75,8 @@ fn critical_modules_avoid_ambiguous_index_size_offset_len_bindings() { include_str!("../src/commit/executor/internals.rs"), ), ( - "src/query/executor.rs", - include_str!("../src/query/executor.rs"), + "src/query/executor/mod.rs", + include_str!("../src/query/executor/mod.rs"), ), ( "src/query/operators.rs", From 6055a2ad22b8785db1df7daf2a07bd0ab40a1188 Mon Sep 17 00:00:00 2001 From: johnny Date: Fri, 27 Feb 2026 23:31:13 -0500 Subject: [PATCH 3/3] improve query performance --- src/lib.rs | 22 +- src/query/executor.rs | 2960 -------------------------------- src/query/executor/indexing.rs | 127 +- src/query/executor/mod.rs | 137 +- src/query/executor/validate.rs | 1 + 5 files changed, 253 insertions(+), 2994 deletions(-) delete mode 100644 src/query/executor.rs diff --git a/src/lib.rs b/src/lib.rs index 7ea4d33..1010f88 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4517,9 +4517,9 @@ impl AedbInstance { for pk in &schema.primary_key { query = query.order_by(pk, Order::Asc); } - let query_result = if let Some(caller_ref) = caller.as_ref() { - self.query_with_options_as( - Some(caller_ref), + let query_result = self + .query_with_options_as( + caller.as_ref(), project_id, scope_id, query, @@ -4530,21 +4530,7 @@ impl AedbInstance { }, ) .await - .map_err(query_error_to_aedb)? - } else { - self.query_with_options( - project_id, - scope_id, - query, - QueryOptions { - consistency: ConsistencyMode::AtLatest, - allow_full_scan: true, - ..QueryOptions::default() - }, - ) - .await - .map_err(query_error_to_aedb)? - }; + .map_err(query_error_to_aedb)?; let Some(before) = query_result.rows.first().cloned() else { return Ok(None); diff --git a/src/query/executor.rs b/src/query/executor.rs deleted file mode 100644 index a386605..0000000 --- a/src/query/executor.rs +++ /dev/null @@ -1,2960 +0,0 @@ -use crate::catalog::Catalog; -use crate::catalog::namespace_key; -use crate::catalog::schema::TableSchema; -use crate::catalog::types::{Row, Value}; -use crate::query::error::QueryError; -use crate::query::operators::{ - AggregateOperator, FilterOperator, LimitOperator, Operator, ScanOperator, SortOperator, - compile_expr, -}; -use crate::query::plan::{Aggregate, Expr, JoinType, Query, QueryOptions}; -use crate::query::planner::{ExecutionStage, build_physical_plan}; -use crate::storage::encoded_key::EncodedKey; -use crate::storage::keyspace::KeyspaceSnapshot; -use serde::{Deserialize, Serialize}; -use std::collections::{HashMap, HashSet}; -use std::ops::Bound; - -type IndexBounds = (Bound, Bound); - -#[derive(Debug, Clone)] -enum IndexLookup { - Range { column: String, bounds: IndexBounds }, - MultiEq { column: String, values: Vec }, -} - -#[derive(Debug, Clone)] -pub struct QueryResult { - pub rows: Vec, - pub rows_examined: usize, - pub cursor: Option, - pub snapshot_seq: u64, - pub materialized_seq: Option, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub(crate) struct AccessPathDiagnostics { - pub selected_indexes: Vec, - pub predicate_evaluation_path: crate::PredicateEvaluationPath, - pub plan_trace: Vec, -} - -pub fn execute_query( - snapshot: &KeyspaceSnapshot, - catalog: &Catalog, - project_id: &str, - scope_id: &str, - query: Query, -) -> Result { - execute_query_with_options( - snapshot, - catalog, - project_id, - scope_id, - query, - &QueryOptions::default(), - 0, - 10_000, - ) -} - -#[allow(clippy::too_many_arguments)] -pub fn execute_query_with_options( - snapshot: &KeyspaceSnapshot, - catalog: &Catalog, - project_id: &str, - scope_id: &str, - query: Query, - options: &QueryOptions, - snapshot_seq: u64, - max_scan_rows: usize, -) -> Result { - let mut options = options.clone(); - if options.async_index.is_none() { - options.async_index = query.use_index.clone(); - } - - // Validate expression depth to prevent stack overflow from deeply nested expressions - if let Some(pred) = &query.predicate { - pred.validate_depth() - .map_err(|e| QueryError::InvalidQuery { - reason: e.to_string(), - })?; - } - if let Some(having) = &query.having { - having - .validate_depth() - .map_err(|e| QueryError::InvalidQuery { - reason: e.to_string(), - })?; - } - - if !options.allow_full_scan && query.limit.is_none() && query.predicate.is_none() { - return Err(QueryError::InvalidQuery { - reason: "full scan requires limit/cursor or allow_full_scan".into(), - }); - } - - let cursor_state = match &options.cursor { - Some(encoded) => Some(decode_cursor(encoded)?), - None => None, - }; - if let Some(cursor) = &cursor_state - && cursor.snapshot_seq != snapshot_seq - { - return Err(QueryError::InvalidQuery { - reason: "cursor snapshot_seq mismatch".into(), - }); - } - - if !query.joins.is_empty() { - return execute_join_query( - snapshot, - catalog, - project_id, - scope_id, - query, - options, - snapshot_seq, - max_scan_rows, - cursor_state, - ); - } - - let (q_project, q_scope, q_table) = resolve_table_ref(project_id, scope_id, &query.table); - let table_key = (namespace_key(&q_project, &q_scope), q_table.clone()); - let schema = catalog - .tables - .get(&table_key) - .ok_or_else(|| QueryError::TableNotFound { - project_id: q_project.clone(), - table: q_table.clone(), - })?; - let table = snapshot.table(&q_project, &q_scope, &q_table); - let mut materialized_seq = None; - validate_query(schema, &query)?; - - let columns: Vec = schema.columns.iter().map(|c| c.name.clone()).collect(); - let page_size = query.limit.unwrap_or_else(|| { - cursor_state - .as_ref() - .map(|c| c.page_size) - .unwrap_or(max_scan_rows.min(100)) - }); - let effective_page_size = page_size.min(max_scan_rows); - if let Some(result) = - try_primary_key_point_query(schema, table, &query, &cursor_state, snapshot_seq)? - { - return Ok(result); - } - - let estimated_rows: usize; - let row_source: Box + Send> = if let Some(async_index) = - &options.async_index - { - let projection = snapshot - .async_index(project_id, scope_id, &query.table, async_index) - .ok_or_else(|| QueryError::InvalidQuery { - reason: "async index not found".into(), - })?; - materialized_seq = Some(projection.materialized_seq); - estimated_rows = projection.rows.len(); - let rows = projection.rows.clone(); - Box::new(rows.into_iter().map(|(_, row)| row)) - } else if let (Some(predicate), Some(table)) = (&query.predicate, table) { - let table_rows = table.rows.clone(); - let indexed_pks = - indexed_pks_for_predicate(catalog, &q_project, &q_scope, &q_table, table, predicate)?; - match indexed_pks { - Some(pks) => { - estimated_rows = pks.len(); - Box::new( - pks.into_iter() - .filter_map(move |pk| table_rows.get(&pk).cloned()), - ) - } - None => { - estimated_rows = table.rows.len(); - let rows = table.rows.clone(); - Box::new(rows.into_iter().map(|(_, row)| row)) - } - } - } else { - let rows = table.map(|t| t.rows.clone()).unwrap_or_default(); - estimated_rows = rows.len(); - Box::new(rows.into_iter().map(|(_, row)| row)) - }; - - if estimated_rows > max_scan_rows && query.limit.is_none() && options.cursor.is_none() { - return Err(QueryError::ScanBoundExceeded { - estimated_rows: estimated_rows as u64, - max_scan_rows: max_scan_rows as u64, - }); - } - let physical_plan = build_physical_plan( - schema, - &query, - options.async_index.clone(), - estimated_rows as u64, - query.predicate.is_some(), - )?; - - let mut root: Box = Box::new(ScanOperator::new(row_source)); - let mut selected_indices: Option> = None; - let mut row_columns = columns.clone(); - for stage in &physical_plan.stages { - match stage { - ExecutionStage::Scan => {} - ExecutionStage::Limit => { - if cursor_state.is_none() - && query.order_by.is_empty() - && query.aggregates.is_empty() - && query.having.is_none() - { - root = Box::new(LimitOperator::new( - root, - effective_page_size.saturating_add(1), - )); - } - } - ExecutionStage::Filter => { - if let Some(predicate) = query.predicate.clone() { - let compiled = compile_expr(&predicate, &columns, &query.table)?; - root = Box::new(FilterOperator::new(root, compiled)); - } - } - ExecutionStage::Sort => { - let order_by = query - .order_by - .iter() - .map(|(order_col, order)| { - row_columns - .iter() - .position(|c| c == order_col) - .map(|idx| (idx, *order)) - .ok_or_else(|| QueryError::ColumnNotFound { - table: query.table.clone(), - column: order_col.clone(), - }) - }) - .collect::, _>>()?; - let top_k_limit = if cursor_state.is_none() { - Some(effective_page_size.saturating_add(1)) - } else { - None - }; - root = Box::new(SortOperator::new_with_limit(root, order_by, top_k_limit)); - } - ExecutionStage::Aggregate => { - let group_by_idx = query - .group_by - .iter() - .map(|name| { - columns.iter().position(|c| c == name).ok_or_else(|| { - QueryError::ColumnNotFound { - table: query.table.clone(), - column: name.clone(), - } - }) - }) - .collect::, _>>()?; - let agg_col_idx = query - .aggregates - .iter() - .map(|agg| aggregate_col_idx(agg, &columns)) - .collect::, _>>()?; - root = Box::new(AggregateOperator::new( - root, - query.aggregates.clone(), - group_by_idx, - agg_col_idx, - )); - row_columns = query.group_by.clone(); - row_columns.extend(query.aggregates.iter().map(aggregate_output_name)); - } - ExecutionStage::Having => { - if query.aggregates.is_empty() { - return Err(QueryError::InvalidQuery { - reason: "having requires aggregate or group_by".into(), - }); - } - if let Some(having) = query.having.clone() { - let compiled = compile_expr(&having, &row_columns, &query.table)?; - root = Box::new(FilterOperator::new(root, compiled)); - } - } - ExecutionStage::Project => { - selected_indices = Some( - query - .select - .iter() - .map(|col| { - row_columns.iter().position(|c| c == col).ok_or_else(|| { - QueryError::ColumnNotFound { - table: query.table.clone(), - column: col.clone(), - } - }) - }) - .collect::, _>>()?, - ); - } - } - } - - let sort_indices: Vec<(usize, crate::query::plan::Order)> = if !query.order_by.is_empty() { - query - .order_by - .iter() - .filter_map(|(name, ord)| { - row_columns - .iter() - .position(|c| c == name) - .map(|i| (i, *ord)) - }) - .collect() - } else { - Vec::new() - }; - let pk_indices: Vec = if !query.aggregates.is_empty() { - (0..row_columns.len()).collect() - } else { - schema - .primary_key - .iter() - .filter_map(|pk| row_columns.iter().position(|c| c == pk)) - .collect() - }; - let mut sliced: Vec = Vec::new(); - while let Some(row) = root.next() { - if let Some(cursor) = &cursor_state - && !row_after_cursor(&row, cursor, &sort_indices, &pk_indices) - { - continue; - } - sliced.push(row); - if sliced.len() > effective_page_size { - break; - } - } - let has_more = sliced.len() > effective_page_size; - if has_more { - sliced.truncate(effective_page_size); - } - let cursor_last_row = sliced.last().cloned(); - let sliced: Vec = if let Some(selected) = &selected_indices { - sliced - .into_iter() - .map(|row| Row { - values: selected - .iter() - .map(|idx| row.values[*idx].clone()) - .collect(), - }) - .collect() - } else { - sliced - }; - let cursor = if has_more { - let last_row = cursor_last_row.ok_or_else(|| QueryError::InvalidQuery { - reason: "invalid cursor state".into(), - })?; - Some(encode_cursor(&CursorToken { - snapshot_seq, - last_sort_key: extract_sort_key(&last_row, &sort_indices), - last_pk: extract_pk_key(&last_row, &pk_indices), - page_size, - remaining_limit: None, - })?) - } else { - None - }; - - Ok(QueryResult { - rows: sliced, - rows_examined: root.rows_examined(), - cursor, - snapshot_seq, - materialized_seq, - }) -} - -pub(crate) fn explain_access_path_for_query( - snapshot: &KeyspaceSnapshot, - catalog: &Catalog, - project_id: &str, - scope_id: &str, - query: &Query, - options: &QueryOptions, -) -> Result { - if !query.joins.is_empty() { - let mut trace = Vec::new(); - trace.push("join query: predicate evaluation happens after join execution".to_string()); - if query.predicate.is_some() { - trace.push("post-join filter stage evaluates query predicate".to_string()); - } - return Ok(AccessPathDiagnostics { - selected_indexes: Vec::new(), - predicate_evaluation_path: crate::PredicateEvaluationPath::JoinExecution, - plan_trace: trace, - }); - } - - let mut selected_indexes = Vec::new(); - let mut trace = Vec::new(); - let mut predicate_evaluation_path = crate::PredicateEvaluationPath::None; - - let mut effective_options = options.clone(); - if effective_options.async_index.is_none() { - effective_options.async_index = query.use_index.clone(); - } - - if let Some(async_index) = &effective_options.async_index { - selected_indexes.push(async_index.clone()); - trace.push(format!( - "selected async index projection '{async_index}' as row source" - )); - predicate_evaluation_path = crate::PredicateEvaluationPath::AsyncIndexProjection; - if query.predicate.is_some() { - trace.push("query predicate is evaluated as filter on projected rows".to_string()); - } - return Ok(AccessPathDiagnostics { - selected_indexes, - predicate_evaluation_path, - plan_trace: trace, - }); - } - - let table_key = (namespace_key(project_id, scope_id), query.table.clone()); - let schema = catalog - .tables - .get(&table_key) - .ok_or_else(|| QueryError::TableNotFound { - project_id: project_id.to_string(), - table: query.table.clone(), - })?; - let table = snapshot.table(project_id, scope_id, &query.table); - - if let Some(predicate) = query.predicate.as_ref() { - if query.limit != Some(0) - && query.group_by.is_empty() - && query.aggregates.is_empty() - && query.having.is_none() - && query.order_by.is_empty() - && options.cursor.is_none() - && extract_primary_key_values(predicate, &schema.primary_key).is_some() - { - trace.push("primary-key equality predicate detected; using direct row lookup".into()); - return Ok(AccessPathDiagnostics { - selected_indexes, - predicate_evaluation_path: crate::PredicateEvaluationPath::PrimaryKeyEqLookup, - plan_trace: trace, - }); - } - - if let Some(table) = table { - let indexed = indexed_pks_for_predicate_with_trace( - catalog, - project_id, - scope_id, - &query.table, - table, - predicate, - )?; - if let Some(indexed) = indexed { - if !indexed.selected_indexes.is_empty() { - selected_indexes.extend(indexed.selected_indexes.clone()); - predicate_evaluation_path = - crate::PredicateEvaluationPath::SecondaryIndexLookup; - } else { - predicate_evaluation_path = crate::PredicateEvaluationPath::FullScanFilter; - } - trace.extend(indexed.plan_trace); - if matches!( - predicate_evaluation_path, - crate::PredicateEvaluationPath::FullScanFilter - ) { - trace.push( - "no matching secondary index; evaluating predicate during table scan" - .to_string(), - ); - } else { - trace.push( - "residual predicate is evaluated on rows returned by index lookup" - .to_string(), - ); - } - return Ok(AccessPathDiagnostics { - selected_indexes, - predicate_evaluation_path, - plan_trace: trace, - }); - } - } - - trace.push("predicate not indexable for current schema/index set".to_string()); - return Ok(AccessPathDiagnostics { - selected_indexes, - predicate_evaluation_path: crate::PredicateEvaluationPath::FullScanFilter, - plan_trace: trace, - }); - } - - trace.push("no predicate supplied; full table scan path".to_string()); - Ok(AccessPathDiagnostics { - selected_indexes, - predicate_evaluation_path, - plan_trace: trace, - }) -} - -#[allow(clippy::too_many_arguments)] -fn execute_join_query( - snapshot: &KeyspaceSnapshot, - catalog: &Catalog, - project_id: &str, - scope_id: &str, - query: Query, - options: QueryOptions, - snapshot_seq: u64, - max_scan_rows: usize, - cursor_state: Option, -) -> Result { - let (base_ns_project, base_ns_scope, base_table) = - resolve_table_ref(project_id, scope_id, &query.table); - let base_schema = catalog - .tables - .get(&( - namespace_key(&base_ns_project, &base_ns_scope), - base_table.clone(), - )) - .ok_or_else(|| QueryError::TableNotFound { - project_id: base_ns_project.clone(), - table: base_table.clone(), - })?; - let base_alias = query.table_alias.clone().unwrap_or(base_table.clone()); - let mut columns: Vec = base_schema - .columns - .iter() - .map(|c| format!("{base_alias}.{}", c.name)) - .collect(); - let base_count = snapshot - .table(&base_ns_project, &base_ns_scope, &base_table) - .map(|t| t.rows.len()) - .unwrap_or(0); - if !options.allow_full_scan && base_count > max_scan_rows { - return Err(QueryError::ScanBoundExceeded { - estimated_rows: base_count as u64, - max_scan_rows: max_scan_rows as u64, - }); - } - let mut rows: Vec = snapshot - .table(&base_ns_project, &base_ns_scope, &base_table) - .map(|t| t.rows.values().cloned().collect()) - .unwrap_or_default(); - - for join in &query.joins { - let (jp, js, jt) = resolve_table_ref(project_id, scope_id, &join.table); - let join_schema = catalog - .tables - .get(&(namespace_key(&jp, &js), jt.clone())) - .ok_or_else(|| QueryError::TableNotFound { - project_id: jp.clone(), - table: jt.clone(), - })?; - let join_alias = join.alias.clone().unwrap_or(jt.clone()); - let join_rows: Vec<&Row> = snapshot - .table(&jp, &js, &jt) - .map(|t| t.rows.values().collect()) - .unwrap_or_default(); - if !options.allow_full_scan - && rows.len().saturating_mul(join_rows.len().max(1)) > max_scan_rows - { - return Err(QueryError::ScanBoundExceeded { - estimated_rows: rows.len().saturating_mul(join_rows.len().max(1)) as u64, - max_scan_rows: max_scan_rows as u64, - }); - } - let join_col_offset = columns.len(); - let mut next_columns = columns.clone(); - next_columns.extend( - join_schema - .columns - .iter() - .map(|c| format!("{join_alias}.{}", c.name)), - ); - let (left_idx, right_idx) = match join.join_type { - JoinType::Cross => (None, None), - _ => { - let left = join - .left_column - .as_ref() - .ok_or_else(|| QueryError::InvalidQuery { - reason: "join requires left_column".into(), - })?; - let right = join - .right_column - .as_ref() - .ok_or_else(|| QueryError::InvalidQuery { - reason: "join requires right_column".into(), - })?; - let left_idx = columns.iter().position(|c| c == left).ok_or_else(|| { - QueryError::ColumnNotFound { - table: query.table.clone(), - column: left.clone(), - } - })?; - let right_idx = join_schema - .columns - .iter() - .position(|c| format!("{join_alias}.{}", c.name) == *right || c.name == *right) - .ok_or_else(|| QueryError::ColumnNotFound { - table: join.table.clone(), - column: right.clone(), - })?; - (Some(left_idx), Some(right_idx)) - } - }; - - let mut joined = Vec::new(); - match join.join_type { - JoinType::Cross => { - for left in &rows { - for right in &join_rows { - let mut values = left.values.clone(); - values.extend(right.values.clone()); - joined.push(Row { values }); - } - } - } - JoinType::Inner | JoinType::Left => { - let right_idx = right_idx.ok_or_else(|| QueryError::InvalidQuery { - reason: "join requires right join key".into(), - })?; - let left_idx = left_idx.ok_or_else(|| QueryError::InvalidQuery { - reason: "join requires left join key".into(), - })?; - // Hash join for equality predicates. - let mut right_map: HashMap> = HashMap::new(); - for right in &join_rows { - right_map - .entry(right.values[right_idx].clone()) - .or_default() - .push(right); - } - for left in &rows { - let key = left.values[left_idx].clone(); - if let Some(matches) = right_map.get(&key) { - for right in matches { - let mut values = left.values.clone(); - values.extend(right.values.clone()); - joined.push(Row { values }); - } - } else if matches!(join.join_type, JoinType::Left) { - let mut values = left.values.clone(); - values.extend(std::iter::repeat_n(Value::Null, join_schema.columns.len())); - joined.push(Row { values }); - } - } - } - JoinType::Right => { - let left_idx = left_idx.ok_or_else(|| QueryError::InvalidQuery { - reason: "join requires left join key".into(), - })?; - let right_idx = right_idx.ok_or_else(|| QueryError::InvalidQuery { - reason: "join requires right join key".into(), - })?; - let mut left_map: HashMap> = HashMap::new(); - for left in &rows { - left_map - .entry(left.values[left_idx].clone()) - .or_default() - .push(left); - } - for right in &join_rows { - let key = right.values[right_idx].clone(); - if let Some(matches) = left_map.get(&key) { - for left in matches { - let mut values = left.values.clone(); - values.extend(right.values.clone()); - joined.push(Row { values }); - } - } else { - let mut values = - std::iter::repeat_n(Value::Null, join_col_offset).collect::>(); - values.extend(right.values.clone()); - joined.push(Row { values }); - } - } - } - } - rows = joined; - if !options.allow_full_scan && rows.len() > max_scan_rows { - return Err(QueryError::ScanBoundExceeded { - estimated_rows: rows.len() as u64, - max_scan_rows: max_scan_rows as u64, - }); - } - columns = next_columns; - } - - if let Some(predicate) = &query.predicate { - let compiled = compile_expr(predicate, &columns, "join")?; - rows.retain(|r| crate::query::operators::eval_compiled_expr_public(&compiled, r)); - } - - if !query.aggregates.is_empty() { - let group_by_idx = query - .group_by - .iter() - .map(|name| { - columns - .iter() - .position(|c| c == name) - .ok_or_else(|| QueryError::ColumnNotFound { - table: "join".into(), - column: name.clone(), - }) - }) - .collect::, _>>()?; - let agg_col_idx = query - .aggregates - .iter() - .map(|agg| aggregate_col_idx(agg, &columns)) - .collect::, _>>()?; - - let mut aggregate = AggregateOperator::new( - Box::new(ScanOperator::new(rows)), - query.aggregates.clone(), - group_by_idx, - agg_col_idx, - ); - let mut aggregated_rows = Vec::new(); - while let Some(row) = aggregate.next() { - aggregated_rows.push(row); - } - rows = aggregated_rows; - columns = query.group_by.clone(); - columns.extend(query.aggregates.iter().map(aggregate_output_name)); - } - - if let Some(having) = &query.having { - if query.aggregates.is_empty() { - return Err(QueryError::InvalidQuery { - reason: "having requires aggregate or group_by".into(), - }); - } - let compiled = compile_expr(having, &columns, "join")?; - rows.retain(|r| crate::query::operators::eval_compiled_expr_public(&compiled, r)); - } - - if !query.order_by.is_empty() { - let order_pairs: Vec<(usize, crate::query::plan::Order)> = query - .order_by - .iter() - .map(|(col, ord)| { - columns - .iter() - .position(|c| c == col) - .map(|idx| (idx, *ord)) - .ok_or_else(|| QueryError::ColumnNotFound { - table: "join".into(), - column: col.clone(), - }) - }) - .collect::>()?; - rows.sort_by(|a, b| { - for (idx, ord) in &order_pairs { - let cmp = a.values[*idx].cmp(&b.values[*idx]); - let ord_cmp = match ord { - crate::query::plan::Order::Asc => cmp, - crate::query::plan::Order::Desc => cmp.reverse(), - }; - if !ord_cmp.is_eq() { - return ord_cmp; - } - } - std::cmp::Ordering::Equal - }); - } - - let rows_examined = rows.len(); - let page_size = query.limit.unwrap_or_else(|| { - cursor_state - .as_ref() - .map(|c| c.page_size) - .unwrap_or(max_scan_rows.min(100)) - }); - let effective_page_size = page_size.min(max_scan_rows); - let sort_indices: Vec<(usize, crate::query::plan::Order)> = if !query.order_by.is_empty() { - query - .order_by - .iter() - .filter_map(|(name, ord)| columns.iter().position(|c| c == name).map(|i| (i, *ord))) - .collect() - } else { - Vec::new() - }; - let pk_indices: Vec = (0..columns.len()).collect(); - let mut sliced = Vec::new(); - for row in rows { - if let Some(cursor) = &cursor_state - && !row_after_cursor(&row, cursor, &sort_indices, &pk_indices) - { - continue; - } - sliced.push(row); - if sliced.len() > effective_page_size { - break; - } - } - let has_more = sliced.len() > effective_page_size; - if has_more { - sliced.truncate(effective_page_size); - } - let cursor_last_row = sliced.last().cloned(); - - if !query.select.is_empty() && query.select[0] != "*" { - let idxs: Vec = query - .select - .iter() - .map(|col| { - columns - .iter() - .position(|c| c == col) - .ok_or_else(|| QueryError::ColumnNotFound { - table: "join".into(), - column: col.clone(), - }) - }) - .collect::>()?; - sliced = sliced - .into_iter() - .map(|r| Row { - values: idxs.iter().map(|i| r.values[*i].clone()).collect(), - }) - .collect(); - } - - let cursor = if has_more { - let last_row = cursor_last_row.ok_or_else(|| QueryError::InvalidQuery { - reason: "invalid cursor state".into(), - })?; - Some(encode_cursor(&CursorToken { - snapshot_seq, - last_sort_key: extract_sort_key(&last_row, &sort_indices), - last_pk: extract_pk_key(&last_row, &pk_indices), - page_size, - remaining_limit: None, - })?) - } else { - None - }; - Ok(QueryResult { - rows_examined, - rows: sliced, - cursor, - snapshot_seq, - materialized_seq: None, - }) -} - -fn resolve_table_ref( - project_id: &str, - scope_id: &str, - table_ref: &str, -) -> (String, String, String) { - if let Some(name) = table_ref.strip_prefix("_global.") { - return ("_global".to_string(), "app".to_string(), name.to_string()); - } - ( - project_id.to_string(), - scope_id.to_string(), - table_ref.to_string(), - ) -} - -fn try_primary_key_point_query( - schema: &TableSchema, - table: Option<&crate::storage::keyspace::TableData>, - query: &Query, - cursor_state: &Option, - snapshot_seq: u64, -) -> Result, QueryError> { - if cursor_state.is_some() - || query.predicate.is_none() - || !query.group_by.is_empty() - || !query.aggregates.is_empty() - || query.having.is_some() - || !query.order_by.is_empty() - { - return Ok(None); - } - if query.limit == Some(0) { - return Ok(Some(QueryResult { - rows: Vec::new(), - rows_examined: 0, - cursor: None, - snapshot_seq, - materialized_seq: None, - })); - } - - let Some(predicate) = query.predicate.as_ref() else { - return Ok(None); - }; - let Some(primary_key) = extract_primary_key_values(predicate, &schema.primary_key) else { - return Ok(None); - }; - - let selected_indices = resolve_selected_indices(schema, query)?; - let encoded_pk = EncodedKey::from_values(&primary_key); - let maybe_row = table.and_then(|t| t.rows.get(&encoded_pk)); - let rows = match maybe_row { - Some(row) => vec![project_selected_row(row, selected_indices.as_deref())], - None => Vec::new(), - }; - - Ok(Some(QueryResult { - rows, - rows_examined: 1, - cursor: None, - snapshot_seq, - materialized_seq: None, - })) -} - -fn resolve_selected_indices( - schema: &TableSchema, - query: &Query, -) -> Result>, QueryError> { - if query.select.len() == 1 && query.select[0] == "*" { - return Ok(None); - } - let mut indices = Vec::with_capacity(query.select.len()); - for col in &query.select { - let column_index = schema - .columns - .iter() - .position(|c| c.name == *col) - .ok_or_else(|| QueryError::ColumnNotFound { - table: query.table.clone(), - column: col.clone(), - })?; - indices.push(column_index); - } - Ok(Some(indices)) -} - -fn project_selected_row(row: &Row, selected_indices: Option<&[usize]>) -> Row { - match selected_indices { - Some(indices) => Row { - values: indices.iter().map(|idx| row.values[*idx].clone()).collect(), - }, - None => row.clone(), - } -} - -fn extract_primary_key_values(predicate: &Expr, primary_key: &[String]) -> Option> { - if primary_key.is_empty() { - return None; - } - let mut equalities: HashMap = HashMap::new(); - if !collect_eq_constraints(predicate, &mut equalities) { - return None; - } - if equalities.len() != primary_key.len() { - return None; - } - let mut values = Vec::with_capacity(primary_key.len()); - for key_col in primary_key { - let value = equalities.get(key_col)?; - values.push(value.clone()); - } - Some(values) -} - -fn collect_eq_constraints(expr: &Expr, equalities: &mut HashMap) -> bool { - match expr { - Expr::Eq(column, value) => { - if let Some(existing) = equalities.get(column) { - existing == value - } else { - equalities.insert(column.clone(), value.clone()); - true - } - } - Expr::And(lhs, rhs) => { - collect_eq_constraints(lhs, equalities) && collect_eq_constraints(rhs, equalities) - } - _ => false, - } -} - -fn validate_query(schema: &TableSchema, query: &Query) -> Result<(), QueryError> { - for (col, _) in &query.order_by { - if !schema.columns.iter().any(|c| c.name == *col) { - return Err(QueryError::ColumnNotFound { - table: query.table.clone(), - column: col.clone(), - }); - } - } - for col in &query.group_by { - if !schema.columns.iter().any(|c| c.name == *col) { - return Err(QueryError::ColumnNotFound { - table: query.table.clone(), - column: col.clone(), - }); - } - } - if let Some(expr) = &query.predicate { - validate_expr_types(schema, expr)?; - } - Ok(()) -} - -fn extract_sort_key(row: &Row, sort_indices: &[(usize, crate::query::plan::Order)]) -> Vec { - if sort_indices.is_empty() { - return row.values.clone(); - } - sort_indices - .iter() - .map(|(idx, _)| row.values[*idx].clone()) - .collect() -} - -fn extract_pk_key(row: &Row, pk_indices: &[usize]) -> Vec { - if pk_indices.is_empty() { - return row.values.clone(); - } - pk_indices - .iter() - .map(|idx| row.values[*idx].clone()) - .collect() -} - -fn row_after_cursor( - row: &Row, - cursor: &CursorToken, - sort_indices: &[(usize, crate::query::plan::Order)], - pk_indices: &[usize], -) -> bool { - let row_sort = extract_sort_key(row, sort_indices); - let row_pk = extract_pk_key(row, pk_indices); - if sort_indices.is_empty() { - return row_pk > cursor.last_pk; - } - for ((_, order), (lhs, rhs)) in sort_indices - .iter() - .zip(row_sort.iter().zip(cursor.last_sort_key.iter())) - { - let cmp = lhs.cmp(rhs); - if cmp.is_eq() { - continue; - } - return match order { - crate::query::plan::Order::Asc => cmp.is_gt(), - crate::query::plan::Order::Desc => cmp.is_lt(), - }; - } - row_pk > cursor.last_pk -} - -fn validate_expr_types( - schema: &TableSchema, - expr: &crate::query::plan::Expr, -) -> Result<(), QueryError> { - use crate::catalog::types::ColumnType; - use crate::query::plan::Expr; - - let find_col_type = |name: &str| -> Result { - schema - .columns - .iter() - .find(|c| c.name == name) - .map(|c| c.col_type.clone()) - .ok_or_else(|| QueryError::ColumnNotFound { - table: schema.table_name.clone(), - column: name.to_string(), - }) - }; - - let value_compatible = |col_type: &ColumnType, value: &Value| -> bool { - matches!(value, Value::Null) - || match col_type { - ColumnType::U8 => matches!( - value, - Value::U8(_) | Value::Integer(_) | Value::Float(_) | Value::Timestamp(_) - ), - ColumnType::Integer => matches!( - value, - Value::U8(_) | Value::Integer(_) | Value::Float(_) | Value::Timestamp(_) - ), - ColumnType::Float => matches!( - value, - Value::U8(_) | Value::Integer(_) | Value::Float(_) | Value::Timestamp(_) - ), - ColumnType::Timestamp => matches!( - value, - Value::U8(_) | Value::Integer(_) | Value::Float(_) | Value::Timestamp(_) - ), - ColumnType::Text => matches!(value, Value::Text(_)), - ColumnType::Boolean => matches!(value, Value::Boolean(_)), - ColumnType::U256 => matches!(value, Value::U256(_)), - ColumnType::I256 => matches!(value, Value::I256(_)), - ColumnType::Blob => matches!(value, Value::Blob(_)), - ColumnType::Json => matches!(value, Value::Json(_) | Value::Text(_)), - } - }; - - match expr { - Expr::Eq(c, v) - | Expr::Ne(c, v) - | Expr::Lt(c, v) - | Expr::Lte(c, v) - | Expr::Gt(c, v) - | Expr::Gte(c, v) => { - let t = find_col_type(c)?; - if !value_compatible(&t, v) { - return Err(QueryError::TypeMismatch { - column: c.clone(), - expected: format!("{t:?}"), - got: format!("{v:?}"), - }); - } - } - Expr::In(c, values) => { - let t = find_col_type(c)?; - if !values.iter().all(|v| value_compatible(&t, v)) { - return Err(QueryError::TypeMismatch { - column: c.clone(), - expected: format!("{t:?}"), - got: "IN literal".to_string(), - }); - } - } - Expr::Between(c, lo, hi) => { - let t = find_col_type(c)?; - if !value_compatible(&t, lo) || !value_compatible(&t, hi) { - return Err(QueryError::TypeMismatch { - column: c.clone(), - expected: format!("{t:?}"), - got: "BETWEEN literal".to_string(), - }); - } - } - Expr::Like(c, _) => { - let t = find_col_type(c)?; - if !matches!(t, ColumnType::Text) { - return Err(QueryError::TypeMismatch { - column: c.clone(), - expected: "Text".to_string(), - got: format!("{t:?}"), - }); - } - } - Expr::IsNull(c) | Expr::IsNotNull(c) => { - let _ = find_col_type(c)?; - } - Expr::And(a, b) | Expr::Or(a, b) => { - validate_expr_types(schema, a)?; - validate_expr_types(schema, b)?; - } - Expr::Not(a) => validate_expr_types(schema, a)?, - } - Ok(()) -} - -#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] -struct CursorToken { - snapshot_seq: u64, - last_sort_key: Vec, - last_pk: Vec, - page_size: usize, - remaining_limit: Option, -} - -fn encode_cursor(cursor: &CursorToken) -> Result { - let bytes = rmp_serde::to_vec(cursor).map_err(|e| QueryError::InternalError(e.to_string()))?; - Ok(bytes.iter().map(|b| format!("{b:02x}")).collect()) -} - -fn decode_cursor(encoded: &str) -> Result { - let encoded_size_bytes = encoded.len(); - if !encoded_size_bytes.is_multiple_of(2) { - return Err(QueryError::InvalidQuery { - reason: "invalid cursor".into(), - }); - } - let mut decoded_bytes = Vec::with_capacity(encoded_size_bytes / 2); - let encoded_bytes = encoded.as_bytes(); - for byte_offset in (0..encoded_bytes.len()).step_by(2) { - let hi = decode_hex_nibble(encoded_bytes[byte_offset]).ok_or_else(|| { - QueryError::InvalidQuery { - reason: "invalid cursor".into(), - } - })?; - let lo = decode_hex_nibble(encoded_bytes[byte_offset + 1]).ok_or_else(|| { - QueryError::InvalidQuery { - reason: "invalid cursor".into(), - } - })?; - decoded_bytes.push((hi << 4) | lo); - } - rmp_serde::from_slice(&decoded_bytes).map_err(|e| QueryError::InvalidQuery { - reason: e.to_string(), - }) -} - -fn decode_hex_nibble(byte: u8) -> Option { - match byte { - b'0'..=b'9' => Some(byte - b'0'), - b'a'..=b'f' => Some(byte - b'a' + 10), - b'A'..=b'F' => Some(byte - b'A' + 10), - _ => None, - } -} - -fn aggregate_col_idx(agg: &Aggregate, columns: &[String]) -> Result, QueryError> { - let column_index = match agg { - Aggregate::Count => return Ok(None), - Aggregate::Sum(col) | Aggregate::Min(col) | Aggregate::Max(col) | Aggregate::Avg(col) => { - columns - .iter() - .position(|c| c == col) - .ok_or_else(|| QueryError::ColumnNotFound { - table: "".to_string(), - column: col.clone(), - })? - } - }; - Ok(Some(column_index)) -} - -fn aggregate_output_name(agg: &Aggregate) -> String { - match agg { - Aggregate::Count => "count_star".to_string(), - Aggregate::Sum(col) => format!("sum_{col}"), - Aggregate::Min(col) => format!("min_{col}"), - Aggregate::Max(col) => format!("max_{col}"), - Aggregate::Avg(col) => format!("avg_{col}"), - } -} - -fn indexed_pks_for_predicate( - catalog: &Catalog, - project_id: &str, - scope_id: &str, - table_name: &str, - table: &crate::storage::keyspace::TableData, - predicate: &crate::query::plan::Expr, -) -> Result>, QueryError> { - Ok(indexed_pks_for_predicate_with_trace( - catalog, project_id, scope_id, table_name, table, predicate, - )? - .map(|result| result.pks)) -} - -#[derive(Debug, Clone)] -struct IndexLookupResult { - pks: Vec, - selected_indexes: Vec, - plan_trace: Vec, -} - -fn indexed_pks_for_predicate_with_trace( - catalog: &Catalog, - project_id: &str, - scope_id: &str, - table_name: &str, - table: &crate::storage::keyspace::TableData, - predicate: &crate::query::plan::Expr, -) -> Result, QueryError> { - use crate::query::plan::Expr; - - match predicate { - Expr::And(lhs, rhs) => { - let left = indexed_pks_for_predicate_with_trace( - catalog, project_id, scope_id, table_name, table, lhs, - )?; - let right = indexed_pks_for_predicate_with_trace( - catalog, project_id, scope_id, table_name, table, rhs, - )?; - return Ok(match (left, right) { - (Some(left), Some(right)) => Some(IndexLookupResult { - pks: intersect_pks(left.pks, right.pks), - selected_indexes: merge_selected_indexes( - left.selected_indexes, - right.selected_indexes, - ), - plan_trace: merge_trace( - "AND predicate combines indexed candidates with intersection", - left.plan_trace, - right.plan_trace, - ), - }), - (Some(left), None) => Some(IndexLookupResult { - plan_trace: merge_trace_single( - "AND predicate uses indexed left side; right side will be residual filter", - left.plan_trace, - ), - ..left - }), - (None, Some(right)) => Some(IndexLookupResult { - plan_trace: merge_trace_single( - "AND predicate uses indexed right side; left side will be residual filter", - right.plan_trace, - ), - ..right - }), - (None, None) => None, - }); - } - Expr::Or(lhs, rhs) => { - let left = indexed_pks_for_predicate_with_trace( - catalog, project_id, scope_id, table_name, table, lhs, - )?; - let right = indexed_pks_for_predicate_with_trace( - catalog, project_id, scope_id, table_name, table, rhs, - )?; - return Ok(match (left, right) { - (Some(left), Some(right)) => Some(IndexLookupResult { - pks: union_pks(left.pks, right.pks), - selected_indexes: merge_selected_indexes( - left.selected_indexes, - right.selected_indexes, - ), - plan_trace: merge_trace( - "OR predicate combines indexed candidates with union", - left.plan_trace, - right.plan_trace, - ), - }), - _ => None, - }); - } - _ => {} - } - - let mut equalities = HashMap::new(); - let eq_only = collect_eq_constraints(predicate, &mut equalities); - let Some(lookup) = extract_indexable_predicate(predicate) else { - if !eq_only { - return Ok(None); - } - // Composite + leftmost-prefix support for conjunctions of equality predicates. - let ns = namespace_key(project_id, scope_id); - let mut best: Option<(String, usize)> = None; - for ((p, t, idx_name), idx_def) in &catalog.indexes { - if p != &ns || t != table_name || !table.indexes.contains_key(idx_name) { - continue; - } - if let Some(filter) = &idx_def.partial_filter - && !expr_implied_by_eq_constraints(filter, &equalities) - { - continue; - } - let mut prefix_cols = 0usize; - for col in &idx_def.columns { - if equalities.contains_key(col) { - prefix_cols += 1; - } else { - break; - } - } - if prefix_cols == 0 { - continue; - } - if best.as_ref().map(|(_, c)| *c).unwrap_or(0) < prefix_cols { - best = Some((idx_name.clone(), prefix_cols)); - } - } - let Some((idx_name, prefix_cols)) = best else { - return Ok(None); - }; - let selected_index = - table - .indexes - .get(&idx_name) - .ok_or_else(|| QueryError::InvalidQuery { - reason: "index not found".into(), - })?; - let idx_def = catalog - .indexes - .get(&(ns, table_name.to_string(), idx_name.clone())) - .ok_or_else(|| QueryError::InvalidQuery { - reason: "index definition not found".into(), - })?; - let prefix_values = idx_def - .columns - .iter() - .take(prefix_cols) - .filter_map(|c| equalities.get(c).cloned()) - .collect::>(); - let encoded = EncodedKey::from_values(&prefix_values); - let pks = if prefix_cols == idx_def.columns.len() { - selected_index.scan_eq(&encoded) - } else { - selected_index.scan_prefix(&encoded) - }; - return Ok(Some(IndexLookupResult { - pks, - selected_indexes: vec![idx_name.clone()], - plan_trace: vec![format!( - "selected composite index '{idx_name}' with leftmost prefix columns={prefix_cols}" - )], - })); - }; - let column = match &lookup { - IndexLookup::Range { column, .. } => column, - IndexLookup::MultiEq { column, .. } => column, - }; - - let mut selected_index_name: Option = None; - let ns = namespace_key(project_id, scope_id); - for ((p, t, idx_name), idx_def) in &catalog.indexes { - if p == &ns - && t == table_name - && idx_def.columns.len() == 1 - && idx_def.columns[0] == *column - && idx_def - .partial_filter - .as_ref() - .map(|f| expr_implied_by_eq_constraints(f, &equalities)) - .unwrap_or(true) - && table.indexes.contains_key(idx_name) - { - selected_index_name = Some(idx_name.clone()); - break; - } - } - - let Some(index_name) = selected_index_name else { - return Ok(None); - }; - let Some(index) = table.indexes.get(&index_name) else { - return Ok(None); - }; - - let pks = match lookup.clone() { - IndexLookup::Range { bounds, .. } => index.scan_range(bounds.0, bounds.1), - IndexLookup::MultiEq { values, .. } => values - .into_iter() - .flat_map(|v| index.scan_eq(&EncodedKey::from_values(&[v]))) - .collect(), - }; - Ok(Some(IndexLookupResult { - pks, - selected_indexes: vec![index_name.clone()], - plan_trace: vec![format!( - "selected single-column index '{index_name}' for predicate on '{column}'" - )], - })) -} - -fn merge_selected_indexes(left: Vec, right: Vec) -> Vec { - let mut out = Vec::with_capacity(left.len() + right.len()); - for name in left.into_iter().chain(right) { - if !out.contains(&name) { - out.push(name); - } - } - out -} - -fn merge_trace(header: &str, mut left: Vec, right: Vec) -> Vec { - let mut out = Vec::with_capacity(1 + left.len() + right.len()); - out.push(header.to_string()); - out.append(&mut left); - out.extend(right); - out -} - -fn merge_trace_single(header: &str, mut trace: Vec) -> Vec { - let mut out = Vec::with_capacity(1 + trace.len()); - out.push(header.to_string()); - out.append(&mut trace); - out -} - -fn intersect_pks(left: Vec, right: Vec) -> Vec { - let mut right_set: HashSet = HashSet::with_capacity(right.len()); - right_set.extend(right); - let mut out = Vec::with_capacity(left.len().min(right_set.len())); - for pk in left { - if right_set.contains(&pk) { - out.push(pk); - } - } - out -} - -fn union_pks(left: Vec, right: Vec) -> Vec { - let mut seen: HashSet = HashSet::with_capacity(left.len() + right.len()); - let mut out = Vec::with_capacity(left.len() + right.len()); - for pk in left.into_iter().chain(right) { - if seen.insert(pk.clone()) { - out.push(pk); - } - } - out -} - -fn expr_implied_by_eq_constraints( - expr: &crate::query::plan::Expr, - equalities: &HashMap, -) -> bool { - use crate::query::plan::Expr; - match expr { - Expr::Eq(col, val) => equalities.get(col) == Some(val), - Expr::And(lhs, rhs) => { - expr_implied_by_eq_constraints(lhs, equalities) - && expr_implied_by_eq_constraints(rhs, equalities) - } - _ => false, - } -} - -fn extract_indexable_predicate(predicate: &crate::query::plan::Expr) -> Option { - use crate::query::plan::Expr; - - match predicate { - Expr::Eq(c, v) => Some(IndexLookup::Range { - column: c.clone(), - bounds: ( - Bound::Included(EncodedKey::from_values(std::slice::from_ref(v))), - Bound::Included(EncodedKey::from_values(std::slice::from_ref(v))), - ), - }), - Expr::In(c, values) => Some(IndexLookup::MultiEq { - column: c.clone(), - values: values.clone(), - }), - Expr::Lt(c, v) => Some(IndexLookup::Range { - column: c.clone(), - bounds: ( - Bound::Unbounded, - Bound::Excluded(EncodedKey::from_values(std::slice::from_ref(v))), - ), - }), - Expr::Lte(c, v) => Some(IndexLookup::Range { - column: c.clone(), - bounds: ( - Bound::Unbounded, - Bound::Included(EncodedKey::from_values(std::slice::from_ref(v))), - ), - }), - Expr::Gt(c, v) => Some(IndexLookup::Range { - column: c.clone(), - bounds: ( - Bound::Excluded(EncodedKey::from_values(std::slice::from_ref(v))), - Bound::Unbounded, - ), - }), - Expr::Gte(c, v) => Some(IndexLookup::Range { - column: c.clone(), - bounds: ( - Bound::Included(EncodedKey::from_values(std::slice::from_ref(v))), - Bound::Unbounded, - ), - }), - Expr::Between(c, lo, hi) => Some(IndexLookup::Range { - column: c.clone(), - bounds: ( - Bound::Included(EncodedKey::from_values(std::slice::from_ref(lo))), - Bound::Included(EncodedKey::from_values(std::slice::from_ref(hi))), - ), - }), - Expr::Like(c, pattern) => { - let prefix = like_prefix(pattern)?; - let start = Bound::Included(EncodedKey::from_values(&[Value::Text( - prefix.clone().into(), - )])); - let end = match next_prefix(&prefix) { - Some(next) => Bound::Excluded(EncodedKey::from_values(&[Value::Text(next.into())])), - None => Bound::Unbounded, - }; - Some(IndexLookup::Range { - column: c.clone(), - bounds: (start, end), - }) - } - _ => None, - } -} - -fn like_prefix(pattern: &str) -> Option { - if !pattern.ends_with('%') { - return None; - } - let mut prefix = String::new(); - for ch in pattern.chars() { - if ch == '%' || ch == '_' { - break; - } - prefix.push(ch); - } - if prefix.is_empty() { - return None; - } - Some(prefix) -} - -fn next_prefix(prefix: &str) -> Option { - let mut bytes = prefix.as_bytes().to_vec(); - for byte_index in (0..bytes.len()).rev() { - if bytes[byte_index] != u8::MAX { - bytes[byte_index] += 1; - bytes.truncate(byte_index + 1); - return String::from_utf8(bytes).ok(); - } - } - None -} - -#[cfg(test)] -mod tests { - use super::execute_query_with_options; - use crate::catalog::Catalog; - use crate::catalog::namespace_key; - use crate::catalog::schema::{ColumnDef, IndexType}; - use crate::catalog::types::{ColumnType, Row, Value}; - use crate::query::error::QueryError; - use crate::query::plan::{Aggregate, Expr, Order, Query, QueryOptions, col, lit}; - use crate::storage::encoded_key::EncodedKey; - use crate::storage::index::extract_index_key_encoded; - use crate::storage::keyspace::{Keyspace, NamespaceId, SecondaryIndex}; - - fn execute_query( - snapshot: &crate::storage::keyspace::KeyspaceSnapshot, - catalog: &Catalog, - project_id: &str, - scope_id: &str, - query: Query, - ) -> Result { - execute_query_with_options( - snapshot, - catalog, - project_id, - scope_id, - query, - &QueryOptions { - allow_full_scan: true, - ..QueryOptions::default() - }, - 0, - usize::MAX, - ) - } - - fn setup() -> (Keyspace, Catalog) { - let mut keyspace = Keyspace::default(); - let mut catalog = Catalog::default(); - catalog.create_project("A").expect("project A"); - catalog.create_project("B").expect("project B"); - for p in ["A", "B"] { - catalog - .create_table( - p, - "app", - "users", - vec![ - ColumnDef { - name: "id".into(), - col_type: ColumnType::Integer, - nullable: false, - }, - ColumnDef { - name: "name".into(), - col_type: ColumnType::Text, - nullable: false, - }, - ColumnDef { - name: "age".into(), - col_type: ColumnType::Integer, - nullable: false, - }, - ColumnDef { - name: "email".into(), - col_type: ColumnType::Text, - nullable: true, - }, - ], - vec!["id".into()], - ) - .expect("table"); - } - for i in 0..100 { - keyspace.upsert_row( - "A", - "app", - "users", - vec![Value::Integer(i)], - Row { - values: vec![ - Value::Integer(i), - Value::Text(format!("u{i}").into()), - Value::Integer(18 + (i % 50)), - if i == 0 { - Value::Null - } else if i % 2 == 0 { - Value::Text(format!("u{i}@gmail.com").into()) - } else { - Value::Text(format!("u{i}@example.com").into()) - }, - ], - }, - i as u64 + 1, - ); - keyspace.upsert_row( - "B", - "app", - "users", - vec![Value::Integer(i)], - Row { - values: vec![ - Value::Integer(i), - Value::Text(format!("b{i}").into()), - Value::Integer(99), - Value::Text(format!("b{i}@other.com").into()), - ], - }, - i as u64 + 10_000, - ); - } - catalog - .create_index( - "A", - "app", - "users", - "by_age", - vec!["age".into()], - IndexType::BTree, - None, - ) - .expect("create index"); - catalog - .create_index( - "A", - "app", - "users", - "by_name", - vec!["name".into()], - IndexType::BTree, - None, - ) - .expect("create name index"); - let schema = catalog - .tables - .get(&(namespace_key("A", "app"), "users".to_string())) - .expect("schema") - .clone(); - let table = keyspace - .table_by_namespace_key_mut(&namespace_key("A", "app"), "users") - .expect("table"); - let mut secondary_index = SecondaryIndex::default(); - for (pk, row) in &table.rows { - let age_key = - extract_index_key_encoded(row, &schema, &["age".into()]).expect("age index key"); - secondary_index.insert(age_key, pk.clone()); - } - table.indexes.insert("by_age".into(), secondary_index); - let mut by_name = SecondaryIndex::default(); - for (pk, row) in &table.rows { - let key = - extract_index_key_encoded(row, &schema, &["name".into()]).expect("name index key"); - by_name.insert(key, pk.clone()); - } - table.indexes.insert("by_name".into(), by_name); - (keyspace, catalog) - } - - #[test] - fn query_correctness_suite() { - let (keyspace, catalog) = setup(); - let snapshot = keyspace.snapshot(); - - let all = execute_query( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["*"]).from("users"), - ) - .expect("all"); - assert_eq!(all.rows.len(), 100); - - let filtered = execute_query( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["*"]) - .from("users") - .where_(Expr::Gt("age".into(), Value::Integer(30))), - ) - .expect("filtered"); - assert!( - filtered - .rows - .iter() - .all(|r| matches!(r.values[2], Value::Integer(v) if v > 30)) - ); - - let ordered = execute_query( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["*"]) - .from("users") - .order_by("age", Order::Desc) - .order_by("id", Order::Asc), - ) - .expect("ordered"); - for w in ordered.rows.windows(2) { - assert!(w[0].values[2] >= w[1].values[2]); - } - - let limited = execute_query( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["*"]).from("users").limit(5), - ) - .expect("limit"); - assert_eq!(limited.rows.len(), 5); - - let counted = execute_query( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["*"]) - .from("users") - .aggregate(Aggregate::Count), - ) - .expect("count"); - assert_eq!(counted.rows[0].values[0], Value::Integer(100)); - - let grouped = execute_query( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["*"]) - .from("users") - .group_by(&["age"]) - .aggregate(Aggregate::Count), - ) - .expect("grouped"); - assert!(!grouped.rows.is_empty()); - - let compound = execute_query( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["*"]).from("users").where_( - Expr::Gt("age".into(), Value::Integer(30)) - .and(Expr::Like("email".into(), "%@gmail.com".into())), - ), - ) - .expect("compound"); - assert!(compound.rows.iter().all(|r| { - matches!(&r.values[2], Value::Integer(v) if *v > 30) - && matches!(&r.values[3], Value::Text(s) if s.ends_with("@gmail.com")) - })); - - let project_b = execute_query( - &snapshot, - &catalog, - "B", - "app", - Query::select(&["*"]) - .from("users") - .where_(Expr::Eq("age".into(), Value::Integer(99))), - ) - .expect("project B"); - assert_eq!(project_b.rows.len(), 100); - } - - #[test] - fn builder_supports_not_is_not_null_and_like_underscore() { - let (keyspace, catalog) = setup(); - let snapshot = keyspace.snapshot(); - - let query = Query::select(&["id", "email"]).from("users").where_( - col("email") - .is_not_null() - .and(col("name").like(lit("u_"))) - .and(col("age").gt(lit(20)).not().not()), - ); - let result = execute_query(&snapshot, &catalog, "A", "app", query).expect("query"); - assert!(!result.rows.is_empty()); - assert!( - result - .rows - .iter() - .all(|r| matches!(&r.values[1], Value::Text(_))) - ); - } - - #[test] - fn having_filters_post_aggregation() { - let (keyspace, catalog) = setup(); - let snapshot = keyspace.snapshot(); - - let result = execute_query( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["age", "count_star"]) - .from("users") - .group_by(&["age"]) - .aggregate(Aggregate::Count) - .having(Expr::Gt("count_star".into(), Value::Integer(1))), - ) - .expect("having"); - - assert!( - result - .rows - .iter() - .all(|r| matches!(r.values[1], Value::Integer(v) if v > 1)) - ); - } - - #[test] - fn index_backed_range_scan_reduces_examined_rows() { - let (keyspace, catalog) = setup(); - let snapshot = keyspace.snapshot(); - - let full = execute_query( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["*"]).from("users"), - ) - .expect("full"); - let ranged = execute_query( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["*"]).from("users").where_(Expr::Between( - "age".into(), - Value::Integer(40), - Value::Integer(41), - )), - ) - .expect("range"); - assert!(ranged.rows.len() < full.rows.len()); - assert!(ranged.rows_examined < full.rows_examined); - } - - #[test] - fn primary_key_eq_uses_point_lookup_path() { - let (keyspace, catalog) = setup(); - let snapshot = keyspace.snapshot(); - - let result = execute_query( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["id", "name"]) - .from("users") - .where_(Expr::Eq("id".into(), Value::Integer(42))) - .limit(1), - ) - .expect("pk point query"); - - assert_eq!(result.rows.len(), 1); - assert_eq!(result.rows[0].values[0], Value::Integer(42)); - assert_eq!(result.rows_examined, 1); - } - - #[test] - fn primary_key_with_non_pk_eq_falls_back_to_general_path() { - let (keyspace, catalog) = setup(); - let snapshot = keyspace.snapshot(); - - let result = execute_query( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["id", "name"]) - .from("users") - .where_( - Expr::Eq("id".into(), Value::Integer(42)) - .and(Expr::Eq("age".into(), Value::Integer(60))), - ) - .limit(1), - ) - .expect("mixed eq query"); - - assert_eq!(result.rows.len(), 1); - assert!(result.rows_examined > 1); - } - - #[test] - fn use_index_hint_selects_async_projection() { - let mut keyspace = Keyspace::default(); - let mut catalog = Catalog::default(); - catalog.create_project("A").expect("project A"); - catalog - .create_table( - "A", - "app", - "users", - vec![ - ColumnDef { - name: "id".into(), - col_type: ColumnType::Integer, - nullable: false, - }, - ColumnDef { - name: "name".into(), - col_type: ColumnType::Text, - nullable: false, - }, - ], - vec!["id".into()], - ) - .expect("table"); - keyspace.upsert_row( - "A", - "app", - "users", - vec![Value::Integer(1)], - Row { - values: vec![Value::Integer(1), Value::Text("alice".into())], - }, - 1, - ); - keyspace.insert_async_projection( - NamespaceId::Project(namespace_key("A", "app")), - "users".into(), - "users_view".into(), - crate::storage::keyspace::AsyncProjectionData { - rows: { - let mut rows = im::OrdMap::new(); - rows.insert( - EncodedKey::from_values(&[Value::Integer(9)]), - Row { - values: vec![Value::Integer(9), Value::Text("projection".into())], - }, - ); - rows - }, - materialized_seq: 123, - }, - ); - let snapshot = keyspace.snapshot(); - - let result = execute_query( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["*"]).from("users").use_index("users_view"), - ) - .expect("hint query"); - - assert_eq!(result.materialized_seq, Some(123)); - assert_eq!(result.rows[0].values[0], Value::Integer(9)); - } - - #[test] - fn in_and_like_prefix_can_use_index_path() { - let (keyspace, catalog) = setup(); - let snapshot = keyspace.snapshot(); - - let by_in = execute_query( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["*"]).from("users").where_(Expr::In( - "age".into(), - vec![Value::Integer(40), Value::Integer(41)], - )), - ) - .expect("in"); - assert!( - by_in - .rows - .iter() - .all(|r| { matches!(r.values[2], Value::Integer(40) | Value::Integer(41)) }) - ); - - let by_prefix = execute_query( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["*"]) - .from("users") - .where_(Expr::Like("name".into(), "u1%".into())), - ) - .expect("prefix like"); - assert!( - by_prefix - .rows - .iter() - .all(|r| matches!(&r.values[1], Value::Text(s) if s.starts_with("u1"))) - ); - } - - #[test] - fn and_or_predicates_compose_index_row_sets() { - let (keyspace, catalog) = setup(); - let snapshot = keyspace.snapshot(); - - let and_query = execute_query( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["id", "name", "age"]).from("users").where_( - Expr::Eq("age".into(), Value::Integer(40)) - .and(Expr::Like("name".into(), "u2%".into())), - ), - ) - .expect("and query"); - assert!(and_query.rows.iter().all(|r| { - matches!(r.values[2], Value::Integer(40)) - && matches!(&r.values[1], Value::Text(name) if name.starts_with("u2")) - })); - - let or_query = execute_query( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["id", "name"]) - .from("users") - .where_( - Expr::Eq("name".into(), Value::Text("u1".into())) - .or(Expr::Like("name".into(), "u2%".into())), - ) - .order_by("id", Order::Asc), - ) - .expect("or query"); - assert!(or_query.rows.iter().all(|r| match &r.values[1] { - Value::Text(name) => name == "u1" || name.starts_with("u2"), - _ => false, - })); - assert!(!or_query.rows.is_empty()); - } - - #[test] - fn composite_index_respects_leftmost_prefix_rule() { - let (mut keyspace, mut catalog) = setup(); - catalog - .create_index( - "A", - "app", - "users", - "by_age_name", - vec!["age".into(), "name".into()], - IndexType::BTree, - None, - ) - .expect("composite index"); - let schema = catalog - .tables - .get(&(namespace_key("A", "app"), "users".to_string())) - .expect("schema") - .clone(); - let table = keyspace - .table_by_namespace_key_mut(&namespace_key("A", "app"), "users") - .expect("table"); - let mut by_age_name = SecondaryIndex::default(); - for (pk, row) in &table.rows { - let key = extract_index_key_encoded(row, &schema, &["age".into(), "name".into()]) - .expect("composite key"); - by_age_name.insert(key, pk.clone()); - } - table.indexes.insert("by_age_name".into(), by_age_name); - - let snapshot = keyspace.snapshot(); - - let good = execute_query( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["id", "name", "age"]) - .from("users") - .where_(Expr::Eq("age".into(), Value::Integer(40))), - ) - .expect("leftmost predicate should use composite index"); - assert!( - good.rows_examined < 100, - "leftmost-prefix query should avoid full scan" - ); - - let bad = execute_query( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["id", "name", "age"]) - .from("users") - .where_(Expr::Eq( - "email".into(), - Value::Text("u1@example.com".into()), - )), - ) - .expect("non-leftmost predicate falls back"); - assert!( - bad.rows_examined >= good.rows_examined, - "non-leftmost query should not be better than leftmost" - ); - } - - #[test] - fn partial_index_only_indexes_matching_rows() { - let (mut keyspace, mut catalog) = setup(); - catalog - .create_index( - "A", - "app", - "users", - "adults_only", - vec!["age".into()], - IndexType::BTree, - Some(Expr::Gte("age".into(), Value::Integer(50))), - ) - .expect("partial index"); - let schema = catalog - .tables - .get(&(namespace_key("A", "app"), "users".to_string())) - .expect("schema") - .clone(); - let table = keyspace - .table_by_namespace_key_mut(&namespace_key("A", "app"), "users") - .expect("table"); - let mut adults_only = SecondaryIndex { - partial_filter: Some(Expr::Gte("age".into(), Value::Integer(50))), - ..SecondaryIndex::default() - }; - for (pk, row) in &table.rows { - if adults_only - .should_include_row(row, &schema, "users") - .expect("partial eval") - { - let key = - extract_index_key_encoded(row, &schema, &["age".into()]).expect("index key"); - adults_only.insert(key, pk.clone()); - } - } - table.indexes.insert("adults_only".into(), adults_only); - - let snapshot = keyspace.snapshot(); - let result = execute_query( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["id", "age"]) - .from("users") - .where_(Expr::Gte("age".into(), Value::Integer(50))), - ) - .expect("partial query"); - assert!(!result.rows.is_empty()); - assert!( - result - .rows - .iter() - .all(|r| matches!(r.values[1], Value::Integer(v) if v >= 50)) - ); - } - - #[test] - fn bounded_scan_is_enforced_when_full_scan_not_allowed() { - let (keyspace, catalog) = setup(); - let snapshot = keyspace.snapshot(); - let err = execute_query_with_options( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["*"]).from("users"), - &QueryOptions::default(), - 1, - 10_000, - ) - .expect_err("should reject full scan"); - assert!(matches!(err, QueryError::InvalidQuery { .. })); - } - - #[test] - fn default_execute_query_rejects_unbounded_full_scan() { - let (keyspace, catalog) = setup(); - let snapshot = keyspace.snapshot(); - let err = super::execute_query( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["*"]).from("users"), - ) - .expect_err("default execute_query should reject unbounded full scan"); - assert!(matches!(err, QueryError::InvalidQuery { .. })); - } - - #[test] - fn non_join_page_size_is_capped_by_max_scan_rows() { - let (keyspace, catalog) = setup(); - let snapshot = keyspace.snapshot(); - let result = execute_query_with_options( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["*"]) - .from("users") - .order_by("id", Order::Asc) - .limit(50), - &QueryOptions::default(), - 9, - 10, - ) - .expect("bounded page"); - assert_eq!(result.rows.len(), 10); - assert!(result.cursor.is_some()); - assert!(result.rows_examined <= 100); - } - - #[test] - fn join_scan_bound_is_enforced_when_full_scan_not_allowed() { - let (keyspace, mut catalog) = setup(); - catalog - .create_table( - "A", - "app", - "profiles", - vec![ - ColumnDef { - name: "user_id".into(), - col_type: ColumnType::Integer, - nullable: false, - }, - ColumnDef { - name: "country".into(), - col_type: ColumnType::Text, - nullable: false, - }, - ], - vec!["user_id".into()], - ) - .expect("profiles table"); - let mut keyspace = keyspace; - for i in 0..50 { - keyspace.upsert_row( - "A", - "app", - "profiles", - vec![Value::Integer(i)], - Row::from_values(vec![Value::Integer(i), Value::Text("US".into())]), - 1, - ); - } - let snapshot = keyspace.snapshot(); - let err = execute_query_with_options( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["u.id", "p.country"]) - .from("users") - .alias("u") - .inner_join("profiles", "u.id", "user_id") - .with_last_join_alias("p") - .limit(10), - &QueryOptions::default(), - 1, - 1_000, - ) - .expect_err("join scan bound"); - assert!(matches!(err, QueryError::ScanBoundExceeded { .. })); - } - - #[test] - fn type_mismatch_rejected_at_plan_time() { - let (keyspace, catalog) = setup(); - let snapshot = keyspace.snapshot(); - let err = execute_query( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["*"]) - .from("users") - .where_(Expr::Gt("age".into(), Value::Text("oops".into()))), - ) - .expect_err("type mismatch"); - assert!(matches!(err, QueryError::TypeMismatch { .. })); - } - - #[test] - fn cursor_pagination_returns_stable_pages() { - let (keyspace, catalog) = setup(); - let snapshot = keyspace.snapshot(); - let mut options = QueryOptions::default(); - let mut all = Vec::new(); - loop { - let page = execute_query_with_options( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["*"]) - .from("users") - .order_by("id", Order::Asc) - .limit(10), - &options, - 42, - 10_000, - ) - .expect("page"); - all.extend(page.rows.clone()); - if let Some(cursor) = page.cursor { - options.cursor = Some(cursor); - } else { - break; - } - } - assert_eq!(all.len(), 100); - for (i, row) in all.iter().enumerate().take(100) { - assert_eq!(row.values[0], Value::Integer(i as i64)); - } - } - - #[test] - fn inner_join_returns_matching_rows() { - let (keyspace, mut catalog) = setup(); - catalog - .create_table( - "A", - "app", - "profiles", - vec![ - ColumnDef { - name: "user_id".into(), - col_type: ColumnType::Integer, - nullable: false, - }, - ColumnDef { - name: "country".into(), - col_type: ColumnType::Text, - nullable: false, - }, - ], - vec!["user_id".into()], - ) - .expect("profiles table"); - let mut keyspace = keyspace; - for i in 0..50 { - keyspace.upsert_row( - "A", - "app", - "profiles", - vec![Value::Integer(i)], - Row::from_values(vec![Value::Integer(i), Value::Text("US".into())]), - 1, - ); - } - let snapshot = keyspace.snapshot(); - let result = execute_query( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["u.id", "p.country"]) - .from("users") - .alias("u") - .inner_join("profiles", "u.id", "user_id") - .with_last_join_alias("p") - .limit(100), - ) - .expect("join query"); - assert_eq!(result.rows.len(), 50); - } - - #[test] - fn join_aggregate_count_and_having_are_applied() { - let (mut keyspace, mut catalog) = setup(); - catalog - .create_table( - "A", - "app", - "profiles", - vec![ - ColumnDef { - name: "user_id".into(), - col_type: ColumnType::Integer, - nullable: false, - }, - ColumnDef { - name: "country".into(), - col_type: ColumnType::Text, - nullable: false, - }, - ], - vec!["user_id".into()], - ) - .expect("profiles table"); - for i in 0..50 { - keyspace.upsert_row( - "A", - "app", - "profiles", - vec![Value::Integer(i)], - Row::from_values(vec![ - Value::Integer(i), - Value::Text(if i % 2 == 0 { "US" } else { "CA" }.into()), - ]), - 1, - ); - } - let snapshot = keyspace.snapshot(); - let result = execute_query( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["p.country", "count_star"]) - .from("users") - .alias("u") - .inner_join("profiles", "u.id", "user_id") - .with_last_join_alias("p") - .group_by(&["p.country"]) - .aggregate(Aggregate::Count) - .having(Expr::Gt("count_star".into(), Value::Integer(20))) - .order_by("count_star", Order::Desc) - .limit(10), - ) - .expect("join aggregate query"); - - assert_eq!(result.rows.len(), 2); - for row in result.rows { - assert!(matches!(row.values[1], Value::Integer(25))); - } - } - - #[test] - fn left_join_supports_global_table_reference() { - let (mut keyspace, mut catalog) = setup(); - catalog.create_project("_global").expect("global project"); - catalog - .create_table( - "_global", - "app", - "users", - vec![ - ColumnDef { - name: "id".into(), - col_type: ColumnType::Integer, - nullable: false, - }, - ColumnDef { - name: "name".into(), - col_type: ColumnType::Text, - nullable: false, - }, - ], - vec!["id".into()], - ) - .expect("global users"); - for i in 0..20 { - keyspace.upsert_row( - "_global", - "app", - "users", - vec![Value::Integer(i)], - Row::from_values(vec![Value::Integer(i), Value::Text(format!("g{i}").into())]), - 1, - ); - } - let snapshot = keyspace.snapshot(); - let result = execute_query( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["u.id", "g.name"]) - .from("users") - .alias("u") - .left_join("_global.users", "u.id", "id") - .with_last_join_alias("g") - .limit(5), - ) - .expect("left join"); - assert_eq!(result.rows.len(), 5); - } - - #[test] - fn invalid_cursor_is_rejected() { - let (keyspace, catalog) = setup(); - let snapshot = keyspace.snapshot(); - let options = QueryOptions { - cursor: Some("xyz".into()), - ..QueryOptions::default() - }; - let err = execute_query_with_options( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["*"]) - .from("users") - .order_by("id", Order::Asc), - &options, - 42, - 10_000, - ) - .expect_err("invalid cursor should fail"); - assert!(matches!(err, QueryError::InvalidQuery { .. })); - } - - #[test] - fn uppercase_hex_cursor_is_accepted() { - let (keyspace, catalog) = setup(); - let snapshot = keyspace.snapshot(); - let first = execute_query_with_options( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["*"]) - .from("users") - .order_by("id", Order::Asc) - .limit(10), - &QueryOptions::default(), - 42, - 10_000, - ) - .expect("first page"); - let cursor = first - .cursor - .expect("first page should include cursor") - .to_ascii_uppercase(); - let options = QueryOptions { - cursor: Some(cursor), - ..QueryOptions::default() - }; - let second = execute_query_with_options( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["*"]) - .from("users") - .order_by("id", Order::Asc) - .limit(10), - &options, - 42, - 10_000, - ) - .expect("uppercase cursor should decode"); - assert_eq!(second.rows.len(), 10); - } - - #[test] - fn cursor_snapshot_mismatch_is_rejected() { - let (keyspace, catalog) = setup(); - let snapshot = keyspace.snapshot(); - let first = execute_query_with_options( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["*"]) - .from("users") - .order_by("id", Order::Asc) - .limit(10), - &QueryOptions::default(), - 42, - 10_000, - ) - .expect("first page"); - let options = QueryOptions { - cursor: first.cursor, - ..QueryOptions::default() - }; - let err = execute_query_with_options( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["*"]) - .from("users") - .order_by("id", Order::Asc) - .limit(10), - &options, - 43, - 10_000, - ) - .expect_err("snapshot mismatch"); - assert!(matches!(err, QueryError::InvalidQuery { .. })); - } - - #[test] - fn join_query_supports_cursor_pagination() { - let (mut keyspace, mut catalog) = setup(); - catalog - .create_table( - "A", - "app", - "profiles", - vec![ - ColumnDef { - name: "user_id".into(), - col_type: ColumnType::Integer, - nullable: false, - }, - ColumnDef { - name: "country".into(), - col_type: ColumnType::Text, - nullable: false, - }, - ], - vec!["user_id".into()], - ) - .expect("profiles table"); - for i in 0..50 { - keyspace.upsert_row( - "A", - "app", - "profiles", - vec![Value::Integer(i)], - Row::from_values(vec![Value::Integer(i), Value::Text("US".into())]), - 1, - ); - } - let snapshot = keyspace.snapshot(); - - let first = execute_query_with_options( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["*"]) - .from("users") - .alias("u") - .inner_join("profiles", "u.id", "user_id") - .with_last_join_alias("p") - .order_by("u.id", Order::Asc) - .limit(5), - &QueryOptions::default(), - 7, - 10_000, - ) - .expect("first page"); - assert_eq!(first.rows.len(), 5); - assert!(first.cursor.is_some()); - - let options = QueryOptions { - cursor: first.cursor, - ..QueryOptions::default() - }; - let second = execute_query_with_options( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["u.id", "p.country"]) - .from("users") - .alias("u") - .inner_join("profiles", "u.id", "user_id") - .with_last_join_alias("p") - .limit(5), - &options, - 7, - 10_000, - ) - .expect("join cursor page"); - assert_eq!(second.rows.len(), 5); - assert!(second.cursor.is_some()); - - let first_ids: Vec = first.rows.iter().map(|r| r.values[0].clone()).collect(); - let second_ids: Vec = second.rows.iter().map(|r| r.values[0].clone()).collect(); - assert!( - first_ids - .iter() - .all(|id| !second_ids.iter().any(|other| other == id)) - ); - } - - #[test] - fn right_join_includes_unmatched_right_rows_with_nulls() { - let (mut keyspace, mut catalog) = setup(); - catalog - .create_table( - "A", - "app", - "profiles", - vec![ - ColumnDef { - name: "user_id".into(), - col_type: ColumnType::Integer, - nullable: false, - }, - ColumnDef { - name: "country".into(), - col_type: ColumnType::Text, - nullable: false, - }, - ], - vec!["user_id".into()], - ) - .expect("profiles table"); - for i in 90..110 { - keyspace.upsert_row( - "A", - "app", - "profiles", - vec![Value::Integer(i)], - Row::from_values(vec![Value::Integer(i), Value::Text("US".into())]), - 1, - ); - } - let snapshot = keyspace.snapshot(); - let result = execute_query( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["u.id", "p.user_id"]) - .from("users") - .alias("u") - .right_join("profiles", "u.id", "user_id") - .with_last_join_alias("p") - .order_by("p.user_id", Order::Asc) - .limit(200), - ) - .expect("right join"); - - assert_eq!(result.rows.len(), 20); - let unmatched = result - .rows - .iter() - .filter(|r| matches!(r.values[0], Value::Null)) - .count(); - assert_eq!(unmatched, 10); - } - - #[test] - fn cross_join_cardinality_and_limit_are_correct() { - let (mut keyspace, mut catalog) = setup(); - catalog - .create_table( - "A", - "app", - "profiles", - vec![ - ColumnDef { - name: "user_id".into(), - col_type: ColumnType::Integer, - nullable: false, - }, - ColumnDef { - name: "country".into(), - col_type: ColumnType::Text, - nullable: false, - }, - ], - vec!["user_id".into()], - ) - .expect("profiles table"); - for i in 0..5 { - keyspace.upsert_row( - "A", - "app", - "profiles", - vec![Value::Integer(i)], - Row::from_values(vec![Value::Integer(i), Value::Text("US".into())]), - 1, - ); - } - let snapshot = keyspace.snapshot(); - let result = execute_query( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["u.id", "p.user_id"]) - .from("users") - .alias("u") - .cross_join("profiles") - .with_last_join_alias("p") - .limit(123), - ) - .expect("cross join"); - assert_eq!(result.rows.len(), 123); - } - - #[test] - fn descending_cursor_pagination_is_stable() { - let (keyspace, catalog) = setup(); - let snapshot = keyspace.snapshot(); - let mut options = QueryOptions::default(); - let mut all = Vec::new(); - loop { - let page = execute_query_with_options( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["*"]) - .from("users") - .order_by("id", Order::Desc) - .limit(11), - &options, - 55, - 10_000, - ) - .expect("page"); - all.extend(page.rows.clone()); - if let Some(cursor) = page.cursor { - options.cursor = Some(cursor); - } else { - break; - } - } - assert_eq!(all.len(), 100); - for w in all.windows(2) { - assert!(w[0].values[0] > w[1].values[0]); - } - } - - #[test] - fn contradictory_pk_equalities_return_empty_result() { - let (keyspace, catalog) = setup(); - let snapshot = keyspace.snapshot(); - let result = execute_query( - &snapshot, - &catalog, - "A", - "app", - Query::select(&["id"]) - .from("users") - .where_( - Expr::Eq("id".into(), Value::Integer(1)) - .and(Expr::Eq("id".into(), Value::Integer(2))), - ) - .limit(10), - ) - .expect("query"); - assert!(result.rows.is_empty()); - } -} diff --git a/src/query/executor/indexing.rs b/src/query/executor/indexing.rs index d2be4b8..915304e 100644 --- a/src/query/executor/indexing.rs +++ b/src/query/executor/indexing.rs @@ -10,11 +10,19 @@ use super::predicate::collect_eq_constraints; type IndexBounds = (Bound, Bound); +#[derive(Clone)] enum IndexLookup { Range { column: String, bounds: IndexBounds }, MultiEq { column: String, values: Vec }, } +#[derive(Debug, Clone)] +pub(super) struct IndexLookupResult { + pub pks: Vec, + pub selected_indexes: Vec, + pub plan_trace: Vec, +} + pub(super) fn indexed_pks_for_predicate( catalog: &Catalog, project_id: &str, @@ -23,28 +31,80 @@ pub(super) fn indexed_pks_for_predicate( table: &crate::storage::keyspace::TableData, predicate: &crate::query::plan::Expr, ) -> Result>, QueryError> { + Ok(indexed_pks_for_predicate_with_trace( + catalog, project_id, scope_id, table_name, table, predicate, + )? + .map(|result| result.pks)) +} + +pub(super) fn indexed_pks_for_predicate_with_trace( + catalog: &Catalog, + project_id: &str, + scope_id: &str, + table_name: &str, + table: &crate::storage::keyspace::TableData, + predicate: &crate::query::plan::Expr, +) -> Result, QueryError> { use crate::query::plan::Expr; match predicate { Expr::And(lhs, rhs) => { - let left = - indexed_pks_for_predicate(catalog, project_id, scope_id, table_name, table, lhs)?; - let right = - indexed_pks_for_predicate(catalog, project_id, scope_id, table_name, table, rhs)?; + let left = indexed_pks_for_predicate_with_trace( + catalog, project_id, scope_id, table_name, table, lhs, + )?; + let right = indexed_pks_for_predicate_with_trace( + catalog, project_id, scope_id, table_name, table, rhs, + )?; return Ok(match (left, right) { - (Some(left), Some(right)) => Some(intersect_pks(left, right)), - (Some(left), None) => Some(left), - (None, Some(right)) => Some(right), + (Some(left), Some(right)) => Some(IndexLookupResult { + pks: intersect_pks(left.pks, right.pks), + selected_indexes: merge_selected_indexes( + left.selected_indexes, + right.selected_indexes, + ), + plan_trace: merge_trace( + "AND predicate combines indexed candidates with intersection", + left.plan_trace, + right.plan_trace, + ), + }), + (Some(left), None) => Some(IndexLookupResult { + plan_trace: merge_trace_single( + "AND predicate uses indexed left side; right side will be residual filter", + left.plan_trace, + ), + ..left + }), + (None, Some(right)) => Some(IndexLookupResult { + plan_trace: merge_trace_single( + "AND predicate uses indexed right side; left side will be residual filter", + right.plan_trace, + ), + ..right + }), (None, None) => None, }); } Expr::Or(lhs, rhs) => { - let left = - indexed_pks_for_predicate(catalog, project_id, scope_id, table_name, table, lhs)?; - let right = - indexed_pks_for_predicate(catalog, project_id, scope_id, table_name, table, rhs)?; + let left = indexed_pks_for_predicate_with_trace( + catalog, project_id, scope_id, table_name, table, lhs, + )?; + let right = indexed_pks_for_predicate_with_trace( + catalog, project_id, scope_id, table_name, table, rhs, + )?; return Ok(match (left, right) { - (Some(left), Some(right)) => Some(union_pks(left, right)), + (Some(left), Some(right)) => Some(IndexLookupResult { + pks: union_pks(left.pks, right.pks), + selected_indexes: merge_selected_indexes( + left.selected_indexes, + right.selected_indexes, + ), + plan_trace: merge_trace( + "OR predicate combines indexed candidates with union", + left.plan_trace, + right.plan_trace, + ), + }), _ => None, }); } @@ -112,7 +172,13 @@ pub(super) fn indexed_pks_for_predicate( } else { selected_index.scan_prefix(&encoded) }; - return Ok(Some(pks)); + return Ok(Some(IndexLookupResult { + pks, + selected_indexes: vec![idx_name.clone()], + plan_trace: vec![format!( + "selected composite index '{idx_name}' with leftmost prefix columns={prefix_cols}" + )], + })); }; let column = match &lookup { IndexLookup::Range { column, .. } => column, @@ -145,14 +211,45 @@ pub(super) fn indexed_pks_for_predicate( return Ok(None); }; - let pks = match lookup { + let pks = match lookup.clone() { IndexLookup::Range { bounds, .. } => index.scan_range(bounds.0, bounds.1), IndexLookup::MultiEq { values, .. } => values .into_iter() .flat_map(|v| index.scan_eq(&EncodedKey::from_values(&[v]))) .collect(), }; - Ok(Some(pks)) + Ok(Some(IndexLookupResult { + pks, + selected_indexes: vec![index_name.clone()], + plan_trace: vec![format!( + "selected single-column index '{index_name}' for predicate on '{column}'" + )], + })) +} + +fn merge_selected_indexes(left: Vec, right: Vec) -> Vec { + let mut out = Vec::with_capacity(left.len() + right.len()); + for name in left.into_iter().chain(right) { + if !out.contains(&name) { + out.push(name); + } + } + out +} + +fn merge_trace(header: &str, mut left: Vec, right: Vec) -> Vec { + let mut out = Vec::with_capacity(1 + left.len() + right.len()); + out.push(header.to_string()); + out.append(&mut left); + out.extend(right); + out +} + +fn merge_trace_single(header: &str, mut trace: Vec) -> Vec { + let mut out = Vec::with_capacity(1 + trace.len()); + out.push(header.to_string()); + out.append(&mut trace); + out } fn intersect_pks(left: Vec, right: Vec) -> Vec { diff --git a/src/query/executor/mod.rs b/src/query/executor/mod.rs index 05affcd..e864609 100644 --- a/src/query/executor/mod.rs +++ b/src/query/executor/mod.rs @@ -23,7 +23,7 @@ use aggregate::{aggregate_col_idx, aggregate_output_name}; use cursor::{ CursorToken, decode_cursor, encode_cursor, extract_pk_key, extract_sort_key, row_after_cursor, }; -use indexing::indexed_pks_for_predicate; +use indexing::{indexed_pks_for_predicate, indexed_pks_for_predicate_with_trace}; use predicate::extract_primary_key_values; use validate::validate_query; @@ -37,6 +37,13 @@ pub struct QueryResult { pub materialized_seq: Option, } +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) struct AccessPathDiagnostics { + pub selected_indexes: Vec, + pub predicate_evaluation_path: crate::PredicateEvaluationPath, + pub plan_trace: Vec, +} + pub fn execute_query( snapshot: &KeyspaceSnapshot, catalog: &Catalog, @@ -56,6 +63,134 @@ pub fn execute_query( ) } +pub(crate) fn explain_access_path_for_query( + snapshot: &KeyspaceSnapshot, + catalog: &Catalog, + project_id: &str, + scope_id: &str, + query: &Query, + options: &QueryOptions, +) -> Result { + if !query.joins.is_empty() { + let mut trace = Vec::new(); + trace.push("join query: predicate evaluation happens after join execution".to_string()); + if query.predicate.is_some() { + trace.push("post-join filter stage evaluates query predicate".to_string()); + } + return Ok(AccessPathDiagnostics { + selected_indexes: Vec::new(), + predicate_evaluation_path: crate::PredicateEvaluationPath::JoinExecution, + plan_trace: trace, + }); + } + + let mut selected_indexes = Vec::new(); + let mut trace = Vec::new(); + let mut predicate_evaluation_path = crate::PredicateEvaluationPath::None; + + let mut effective_options = options.clone(); + if effective_options.async_index.is_none() { + effective_options.async_index = query.use_index.clone(); + } + + if let Some(async_index) = &effective_options.async_index { + selected_indexes.push(async_index.clone()); + trace.push(format!( + "selected async index projection '{async_index}' as row source" + )); + predicate_evaluation_path = crate::PredicateEvaluationPath::AsyncIndexProjection; + if query.predicate.is_some() { + trace.push("query predicate is evaluated as filter on projected rows".to_string()); + } + return Ok(AccessPathDiagnostics { + selected_indexes, + predicate_evaluation_path, + plan_trace: trace, + }); + } + + let table_key = (namespace_key(project_id, scope_id), query.table.clone()); + let schema = catalog + .tables + .get(&table_key) + .ok_or_else(|| QueryError::TableNotFound { + project_id: project_id.to_string(), + table: query.table.clone(), + })?; + let table = snapshot.table(project_id, scope_id, &query.table); + + if let Some(predicate) = query.predicate.as_ref() { + if query.limit != Some(0) + && query.group_by.is_empty() + && query.aggregates.is_empty() + && query.having.is_none() + && query.order_by.is_empty() + && options.cursor.is_none() + && extract_primary_key_values(predicate, &schema.primary_key).is_some() + { + trace.push("primary-key equality predicate detected; using direct row lookup".into()); + return Ok(AccessPathDiagnostics { + selected_indexes, + predicate_evaluation_path: crate::PredicateEvaluationPath::PrimaryKeyEqLookup, + plan_trace: trace, + }); + } + + if let Some(table) = table { + if let Some(indexed) = indexed_pks_for_predicate_with_trace( + catalog, + project_id, + scope_id, + &query.table, + table, + predicate, + )? { + if !indexed.selected_indexes.is_empty() { + selected_indexes.extend(indexed.selected_indexes.clone()); + predicate_evaluation_path = + crate::PredicateEvaluationPath::SecondaryIndexLookup; + } else { + predicate_evaluation_path = crate::PredicateEvaluationPath::FullScanFilter; + } + trace.extend(indexed.plan_trace); + if matches!( + predicate_evaluation_path, + crate::PredicateEvaluationPath::FullScanFilter + ) { + trace.push( + "no matching secondary index; evaluating predicate during table scan" + .to_string(), + ); + } else { + trace.push( + "residual predicate is evaluated on rows returned by index lookup" + .to_string(), + ); + } + return Ok(AccessPathDiagnostics { + selected_indexes, + predicate_evaluation_path, + plan_trace: trace, + }); + } + } + + trace.push("predicate not indexable for current schema/index set".to_string()); + return Ok(AccessPathDiagnostics { + selected_indexes, + predicate_evaluation_path: crate::PredicateEvaluationPath::FullScanFilter, + plan_trace: trace, + }); + } + + trace.push("no predicate supplied; full table scan path".to_string()); + Ok(AccessPathDiagnostics { + selected_indexes, + predicate_evaluation_path, + plan_trace: trace, + }) +} + #[allow(clippy::too_many_arguments)] pub fn execute_query_with_options( snapshot: &KeyspaceSnapshot, diff --git a/src/query/executor/validate.rs b/src/query/executor/validate.rs index 431689c..6b237cf 100644 --- a/src/query/executor/validate.rs +++ b/src/query/executor/validate.rs @@ -62,6 +62,7 @@ fn validate_expr_types( ), ColumnType::Text => matches!(value, Value::Text(_)), ColumnType::Boolean => matches!(value, Value::Boolean(_)), + ColumnType::U8 => matches!(value, Value::U8(_) | Value::Integer(_)), ColumnType::U256 => matches!(value, Value::U256(_)), ColumnType::I256 => matches!(value, Value::I256(_)), ColumnType::Blob => matches!(value, Value::Blob(_)),