diff --git a/rust/crates/cloudsearch-api/src/lib.rs b/rust/crates/cloudsearch-api/src/lib.rs index 4ddd06a..7585708 100644 --- a/rust/crates/cloudsearch-api/src/lib.rs +++ b/rust/crates/cloudsearch-api/src/lib.rs @@ -628,12 +628,35 @@ fn parse_query(value: &Value) -> Result { } fn parse_term_query(value: &Value) -> Result { + use cloudsearch_common::Fuzziness; + let object = value.as_object().ok_or_else(|| { ApiError(CloudSearchError::InvalidSearchRequest( "term query must be a JSON object".to_string(), )) })?; + // Extract optional fuzziness before consuming the object + let fuzziness = object + .get("fuzziness") + .map(|fv| -> Result { + match fv { + Value::String(s) if s.eq_ignore_ascii_case("auto") => Ok(Fuzziness::Auto), + Value::Number(n) if n.is_u64() => { + let n = usize::try_from(n.as_u64().unwrap()).map_err(|_| { + ApiError(CloudSearchError::InvalidSearchRequest( + "fuzziness value is too large".to_string(), + )) + })?; + Ok(Fuzziness::Exact(n)) + } + _ => Err(ApiError(CloudSearchError::InvalidSearchRequest( + "fuzziness must be 'auto' or a non-negative integer".to_string(), + ))), + } + }) + .transpose()?; + if object.contains_key("field") || object.contains_key("value") { let field = object.get("field").and_then(Value::as_str).ok_or_else(|| { ApiError(CloudSearchError::InvalidSearchRequest( @@ -649,6 +672,7 @@ fn parse_term_query(value: &Value) -> Result { return Ok(TermQuery { field: field.to_string(), value, + fuzziness, }); } @@ -662,6 +686,7 @@ fn parse_term_query(value: &Value) -> Result { Ok(TermQuery { field: field.clone(), value: raw_value.clone(), + fuzziness, }) } @@ -1982,6 +2007,7 @@ mod tests { query: Some(SearchQuery::Term(TermQuery { field: "service".to_string(), value: serde_json::json!("billing"), + fuzziness: None, })), ..Default::default() }) @@ -2027,6 +2053,7 @@ mod tests { query: Some(SearchQuery::Term(TermQuery { field: "service".to_string(), value: serde_json::json!("billing"), + fuzziness: None, })), ..Default::default() }) @@ -2060,6 +2087,7 @@ mod tests { filter: vec![SearchQuery::Term(TermQuery { field: "level".to_string(), value: serde_json::json!("info"), + fuzziness: None, })], ..Default::default() })), @@ -4083,6 +4111,7 @@ mod tests { query: Some(SearchQuery::Term(TermQuery { field: "level".to_string(), value: serde_json::json!("info"), + fuzziness: None, })), aggs: Some(std::collections::BTreeMap::from([ ( @@ -4447,4 +4476,84 @@ mod tests { let metrics_str = String::from_utf8(metrics_body.to_vec()).expect("metrics to string"); assert!(metrics_str.contains("cloudsearch_merge_total")); } + + #[test] + fn parse_term_query_with_fuzziness_auto() { + use cloudsearch_common::Fuzziness; + let json = serde_json::json!({ + "field": "name", + "value": "admin", + "fuzziness": "auto" + }); + let result = parse_term_query(&json).expect("should parse"); + assert_eq!(result.fuzziness, Some(Fuzziness::Auto)); + } + + #[test] + fn parse_term_query_with_fuzziness_auto_uppercase() { + use cloudsearch_common::Fuzziness; + let json = serde_json::json!({ + "field": "name", + "value": "admin", + "fuzziness": "AUTO" + }); + let result = parse_term_query(&json).expect("should parse"); + assert_eq!(result.fuzziness, Some(Fuzziness::Auto)); + } + + #[test] + fn parse_term_query_with_fuzziness_exact_integer() { + use cloudsearch_common::Fuzziness; + let json = serde_json::json!({ + "field": "name", + "value": "admin", + "fuzziness": 2 + }); + let result = parse_term_query(&json).expect("should parse"); + assert_eq!(result.fuzziness, Some(Fuzziness::Exact(2))); + } + + #[test] + fn parse_term_query_with_fuzziness_zero() { + use cloudsearch_common::Fuzziness; + let json = serde_json::json!({ + "field": "name", + "value": "admin", + "fuzziness": 0 + }); + let result = parse_term_query(&json).expect("should parse"); + assert_eq!(result.fuzziness, Some(Fuzziness::Exact(0))); + } + + #[test] + fn parse_term_query_with_fuzziness_missing() { + let json = serde_json::json!({ + "field": "name", + "value": "admin" + }); + let result = parse_term_query(&json).expect("should parse"); + assert_eq!(result.fuzziness, None); + } + + #[test] + fn parse_term_query_with_fuzziness_wrong_type_rejected() { + let json = serde_json::json!({ + "field": "name", + "value": "admin", + "fuzziness": true + }); + let result = parse_term_query(&json); + assert!(result.is_err(), "fuzziness: true should be rejected"); + } + + #[test] + fn parse_term_query_with_fuzziness_unknown_string_rejected() { + let json = serde_json::json!({ + "field": "name", + "value": "admin", + "fuzziness": "unknown" + }); + let result = parse_term_query(&json); + assert!(result.is_err(), "fuzziness: unknown should be rejected"); + } } diff --git a/rust/crates/cloudsearch-api/src/query_string.rs b/rust/crates/cloudsearch-api/src/query_string.rs index f4f81e7..037118f 100644 --- a/rust/crates/cloudsearch-api/src/query_string.rs +++ b/rust/crates/cloudsearch-api/src/query_string.rs @@ -4,7 +4,7 @@ //! into the existing `SearchQuery` AST. use cloudsearch_common::{ - BoolQuery, CloudSearchError, RangeQuery, SearchQuery, TermQuery, WildcardQuery, + BoolQuery, CloudSearchError, Fuzziness, RangeQuery, SearchQuery, TermQuery, WildcardQuery, }; /// Parse a query string into a `SearchQuery`. @@ -157,6 +157,7 @@ impl<'a> Parser<'a> { Ok(SearchQuery::Term(TermQuery { field: "tag".to_string(), value: serde_json::Value::String(word.to_string()), + fuzziness: None, })) } } @@ -245,6 +246,34 @@ impl<'a> Parser<'a> { })); } + // Fuzziness suffix: value~auto or value~N + // NOTE: This fires before wildcard detection (* and ?) below. A value like + // "admin~auto*" will be parsed as a fuzziness query with suffix "auto*" (which + // fails validation) rather than a wildcard query. This is unlikely to affect real + // queries but is a known limitation of the current parse order. + if let Some(with_tilde) = value.strip_suffix('~') { + let (base_value, fuzz_suffix) = with_tilde.split_once('~').unwrap_or((with_tilde, "")); + let fuzziness = if fuzz_suffix.is_empty() { + return Err(CloudSearchError::InvalidSearchRequest( + "fuzziness suffix '~' must be followed by 'auto' or a number".to_string(), + )); + } else if fuzz_suffix.eq_ignore_ascii_case("auto") { + Some(Fuzziness::Auto) + } else if let Ok(dist) = fuzz_suffix.parse::() { + Some(Fuzziness::Exact(dist)) + } else { + return Err(CloudSearchError::InvalidSearchRequest(format!( + "invalid fuzziness suffix '~{fuzz_suffix}' — use '~auto' or '~N'" + ))); + }; + let json_value = Self::parse_value(base_value); + return Ok(SearchQuery::Term(TermQuery { + field: field.to_string(), + value: json_value, + fuzziness, + })); + } + // Wildcard detection: contains * or ? if value.contains('*') || value.contains('?') { return Ok(SearchQuery::Wildcard(WildcardQuery { @@ -258,6 +287,7 @@ impl<'a> Parser<'a> { Ok(SearchQuery::Term(TermQuery { field: field.to_string(), value: json_value, + fuzziness: None, })) } @@ -597,7 +627,8 @@ mod tests { result, SearchQuery::Term(TermQuery { field: "status".to_string(), - value: serde_json::json!("active") + value: serde_json::json!("active"), + fuzziness: None, }) ); } @@ -609,7 +640,8 @@ mod tests { result, SearchQuery::Term(TermQuery { field: "count".to_string(), - value: serde_json::json!(42) + value: serde_json::json!(42), + fuzziness: None, }) ); } @@ -672,7 +704,8 @@ mod tests { result, SearchQuery::Term(TermQuery { field: "message".to_string(), - value: serde_json::json!("hello world") + value: serde_json::json!("hello world"), + fuzziness: None, }) ); } diff --git a/rust/crates/cloudsearch-common/src/lib.rs b/rust/crates/cloudsearch-common/src/lib.rs index 7053292..046d4d5 100644 --- a/rust/crates/cloudsearch-common/src/lib.rs +++ b/rust/crates/cloudsearch-common/src/lib.rs @@ -289,9 +289,20 @@ pub struct PhraseQuery { } #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[serde(rename_all = "lowercase")] +pub enum Fuzziness { + /// Automatically choose edit distance: 0 for 1-2 chars, 1 for 3-5 chars, 2 for 6+ chars. + Auto, + /// Explicit edit distance threshold. + Exact(usize), +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)] pub struct TermQuery { pub field: String, pub value: serde_json::Value, + #[serde(skip_serializing_if = "Option::is_none", default)] + pub fuzziness: Option, } #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)] diff --git a/rust/crates/cloudsearch-common/tests/round_trip.rs b/rust/crates/cloudsearch-common/tests/round_trip.rs index 058fc8f..086668a 100644 --- a/rust/crates/cloudsearch-common/tests/round_trip.rs +++ b/rust/crates/cloudsearch-common/tests/round_trip.rs @@ -122,6 +122,7 @@ fn test_search_query_term() { round_trip(&SearchQuery::Term(TermQuery { field: "status".to_string(), value: serde_json::json!("active"), + fuzziness: None, })); } @@ -150,15 +151,18 @@ fn test_search_query_bool() { must: vec![SearchQuery::Term(TermQuery { field: "status".to_string(), value: serde_json::json!("active"), + fuzziness: None, })], should: vec![SearchQuery::Term(TermQuery { field: "tag".to_string(), value: serde_json::json!("featured"), + fuzziness: None, })], filter: vec![], must_not: vec![SearchQuery::Term(TermQuery { field: "deleted".to_string(), value: serde_json::json!(true), + fuzziness: None, })], })); } @@ -597,6 +601,7 @@ fn test_search_request_all_fields() { query: Some(SearchQuery::Term(TermQuery { field: "status".to_string(), value: serde_json::json!("active"), + fuzziness: None, })), from: Some(10), size: Some(25), @@ -626,6 +631,7 @@ fn test_term_query_string_value() { round_trip(&TermQuery { field: "name".to_string(), value: serde_json::json!("alice"), + ..Default::default() }); } @@ -634,6 +640,7 @@ fn test_term_query_numeric_value() { round_trip(&TermQuery { field: "count".to_string(), value: serde_json::json!(42), + ..Default::default() }); } @@ -642,6 +649,7 @@ fn test_term_query_bool_value() { round_trip(&TermQuery { field: "active".to_string(), value: serde_json::json!(true), + ..Default::default() }); } @@ -1065,6 +1073,7 @@ fn test_bool_query_with_should_and_filter() { should: vec![SearchQuery::Term(TermQuery { field: "tag".to_string(), value: serde_json::json!("featured"), + fuzziness: None, })], filter: vec![SearchQuery::Range(RangeQuery { field: "price".to_string(), diff --git a/rust/crates/cloudsearch-index/src/lib.rs b/rust/crates/cloudsearch-index/src/lib.rs index bf23cc7..d56134b 100644 --- a/rust/crates/cloudsearch-index/src/lib.rs +++ b/rust/crates/cloudsearch-index/src/lib.rs @@ -3,10 +3,10 @@ use cloudsearch_common::{ AggregationRequest, AggregationResult, BoolQuery, BulkItem, BulkItemResult, BulkOperation, BulkRequest, BulkResponse, CloudSearchError, CreateIndexRequest, DateHistogramAggregationResult, DateHistogramBucket, DateHistogramInterval, FieldMapping, - FieldType, FlushResponse, HitsMetadata, IndexDocument, IndexMetadata, MappingMode, MatchQuery, - MergeResponse, PhraseQuery, PrefixQuery, RangeQuery, Result, SearchHit, SearchQuery, - SearchRequest, SearchResponse, SortOrder, SortSpec, StatsAggregationResult, - TermsAggregationResult, TermsBucket, TermsQuery, WildcardQuery, + FieldType, FlushResponse, Fuzziness, HitsMetadata, IndexDocument, IndexMetadata, MappingMode, + MatchQuery, MergeResponse, PhraseQuery, PrefixQuery, RangeQuery, Result, SearchHit, + SearchQuery, SearchRequest, SearchResponse, SortOrder, SortSpec, StatsAggregationResult, + TermQuery, TermsAggregationResult, TermsBucket, TermsQuery, WildcardQuery, }; use cloudsearch_storage::{ IndexManifest, SegmentMeta, SegmentSnapshot, SnapshotMetadata, WalManager, WalRecord, @@ -921,6 +921,14 @@ impl IndexHandle { pub fn validate_search_request(&self, request: &SearchRequest) -> Result<()> { if let Some(query) = &request.query { self.validate_query(query)?; + // search_after + fuzzy query is invalid because fuzzy matching can change + // which documents match, affecting sort order and making cursors unreliable. + if request.search_after.is_some() && self.query_has_fuzzy_term(query) { + return Err(CloudSearchError::InvalidSearchRequest( + "search_after is not supported with fuzzy queries because match behavior affects sort order" + .to_string(), + )); + } } if let Some(sort) = &request.sort @@ -1589,6 +1597,21 @@ impl IndexHandle { } } + #[allow(clippy::self_only_used_in_recursion)] + fn query_has_fuzzy_term(&self, query: &SearchQuery) -> bool { + match query { + SearchQuery::Term(term) => term.fuzziness.is_some(), + SearchQuery::Bool(boolean) => boolean + .must + .iter() + .chain(boolean.should.iter()) + .chain(boolean.filter.iter()) + .chain(boolean.must_not.iter()) + .any(|q| self.query_has_fuzzy_term(q)), + _ => false, + } + } + fn ensure_scalar_field(&self, field: &str, context: &str) -> Result<()> { if let Some(mapping) = self.metadata.mappings.get(field) && matches!(mapping.field_type, FieldType::Object) @@ -1702,11 +1725,10 @@ fn score_query( ) -> Option { match query { SearchQuery::MatchAll => Some(1.0), - SearchQuery::Term(term) => document - .source - .get(&term.field) - .filter(|value| **value == term.value) - .map(|_| 1.0), + SearchQuery::Term(term) => match fuzzy_term_match(document, term) { + Some(true) => Some(1.0), + _ => None, + }, SearchQuery::Terms(terms) => matches_terms_query(document, terms).then_some(1.0), SearchQuery::Range(range) => matches_range_query(document, range).then_some(1.0), SearchQuery::Bool(bool_query) => { @@ -1837,6 +1859,86 @@ fn tokenize(text: &str) -> Vec { .collect() } +/// Returns whether the stored value fuzzy-matches the query term. +/// +/// # Return Value Semantics +/// - `None` — field is absent from the document, or stored value is not a string. +/// Cannot participate in fuzzy matching. +/// - `Some(true)` — field is a string and matches within the fuzziness threshold. +/// - `Some(false)` — field is a string but edit distance exceeds the threshold. +/// The document does not match. +/// +/// When `fuzziness` is `None`, performs exact comparison (handles bool, number, +/// string uniformly). +fn fuzzy_term_match(document: &IndexDocument, term: &TermQuery) -> Option { + let stored = document.source.get(&term.field)?; + + // When no fuzziness, do exact comparison (handles bool, number, string) + if term.fuzziness.is_none() { + return (stored == &term.value).then_some(true); + } + + // Fuzzy matching requires string values + let stored_str = stored.as_str()?; + let query_value = term.value.as_str()?; + + match &term.fuzziness { + // The None branch is unreachable because we already checked is_none() above. + // If this guard logic ever changes, a compilation failure is preferable to a + // silent wrong-answer at runtime — so we use unreachable!() rather than None. + None => unreachable!(), + Some(Fuzziness::Auto) => { + let threshold = match query_value.len() { + 0..=2 => 0, + 3..=5 => 1, + _ => 2, + }; + if threshold == 0 { + (stored_str == query_value).then_some(true) + } else { + Some(levenshtein_distance(stored_str, query_value) <= threshold) + } + } + Some(Fuzziness::Exact(max_dist)) => { + if *max_dist == 0 { + (stored_str == query_value).then_some(true) + } else { + Some(levenshtein_distance(stored_str, query_value) <= *max_dist) + } + } + } +} + +/// Compute the Levenshtein edit distance between two strings. +fn levenshtein_distance(a: &str, b: &str) -> usize { + if a.is_empty() { + return b.len(); + } + if b.is_empty() { + return a.len(); + } + + let mut matrix = vec![vec![0usize; b.len() + 1]; a.len() + 1]; + + for (i, row) in matrix.iter_mut().enumerate().take(a.len() + 1) { + row[0] = i; + } + for (j, cell) in matrix[0].iter_mut().enumerate().take(b.len() + 1) { + *cell = j; + } + + for (i, ca) in a.char_indices() { + for (j, cb) in b.char_indices() { + let cost = usize::from(ca != cb); + matrix[i + 1][j + 1] = (matrix[i][j + 1] + 1) // deletion + .min(matrix[i + 1][j] + 1) // insertion + .min(matrix[i][j] + cost); // substitution + } + } + + matrix[a.len()][b.len()] +} + /// Stable hash of a document ID string for use as a persistent `doc_id` in postings. /// Using the string directly ensures the same ID always produces the same hash, /// independent of enumeration order or segment boundaries. @@ -3467,6 +3569,7 @@ mod tests { query: Some(SearchQuery::Term(TermQuery { field: "service".to_string(), value: serde_json::json!("billing"), + fuzziness: None, })), ..Default::default() }); @@ -3478,6 +3581,7 @@ mod tests { filter: vec![SearchQuery::Term(TermQuery { field: "level".to_string(), value: serde_json::json!("info"), + fuzziness: None, })], ..Default::default() })), @@ -3568,6 +3672,7 @@ mod tests { query: Some(SearchQuery::Term(TermQuery { field: "missing".to_string(), value: serde_json::json!("nope"), + fuzziness: None, })), ..Default::default() }); @@ -3579,10 +3684,12 @@ mod tests { SearchQuery::Term(TermQuery { field: "service".to_string(), value: serde_json::json!("billing"), + fuzziness: None, }), SearchQuery::Term(TermQuery { field: "active".to_string(), value: serde_json::json!(true), + fuzziness: None, }), ], ..Default::default() @@ -3706,6 +3813,7 @@ mod tests { query: Some(SearchQuery::Term(TermQuery { field: "active".to_string(), value: serde_json::json!(true), + fuzziness: None, })), ..Default::default() }); @@ -3715,6 +3823,7 @@ mod tests { query: Some(SearchQuery::Term(TermQuery { field: "latency".to_string(), value: serde_json::json!(42), + fuzziness: None, })), ..Default::default() }); @@ -3724,6 +3833,7 @@ mod tests { query: Some(SearchQuery::Term(TermQuery { field: "latency".to_string(), value: serde_json::json!("42"), + fuzziness: None, })), ..Default::default() }); @@ -3942,6 +4052,7 @@ mod tests { must: vec![SearchQuery::Term(TermQuery { field: "service".to_string(), value: serde_json::json!("billing"), + fuzziness: None, })], ..Default::default() })), @@ -3955,10 +4066,12 @@ mod tests { SearchQuery::Term(TermQuery { field: "service".to_string(), value: serde_json::json!("billing"), + fuzziness: None, }), SearchQuery::Term(TermQuery { field: "service".to_string(), value: serde_json::json!("search"), + fuzziness: None, }), ], ..Default::default() @@ -3972,10 +4085,12 @@ mod tests { filter: vec![SearchQuery::Term(TermQuery { field: "service".to_string(), value: serde_json::json!("billing"), + fuzziness: None, })], must_not: vec![SearchQuery::Term(TermQuery { field: "level".to_string(), value: serde_json::json!("error"), + fuzziness: None, })], ..Default::default() })), @@ -4347,6 +4462,7 @@ mod tests { query: Some(SearchQuery::Term(TermQuery { field: "level".to_string(), value: serde_json::json!("info"), + fuzziness: None, })), from: Some(0), size: Some(1), @@ -5457,4 +5573,166 @@ mod tests { let result = tokenize(""); assert!(result.is_empty()); } + + #[test] + fn levenshtein_distance_empty() { + assert_eq!(levenshtein_distance("", ""), 0); + assert_eq!(levenshtein_distance("", "abc"), 3); + assert_eq!(levenshtein_distance("abc", ""), 3); + } + + #[test] + fn levenshtein_distance_identical() { + assert_eq!(levenshtein_distance("hello", "hello"), 0); + assert_eq!(levenshtein_distance("", ""), 0); + } + + #[test] + fn levenshtein_distance_one_edit() { + assert_eq!(levenshtein_distance("hello", "hallo"), 1); // substitution + assert_eq!(levenshtein_distance("hello", "hell"), 1); // deletion + assert_eq!(levenshtein_distance("hello", "helloo"), 1); // insertion + } + + #[test] + fn levenshtein_distance_case_sensitive() { + assert_eq!(levenshtein_distance("HELLO", "hello"), 5); // all chars different + assert_eq!(levenshtein_distance("Hello", "hello"), 1); // case only + } + + #[test] + fn levenshtein_distance_complex() { + assert_eq!(levenshtein_distance("kitten", "sitting"), 3); + } + + #[test] + fn fuzzy_term_match_exact_no_fuzziness() { + use cloudsearch_common::TermQuery; + let doc = IndexDocument { + id: "1".to_string(), + source: serde_json::json!({"name": "admin"}), + }; + // No fuzziness - exact match required + let term = TermQuery { + field: "name".to_string(), + value: serde_json::json!("admin"), + fuzziness: None, + }; + let result = fuzzy_term_match(&doc, &term); + assert_eq!( + result, + Some(true), + "exact match should return Some(true), got {result:?}" + ); + + // Non-matching value returns None (no match, same as original behavior) + let term_miss = TermQuery { + field: "name".to_string(), + value: serde_json::json!("xyz"), + fuzziness: None, + }; + let result_miss = fuzzy_term_match(&doc, &term_miss); + assert_eq!( + result_miss, None, + "non-matching value should return None, got {result_miss:?}" + ); + + // Missing field returns None + let term_missing = TermQuery { + field: "nonexistent".to_string(), + value: serde_json::json!("admin"), + fuzziness: None, + }; + let result_missing = fuzzy_term_match(&doc, &term_missing); + assert_eq!( + result_missing, None, + "missing field should return None, got {result_missing:?}" + ); + } + + #[test] + fn fuzzy_term_match_exact_with_fuzziness() { + use cloudsearch_common::{Fuzziness, TermQuery}; + let doc = IndexDocument { + id: "1".to_string(), + source: serde_json::json!({"name": "admin"}), + }; + // Fuzziness::Exact(0) is still exact match + let term = TermQuery { + field: "name".to_string(), + value: serde_json::json!("admin"), + fuzziness: Some(Fuzziness::Exact(0)), + }; + let result = fuzzy_term_match(&doc, &term); + assert_eq!( + result, + Some(true), + "exact match with Exact(0) should return Some(true), got {result:?}" + ); + + // Different value with threshold 0 - no match + let term_miss = TermQuery { + field: "name".to_string(), + value: serde_json::json!("xyz"), + fuzziness: Some(Fuzziness::Exact(0)), + }; + let result_miss = fuzzy_term_match(&doc, &term_miss); + assert_eq!( + result_miss, None, + "threshold=0 fuzzy with string mismatch returns None, got {result_miss:?}" + ); + } + + #[test] + fn fuzzy_term_match_auto_mode() { + use cloudsearch_common::{Fuzziness, TermQuery}; + let doc = IndexDocument { + id: "1".to_string(), + source: serde_json::json!({"name": "admin"}), + }; + // "admin" (6 chars) → Auto threshold = 2, exact match passes + let term = TermQuery { + field: "name".to_string(), + value: serde_json::json!("admin"), + fuzziness: Some(Fuzziness::Auto), + }; + assert_eq!(fuzzy_term_match(&doc, &term), Some(true)); + + // Edit distance 1 (admim vs admin) → should match with threshold 2 + let term_fuzzy = TermQuery { + field: "name".to_string(), + value: serde_json::json!("admim"), + fuzziness: Some(Fuzziness::Auto), + }; + assert_eq!(fuzzy_term_match(&doc, &term_fuzzy), Some(true)); + + // Edit distance 5 > threshold 2 → returns Some(false) + let term_no_match = TermQuery { + field: "name".to_string(), + value: serde_json::json!("xyz"), + fuzziness: Some(Fuzziness::Auto), + }; + let result_no_match = fuzzy_term_match(&doc, &term_no_match); + assert_eq!( + result_no_match, + Some(false), + "edit distance 5 > threshold 2 should return Some(false), got {result_no_match:?}" + ); + } + + #[test] + fn fuzzy_term_match_numeric_stored_value() { + use cloudsearch_common::{Fuzziness, TermQuery}; + let doc = IndexDocument { + id: "1".to_string(), + source: serde_json::json!({"count": 42}), + }; + // Numeric stored value with fuzziness - should return None (fuzzy only works with strings) + let term = TermQuery { + field: "count".to_string(), + value: serde_json::json!(42), + fuzziness: Some(Fuzziness::Auto), + }; + assert_eq!(fuzzy_term_match(&doc, &term), None); // fuzzy requires string + } } diff --git a/rust/crates/cloudsearch-index/tests/coverage.rs b/rust/crates/cloudsearch-index/tests/coverage.rs index f57d8dc..01fb660 100644 --- a/rust/crates/cloudsearch-index/tests/coverage.rs +++ b/rust/crates/cloudsearch-index/tests/coverage.rs @@ -3,8 +3,8 @@ //! Run with: cargo test -p cloudsearch-index --test coverage use cloudsearch_common::{ - BoolQuery, CreateIndexRequest, IndexDocument, IndexSettings, MatchQuery, SearchQuery, - SearchRequest, SortOrder, SortSpec, TermQuery, + BoolQuery, CreateIndexRequest, Fuzziness, IndexDocument, IndexSettings, MatchQuery, + SearchQuery, SearchRequest, SortOrder, SortSpec, TermQuery, }; use cloudsearch_index::{IndexCatalog, MergePlan}; use cloudsearch_storage::SegmentMeta; @@ -111,6 +111,7 @@ async fn validate_search_request_rejects_nested_bool_with_object_field() { must: vec![SearchQuery::Term(TermQuery { field: "meta".to_string(), value: serde_json::json!("value"), + fuzziness: None, })], should: vec![], filter: vec![], @@ -222,6 +223,45 @@ async fn validate_search_request_rejects_search_after_without_sort() { ); } +#[tokio::test] +async fn validate_search_request_rejects_fuzzy_with_search_after() { + let temp_dir = TempDir::new().expect("temp dir"); + let catalog = Arc::new(IndexCatalog::new(temp_dir.path())); + catalog.initialize().await.expect("init catalog"); + let _metadata = catalog + .create_index( + "test", + CreateIndexRequest { + settings: IndexSettings::default(), + ..Default::default() + }, + ) + .await + .expect("create index"); + let handle = catalog.open_index("test").await.expect("open index"); + + // Fuzzy query with search_after is invalid — fuzzy matching affects sort order + let request = SearchRequest { + query: Some(SearchQuery::Term(TermQuery { + field: "name".to_string(), + value: serde_json::json!("admin"), + fuzziness: Some(Fuzziness::Auto), + })), + search_after: Some(vec![serde_json::json!(1.0), serde_json::json!("doc123")]), + sort: Some(SortSpec { + field: "name".to_string(), + order: SortOrder::Asc, + }), + ..Default::default() + }; + + let result = handle.validate_search_request(&request); + assert!( + result.is_err(), + "fuzzy query with search_after should be rejected" + ); +} + #[tokio::test] async fn highlight_positions_case_insensitive() { // Index doc with mixed-case text, search for lowercase term. @@ -391,3 +431,128 @@ async fn highlight_positions_empty_field() { "no highlight for empty text field" ); } + +#[tokio::test] +async fn fuzzy_match_exact_edit_distance_within_threshold() { + // Index doc with "name": "admin", search with "admim" (edit distance 1) and fuzziness=1 + // Should match since edit distance <= threshold + use cloudsearch_common::{Fuzziness, TermQuery}; + + let temp_dir = TempDir::new().expect("temp dir"); + let catalog = Arc::new(IndexCatalog::new(temp_dir.path())); + catalog.initialize().await.expect("init catalog"); + catalog + .create_index( + "test", + CreateIndexRequest { + settings: IndexSettings::default(), + ..Default::default() + }, + ) + .await + .expect("create index"); + let mut handle = catalog.open_index("test").await.expect("open index"); + + handle + .index_document(doc("1", serde_json::json!({"name": "admin"}))) + .await + .expect("index"); + handle.refresh().await.expect("refresh"); + + // Edit distance 1 — should match with fuzziness=1 + let result = handle.search(&SearchRequest { + query: Some(SearchQuery::Term(TermQuery { + field: "name".to_string(), + value: serde_json::json!("admim"), + fuzziness: Some(Fuzziness::Exact(1)), + })), + ..Default::default() + }); + assert_eq!( + result.hits.total, 1, + "edit distance 1 should match with fuzziness=1" + ); + assert_eq!(result.hits.hits[0].id, "1"); +} + +#[tokio::test] +async fn fuzzy_match_no_match_when_exceeding_threshold() { + // Index doc with "name": "admin", search with "xyz" (edit distance 5) and fuzziness=1 + // Should NOT match since edit distance > threshold + use cloudsearch_common::{Fuzziness, TermQuery}; + + let temp_dir = TempDir::new().expect("temp dir"); + let catalog = Arc::new(IndexCatalog::new(temp_dir.path())); + catalog.initialize().await.expect("init catalog"); + catalog + .create_index( + "test", + CreateIndexRequest { + settings: IndexSettings::default(), + ..Default::default() + }, + ) + .await + .expect("create index"); + let mut handle = catalog.open_index("test").await.expect("open index"); + + handle + .index_document(doc("1", serde_json::json!({"name": "admin"}))) + .await + .expect("index"); + handle.refresh().await.expect("refresh"); + + let result = handle.search(&SearchRequest { + query: Some(SearchQuery::Term(TermQuery { + field: "name".to_string(), + value: serde_json::json!("xyz"), + fuzziness: Some(Fuzziness::Exact(1)), + })), + ..Default::default() + }); + assert_eq!( + result.hits.total, 0, + "edit distance 5 should NOT match with fuzziness=1" + ); +} + +#[tokio::test] +async fn fuzzy_match_auto_mode_threshold_2_for_long_terms() { + // Index doc with "name": "admin" (6 chars), use Auto fuzziness + // Auto threshold for 6+ chars is 2, so "admim" (edit distance 1) should match + use cloudsearch_common::{Fuzziness, TermQuery}; + + let temp_dir = TempDir::new().expect("temp dir"); + let catalog = Arc::new(IndexCatalog::new(temp_dir.path())); + catalog.initialize().await.expect("init catalog"); + catalog + .create_index( + "test", + CreateIndexRequest { + settings: IndexSettings::default(), + ..Default::default() + }, + ) + .await + .expect("create index"); + let mut handle = catalog.open_index("test").await.expect("open index"); + + handle + .index_document(doc("1", serde_json::json!({"name": "admin"}))) + .await + .expect("index"); + handle.refresh().await.expect("refresh"); + + let result = handle.search(&SearchRequest { + query: Some(SearchQuery::Term(TermQuery { + field: "name".to_string(), + value: serde_json::json!("admim"), + fuzziness: Some(Fuzziness::Auto), + })), + ..Default::default() + }); + assert_eq!( + result.hits.total, 1, + "Auto fuzziness (threshold=2) should match edit distance 1" + ); +}