From 23a4802e31fc254d9a0396976721eec91e990768 Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Sun, 5 Apr 2026 22:57:16 +0800 Subject: [PATCH 1/2] feat: add BM25 scoring functionality with configurable parameters - Add bm25 dependency with parallelism feature enabled - Introduce comprehensive BM25 scoring module with per-field weighting support - Implement configurable BM25 parameters (k1, b, avgdl) for fine-tuning - Replace hardcoded BM25 parameters with Bm25Params struct - Move stopwords and keyword extraction to shared bm25 module - Add FieldWeights struct for different field importance (title, summary, content) - Implement BM25 engine with fit-to-corpus functionality for accurate scoring - Update RelevanceScorer to use new Bm25Params configuration The new BM25 implementation provides: - Per-field weighting for more accurate document scoring - Configurable length normalization and term frequency saturation - IDF caching for efficient scoring operations - Query expansion support for enhanced search capabilities --- Cargo.toml | 3 + src/retrieval/content/scorer.rs | 165 +--------- src/retrieval/search/bm25.rs | 534 ++++++++++++++++++++++++++++++++ src/retrieval/search/mod.rs | 5 + src/retrieval/search/scorer.rs | 56 +--- 5 files changed, 558 insertions(+), 205 deletions(-) create mode 100644 src/retrieval/search/bm25.rs diff --git a/Cargo.toml b/Cargo.toml index f063cfa9..fc0f24dd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -80,6 +80,9 @@ roxmltree = "0.20" # Random number generation (for sampling) rand = "0.8" +# BM25 scoring +bm25 = { version = "2.3.2", features = ["parallelism"] } + [dev-dependencies] tempfile = "3.10" tokio-test = "0.4" diff --git a/src/retrieval/content/scorer.rs b/src/retrieval/content/scorer.rs index 37bde7c9..d9d0dcbe 100644 --- a/src/retrieval/content/scorer.rs +++ b/src/retrieval/content/scorer.rs @@ -9,6 +9,7 @@ use std::collections::HashMap; use crate::document::NodeId; +use crate::retrieval::search::{extract_keywords, Bm25Params, STOPWORDS}; use crate::util::estimate_tokens; use super::config::ScoringStrategyConfig; @@ -130,8 +131,7 @@ pub struct RelevanceScorer { /// Scoring strategy to use. strategy: ScoringStrategyConfig, /// BM25 parameters. - k1: f32, - b: f32, + params: Bm25Params, } impl RelevanceScorer { @@ -142,8 +142,7 @@ impl RelevanceScorer { Self { query_keywords, strategy, - k1: 1.2, - b: 0.75, + params: Bm25Params::default(), } } @@ -153,8 +152,7 @@ impl RelevanceScorer { Self { query_keywords: keywords, strategy, - k1: 1.2, - b: 0.75, + params: Bm25Params::default(), } } @@ -240,13 +238,15 @@ impl RelevanceScorer { continue; } - // IDF calculation + // IDF calculation using BM25L variant let df = ctx.doc_freq.get(&term_lower).copied().unwrap_or(1) as f32; let idf = ((ctx.doc_count as f32 - df + 0.5) / (df + 0.5) + 1.0).ln(); // BM25 formula - let numerator = tf * (self.k1 + 1.0); - let denominator = tf + self.k1 * (1.0 - self.b + self.b * doc_len / ctx.avg_doc_len); + let k1 = self.params.k1; + let b = self.params.b; + let numerator = tf * (k1 + 1.0); + let denominator = tf + k1 * (1.0 - b + b * doc_len / ctx.avg_doc_len); score += idf * numerator / denominator; } @@ -263,145 +263,6 @@ impl RelevanceScorer { } } -/// Extract keywords from a query string. -fn extract_keywords(query: &str) -> Vec { - // Common English stop words - const STOPWORDS: &[&str] = &[ - "a", - "an", - "the", - "is", - "are", - "was", - "were", - "be", - "been", - "being", - "have", - "has", - "had", - "do", - "does", - "did", - "will", - "would", - "could", - "should", - "may", - "might", - "must", - "shall", - "can", - "need", - "dare", - "ought", - "used", - "to", - "of", - "in", - "for", - "on", - "with", - "at", - "by", - "from", - "as", - "into", - "through", - "during", - "before", - "after", - "above", - "below", - "between", - "under", - "again", - "further", - "then", - "once", - "here", - "there", - "when", - "where", - "why", - "how", - "all", - "each", - "few", - "more", - "most", - "other", - "some", - "such", - "no", - "nor", - "not", - "only", - "own", - "same", - "so", - "than", - "too", - "very", - "just", - "and", - "but", - "if", - "or", - "because", - "until", - "while", - "about", - "what", - "which", - "who", - "whom", - "this", - "that", - "these", - "those", - "i", - "me", - "my", - "myself", - "we", - "our", - "ours", - "ourselves", - "you", - "your", - "yours", - "yourself", - "yourselves", - "he", - "him", - "his", - "himself", - "she", - "her", - "hers", - "herself", - "it", - "its", - "itself", - "they", - "them", - "their", - "theirs", - "themselves", - ]; - - query - .to_lowercase() - .split(|c: char| !c.is_alphanumeric()) - .filter(|s| { - let s = *s; - !s.is_empty() && s.len() > 1 && !STOPWORDS.contains(&s) - }) - .map(String::from) - .collect() -} - /// Compute information density of content. fn compute_density(content: &str) -> f32 { let words: Vec<&str> = content.split_whitespace().collect(); @@ -409,13 +270,7 @@ fn compute_density(content: &str) -> f32 { return 0.0; } - // Stopword ratio (lower is better) - const STOPWORDS: &[&str] = &[ - "a", "an", "the", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", - "do", "does", "did", "will", "would", "could", "should", "may", "might", "must", "shall", - "can", "to", "of", "in", "for", "on", "with", "at", "by", "from", "and", "but", "or", "as", - ]; - + // Use shared STOPWORDS from bm25 module let stopword_count = words .iter() .filter(|w| STOPWORDS.contains(&w.to_lowercase().as_str())) diff --git a/src/retrieval/search/bm25.rs b/src/retrieval/search/bm25.rs new file mode 100644 index 00000000..26c311f5 --- /dev/null +++ b/src/retrieval/search/bm25.rs @@ -0,0 +1,534 @@ +// Copyright (c) 2026 vectorless developers +// SPDX-License-Identifier: Apache-2.0 + +//! BM25 scoring module using the `bm25` crate. +//! +//! This module provides: +//! - Per-field weighting for document scoring +//! - Configurable length normalization +//! - IDF caching for efficient scoring +//! - Query expansion support + +use std::collections::HashMap; + +use bm25::{ + Embedder, EmbedderBuilder, Embedding, Language, Scorer, ScoredDocument, + DefaultTokenizer, Tokenizer, +}; + +/// Field weights for BM25 scoring. +/// +/// Different document fields can have different importance. +/// For example, title matches are typically more important than content matches. +#[derive(Debug, Clone, Copy)] +pub struct FieldWeights { + /// Weight for title field matches. + pub title: f32, + /// Weight for summary field matches. + pub summary: f32, + /// Weight for content field matches. + pub content: f32, +} + +impl Default for FieldWeights { + fn default() -> Self { + Self { + title: 2.0, + summary: 1.5, + content: 1.0, + } + } +} + +/// BM25 parameters for fine-tuning. +#[derive(Debug, Clone, Copy)] +pub struct Bm25Params { + /// Term frequency saturation parameter (k1). + /// Controls how quickly term frequency saturates. + /// Typical value: 1.2 + pub k1: f32, + /// Length normalization parameter (b). + /// Controls how much document length affects scoring. + /// - 0.0: No length normalization + /// - 1.0: Full length normalization + /// Typical value: 0.75 + pub b: f32, + /// Average document length. + /// If not known, can be estimated or set to 1.0 with b=0. + pub avgdl: f32, +} + +impl Default for Bm25Params { + fn default() -> Self { + Self { + k1: 1.2, + b: 0.75, + avgdl: 100.0, + } + } +} + +/// A document with multiple fields for scoring. +#[derive(Debug, Clone)] +pub struct FieldDocument { + /// Document identifier. + pub id: K, + /// Title field. + pub title: String, + /// Summary field. + pub summary: String, + /// Content field. + pub content: String, +} + +impl FieldDocument { + /// Create a new field document. + pub fn new(id: K, title: String, summary: String, content: String) -> Self { + Self { id, title, summary, content } + } + + /// Get combined text for embedding. + fn combined_text(&self) -> String { + format!("{} {} {}", self.title, self.summary, self.content) + } +} + +/// Key for field-specific document storage. +#[derive(Debug, Clone, Hash, Eq, PartialEq)] +struct FieldKey { + doc_id: K, + field: Field, +} + +#[derive(Debug, Clone, Copy, Hash, Eq, PartialEq)] +enum Field { + Title, + Summary, + Content, +} + +/// BM25 engine with per-field weighting support. +/// +/// This wraps the `bm25` crate's Embedder and Scorer to provide: +/// - Per-field weighting +/// - Configurable parameters +/// - IDF caching (handled internally by Scorer) +pub struct Bm25Engine { + /// The embedder for creating sparse vectors. + embedder: Embedder, + /// The scorer for scoring documents (combined text). + scorer: Scorer, + /// Field-specific scorers for weighted scoring. + title_scorer: Scorer, + summary_scorer: Scorer, + content_scorer: Scorer, + /// Field weights. + weights: FieldWeights, + /// Document count. + doc_count: usize, + /// Whether the engine has been fitted to a corpus. + fitted: bool, +} + +impl Bm25Engine { + /// Create a new BM25 engine with default parameters. + pub fn new() -> Self { + Self::with_params(Bm25Params::default()) + } + + /// Create a BM25 engine with custom parameters. + pub fn with_params(params: Bm25Params) -> Self { + let embedder = EmbedderBuilder::with_avgdl(params.avgdl) + .k1(params.k1) + .b(params.b) + .language_mode(Language::English) + .build(); + + Self { + embedder, + scorer: Scorer::new(), + title_scorer: Scorer::new(), + summary_scorer: Scorer::new(), + content_scorer: Scorer::new(), + weights: FieldWeights::default(), + doc_count: 0, + fitted: false, + } + } + + /// Create a BM25 engine fitted to a corpus. + /// + /// This calculates the true average document length from the corpus. + pub fn fit_to_corpus(documents: &[FieldDocument]) -> Self { + // Collect owned strings first + let corpus: Vec = documents.iter() + .map(|d| d.combined_text()) + .collect(); + let corpus_refs: Vec<&str> = corpus.iter().map(|s| s.as_str()).collect(); + + let embedder = EmbedderBuilder::with_fit_to_corpus(Language::English, &corpus_refs) + .build(); + + let mut engine = Self { + embedder, + scorer: Scorer::new(), + title_scorer: Scorer::new(), + summary_scorer: Scorer::new(), + content_scorer: Scorer::new(), + weights: FieldWeights::default(), + doc_count: 0, + fitted: true, + }; + + // Index all documents + for doc in documents { + engine.upsert(doc); + } + + engine + } + + /// Set field weights. + pub fn with_weights(mut self, weights: FieldWeights) -> Self { + self.weights = weights; + self + } + + /// Set language for tokenization. + pub fn with_language(mut self, language: Language) -> Self { + self.embedder = EmbedderBuilder::with_avgdl(self.embedder.avgdl()) + .language_mode(language) + .build(); + self + } + + /// Get the average document length. + pub fn avgdl(&self) -> f32 { + self.embedder.avgdl() + } + + /// Check if the engine has been fitted to a corpus. + pub fn is_fitted(&self) -> bool { + self.fitted + } + + /// Upsert a document into the index. + /// + /// This stores embeddings for each field separately for weighted scoring. + pub fn upsert(&mut self, document: &FieldDocument) { + let id = &document.id; + + // Embed and store each field separately + let title_emb = self.embedder.embed(&document.title); + let summary_emb = self.embedder.embed(&document.summary); + let content_emb = self.embedder.embed(&document.content); + + self.title_scorer.upsert(id, title_emb); + self.summary_scorer.upsert(id, summary_emb); + self.content_scorer.upsert(id, content_emb); + + // Also store combined embedding for basic search + let combined = self.embedder.embed(&document.combined_text()); + self.scorer.upsert(id, combined); + + self.doc_count += 1; + } + + /// Remove a document from the index. + pub fn remove(&mut self, id: &K) { + self.scorer.remove(id); + self.title_scorer.remove(id); + self.summary_scorer.remove(id); + self.content_scorer.remove(id); + self.doc_count = self.doc_count.saturating_sub(1); + } + + /// Get the number of indexed documents. + pub fn len(&self) -> usize { + self.doc_count + } + + /// Check if the index is empty. + pub fn is_empty(&self) -> bool { + self.doc_count == 0 + } + + /// Score a single document against a query. + /// + /// Returns None if the document is not in the index. + pub fn score(&self, id: &K, query: &str) -> Option { + let query_emb = self.embedder.embed(query); + + // Score each field + let title_score = self.title_scorer.score(id, &query_emb)?; + let summary_score = self.summary_scorer.score(id, &query_emb)?; + let content_score = self.content_scorer.score(id, &query_emb)?; + + // Weighted combination + let total_weight = self.weights.title + self.weights.summary + self.weights.content; + let weighted_score = (title_score * self.weights.title + + summary_score * self.weights.summary + + content_score * self.weights.content) / total_weight; + + Some(weighted_score) + } + + /// Search for documents matching a query. + /// + /// Returns documents sorted by score (descending). + pub fn search(&self, query: &str, limit: usize) -> Vec> { + let query_emb = self.embedder.embed(query); + self.scorer.matches(&query_emb).into_iter().take(limit).collect() + } + + /// Search with per-field weighting. + /// + /// This is slower but provides more accurate weighted scores. + pub fn search_weighted(&self, query: &str, limit: usize) -> Vec<(K, f32)> { + let query_emb = self.embedder.embed(query); + + // Get all document IDs from the main scorer + let all_results = self.scorer.matches(&query_emb); + + let mut scored: Vec<(K, f32)> = all_results + .into_iter() + .filter_map(|scored_doc| { + let id = scored_doc.id; + + // Get per-field scores + let title_score = self.title_scorer.score(&id, &query_emb)?; + let summary_score = self.summary_scorer.score(&id, &query_emb)?; + let content_score = self.content_scorer.score(&id, &query_emb)?; + + let total_weight = self.weights.title + self.weights.summary + self.weights.content; + let weighted_score = (title_score * self.weights.title + + summary_score * self.weights.summary + + content_score * self.weights.content) / total_weight; + + Some((id, weighted_score)) + }) + .collect(); + + scored.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)); + scored.truncate(limit); + scored + } + + /// Extract keywords from a query (tokenize and filter). + pub fn tokenize(&self, text: &str) -> Vec { + let tokenizer = DefaultTokenizer::builder() + .language_mode(Language::English) + .normalization(true) + .stopwords(true) + .stemming(true) + .build(); + tokenizer.tokenize(text) + } + + /// Get the underlying embedder. + pub fn embedder(&self) -> &Embedder { + &self.embedder + } + + /// Get mutable access to the embedder. + pub fn embedder_mut(&mut self) -> &mut Embedder { + &mut self.embedder + } +} + +impl Default for Bm25Engine { + fn default() -> Self { + Self::new() + } +} + +/// Query expansion result from LLM. +#[derive(Debug, Clone)] +pub struct ExpandedQuery { + /// Original query. + pub original: String, + /// Expanded terms. + pub expansions: Vec, + /// Combined query (original + expansions). + pub combined: String, +} + +impl ExpandedQuery { + /// Create a new expanded query. + pub fn new(original: String, expansions: Vec) -> Self { + let combined = format!("{} {}", original, expansions.join(" ")); + Self { original, expansions, combined } + } +} + +/// Query expander trait for LLM-based expansion. +#[async_trait::async_trait] +pub trait QueryExpander: Send + Sync { + /// Expand a query with related terms. + async fn expand(&self, query: &str) -> ExpandedQuery; +} + +/// Common English stop words for keyword filtering. +pub const STOPWORDS: &[&str] = &[ + "a", "an", "the", "is", "are", "was", "were", "be", "been", "being", + "have", "has", "had", "do", "does", "did", "will", "would", "could", + "should", "may", "might", "must", "shall", "can", "need", "dare", + "ought", "used", "to", "of", "in", "for", "on", "with", "at", "by", + "from", "as", "into", "through", "during", "before", "after", "above", + "below", "between", "under", "again", "further", "then", "once", + "here", "there", "when", "where", "why", "how", "all", "each", "few", + "more", "most", "other", "some", "such", "no", "nor", "not", "only", + "own", "same", "so", "than", "too", "very", "just", "and", "but", + "if", "or", "because", "until", "while", "about", "what", "which", + "who", "whom", "this", "that", "these", "those", "i", "me", "my", + "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", + "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", + "hers", "herself", "it", "its", "itself", "they", "them", "their", + "theirs", "themselves", +]; + +/// Extract keywords from a query string, filtering stop words. +/// +/// This is a simple keyword extraction that: +/// - Converts to lowercase +/// - Splits on non-alphanumeric characters +/// - Filters out stop words +/// - Requires minimum length of 2 characters +#[must_use] +pub fn extract_keywords(query: &str) -> Vec { + query + .to_lowercase() + .split(|c: char| !c.is_alphanumeric()) + .filter(|s| { + let s = *s; + !s.is_empty() && s.len() > 1 && !STOPWORDS.contains(&s) + }) + .map(String::from) + .collect() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_bm25_engine_creation() { + let engine: Bm25Engine = Bm25Engine::new(); + assert!(engine.is_empty()); + assert!(!engine.is_fitted()); + } + + #[test] + fn test_bm25_engine_fit_to_corpus() { + let docs = vec![ + FieldDocument::new(1u32, "Rust Programming".to_string(), "About Rust".to_string(), "Rust is a systems programming language.".to_string()), + FieldDocument::new(2u32, "Python Guide".to_string(), "About Python".to_string(), "Python is a scripting language.".to_string()), + ]; + + let engine = Bm25Engine::fit_to_corpus(&docs); + assert!(engine.is_fitted()); + assert_eq!(engine.len(), 2); + } + + #[test] + fn test_bm25_search() { + let docs = vec![ + FieldDocument::new(1u32, "Rust Programming".to_string(), "About Rust".to_string(), "Rust is a systems programming language with memory safety.".to_string()), + FieldDocument::new(2u32, "Python Guide".to_string(), "About Python".to_string(), "Python is a scripting language for data science.".to_string()), + FieldDocument::new(3u32, "Rust Memory Safety".to_string(), "Memory in Rust".to_string(), "Rust provides guaranteed memory safety without garbage collection.".to_string()), + ]; + + let engine = Bm25Engine::fit_to_corpus(&docs); + let results = engine.search("rust memory", 10); + + assert!(!results.is_empty()); + // Documents about Rust should rank higher + assert!(results.iter().any(|r| r.id == 1 || r.id == 3)); + } + + #[test] + fn test_bm25_weighted_search() { + let docs = vec![ + FieldDocument::new(1u32, "Rust Programming".to_string(), "About memory safety".to_string(), "Content about other things.".to_string()), + FieldDocument::new(2u32, "Other Language".to_string(), "About other things".to_string(), "Rust memory safety is important.".to_string()), + ]; + + let engine = Bm25Engine::fit_to_corpus(&docs) + .with_weights(FieldWeights { + title: 3.0, + summary: 2.0, + content: 1.0, + }); + + let results = engine.search_weighted("rust", 10); + + // Doc 1 has "Rust" in title, should rank higher + assert_eq!(results.first().map(|(id, _)| *id), Some(1u32)); + } + + #[test] + fn test_bm25_score() { + let docs = vec![ + FieldDocument::new(1u32, "Rust Programming".to_string(), "About Rust".to_string(), "Rust is a systems programming language.".to_string()), + ]; + + let engine = Bm25Engine::fit_to_corpus(&docs); + let score = engine.score(&1u32, "rust programming"); + + assert!(score.is_some()); + assert!(score.unwrap() > 0.0); + } + + #[test] + fn test_bm25_tokenize() { + let engine: Bm25Engine = Bm25Engine::new(); + let tokens = engine.tokenize("What is the Rust programming language?"); + + // Should filter stop words and stem + assert!(tokens.contains(&"rust".to_string())); + assert!(tokens.contains(&"program".to_string())); // stemmed + assert!(!tokens.contains(&"what".to_string())); // stop word + assert!(!tokens.contains(&"the".to_string())); // stop word + } + + #[test] + fn test_bm25_remove() { + let docs = vec![ + FieldDocument::new(1u32, "Rust".to_string(), "About Rust".to_string(), "Rust content.".to_string()), + ]; + + let mut engine = Bm25Engine::fit_to_corpus(&docs); + assert_eq!(engine.len(), 1); + + engine.remove(&1u32); + assert!(engine.is_empty()); + } + + #[test] + fn test_field_weights_default() { + let weights = FieldWeights::default(); + assert!((weights.title - 2.0).abs() < f32::EPSILON); + assert!((weights.summary - 1.5).abs() < f32::EPSILON); + assert!((weights.content - 1.0).abs() < f32::EPSILON); + } + + #[test] + fn test_bm25_params_default() { + let params = Bm25Params::default(); + assert!((params.k1 - 1.2).abs() < f32::EPSILON); + assert!((params.b - 0.75).abs() < f32::EPSILON); + assert!((params.avgdl - 100.0).abs() < f32::EPSILON); + } + + #[test] + fn test_expanded_query() { + let expanded = ExpandedQuery::new( + "rust".to_string(), + vec!["programming".to_string(), "language".to_string()], + ); + + assert_eq!(expanded.original, "rust"); + assert_eq!(expanded.expansions.len(), 2); + assert_eq!(expanded.combined, "rust programming language"); + } +} diff --git a/src/retrieval/search/mod.rs b/src/retrieval/search/mod.rs index dca0e04e..3b00655d 100644 --- a/src/retrieval/search/mod.rs +++ b/src/retrieval/search/mod.rs @@ -4,12 +4,17 @@ //! Search algorithms for tree traversal. mod beam; +mod bm25; mod greedy; mod mcts; mod scorer; mod r#trait; pub use beam::BeamSearch; +pub use bm25::{ + extract_keywords, Bm25Engine, Bm25Params, ExpandedQuery, FieldDocument, FieldWeights, + QueryExpander, STOPWORDS, +}; pub use greedy::GreedySearch; pub use mcts::MctsSearch; pub use scorer::{NodeScorer, ScoringContext}; diff --git a/src/retrieval/search/scorer.rs b/src/retrieval/search/scorer.rs index 72080a6b..f17bf118 100644 --- a/src/retrieval/search/scorer.rs +++ b/src/retrieval/search/scorer.rs @@ -10,37 +10,10 @@ use std::collections::HashMap; use crate::document::{DocumentTree, NodeId}; -/// Common English stop words for keyword filtering. -const STOPWORDS: &[&str] = &[ - "a", "an", "the", "is", "are", "was", "were", "be", "been", "being", - "have", "has", "had", "do", "does", "did", "will", "would", "could", - "should", "may", "might", "must", "shall", "can", "need", "dare", - "ought", "used", "to", "of", "in", "for", "on", "with", "at", "by", - "from", "as", "into", "through", "during", "before", "after", "above", - "below", "between", "under", "again", "further", "then", "once", - "here", "there", "when", "where", "why", "how", "all", "each", "few", - "more", "most", "other", "some", "such", "no", "nor", "not", "only", - "own", "same", "so", "than", "too", "very", "just", "and", "but", - "if", "or", "because", "until", "while", "about", "what", "which", - "who", "whom", "this", "that", "these", "those", "i", "me", "my", - "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", - "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", - "hers", "herself", "it", "its", "itself", "they", "them", "their", - "theirs", "themselves", -]; - -/// Extract keywords from a query string, filtering stop words. -fn extract_keywords(query: &str) -> Vec { - query - .to_lowercase() - .split(|c: char| !c.is_alphanumeric()) - .filter(|s| { - let s = *s; - !s.is_empty() && s.len() > 1 && !STOPWORDS.contains(&s) - }) - .map(String::from) - .collect() -} +use super::bm25::{Bm25Engine, Bm25Params, FieldDocument, FieldWeights}; + +// Re-export extract_keywords for other modules to use +pub use super::bm25::extract_keywords; /// Scoring strategy to use. #[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] @@ -54,25 +27,9 @@ pub enum ScoringStrategy { Hybrid, } -/// BM25 parameters. -#[derive(Debug, Clone, Copy)] -pub struct Bm25Params { - /// Term frequency saturation parameter (k1). - pub k1: f32, - /// Length normalization parameter (b). - pub b: f32, -} - -impl Default for Bm25Params { - fn default() -> Self { - Self { - k1: 1.2, - b: 0.75, - } - } -} - /// Context for scoring calculations. +/// +/// This wraps the BM25 engine and provides additional scoring context. #[derive(Debug, Clone)] pub struct ScoringContext { /// Query terms for keyword matching. @@ -413,4 +370,3 @@ mod tests { assert_eq!(scorer.context().strategy, ScoringStrategy::BM25); } } - From 52cfbf19c8b8d249c49dd0604775e062a2557fbd Mon Sep 17 00:00:00 2001 From: zTgx <747674262@qq.com> Date: Sun, 5 Apr 2026 22:57:56 +0800 Subject: [PATCH 2/2] chore(release): bump version from 0.1.15 to 0.1.16 - Update package version in Cargo.toml --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index fc0f24dd..13b31954 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "vectorless" -version = "0.1.15" +version = "0.1.16" edition = "2024" authors = ["zTgx "] description = "Hierarchical, reasoning-native document intelligence engine"