From 23a4802e31fc254d9a0396976721eec91e990768 Mon Sep 17 00:00:00 2001
From: zTgx <747674262@qq.com>
Date: Sun, 5 Apr 2026 22:57:16 +0800
Subject: [PATCH 1/2] feat: add BM25 scoring functionality with configurable
 parameters

- Add bm25 dependency with parallelism feature enabled
- Introduce comprehensive BM25 scoring module with per-field weighting support
- Implement configurable BM25 parameters (k1, b, avgdl) for fine-tuning
- Replace hardcoded BM25 parameters with Bm25Params struct
- Move stopwords and keyword extraction to shared bm25 module
- Add FieldWeights struct for different field importance (title, summary, content)
- Implement BM25 engine with fit-to-corpus functionality for accurate scoring
- Update RelevanceScorer to use new Bm25Params configuration

The new BM25 implementation provides:
- Per-field weighting for more accurate document scoring
- Configurable length normalization and term frequency saturation
- IDF caching for efficient scoring operations
- Query expansion support for enhanced search capabilities
---
 Cargo.toml                      |   3 +
 src/retrieval/content/scorer.rs | 165 +---------
 src/retrieval/search/bm25.rs    | 534 ++++++++++++++++++++++++++++++++
 src/retrieval/search/mod.rs     |   5 +
 src/retrieval/search/scorer.rs  |  56 +---
 5 files changed, 558 insertions(+), 205 deletions(-)
 create mode 100644 src/retrieval/search/bm25.rs
diff --git a/Cargo.toml b/Cargo.toml
index f063cfa9..fc0f24dd 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -80,6 +80,9 @@ roxmltree = "0.20"
 # Random number generation (for sampling)
 rand = "0.8"
 
+# BM25 scoring
+bm25 = { version = "2.3.2", features = ["parallelism"] }
+
 [dev-dependencies]
 tempfile = "3.10"
 tokio-test = "0.4"
diff --git a/src/retrieval/content/scorer.rs b/src/retrieval/content/scorer.rs
index 37bde7c9..d9d0dcbe 100644
--- a/src/retrieval/content/scorer.rs
+++ b/src/retrieval/content/scorer.rs
@@ -9,6 +9,7 @@
 use std::collections::HashMap;
 
 use crate::document::NodeId;
+use crate::retrieval::search::{extract_keywords, Bm25Params, STOPWORDS};
 use crate::util::estimate_tokens;
 
 use super::config::ScoringStrategyConfig;
@@ -130,8 +131,7 @@ pub struct RelevanceScorer {
     /// Scoring strategy to use.
     strategy: ScoringStrategyConfig,
     /// BM25 parameters.
-    k1: f32,
-    b: f32,
+    params: Bm25Params,
 }
 
 impl RelevanceScorer {
@@ -142,8 +142,7 @@ impl RelevanceScorer {
         Self {
             query_keywords,
             strategy,
-            k1: 1.2,
-            b: 0.75,
+            params: Bm25Params::default(),
         }
     }
 
@@ -153,8 +152,7 @@ impl RelevanceScorer {
         Self {
             query_keywords: keywords,
             strategy,
-            k1: 1.2,
-            b: 0.75,
+            params: Bm25Params::default(),
         }
     }
 
@@ -240,13 +238,15 @@ impl RelevanceScorer {
                 continue;
             }
 
-            // IDF calculation
+            // IDF calculation using BM25L variant
             let df = ctx.doc_freq.get(&term_lower).copied().unwrap_or(1) as f32;
             let idf = ((ctx.doc_count as f32 - df + 0.5) / (df + 0.5) + 1.0).ln();
 
             // BM25 formula
-            let numerator = tf * (self.k1 + 1.0);
-            let denominator = tf + self.k1 * (1.0 - self.b + self.b * doc_len / ctx.avg_doc_len);
+            let k1 = self.params.k1;
+            let b = self.params.b;
+            let numerator = tf * (k1 + 1.0);
+            let denominator = tf + k1 * (1.0 - b + b * doc_len / ctx.avg_doc_len);
 
             score += idf * numerator / denominator;
         }
@@ -263,145 +263,6 @@ impl RelevanceScorer {
     }
 }
 
-/// Extract keywords from a query string.
-fn extract_keywords(query: &str) -> Vec<String> {
-    // Common English stop words
-    const STOPWORDS: &[&str] = &[
-        "a",
-        "an",
-        "the",
-        "is",
-        "are",
-        "was",
-        "were",
-        "be",
-        "been",
-        "being",
-        "have",
-        "has",
-        "had",
-        "do",
-        "does",
-        "did",
-        "will",
-        "would",
-        "could",
-        "should",
-        "may",
-        "might",
-        "must",
-        "shall",
-        "can",
-        "need",
-        "dare",
-        "ought",
-        "used",
-        "to",
-        "of",
-        "in",
-        "for",
-        "on",
-        "with",
-        "at",
-        "by",
-        "from",
-        "as",
-        "into",
-        "through",
-        "during",
-        "before",
-        "after",
-        "above",
-        "below",
-        "between",
-        "under",
-        "again",
-        "further",
-        "then",
-        "once",
-        "here",
-        "there",
-        "when",
-        "where",
-        "why",
-        "how",
-        "all",
-        "each",
-        "few",
-        "more",
-        "most",
-        "other",
-        "some",
-        "such",
-        "no",
-        "nor",
-        "not",
-        "only",
-        "own",
-        "same",
-        "so",
-        "than",
-        "too",
-        "very",
-        "just",
-        "and",
-        "but",
-        "if",
-        "or",
-        "because",
-        "until",
-        "while",
-        "about",
-        "what",
-        "which",
-        "who",
-        "whom",
-        "this",
-        "that",
-        "these",
-        "those",
-        "i",
-        "me",
-        "my",
-        "myself",
-        "we",
-        "our",
-        "ours",
-        "ourselves",
-        "you",
-        "your",
-        "yours",
-        "yourself",
-        "yourselves",
-        "he",
-        "him",
-        "his",
-        "himself",
-        "she",
-        "her",
-        "hers",
-        "herself",
-        "it",
-        "its",
-        "itself",
-        "they",
-        "them",
-        "their",
-        "theirs",
-        "themselves",
-    ];
-
-    query
-        .to_lowercase()
-        .split(|c: char| !c.is_alphanumeric())
-        .filter(|s| {
-            let s = *s;
-            !s.is_empty() && s.len() > 1 && !STOPWORDS.contains(&s)
-        })
-        .map(String::from)
-        .collect()
-}
-
 /// Compute information density of content.
 fn compute_density(content: &str) -> f32 {
     let words: Vec<&str> = content.split_whitespace().collect();
@@ -409,13 +270,7 @@ fn compute_density(content: &str) -> f32 {
         return 0.0;
     }
 
-    // Stopword ratio (lower is better)
-    const STOPWORDS: &[&str] = &[
-        "a", "an", "the", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had",
-        "do", "does", "did", "will", "would", "could", "should", "may", "might", "must", "shall",
-        "can", "to", "of", "in", "for", "on", "with", "at", "by", "from", "and", "but", "or", "as",
-    ];
-
+    // Use shared STOPWORDS from bm25 module
     let stopword_count = words
         .iter()
         .filter(|w| STOPWORDS.contains(&w.to_lowercase().as_str()))
diff --git a/src/retrieval/search/bm25.rs b/src/retrieval/search/bm25.rs
new file mode 100644
index 00000000..26c311f5
--- /dev/null
+++ b/src/retrieval/search/bm25.rs
@@ -0,0 +1,534 @@
+// Copyright (c) 2026 vectorless developers
+// SPDX-License-Identifier: Apache-2.0
+
+//! BM25 scoring module using the `bm25` crate.
+//!
+//! This module provides:
+//! - Per-field weighting for document scoring
+//! - Configurable length normalization
+//! - IDF caching for efficient scoring
+//! - Query expansion support
+
+use std::collections::HashMap;
+
+use bm25::{
+    Embedder, EmbedderBuilder, Embedding, Language, Scorer, ScoredDocument,
+    DefaultTokenizer, Tokenizer,
+};
+
+/// Field weights for BM25 scoring.
+///
+/// Different document fields can have different importance.
+/// For example, title matches are typically more important than content matches.
+#[derive(Debug, Clone, Copy)]
+pub struct FieldWeights {
+    /// Weight for title field matches.
+    pub title: f32,
+    /// Weight for summary field matches.
+    pub summary: f32,
+    /// Weight for content field matches.
+    pub content: f32,
+}
+
+impl Default for FieldWeights {
+    fn default() -> Self {
+        Self {
+            title: 2.0,
+            summary: 1.5,
+            content: 1.0,
+        }
+    }
+}
+
+/// BM25 parameters for fine-tuning.
+#[derive(Debug, Clone, Copy)]
+pub struct Bm25Params {
+    /// Term frequency saturation parameter (k1).
+    /// Controls how quickly term frequency saturates.
+    /// Typical value: 1.2
+    pub k1: f32,
+    /// Length normalization parameter (b).
+    /// Controls how much document length affects scoring.
+    /// - 0.0: No length normalization
+    /// - 1.0: Full length normalization
+    /// Typical value: 0.75
+    pub b: f32,
+    /// Average document length.
+    /// If not known, can be estimated or set to 1.0 with b=0.
+    pub avgdl: f32,
+}
+
+impl Default for Bm25Params {
+    fn default() -> Self {
+        Self {
+            k1: 1.2,
+            b: 0.75,
+            avgdl: 100.0,
+        }
+    }
+}
+
+/// A document with multiple fields for scoring.
+#[derive(Debug, Clone)]
+pub struct FieldDocument<K> {
+    /// Document identifier.
+    pub id: K,
+    /// Title field.
+    pub title: String,
+    /// Summary field.
+    pub summary: String,
+    /// Content field.
+    pub content: String,
+}
+
+impl<K> FieldDocument<K> {
+    /// Create a new field document.
+    pub fn new(id: K, title: String, summary: String, content: String) -> Self {
+        Self { id, title, summary, content }
+    }
+
+    /// Get combined text for embedding.
+    fn combined_text(&self) -> String {
+        format!("{} {} {}", self.title, self.summary, self.content)
+    }
+}
+
+/// Key for field-specific document storage.
+#[derive(Debug, Clone, Hash, Eq, PartialEq)]
+struct FieldKey<K> {
+    doc_id: K,
+    field: Field,
+}
+
+#[derive(Debug, Clone, Copy, Hash, Eq, PartialEq)]
+enum Field {
+    Title,
+    Summary,
+    Content,
+}
+
+/// BM25 engine with per-field weighting support.
+///
+/// This wraps the `bm25` crate's Embedder and Scorer to provide:
+/// - Per-field weighting
+/// - Configurable parameters
+/// - IDF caching (handled internally by Scorer)
+pub struct Bm25Engine<K> {
+    /// The embedder for creating sparse vectors.
+    embedder: Embedder,
+    /// The scorer for scoring documents (combined text).
+    scorer: Scorer<K>,
+    /// Field-specific scorers for weighted scoring.
+    title_scorer: Scorer<K>,
+    summary_scorer: Scorer<K>,
+    content_scorer: Scorer<K>,
+    /// Field weights.
+    weights: FieldWeights,
+    /// Document count.
+    doc_count: usize,
+    /// Whether the engine has been fitted to a corpus.
+    fitted: bool,
+}
+
+impl<K: std::hash::Hash + Eq + Clone + std::fmt::Debug> Bm25Engine<K> {
+    /// Create a new BM25 engine with default parameters.
+    pub fn new() -> Self {
+        Self::with_params(Bm25Params::default())
+    }
+
+    /// Create a BM25 engine with custom parameters.
+    pub fn with_params(params: Bm25Params) -> Self {
+        let embedder = EmbedderBuilder::with_avgdl(params.avgdl)
+            .k1(params.k1)
+            .b(params.b)
+            .language_mode(Language::English)
+            .build();
+
+        Self {
+            embedder,
+            scorer: Scorer::new(),
+            title_scorer: Scorer::new(),
+            summary_scorer: Scorer::new(),
+            content_scorer: Scorer::new(),
+            weights: FieldWeights::default(),
+            doc_count: 0,
+            fitted: false,
+        }
+    }
+
+    /// Create a BM25 engine fitted to a corpus.
+    ///
+    /// This calculates the true average document length from the corpus.
+    pub fn fit_to_corpus(documents: &[FieldDocument<K>]) -> Self {
+        // Collect owned strings first
+        let corpus: Vec<String> = documents.iter()
+            .map(|d| d.combined_text())
+            .collect();
+        let corpus_refs: Vec<&str> = corpus.iter().map(|s| s.as_str()).collect();
+
+        let embedder = EmbedderBuilder::with_fit_to_corpus(Language::English, &corpus_refs)
+            .build();
+
+        let mut engine = Self {
+            embedder,
+            scorer: Scorer::new(),
+            title_scorer: Scorer::new(),
+            summary_scorer: Scorer::new(),
+            content_scorer: Scorer::new(),
+            weights: FieldWeights::default(),
+            doc_count: 0,
+            fitted: true,
+        };
+
+        // Index all documents
+        for doc in documents {
+            engine.upsert(doc);
+        }
+
+        engine
+    }
+
+    /// Set field weights.
+    pub fn with_weights(mut self, weights: FieldWeights) -> Self {
+        self.weights = weights;
+        self
+    }
+
+    /// Set language for tokenization.
+    pub fn with_language(mut self, language: Language) -> Self {
+        self.embedder = EmbedderBuilder::with_avgdl(self.embedder.avgdl())
+            .language_mode(language)
+            .build();
+        self
+    }
+
+    /// Get the average document length.
+    pub fn avgdl(&self) -> f32 {
+        self.embedder.avgdl()
+    }
+
+    /// Check if the engine has been fitted to a corpus.
+    pub fn is_fitted(&self) -> bool {
+        self.fitted
+    }
+
+    /// Upsert a document into the index.
+    ///
+    /// This stores embeddings for each field separately for weighted scoring.
+    pub fn upsert(&mut self, document: &FieldDocument<K>) {
+        let id = &document.id;
+
+        // Embed and store each field separately
+        let title_emb = self.embedder.embed(&document.title);
+        let summary_emb = self.embedder.embed(&document.summary);
+        let content_emb = self.embedder.embed(&document.content);
+
+        self.title_scorer.upsert(id, title_emb);
+        self.summary_scorer.upsert(id, summary_emb);
+        self.content_scorer.upsert(id, content_emb);
+
+        // Also store combined embedding for basic search
+        let combined = self.embedder.embed(&document.combined_text());
+        self.scorer.upsert(id, combined);
+
+        self.doc_count += 1;
+    }
+
+    /// Remove a document from the index.
+    pub fn remove(&mut self, id: &K) {
+        self.scorer.remove(id);
+        self.title_scorer.remove(id);
+        self.summary_scorer.remove(id);
+        self.content_scorer.remove(id);
+        self.doc_count = self.doc_count.saturating_sub(1);
+    }
+
+    /// Get the number of indexed documents.
+    pub fn len(&self) -> usize {
+        self.doc_count
+    }
+
+    /// Check if the index is empty.
+    pub fn is_empty(&self) -> bool {
+        self.doc_count == 0
+    }
+
+    /// Score a single document against a query.
+    ///
+    /// Returns None if the document is not in the index.
+    pub fn score(&self, id: &K, query: &str) -> Option<f32> {
+        let query_emb = self.embedder.embed(query);
+
+        // Score each field
+        let title_score = self.title_scorer.score(id, &query_emb)?;
+        let summary_score = self.summary_scorer.score(id, &query_emb)?;
+        let content_score = self.content_scorer.score(id, &query_emb)?;
+
+        // Weighted combination
+        let total_weight = self.weights.title + self.weights.summary + self.weights.content;
+        let weighted_score = (title_score * self.weights.title
+            + summary_score * self.weights.summary
+            + content_score * self.weights.content) / total_weight;
+
+        Some(weighted_score)
+    }
+
+    /// Search for documents matching a query.
+    ///
+    /// Returns documents sorted by score (descending).
+    pub fn search(&self, query: &str, limit: usize) -> Vec<ScoredDocument<K>> {
+        let query_emb = self.embedder.embed(query);
+        self.scorer.matches(&query_emb).into_iter().take(limit).collect()
+    }
+
+    /// Search with per-field weighting.
+    ///
+    /// This is slower but provides more accurate weighted scores.
+    pub fn search_weighted(&self, query: &str, limit: usize) -> Vec<(K, f32)> {
+        let query_emb = self.embedder.embed(query);
+
+        // Get all document IDs from the main scorer
+        let all_results = self.scorer.matches(&query_emb);
+
+        let mut scored: Vec<(K, f32)> = all_results
+            .into_iter()
+            .filter_map(|scored_doc| {
+                let id = scored_doc.id;
+
+                // Get per-field scores
+                let title_score = self.title_scorer.score(&id, &query_emb)?;
+                let summary_score = self.summary_scorer.score(&id, &query_emb)?;
+                let content_score = self.content_scorer.score(&id, &query_emb)?;
+
+                let total_weight = self.weights.title + self.weights.summary + self.weights.content;
+                let weighted_score = (title_score * self.weights.title
+                    + summary_score * self.weights.summary
+                    + content_score * self.weights.content) / total_weight;
+
+                Some((id, weighted_score))
+            })
+            .collect();
+
+        scored.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
+        scored.truncate(limit);
+        scored
+    }
+
+    /// Extract keywords from a query (tokenize and filter).
+    pub fn tokenize(&self, text: &str) -> Vec<String> {
+        let tokenizer = DefaultTokenizer::builder()
+            .language_mode(Language::English)
+            .normalization(true)
+            .stopwords(true)
+            .stemming(true)
+            .build();
+        tokenizer.tokenize(text)
+    }
+
+    /// Get the underlying embedder.
+    pub fn embedder(&self) -> &Embedder {
+        &self.embedder
+    }
+
+    /// Get mutable access to the embedder.
+    pub fn embedder_mut(&mut self) -> &mut Embedder {
+        &mut self.embedder
+    }
+}
+
+impl<K: std::hash::Hash + Eq + Clone + std::fmt::Debug> Default for Bm25Engine<K> {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Query expansion result from LLM.
+#[derive(Debug, Clone)]
+pub struct ExpandedQuery {
+    /// Original query.
+    pub original: String,
+    /// Expanded terms.
+    pub expansions: Vec<String>,
+    /// Combined query (original + expansions).
+    pub combined: String,
+}
+
+impl ExpandedQuery {
+    /// Create a new expanded query.
+    pub fn new(original: String, expansions: Vec<String>) -> Self {
+        let combined = format!("{} {}", original, expansions.join(" "));
+        Self { original, expansions, combined }
+    }
+}
+
+/// Query expander trait for LLM-based expansion.
+#[async_trait::async_trait]
+pub trait QueryExpander: Send + Sync {
+    /// Expand a query with related terms.
+    async fn expand(&self, query: &str) -> ExpandedQuery;
+}
+
+/// Common English stop words for keyword filtering.
+pub const STOPWORDS: &[&str] = &[
+    "a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
+    "have", "has", "had", "do", "does", "did", "will", "would", "could",
+    "should", "may", "might", "must", "shall", "can", "need", "dare",
+    "ought", "used", "to", "of", "in", "for", "on", "with", "at", "by",
+    "from", "as", "into", "through", "during", "before", "after", "above",
+    "below", "between", "under", "again", "further", "then", "once",
+    "here", "there", "when", "where", "why", "how", "all", "each", "few",
+    "more", "most", "other", "some", "such", "no", "nor", "not", "only",
+    "own", "same", "so", "than", "too", "very", "just", "and", "but",
+    "if", "or", "because", "until", "while", "about", "what", "which",
+    "who", "whom", "this", "that", "these", "those", "i", "me", "my",
+    "myself", "we", "our", "ours", "ourselves", "you", "your", "yours",
+    "yourself", "yourselves", "he", "him", "his", "himself", "she", "her",
+    "hers", "herself", "it", "its", "itself", "they", "them", "their",
+    "theirs", "themselves",
+];
+
+/// Extract keywords from a query string, filtering stop words.
+///
+/// This is a simple keyword extraction that:
+/// - Converts to lowercase
+/// - Splits on non-alphanumeric characters
+/// - Filters out stop words
+/// - Requires minimum length of 2 characters
+#[must_use]
+pub fn extract_keywords(query: &str) -> Vec<String> {
+    query
+        .to_lowercase()
+        .split(|c: char| !c.is_alphanumeric())
+        .filter(|s| {
+            let s = *s;
+            !s.is_empty() && s.len() > 1 && !STOPWORDS.contains(&s)
+        })
+        .map(String::from)
+        .collect()
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_bm25_engine_creation() {
+        let engine: Bm25Engine<u32> = Bm25Engine::new();
+        assert!(engine.is_empty());
+        assert!(!engine.is_fitted());
+    }
+
+    #[test]
+    fn test_bm25_engine_fit_to_corpus() {
+        let docs = vec![
+            FieldDocument::new(1u32, "Rust Programming".to_string(), "About Rust".to_string(), "Rust is a systems programming language.".to_string()),
+            FieldDocument::new(2u32, "Python Guide".to_string(), "About Python".to_string(), "Python is a scripting language.".to_string()),
+        ];
+
+        let engine = Bm25Engine::fit_to_corpus(&docs);
+        assert!(engine.is_fitted());
+        assert_eq!(engine.len(), 2);
+    }
+
+    #[test]
+    fn test_bm25_search() {
+        let docs = vec![
+            FieldDocument::new(1u32, "Rust Programming".to_string(), "About Rust".to_string(), "Rust is a systems programming language with memory safety.".to_string()),
+            FieldDocument::new(2u32, "Python Guide".to_string(), "About Python".to_string(), "Python is a scripting language for data science.".to_string()),
+            FieldDocument::new(3u32, "Rust Memory Safety".to_string(), "Memory in Rust".to_string(), "Rust provides guaranteed memory safety without garbage collection.".to_string()),
+        ];
+
+        let engine = Bm25Engine::fit_to_corpus(&docs);
+        let results = engine.search("rust memory", 10);
+
+        assert!(!results.is_empty());
+        // Documents about Rust should rank higher
+        assert!(results.iter().any(|r| r.id == 1 || r.id == 3));
+    }
+
+    #[test]
+    fn test_bm25_weighted_search() {
+        let docs = vec![
+            FieldDocument::new(1u32, "Rust Programming".to_string(), "About memory safety".to_string(), "Content about other things.".to_string()),
+            FieldDocument::new(2u32, "Other Language".to_string(), "About other things".to_string(), "Rust memory safety is important.".to_string()),
+        ];
+
+        let engine = Bm25Engine::fit_to_corpus(&docs)
+            .with_weights(FieldWeights {
+                title: 3.0,
+                summary: 2.0,
+                content: 1.0,
+            });
+
+        let results = engine.search_weighted("rust", 10);
+
+        // Doc 1 has "Rust" in title, should rank higher
+        assert_eq!(results.first().map(|(id, _)| *id), Some(1u32));
+    }
+
+    #[test]
+    fn test_bm25_score() {
+        let docs = vec![
+            FieldDocument::new(1u32, "Rust Programming".to_string(), "About Rust".to_string(), "Rust is a systems programming language.".to_string()),
+        ];
+
+        let engine = Bm25Engine::fit_to_corpus(&docs);
+        let score = engine.score(&1u32, "rust programming");
+
+        assert!(score.is_some());
+        assert!(score.unwrap() > 0.0);
+    }
+
+    #[test]
+    fn test_bm25_tokenize() {
+        let engine: Bm25Engine<u32> = Bm25Engine::new();
+        let tokens = engine.tokenize("What is the Rust programming language?");
+
+        // Should filter stop words and stem
+        assert!(tokens.contains(&"rust".to_string()));
+        assert!(tokens.contains(&"program".to_string())); // stemmed
+        assert!(!tokens.contains(&"what".to_string())); // stop word
+        assert!(!tokens.contains(&"the".to_string())); // stop word
+    }
+
+    #[test]
+    fn test_bm25_remove() {
+        let docs = vec![
+            FieldDocument::new(1u32, "Rust".to_string(), "About Rust".to_string(), "Rust content.".to_string()),
+        ];
+
+        let mut engine = Bm25Engine::fit_to_corpus(&docs);
+        assert_eq!(engine.len(), 1);
+
+        engine.remove(&1u32);
+        assert!(engine.is_empty());
+    }
+
+    #[test]
+    fn test_field_weights_default() {
+        let weights = FieldWeights::default();
+        assert!((weights.title - 2.0).abs() < f32::EPSILON);
+        assert!((weights.summary - 1.5).abs() < f32::EPSILON);
+        assert!((weights.content - 1.0).abs() < f32::EPSILON);
+    }
+
+    #[test]
+    fn test_bm25_params_default() {
+        let params = Bm25Params::default();
+        assert!((params.k1 - 1.2).abs() < f32::EPSILON);
+        assert!((params.b - 0.75).abs() < f32::EPSILON);
+        assert!((params.avgdl - 100.0).abs() < f32::EPSILON);
+    }
+
+    #[test]
+    fn test_expanded_query() {
+        let expanded = ExpandedQuery::new(
+            "rust".to_string(),
+            vec!["programming".to_string(), "language".to_string()],
+        );
+
+        assert_eq!(expanded.original, "rust");
+        assert_eq!(expanded.expansions.len(), 2);
+        assert_eq!(expanded.combined, "rust programming language");
+    }
+}
diff --git a/src/retrieval/search/mod.rs b/src/retrieval/search/mod.rs
index dca0e04e..3b00655d 100644
--- a/src/retrieval/search/mod.rs
+++ b/src/retrieval/search/mod.rs
@@ -4,12 +4,17 @@
 //! Search algorithms for tree traversal.
 
 mod beam;
+mod bm25;
 mod greedy;
 mod mcts;
 mod scorer;
 mod r#trait;
 
 pub use beam::BeamSearch;
+pub use bm25::{
+    extract_keywords, Bm25Engine, Bm25Params, ExpandedQuery, FieldDocument, FieldWeights,
+    QueryExpander, STOPWORDS,
+};
 pub use greedy::GreedySearch;
 pub use mcts::MctsSearch;
 pub use scorer::{NodeScorer, ScoringContext};
diff --git a/src/retrieval/search/scorer.rs b/src/retrieval/search/scorer.rs
index 72080a6b..f17bf118 100644
--- a/src/retrieval/search/scorer.rs
+++ b/src/retrieval/search/scorer.rs
@@ -10,37 +10,10 @@ use std::collections::HashMap;
 
 use crate::document::{DocumentTree, NodeId};
 
-/// Common English stop words for keyword filtering.
-const STOPWORDS: &[&str] = &[
-    "a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
-    "have", "has", "had", "do", "does", "did", "will", "would", "could",
-    "should", "may", "might", "must", "shall", "can", "need", "dare",
-    "ought", "used", "to", "of", "in", "for", "on", "with", "at", "by",
-    "from", "as", "into", "through", "during", "before", "after", "above",
-    "below", "between", "under", "again", "further", "then", "once",
-    "here", "there", "when", "where", "why", "how", "all", "each", "few",
-    "more", "most", "other", "some", "such", "no", "nor", "not", "only",
-    "own", "same", "so", "than", "too", "very", "just", "and", "but",
-    "if", "or", "because", "until", "while", "about", "what", "which",
-    "who", "whom", "this", "that", "these", "those", "i", "me", "my",
-    "myself", "we", "our", "ours", "ourselves", "you", "your", "yours",
-    "yourself", "yourselves", "he", "him", "his", "himself", "she", "her",
-    "hers", "herself", "it", "its", "itself", "they", "them", "their",
-    "theirs", "themselves",
-];
-
-/// Extract keywords from a query string, filtering stop words.
-fn extract_keywords(query: &str) -> Vec<String> {
-    query
-        .to_lowercase()
-        .split(|c: char| !c.is_alphanumeric())
-        .filter(|s| {
-            let s = *s;
-            !s.is_empty() && s.len() > 1 && !STOPWORDS.contains(&s)
-        })
-        .map(String::from)
-        .collect()
-}
+use super::bm25::{Bm25Engine, Bm25Params, FieldDocument, FieldWeights};
+
+// Re-export extract_keywords for other modules to use
+pub use super::bm25::extract_keywords;
 
 /// Scoring strategy to use.
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
@@ -54,25 +27,9 @@ pub enum ScoringStrategy {
     Hybrid,
 }
 
-/// BM25 parameters.
-#[derive(Debug, Clone, Copy)]
-pub struct Bm25Params {
-    /// Term frequency saturation parameter (k1).
-    pub k1: f32,
-    /// Length normalization parameter (b).
-    pub b: f32,
-}
-
-impl Default for Bm25Params {
-    fn default() -> Self {
-        Self {
-            k1: 1.2,
-            b: 0.75,
-        }
-    }
-}
-
 /// Context for scoring calculations.
+///
+/// This wraps the BM25 engine and provides additional scoring context.
 #[derive(Debug, Clone)]
 pub struct ScoringContext {
     /// Query terms for keyword matching.
@@ -413,4 +370,3 @@ mod tests {
         assert_eq!(scorer.context().strategy, ScoringStrategy::BM25);
     }
 }
-

From 52cfbf19c8b8d249c49dd0604775e062a2557fbd Mon Sep 17 00:00:00 2001
From: zTgx <747674262@qq.com>
Date: Sun, 5 Apr 2026 22:57:56 +0800
Subject: [PATCH 2/2] chore(release): bump version from 0.1.15 to 0.1.16

- Update package version in Cargo.toml
---
 Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index fc0f24dd..13b31954 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "vectorless"
-version = "0.1.15"
+version = "0.1.16"
 edition = "2024"
 authors = ["zTgx <beautifularea@gmail.com>"]
 description = "Hierarchical, reasoning-native document intelligence engine"