From 42f81bc920a6d898ef2dec30bb8be41409b7defc Mon Sep 17 00:00:00 2001 From: ebembi-crdb Date: Fri, 24 Oct 2025 23:57:24 +0530 Subject: [PATCH] Fix version number search discrepancy between formats Improve version number search by implementing context-aware filtering that distinguishes between legitimate version references and navigation spam. Previously, searches for 'v26.1' returned significantly fewer results than '26.1' due to overly aggressive version spam filtering. The fix preserves version numbers in release pages, changelogs, and complex version strings while still filtering UI navigation spam. This ensures both search formats return equivalent, relevant results. Resolves issue where version searches with 'v' prefix were less discoverable than plain number searches. --- ...algolia_index_intelligent_bloat_removal.py | 44 ++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/src/current/algolia_index_intelligent_bloat_removal.py b/src/current/algolia_index_intelligent_bloat_removal.py index 6a5d7befbc7..9ce1ee1ef62 100644 --- a/src/current/algolia_index_intelligent_bloat_removal.py +++ b/src/current/algolia_index_intelligent_bloat_removal.py @@ -137,6 +137,41 @@ def is_duplicate_content(self, content: str) -> bool: SEEN_CONTENT_HASHES.add(content_hash) return False + def _is_version_spam(self, content: str, context: Dict[str, str]) -> bool: + """Context-aware version filtering to distinguish spam from legitimate version references.""" + # Version patterns to check + v_pattern = re.compile(r'^v\d+\.\d+(\.\d+)?(-beta\.\d+)?\s*$', re.IGNORECASE) + beta_pattern = re.compile(r'^beta-\d+\s*$', re.IGNORECASE) + + # Check if content matches version patterns + is_v_version = v_pattern.match(content) + is_beta_version = beta_pattern.match(content) + + if not (is_v_version or is_beta_version): + return False + + # Context clues that indicate this is legitimate version content, not spam + page_url = context.get('url', '') + + # ALWAYS preserve version numbers in release pages and version-specific content + if any(area in page_url for area in ['/releases/', 'release-notes', 'changelog']): + return False + + # Handle beta versions - generally filter as spam unless in release context + if is_beta_version: + return True + + # Handle v-versions based on length and complexity + if is_v_version: + # Preserve longer, more complex version strings + if len(content) > 8 or '-beta' in content: # e.g., "v26.1.0-beta.1" + return False + + # Filter short version numbers outside release context (navigation spam) + return True + + return False + def is_bloat_content(self, content: str, context: Dict[str, str] = None) -> bool: """Intelligently determine if content is bloat while preserving valuable content.""" if not content or len(content.strip()) < MIN_CONTENT_LENGTH: @@ -150,8 +185,15 @@ def is_bloat_content(self, content: str, context: Dict[str, str] = None) -> bool if pattern.search(content_clean): return False - # 2. Check for exact bloat patterns + # 2. Context-aware version filtering before exact bloat patterns + if self._is_version_spam(content_clean, context): + return True + + # 3. Check for exact bloat patterns (excluding version patterns handled above) for pattern in self.exact_bloat_patterns: + # Skip version patterns since they're handled contextually above + if pattern.pattern.startswith(('^v\\d+', '^beta-')): + continue if pattern.match(content_clean): return True