diff --git a/src/current/algolia_index_intelligent_bloat_removal.py b/src/current/algolia_index_intelligent_bloat_removal.py index 6a5d7befbc7..9ce1ee1ef62 100644 --- a/src/current/algolia_index_intelligent_bloat_removal.py +++ b/src/current/algolia_index_intelligent_bloat_removal.py @@ -137,6 +137,41 @@ def is_duplicate_content(self, content: str) -> bool: SEEN_CONTENT_HASHES.add(content_hash) return False + def _is_version_spam(self, content: str, context: Dict[str, str]) -> bool: + """Context-aware version filtering to distinguish spam from legitimate version references.""" + # Version patterns to check + v_pattern = re.compile(r'^v\d+\.\d+(\.\d+)?(-beta\.\d+)?\s*$', re.IGNORECASE) + beta_pattern = re.compile(r'^beta-\d+\s*$', re.IGNORECASE) + + # Check if content matches version patterns + is_v_version = v_pattern.match(content) + is_beta_version = beta_pattern.match(content) + + if not (is_v_version or is_beta_version): + return False + + # Context clues that indicate this is legitimate version content, not spam + page_url = context.get('url', '') + + # ALWAYS preserve version numbers in release pages and version-specific content + if any(area in page_url for area in ['/releases/', 'release-notes', 'changelog']): + return False + + # Handle beta versions - generally filter as spam unless in release context + if is_beta_version: + return True + + # Handle v-versions based on length and complexity + if is_v_version: + # Preserve longer, more complex version strings + if len(content) > 8 or '-beta' in content: # e.g., "v26.1.0-beta.1" + return False + + # Filter short version numbers outside release context (navigation spam) + return True + + return False + def is_bloat_content(self, content: str, context: Dict[str, str] = None) -> bool: """Intelligently determine if content is bloat while preserving valuable content.""" if not content or len(content.strip()) < MIN_CONTENT_LENGTH: @@ -150,8 +185,15 @@ def is_bloat_content(self, content: str, context: Dict[str, str] = None) -> bool if pattern.search(content_clean): return False - # 2. Check for exact bloat patterns + # 2. Context-aware version filtering before exact bloat patterns + if self._is_version_spam(content_clean, context): + return True + + # 3. Check for exact bloat patterns (excluding version patterns handled above) for pattern in self.exact_bloat_patterns: + # Skip version patterns since they're handled contextually above + if pattern.pattern.startswith(('^v\\d+', '^beta-')): + continue if pattern.match(content_clean): return True