Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 43 additions & 1 deletion src/current/algolia_index_intelligent_bloat_removal.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,41 @@ def is_duplicate_content(self, content: str) -> bool:
SEEN_CONTENT_HASHES.add(content_hash)
return False

def _is_version_spam(self, content: str, context: Dict[str, str]) -> bool:
"""Context-aware version filtering to distinguish spam from legitimate version references."""
# Version patterns to check
v_pattern = re.compile(r'^v\d+\.\d+(\.\d+)?(-beta\.\d+)?\s*$', re.IGNORECASE)
beta_pattern = re.compile(r'^beta-\d+\s*$', re.IGNORECASE)

# Check if content matches version patterns
is_v_version = v_pattern.match(content)
is_beta_version = beta_pattern.match(content)

if not (is_v_version or is_beta_version):
return False

# Context clues that indicate this is legitimate version content, not spam
page_url = context.get('url', '')

# ALWAYS preserve version numbers in release pages and version-specific content
if any(area in page_url for area in ['/releases/', 'release-notes', 'changelog']):
return False

# Handle beta versions - generally filter as spam unless in release context
if is_beta_version:
return True

# Handle v-versions based on length and complexity
if is_v_version:
# Preserve longer, more complex version strings
if len(content) > 8 or '-beta' in content: # e.g., "v26.1.0-beta.1"
return False

# Filter short version numbers outside release context (navigation spam)
return True

return False

def is_bloat_content(self, content: str, context: Dict[str, str] = None) -> bool:
"""Intelligently determine if content is bloat while preserving valuable content."""
if not content or len(content.strip()) < MIN_CONTENT_LENGTH:
Expand All @@ -150,8 +185,15 @@ def is_bloat_content(self, content: str, context: Dict[str, str] = None) -> bool
if pattern.search(content_clean):
return False

# 2. Check for exact bloat patterns
# 2. Context-aware version filtering before exact bloat patterns
if self._is_version_spam(content_clean, context):
return True

# 3. Check for exact bloat patterns (excluding version patterns handled above)
for pattern in self.exact_bloat_patterns:
# Skip version patterns since they're handled contextually above
if pattern.pattern.startswith(('^v\\d+', '^beta-')):
continue
if pattern.match(content_clean):
return True

Expand Down
Loading