Skip to content

Commit 42f81bc

Browse files
author
ebembi-crdb
committed
Fix version number search discrepancy between formats
Improve version number search by implementing context-aware filtering that distinguishes between legitimate version references and navigation spam. Previously, searches for 'v26.1' returned significantly fewer results than '26.1' due to overly aggressive version spam filtering. The fix preserves version numbers in release pages, changelogs, and complex version strings while still filtering UI navigation spam. This ensures both search formats return equivalent, relevant results. Resolves issue where version searches with 'v' prefix were less discoverable than plain number searches.
1 parent f775435 commit 42f81bc

File tree

1 file changed

+43
-1
lines changed

1 file changed

+43
-1
lines changed

src/current/algolia_index_intelligent_bloat_removal.py

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,41 @@ def is_duplicate_content(self, content: str) -> bool:
137137
SEEN_CONTENT_HASHES.add(content_hash)
138138
return False
139139

140+
def _is_version_spam(self, content: str, context: Dict[str, str]) -> bool:
141+
"""Context-aware version filtering to distinguish spam from legitimate version references."""
142+
# Version patterns to check
143+
v_pattern = re.compile(r'^v\d+\.\d+(\.\d+)?(-beta\.\d+)?\s*$', re.IGNORECASE)
144+
beta_pattern = re.compile(r'^beta-\d+\s*$', re.IGNORECASE)
145+
146+
# Check if content matches version patterns
147+
is_v_version = v_pattern.match(content)
148+
is_beta_version = beta_pattern.match(content)
149+
150+
if not (is_v_version or is_beta_version):
151+
return False
152+
153+
# Context clues that indicate this is legitimate version content, not spam
154+
page_url = context.get('url', '')
155+
156+
# ALWAYS preserve version numbers in release pages and version-specific content
157+
if any(area in page_url for area in ['/releases/', 'release-notes', 'changelog']):
158+
return False
159+
160+
# Handle beta versions - generally filter as spam unless in release context
161+
if is_beta_version:
162+
return True
163+
164+
# Handle v-versions based on length and complexity
165+
if is_v_version:
166+
# Preserve longer, more complex version strings
167+
if len(content) > 8 or '-beta' in content: # e.g., "v26.1.0-beta.1"
168+
return False
169+
170+
# Filter short version numbers outside release context (navigation spam)
171+
return True
172+
173+
return False
174+
140175
def is_bloat_content(self, content: str, context: Dict[str, str] = None) -> bool:
141176
"""Intelligently determine if content is bloat while preserving valuable content."""
142177
if not content or len(content.strip()) < MIN_CONTENT_LENGTH:
@@ -150,8 +185,15 @@ def is_bloat_content(self, content: str, context: Dict[str, str] = None) -> bool
150185
if pattern.search(content_clean):
151186
return False
152187

153-
# 2. Check for exact bloat patterns
188+
# 2. Context-aware version filtering before exact bloat patterns
189+
if self._is_version_spam(content_clean, context):
190+
return True
191+
192+
# 3. Check for exact bloat patterns (excluding version patterns handled above)
154193
for pattern in self.exact_bloat_patterns:
194+
# Skip version patterns since they're handled contextually above
195+
if pattern.pattern.startswith(('^v\\d+', '^beta-')):
196+
continue
155197
if pattern.match(content_clean):
156198
return True
157199

0 commit comments

Comments
 (0)