From 17eaf9e52cd097fb64985ccf3f1963011c0a1459 Mon Sep 17 00:00:00 2001 From: chriguBERTO Date: Sat, 25 Apr 2026 22:24:00 +0200 Subject: [PATCH 1/3] central filtered_at_stage texts --- fraudcrawler/__init__.py | 2 ++ fraudcrawler/base/base.py | 9 +++++++++ fraudcrawler/base/orchestrator.py | 7 +++---- fraudcrawler/scraping/url.py | 6 +++--- pyproject.toml | 2 +- 5 files changed, 18 insertions(+), 8 deletions(-) diff --git a/fraudcrawler/__init__.py b/fraudcrawler/__init__.py index 3348cb6..50c5483 100644 --- a/fraudcrawler/__init__.py +++ b/fraudcrawler/__init__.py @@ -28,6 +28,7 @@ Host, Language, Location, + FilteredAtStage, ProductItem, HttpxAsyncClient, ) @@ -52,6 +53,7 @@ "OpenAIClassificationUserInputs", "Processor", "Orchestrator", + "FilteredAtStage", "ProductItem", "FraudCrawlerClient", "Language", diff --git a/fraudcrawler/base/base.py b/fraudcrawler/base/base.py index b48aab4..67baccc 100644 --- a/fraudcrawler/base/base.py +++ b/fraudcrawler/base/base.py @@ -1,3 +1,4 @@ +from enum import Enum import json import logging from pydantic import ( @@ -135,6 +136,14 @@ class WebsiteSourceMetadata(BaseModel): render_error: str | None = None +class FilteredAtStage(Enum): + """Enum for search engine names.""" + URL_COLLECTION_CURRENT = "URL collection (current run deduplication)" + URL_COLLECTION_PREVIOUS = "URL collection (previous run deduplication)" + CONTEXT_PROBABILITY_THRESHOLD = "Context (Zyte probability threshold)" + CONTEXT_EXACT_SEARCH = "Context (exact search)" + + class ProductItem(BaseModel): """Model representing a product item.""" diff --git a/fraudcrawler/base/orchestrator.py b/fraudcrawler/base/orchestrator.py index 4d71e61..22e7ef9 100644 --- a/fraudcrawler/base/orchestrator.py +++ b/fraudcrawler/base/orchestrator.py @@ -18,6 +18,7 @@ Language, Location, Deepness, + FilteredAtStage, ProductItem, ) from fraudcrawler import ( @@ -188,9 +189,7 @@ async def _cntx_execute( # Filter the product based on the probability threshold if not self._zyteapi.keep_product(details=details): product.filtered = True - product.filtered_at_stage = ( - "Context (Zyte probability threshold)" - ) + product.filtered_at_stage = FilteredAtStage.CONTEXT_PROBABILITY_THRESHOLD.value # Check for exact match inside the full product context product = self._check_exact_search(product=product) @@ -200,7 +199,7 @@ async def _cntx_execute( and not product.exact_search_match ): product.filtered = True - product.filtered_at_stage = "Context (exact search)" + product.filtered_at_stage = FilteredAtStage.CONTEXT_EXACT_SEARCH.value except Exception: logger.error( diff --git a/fraudcrawler/scraping/url.py b/fraudcrawler/scraping/url.py index ba41b3e..b107c0b 100644 --- a/fraudcrawler/scraping/url.py +++ b/fraudcrawler/scraping/url.py @@ -3,7 +3,7 @@ from urllib.parse import urlparse, parse_qsl, urlencode, quote, urlunparse, ParseResult from fraudcrawler.settings import KNOWN_TRACKERS -from fraudcrawler.base.base import ProductItem +from fraudcrawler.base.base import FilteredAtStage, ProductItem logger = logging.getLogger(__name__) @@ -94,13 +94,13 @@ async def apply(self, product: ProductItem) -> ProductItem: # deduplicate on current run if url in self._collected_currently: product.filtered = True - product.filtered_at_stage = "URL collection (current run deduplication)" + product.filtered_at_stage = FilteredAtStage.URL_COLLECTION_CURRENT.value logger.debug(f"URL {url} already collected in current run") # deduplicate on previous runs coming from a db elif url in self._collected_previously: product.filtered = True - product.filtered_at_stage = "URL collection (previous run deduplication)" + product.filtered_at_stage = FilteredAtStage.URL_COLLECTION_PREVIOUS.value logger.debug(f"URL {url} as already collected in previous run") # Add to currently collected URLs diff --git a/pyproject.toml b/pyproject.toml index 113dd23..4af9291 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "fraudcrawler" -version = "0.8.11" +version = "0.8.12" description = "Intelligent Market Monitoring" authors = [ "Domingo Bertus ", From a29381147851beae5ee08e52f3c2b8de6476b76c Mon Sep 17 00:00:00 2001 From: chriguBERTO Date: Sat, 25 Apr 2026 22:26:30 +0200 Subject: [PATCH 2/3] Filtered at stage texts centralized --- fraudcrawler/base/base.py | 3 ++- fraudcrawler/base/orchestrator.py | 8 ++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/fraudcrawler/base/base.py b/fraudcrawler/base/base.py index 67baccc..784a16b 100644 --- a/fraudcrawler/base/base.py +++ b/fraudcrawler/base/base.py @@ -137,7 +137,8 @@ class WebsiteSourceMetadata(BaseModel): class FilteredAtStage(Enum): - """Enum for search engine names.""" + """Enum for product.filtered_at_stage texts.""" + URL_COLLECTION_CURRENT = "URL collection (current run deduplication)" URL_COLLECTION_PREVIOUS = "URL collection (previous run deduplication)" CONTEXT_PROBABILITY_THRESHOLD = "Context (Zyte probability threshold)" diff --git a/fraudcrawler/base/orchestrator.py b/fraudcrawler/base/orchestrator.py index 22e7ef9..24ae002 100644 --- a/fraudcrawler/base/orchestrator.py +++ b/fraudcrawler/base/orchestrator.py @@ -189,7 +189,9 @@ async def _cntx_execute( # Filter the product based on the probability threshold if not self._zyteapi.keep_product(details=details): product.filtered = True - product.filtered_at_stage = FilteredAtStage.CONTEXT_PROBABILITY_THRESHOLD.value + product.filtered_at_stage = ( + FilteredAtStage.CONTEXT_PROBABILITY_THRESHOLD.value + ) # Check for exact match inside the full product context product = self._check_exact_search(product=product) @@ -199,7 +201,9 @@ async def _cntx_execute( and not product.exact_search_match ): product.filtered = True - product.filtered_at_stage = FilteredAtStage.CONTEXT_EXACT_SEARCH.value + product.filtered_at_stage = ( + FilteredAtStage.CONTEXT_EXACT_SEARCH.value + ) except Exception: logger.error( From 9f8120d5e0bf34b5ca6fd3021e2d0b4087f11fff Mon Sep 17 00:00:00 2001 From: chriguBERTO Date: Sat, 25 Apr 2026 22:37:28 +0200 Subject: [PATCH 3/3] Search filtered_at_stage added --- fraudcrawler/base/base.py | 2 ++ fraudcrawler/scraping/search.py | 5 +++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/fraudcrawler/base/base.py b/fraudcrawler/base/base.py index 784a16b..975aaf4 100644 --- a/fraudcrawler/base/base.py +++ b/fraudcrawler/base/base.py @@ -139,6 +139,8 @@ class WebsiteSourceMetadata(BaseModel): class FilteredAtStage(Enum): """Enum for product.filtered_at_stage texts.""" + SEARCH_COUNTRY_CODE = "Search (country code filtering)" + SEARCH_EXCLUDED_URLS = "Search (excluded URLs filtering)" URL_COLLECTION_CURRENT = "URL collection (current run deduplication)" URL_COLLECTION_PREVIOUS = "URL collection (previous run deduplication)" CONTEXT_PROBABILITY_THRESHOLD = "Context (Zyte probability threshold)" diff --git a/fraudcrawler/scraping/search.py b/fraudcrawler/scraping/search.py index feefb26..b7c8d19 100644 --- a/fraudcrawler/scraping/search.py +++ b/fraudcrawler/scraping/search.py @@ -30,6 +30,7 @@ Language, Location, DomainUtils, + FilteredAtStage, WebsiteSourceMetadata, ) from fraudcrawler.base.retry import get_async_retry @@ -1417,13 +1418,13 @@ def _apply_filters( # Check if the URL has a relevant country_code if not self._relevant_country_code(url=result.url, country_code=location.code): result.filtered = True - result.filtered_at_stage = "Search (country code filtering)" + result.filtered_at_stage = FilteredAtStage.SEARCH_COUNTRY_CODE.value return result # Check if the URL is in the excluded URLs if excluded_urls and self._is_excluded_url(result.domain, excluded_urls): result.filtered = True - result.filtered_at_stage = "Search (excluded URLs filtering)" + result.filtered_at_stage = FilteredAtStage.SEARCH_EXCLUDED_URLS.value return result return result