From 17eaf9e52cd097fb64985ccf3f1963011c0a1459 Mon Sep 17 00:00:00 2001
From: chriguBERTO <christoph.jaeggli@gmail.com>
Date: Sat, 25 Apr 2026 22:24:00 +0200
Subject: [PATCH 1/3] central filtered_at_stage texts

---
 fraudcrawler/__init__.py          | 2 ++
 fraudcrawler/base/base.py         | 9 +++++++++
 fraudcrawler/base/orchestrator.py | 7 +++----
 fraudcrawler/scraping/url.py      | 6 +++---
 pyproject.toml                    | 2 +-
 5 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/fraudcrawler/__init__.py b/fraudcrawler/__init__.py
index 3348cb6..50c5483 100644
--- a/fraudcrawler/__init__.py
+++ b/fraudcrawler/__init__.py
@@ -28,6 +28,7 @@
     Host,
     Language,
     Location,
+    FilteredAtStage,
     ProductItem,
     HttpxAsyncClient,
 )
@@ -52,6 +53,7 @@
     "OpenAIClassificationUserInputs",
     "Processor",
     "Orchestrator",
+    "FilteredAtStage",
     "ProductItem",
     "FraudCrawlerClient",
     "Language",
diff --git a/fraudcrawler/base/base.py b/fraudcrawler/base/base.py
index b48aab4..67baccc 100644
--- a/fraudcrawler/base/base.py
+++ b/fraudcrawler/base/base.py
@@ -1,3 +1,4 @@
+from enum import Enum
 import json
 import logging
 from pydantic import (
@@ -135,6 +136,14 @@ class WebsiteSourceMetadata(BaseModel):
     render_error: str | None = None
 
 
+class FilteredAtStage(Enum):
+    """Enum for search engine names."""
+    URL_COLLECTION_CURRENT = "URL collection (current run deduplication)"
+    URL_COLLECTION_PREVIOUS = "URL collection (previous run deduplication)"
+    CONTEXT_PROBABILITY_THRESHOLD = "Context (Zyte probability threshold)"
+    CONTEXT_EXACT_SEARCH = "Context (exact search)"
+
+
 class ProductItem(BaseModel):
     """Model representing a product item."""
 
diff --git a/fraudcrawler/base/orchestrator.py b/fraudcrawler/base/orchestrator.py
index 4d71e61..22e7ef9 100644
--- a/fraudcrawler/base/orchestrator.py
+++ b/fraudcrawler/base/orchestrator.py
@@ -18,6 +18,7 @@
     Language,
     Location,
     Deepness,
+    FilteredAtStage,
     ProductItem,
 )
 from fraudcrawler import (
@@ -188,9 +189,7 @@ async def _cntx_execute(
                     # Filter the product based on the probability threshold
                     if not self._zyteapi.keep_product(details=details):
                         product.filtered = True
-                        product.filtered_at_stage = (
-                            "Context (Zyte probability threshold)"
-                        )
+                        product.filtered_at_stage = FilteredAtStage.CONTEXT_PROBABILITY_THRESHOLD.value
 
                     # Check for exact match inside the full product context
                     product = self._check_exact_search(product=product)
@@ -200,7 +199,7 @@ async def _cntx_execute(
                         and not product.exact_search_match
                     ):
                         product.filtered = True
-                        product.filtered_at_stage = "Context (exact search)"
+                        product.filtered_at_stage = FilteredAtStage.CONTEXT_EXACT_SEARCH.value
 
                 except Exception:
                     logger.error(
diff --git a/fraudcrawler/scraping/url.py b/fraudcrawler/scraping/url.py
index ba41b3e..b107c0b 100644
--- a/fraudcrawler/scraping/url.py
+++ b/fraudcrawler/scraping/url.py
@@ -3,7 +3,7 @@
 from urllib.parse import urlparse, parse_qsl, urlencode, quote, urlunparse, ParseResult
 
 from fraudcrawler.settings import KNOWN_TRACKERS
-from fraudcrawler.base.base import ProductItem
+from fraudcrawler.base.base import FilteredAtStage, ProductItem
 
 logger = logging.getLogger(__name__)
 
@@ -94,13 +94,13 @@ async def apply(self, product: ProductItem) -> ProductItem:
         # deduplicate on current run
         if url in self._collected_currently:
             product.filtered = True
-            product.filtered_at_stage = "URL collection (current run deduplication)"
+            product.filtered_at_stage = FilteredAtStage.URL_COLLECTION_CURRENT.value
             logger.debug(f"URL {url} already collected in current run")
 
         # deduplicate on previous runs coming from a db
         elif url in self._collected_previously:
             product.filtered = True
-            product.filtered_at_stage = "URL collection (previous run deduplication)"
+            product.filtered_at_stage = FilteredAtStage.URL_COLLECTION_PREVIOUS.value
             logger.debug(f"URL {url} as already collected in previous run")
 
         # Add to currently collected URLs
diff --git a/pyproject.toml b/pyproject.toml
index 113dd23..4af9291 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
 name = "fraudcrawler"
-version = "0.8.11"
+version = "0.8.12"
 description = "Intelligent Market Monitoring"
 authors = [
     "Domingo Bertus <hello@veanu.ch>",

From a29381147851beae5ee08e52f3c2b8de6476b76c Mon Sep 17 00:00:00 2001
From: chriguBERTO <christoph.jaeggli@gmail.com>
Date: Sat, 25 Apr 2026 22:26:30 +0200
Subject: [PATCH 2/3] Filtered at stage texts centralized

---
 fraudcrawler/base/base.py         | 3 ++-
 fraudcrawler/base/orchestrator.py | 8 ++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/fraudcrawler/base/base.py b/fraudcrawler/base/base.py
index 67baccc..784a16b 100644
--- a/fraudcrawler/base/base.py
+++ b/fraudcrawler/base/base.py
@@ -137,7 +137,8 @@ class WebsiteSourceMetadata(BaseModel):
 
 
 class FilteredAtStage(Enum):
-    """Enum for search engine names."""
+    """Enum for product.filtered_at_stage texts."""
+
     URL_COLLECTION_CURRENT = "URL collection (current run deduplication)"
     URL_COLLECTION_PREVIOUS = "URL collection (previous run deduplication)"
     CONTEXT_PROBABILITY_THRESHOLD = "Context (Zyte probability threshold)"
diff --git a/fraudcrawler/base/orchestrator.py b/fraudcrawler/base/orchestrator.py
index 22e7ef9..24ae002 100644
--- a/fraudcrawler/base/orchestrator.py
+++ b/fraudcrawler/base/orchestrator.py
@@ -189,7 +189,9 @@ async def _cntx_execute(
                     # Filter the product based on the probability threshold
                     if not self._zyteapi.keep_product(details=details):
                         product.filtered = True
-                        product.filtered_at_stage = FilteredAtStage.CONTEXT_PROBABILITY_THRESHOLD.value
+                        product.filtered_at_stage = (
+                            FilteredAtStage.CONTEXT_PROBABILITY_THRESHOLD.value
+                        )
 
                     # Check for exact match inside the full product context
                     product = self._check_exact_search(product=product)
@@ -199,7 +201,9 @@ async def _cntx_execute(
                         and not product.exact_search_match
                     ):
                         product.filtered = True
-                        product.filtered_at_stage = FilteredAtStage.CONTEXT_EXACT_SEARCH.value
+                        product.filtered_at_stage = (
+                            FilteredAtStage.CONTEXT_EXACT_SEARCH.value
+                        )
 
                 except Exception:
                     logger.error(

From 9f8120d5e0bf34b5ca6fd3021e2d0b4087f11fff Mon Sep 17 00:00:00 2001
From: chriguBERTO <christoph.jaeggli@gmail.com>
Date: Sat, 25 Apr 2026 22:37:28 +0200
Subject: [PATCH 3/3] Search filtered_at_stage added

---
 fraudcrawler/base/base.py       | 2 ++
 fraudcrawler/scraping/search.py | 5 +++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/fraudcrawler/base/base.py b/fraudcrawler/base/base.py
index 784a16b..975aaf4 100644
--- a/fraudcrawler/base/base.py
+++ b/fraudcrawler/base/base.py
@@ -139,6 +139,8 @@ class WebsiteSourceMetadata(BaseModel):
 class FilteredAtStage(Enum):
     """Enum for product.filtered_at_stage texts."""
 
+    SEARCH_COUNTRY_CODE = "Search (country code filtering)"
+    SEARCH_EXCLUDED_URLS = "Search (excluded URLs filtering)"
     URL_COLLECTION_CURRENT = "URL collection (current run deduplication)"
     URL_COLLECTION_PREVIOUS = "URL collection (previous run deduplication)"
     CONTEXT_PROBABILITY_THRESHOLD = "Context (Zyte probability threshold)"
diff --git a/fraudcrawler/scraping/search.py b/fraudcrawler/scraping/search.py
index feefb26..b7c8d19 100644
--- a/fraudcrawler/scraping/search.py
+++ b/fraudcrawler/scraping/search.py
@@ -30,6 +30,7 @@
     Language,
     Location,
     DomainUtils,
+    FilteredAtStage,
     WebsiteSourceMetadata,
 )
 from fraudcrawler.base.retry import get_async_retry
@@ -1417,13 +1418,13 @@ def _apply_filters(
         # Check if the URL has a relevant country_code
         if not self._relevant_country_code(url=result.url, country_code=location.code):
             result.filtered = True
-            result.filtered_at_stage = "Search (country code filtering)"
+            result.filtered_at_stage = FilteredAtStage.SEARCH_COUNTRY_CODE.value
             return result
 
         # Check if the URL is in the excluded URLs
         if excluded_urls and self._is_excluded_url(result.domain, excluded_urls):
             result.filtered = True
-            result.filtered_at_stage = "Search (excluded URLs filtering)"
+            result.filtered_at_stage = FilteredAtStage.SEARCH_EXCLUDED_URLS.value
             return result
 
         return result