Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions fraudcrawler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
Host,
Language,
Location,
FilteredAtStage,
ProductItem,
HttpxAsyncClient,
)
Expand All @@ -52,6 +53,7 @@
"OpenAIClassificationUserInputs",
"Processor",
"Orchestrator",
"FilteredAtStage",
"ProductItem",
"FraudCrawlerClient",
"Language",
Expand Down
12 changes: 12 additions & 0 deletions fraudcrawler/base/base.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from enum import Enum
import json
import logging
from pydantic import (
Expand Down Expand Up @@ -135,6 +136,17 @@ class WebsiteSourceMetadata(BaseModel):
render_error: str | None = None


class FilteredAtStage(Enum):
"""Enum for product.filtered_at_stage texts."""

SEARCH_COUNTRY_CODE = "Search (country code filtering)"
SEARCH_EXCLUDED_URLS = "Search (excluded URLs filtering)"
URL_COLLECTION_CURRENT = "URL collection (current run deduplication)"
URL_COLLECTION_PREVIOUS = "URL collection (previous run deduplication)"
CONTEXT_PROBABILITY_THRESHOLD = "Context (Zyte probability threshold)"
CONTEXT_EXACT_SEARCH = "Context (exact search)"


class ProductItem(BaseModel):
"""Model representing a product item."""

Expand Down
7 changes: 5 additions & 2 deletions fraudcrawler/base/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
Language,
Location,
Deepness,
FilteredAtStage,
ProductItem,
)
from fraudcrawler import (
Expand Down Expand Up @@ -189,7 +190,7 @@ async def _cntx_execute(
if not self._zyteapi.keep_product(details=details):
product.filtered = True
product.filtered_at_stage = (
"Context (Zyte probability threshold)"
FilteredAtStage.CONTEXT_PROBABILITY_THRESHOLD.value
)

# Check for exact match inside the full product context
Expand All @@ -200,7 +201,9 @@ async def _cntx_execute(
and not product.exact_search_match
):
product.filtered = True
product.filtered_at_stage = "Context (exact search)"
product.filtered_at_stage = (
FilteredAtStage.CONTEXT_EXACT_SEARCH.value
)

except Exception:
logger.error(
Expand Down
5 changes: 3 additions & 2 deletions fraudcrawler/scraping/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
Language,
Location,
DomainUtils,
FilteredAtStage,
WebsiteSourceMetadata,
)
from fraudcrawler.base.retry import get_async_retry
Expand Down Expand Up @@ -1417,13 +1418,13 @@ def _apply_filters(
# Check if the URL has a relevant country_code
if not self._relevant_country_code(url=result.url, country_code=location.code):
result.filtered = True
result.filtered_at_stage = "Search (country code filtering)"
result.filtered_at_stage = FilteredAtStage.SEARCH_COUNTRY_CODE.value
return result

# Check if the URL is in the excluded URLs
if excluded_urls and self._is_excluded_url(result.domain, excluded_urls):
result.filtered = True
result.filtered_at_stage = "Search (excluded URLs filtering)"
result.filtered_at_stage = FilteredAtStage.SEARCH_EXCLUDED_URLS.value
return result

return result
Expand Down
6 changes: 3 additions & 3 deletions fraudcrawler/scraping/url.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from urllib.parse import urlparse, parse_qsl, urlencode, quote, urlunparse, ParseResult

from fraudcrawler.settings import KNOWN_TRACKERS
from fraudcrawler.base.base import ProductItem
from fraudcrawler.base.base import FilteredAtStage, ProductItem

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -94,13 +94,13 @@ async def apply(self, product: ProductItem) -> ProductItem:
# deduplicate on current run
if url in self._collected_currently:
product.filtered = True
product.filtered_at_stage = "URL collection (current run deduplication)"
product.filtered_at_stage = FilteredAtStage.URL_COLLECTION_CURRENT.value
logger.debug(f"URL {url} already collected in current run")

# deduplicate on previous runs coming from a db
elif url in self._collected_previously:
product.filtered = True
product.filtered_at_stage = "URL collection (previous run deduplication)"
product.filtered_at_stage = FilteredAtStage.URL_COLLECTION_PREVIOUS.value
logger.debug(f"URL {url} as already collected in previous run")

# Add to currently collected URLs
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"

[tool.poetry]
name = "fraudcrawler"
version = "0.8.11"
version = "0.8.12"
description = "Intelligent Market Monitoring"
authors = [
"Domingo Bertus <hello@veanu.ch>",
Expand Down
Loading