From 694ba44a04e512090fc4b1dd0235bf8485fa9ace Mon Sep 17 00:00:00 2001 From: ChiragBellara Date: Fri, 30 Jan 2026 09:33:30 -0800 Subject: [PATCH] Added fix for URL Seeder forcing Common Crawl index in case of a "sitemap" --- crawl4ai/async_url_seeder.py | 12 ++++---- .../test_url_seeder_for_only_sitemap.py | 30 +++++++++++++++++++ 2 files changed, 37 insertions(+), 5 deletions(-) create mode 100644 tests/general/test_url_seeder_for_only_sitemap.py diff --git a/crawl4ai/async_url_seeder.py b/crawl4ai/async_url_seeder.py index 29fb4b50c..6afbaa285 100644 --- a/crawl4ai/async_url_seeder.py +++ b/crawl4ai/async_url_seeder.py @@ -400,18 +400,20 @@ async def urls(self, if self.logger and hasattr(self.logger, 'verbose') and config.verbose is not None: self.logger.verbose = config.verbose - # ensure we have the latest CC collection id - if self.index_id is None: - self.index_id = await self._latest_index() - # Parse source parameter - split by '+' to get list of sources - sources = source.split('+') + sources = [s.strip().lower() for s in source.split("+") if s.strip()] + valid_sources = {"cc", "sitemap"} for s in sources: if s not in valid_sources: raise ValueError( f"Invalid source '{s}'. Valid sources are: {', '.join(valid_sources)}") + # ensure we have the latest CC collection id when the source is cc + if s == "cc" and self.index_id is None: + self.index_id = await self._latest_index() + + if hits_per_sec: if hits_per_sec <= 0: self._log( diff --git a/tests/general/test_url_seeder_for_only_sitemap.py b/tests/general/test_url_seeder_for_only_sitemap.py new file mode 100644 index 000000000..892f3af7c --- /dev/null +++ b/tests/general/test_url_seeder_for_only_sitemap.py @@ -0,0 +1,30 @@ +import asyncio +from crawl4ai import AsyncLogger, AsyncUrlSeeder, SeedingConfig +from pathlib import Path +import httpx + + +async def test_sitemap_source_does_not_hit_commoncrawl(): + config = SeedingConfig( + source="sitemap", + live_check=False, + extract_head=False, + max_urls=50, + verbose=True, + force=False + ) + + async with AsyncUrlSeeder(logger=AsyncLogger(verbose=True)) as seeder: + async def boom(*args, **kwargs): + print("DEBUG: _latest_index called") + raise httpx.ConnectTimeout("Simulated CommonCrawl outage") + + seeder._latest_index = boom + try: + await seeder.urls("https://docs.crawl4ai.com/", config) + print("PASS: _latest_index was NOT called (expected after fix).") + except httpx.ConnectTimeout: + print("FAIL: _latest_index WAS called even though source='sitemap'.") + +if __name__ == "__main__": + asyncio.run(test_sitemap_source_does_not_hit_commoncrawl())