From 694ba44a04e512090fc4b1dd0235bf8485fa9ace Mon Sep 17 00:00:00 2001
From: ChiragBellara <chiragbellara7@gmail.com>
Date: Fri, 30 Jan 2026 09:33:30 -0800
Subject: [PATCH] Added fix for URL Seeder forcing Common Crawl index in case
 of a "sitemap"

---
 crawl4ai/async_url_seeder.py                  | 12 ++++----
 .../test_url_seeder_for_only_sitemap.py       | 30 +++++++++++++++++++
 2 files changed, 37 insertions(+), 5 deletions(-)
 create mode 100644 tests/general/test_url_seeder_for_only_sitemap.py

diff --git a/crawl4ai/async_url_seeder.py b/crawl4ai/async_url_seeder.py
index 29fb4b50c..6afbaa285 100644
--- a/crawl4ai/async_url_seeder.py
+++ b/crawl4ai/async_url_seeder.py
@@ -400,18 +400,20 @@ async def urls(self,
         if self.logger and hasattr(self.logger, 'verbose') and config.verbose is not None:
             self.logger.verbose = config.verbose
 
-        # ensure we have the latest CC collection id
-        if self.index_id is None:
-            self.index_id = await self._latest_index()
-
         # Parse source parameter - split by '+' to get list of sources
-        sources = source.split('+')
+        sources = [s.strip().lower() for s in source.split("+") if s.strip()]
+
         valid_sources = {"cc", "sitemap"}
         for s in sources:
             if s not in valid_sources:
                 raise ValueError(
                     f"Invalid source '{s}'. Valid sources are: {', '.join(valid_sources)}")
 
+            # ensure we have the latest CC collection id when the source is cc
+            if s == "cc" and self.index_id is None:
+                self.index_id = await self._latest_index()
+
+
         if hits_per_sec:
             if hits_per_sec <= 0:
                 self._log(
diff --git a/tests/general/test_url_seeder_for_only_sitemap.py b/tests/general/test_url_seeder_for_only_sitemap.py
new file mode 100644
index 000000000..892f3af7c
--- /dev/null
+++ b/tests/general/test_url_seeder_for_only_sitemap.py
@@ -0,0 +1,30 @@
+import asyncio
+from crawl4ai import AsyncLogger, AsyncUrlSeeder, SeedingConfig
+from pathlib import Path
+import httpx
+
+
+async def test_sitemap_source_does_not_hit_commoncrawl():
+    config = SeedingConfig(
+        source="sitemap",
+        live_check=False,
+        extract_head=False,
+        max_urls=50,
+        verbose=True,
+        force=False
+    )
+
+    async with AsyncUrlSeeder(logger=AsyncLogger(verbose=True)) as seeder:
+        async def boom(*args, **kwargs):
+            print("DEBUG: _latest_index called")
+            raise httpx.ConnectTimeout("Simulated CommonCrawl outage")
+
+        seeder._latest_index = boom
+        try:
+            await seeder.urls("https://docs.crawl4ai.com/", config)
+            print("PASS: _latest_index was NOT called (expected after fix).")
+        except httpx.ConnectTimeout:
+            print("FAIL: _latest_index WAS called even though source='sitemap'.")
+
+if __name__ == "__main__":
+    asyncio.run(test_sitemap_source_does_not_hit_commoncrawl())