Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 7 additions & 5 deletions crawl4ai/async_url_seeder.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,18 +400,20 @@ async def urls(self,
if self.logger and hasattr(self.logger, 'verbose') and config.verbose is not None:
self.logger.verbose = config.verbose

# ensure we have the latest CC collection id
if self.index_id is None:
self.index_id = await self._latest_index()

# Parse source parameter - split by '+' to get list of sources
sources = source.split('+')
sources = [s.strip().lower() for s in source.split("+") if s.strip()]

valid_sources = {"cc", "sitemap"}
for s in sources:
if s not in valid_sources:
raise ValueError(
f"Invalid source '{s}'. Valid sources are: {', '.join(valid_sources)}")

# ensure we have the latest CC collection id when the source is cc
if s == "cc" and self.index_id is None:
self.index_id = await self._latest_index()


if hits_per_sec:
if hits_per_sec <= 0:
self._log(
Expand Down
30 changes: 30 additions & 0 deletions tests/general/test_url_seeder_for_only_sitemap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import asyncio
from crawl4ai import AsyncLogger, AsyncUrlSeeder, SeedingConfig
from pathlib import Path
import httpx


async def test_sitemap_source_does_not_hit_commoncrawl():
config = SeedingConfig(
source="sitemap",
live_check=False,
extract_head=False,
max_urls=50,
verbose=True,
force=False
)

async with AsyncUrlSeeder(logger=AsyncLogger(verbose=True)) as seeder:
async def boom(*args, **kwargs):
print("DEBUG: _latest_index called")
raise httpx.ConnectTimeout("Simulated CommonCrawl outage")

seeder._latest_index = boom
try:
await seeder.urls("https://docs.crawl4ai.com/", config)
print("PASS: _latest_index was NOT called (expected after fix).")
except httpx.ConnectTimeout:
print("FAIL: _latest_index WAS called even though source='sitemap'.")

if __name__ == "__main__":
asyncio.run(test_sitemap_source_does_not_hit_commoncrawl())