docling-project · samiuc · Oct 6, 2025 · Oct 8, 2025 · Oct 8, 2025 · Oct 8, 2025
diff --git a/docling_eval/dataset_builders/dataset_builder.py b/docling_eval/dataset_builders/dataset_builder.py
@@ -10,7 +10,15 @@
 from docling.utils.utils import chunkify
 from docling_core.types.doc.document import ImageRefMode
 from huggingface_hub import snapshot_download
+from huggingface_hub.errors import HfHubHTTPError
 from pydantic import BaseModel
+from tenacity import (
+    retry,
+    retry_if_exception_type,
+    stop_after_attempt,
+    stop_after_delay,
+    wait_exponential,
+)
 
 from docling_eval.datamodels.dataset_record import (
     DatasetRecord,
@@ -34,6 +42,7 @@ class HFSource(BaseModel):
     repo_id: str
     revision: Optional[str] = None
     hf_token: Optional[str] = os.getenv("HF_TOKEN", None)
+    max_workers: int = 8  # Reduce concurrent downloads to avoid rate limits
 
 
 class S3Source(BaseModel):
@@ -154,6 +163,51 @@ def __init__(
 
         self.must_retrieve = False
 
+    @retry(
+        retry=retry_if_exception_type(HfHubHTTPError),
+        stop=(stop_after_attempt(10) | stop_after_delay(600)),
+        wait=wait_exponential(multiplier=2, min=10, max=120),
+        before_sleep=lambda retry_state: _log.warning(
+            f"Rate limit hit. Retrying in {retry_state.next_action.sleep if retry_state.next_action else 'unknown'} seconds... "
+            f"(Attempt {retry_state.attempt_number}/10)"
+        ),
+    )
+    def _download_with_retry(
+        self,
+        repo_id: str,
+        repo_type: str,
+        token: Optional[str],
+        local_dir: Path,
+        max_workers: int,
+        revision: Optional[str] = None,
+    ) -> Path:
+        """
+        Download dataset with exponential backoff on rate limit errors.
+
+        Retries up to 10 times with exponential backoff (10s, 20s, 40s, 80s, 120s, 120s...).
+        Will stop retrying after 10 minutes total elapsed time.
+        """
+        try:
+            path_str = snapshot_download(
+                repo_id=repo_id,
+                revision=revision,
+                repo_type=repo_type,
+                token=token,
+                local_dir=local_dir,
+                max_workers=max_workers,
+            )
+            return Path(path_str)
+        except HfHubHTTPError as e:
+            if e.response.status_code == 429:
+                _log.warning(
+                    f"Rate limit exceeded (429). Will retry with backoff. "
+                    f"Tip: Reduce max_workers (currently {max_workers}) if this persists."
+                )
+                raise  # Re-raise to trigger retry
+            else:
+                _log.error(f"HTTP error downloading dataset: {e}")
+                raise
+
     def retrieve_input_dataset(self) -> Path:
         """
         Download and retrieve the input dataset.
@@ -163,23 +217,35 @@ def retrieve_input_dataset(self) -> Path:
         """
         if isinstance(self.dataset_source, HFSource):
             if not self.dataset_local_path:
-                path_str = snapshot_download(
-                    repo_id=self.dataset_source.repo_id,
-                    revision=self.dataset_source.revision,
-                    repo_type="dataset",
-                    token=self.dataset_source.hf_token,
+                self.dataset_local_path = self.target / "source_data"
+            if self.dataset_local_path.exists():
+                _log.info(
+                    f"Dataset already exists at {self.dataset_local_path}, skipping download"
                 )
-                path: Path = Path(path_str)
-                self.dataset_local_path = path
-            else:
-                path_str = snapshot_download(
+                self.retrieved = True
+                return self.dataset_local_path
+
+            _log.info(f"Downloading dataset to {self.dataset_local_path}")
+            try:
+                path_str = self._download_with_retry(
                     repo_id=self.dataset_source.repo_id,
                     revision=self.dataset_source.revision,
                     repo_type="dataset",
                     token=self.dataset_source.hf_token,
                     local_dir=self.dataset_local_path,
+                    max_workers=self.dataset_source.max_workers,
                 )
                 path = Path(path_str)
+            except Exception as e:
+                _log.error(f"Failed to download dataset: {e}")
+                _log.info("If you encounter rate limit errors, try:")
+                _log.info("1. Wait a few minutes before retrying")
+                _log.info(
+                    "2. Set HF_TOKEN environment variable with your HuggingFace token"
+                )
+                _log.info("3. Use a local copy of the dataset if available")
+                raise
+
         elif isinstance(self.dataset_source, Path):
             path = self.dataset_source
         elif isinstance(self.dataset_source, S3Source):

diff --git a/docling_eval/dataset_builders/dpbench_builder.py b/docling_eval/dataset_builders/dpbench_builder.py
@@ -96,6 +96,7 @@ def __init__(
         split: str = "test",
         begin_index: int = 0,
         end_index: int = -1,
+        max_workers: int = 8,
     ):
         """
         Initialize the DPBench dataset builder.
@@ -105,10 +106,13 @@ def __init__(
             split: Dataset split to use
             begin_index: Start index for processing (inclusive)
             end_index: End index for processing (exclusive), -1 means process all
+            max_workers: Number of concurrent downloads (default=8 to avoid rate limits)
         """
         super().__init__(
             name="DPBench",
-            dataset_source=HFSource(repo_id="upstage/dp-bench"),
+            dataset_source=HFSource(
+                repo_id="upstage/dp-bench", max_workers=max_workers
+            ),
             target=target,
             split=split,
             begin_index=begin_index,

diff --git a/docling_eval/dataset_builders/omnidocbench_builder.py b/docling_eval/dataset_builders/omnidocbench_builder.py
@@ -96,6 +96,7 @@ def __init__(
         split: str = "test",
         begin_index: int = 0,
         end_index: int = -1,
+        max_workers: int = 8,
     ):
         """
         Initialize the OmniDocBench dataset builder.
@@ -109,7 +110,9 @@ def __init__(
         super().__init__(
             name="OmniDocBench: end-to-end",
             dataset_source=HFSource(
-                repo_id="opendatalab/OmniDocBench", revision="v1_0"
+                repo_id="opendatalab/OmniDocBench",
+                max_workers=max_workers,
+                revision="v1_0",
             ),
             target=target,
             split=split,

diff --git a/pyproject.toml b/pyproject.toml
@@ -53,7 +53,8 @@ dependencies = [
     "scipy>=1.15.3",
     "scipy-stubs>=1.15.3.0",
     "editdistance>=0.8.1",
-    "reportlab>=4.4.3",
+    "tenacity>=9.1.2",
+    "reportlab>=4.4.3"
 ]
 
 [project.urls]

diff --git a/uv.lock b/uv.lock