From 65803d1811b3c9160ae1b2d28d43db5ae2c24741 Mon Sep 17 00:00:00 2001
From: samiuc <sami.ullah.chat@gmail.com>
Date: Mon, 6 Oct 2025 12:00:37 -0700
Subject: [PATCH 1/2] feat: implement retry mechanism for dataset downloads to
 handle rate limits

Signed-off-by: samiuc <sami.ullah.chat@gmail.com>
---
 .../dataset_builders/dataset_builder.py       | 77 ++++++++++++++++---
 .../dataset_builders/dpbench_builder.py       |  6 +-
 .../dataset_builders/omnidocbench_builder.py  |  7 +-
 pyproject.toml                                |  1 +
 uv.lock                                       | 11 +++
 5 files changed, 91 insertions(+), 11 deletions(-)

diff --git a/docling_eval/dataset_builders/dataset_builder.py b/docling_eval/dataset_builders/dataset_builder.py
index ae0a10e1..5dd9d851 100644
--- a/docling_eval/dataset_builders/dataset_builder.py
+++ b/docling_eval/dataset_builders/dataset_builder.py
@@ -10,7 +10,15 @@
 from docling.utils.utils import chunkify
 from docling_core.types.doc.document import ImageRefMode
 from huggingface_hub import snapshot_download
+from huggingface_hub.errors import HfHubHTTPError
 from pydantic import BaseModel
+from tenacity import (
+    retry,
+    retry_if_exception_type,
+    stop_after_attempt,
+    stop_after_delay,
+    wait_exponential,
+)
 
 from docling_eval.datamodels.dataset_record import (
     DatasetRecord,
@@ -34,6 +42,7 @@ class HFSource(BaseModel):
     repo_id: str
     revision: Optional[str] = None
     hf_token: Optional[str] = os.getenv("HF_TOKEN", None)
+    max_workers: int = 8  # Reduce concurrent downloads to avoid rate limits
 
 
 class S3Source(BaseModel):
@@ -154,6 +163,44 @@ def __init__(
 
         self.must_retrieve = False
 
+    @retry(
+        retry=retry_if_exception_type(HfHubHTTPError),
+        stop=(stop_after_attempt(10) | stop_after_delay(600)),
+        wait=wait_exponential(multiplier=2, min=10, max=120),
+        before_sleep=lambda retry_state: _log.warning(
+            f"Rate limit hit. Retrying in {retry_state.next_action.sleep if retry_state.next_action else 'unknown'} seconds... "
+            f"(Attempt {retry_state.attempt_number}/10)"
+        ),
+    )
+    def _download_with_retry(
+        self, repo_id: str, token: Optional[str], local_dir: Path, max_workers: int
+    ) -> Path:
+        """
+        Download dataset with exponential backoff on rate limit errors.
+
+        Retries up to 10 times with exponential backoff (10s, 20s, 40s, 80s, 120s, 120s...).
+        Will stop retrying after 10 minutes total elapsed time.
+        """
+        try:
+            path_str = snapshot_download(
+                repo_id=repo_id,
+                repo_type="dataset",
+                token=token,
+                local_dir=local_dir,
+                max_workers=max_workers,
+            )
+            return Path(path_str)
+        except HfHubHTTPError as e:
+            if e.response.status_code == 429:
+                _log.warning(
+                    f"Rate limit exceeded (429). Will retry with backoff. "
+                    f"Tip: Reduce max_workers (currently {max_workers}) if this persists."
+                )
+                raise  # Re-raise to trigger retry
+            else:
+                _log.error(f"HTTP error downloading dataset: {e}")
+                raise
+
     def retrieve_input_dataset(self) -> Path:
         """
         Download and retrieve the input dataset.
@@ -163,21 +210,33 @@ def retrieve_input_dataset(self) -> Path:
         """
         if isinstance(self.dataset_source, HFSource):
             if not self.dataset_local_path:
-                path_str = snapshot_download(
-                    repo_id=self.dataset_source.repo_id,
-                    repo_type="dataset",
-                    token=self.dataset_source.hf_token,
+                self.dataset_local_path = self.target / "source_data"
+            if self.dataset_local_path.exists():
+                _log.info(
+                    f"Dataset already exists at {self.dataset_local_path}, skipping download"
                 )
-                path: Path = Path(path_str)
-                self.dataset_local_path = path
-            else:
-                path_str = snapshot_download(
+                self.retrieved = True
+                return self.dataset_local_path
+
+            _log.info(f"Downloading dataset to {self.dataset_local_path}")
+            try:
+                path_str = self._download_with_retry(
                     repo_id=self.dataset_source.repo_id,
-                    repo_type="dataset",
                     token=self.dataset_source.hf_token,
                     local_dir=self.dataset_local_path,
+                    max_workers=self.dataset_source.max_workers,
                 )
                 path = Path(path_str)
+            except Exception as e:
+                _log.error(f"Failed to download dataset: {e}")
+                _log.info("If you encounter rate limit errors, try:")
+                _log.info("1. Wait a few minutes before retrying")
+                _log.info(
+                    "2. Set HF_TOKEN environment variable with your HuggingFace token"
+                )
+                _log.info("3. Use a local copy of the dataset if available")
+                raise
+
         elif isinstance(self.dataset_source, Path):
             path = self.dataset_source
         elif isinstance(self.dataset_source, S3Source):
diff --git a/docling_eval/dataset_builders/dpbench_builder.py b/docling_eval/dataset_builders/dpbench_builder.py
index 1fca4f44..0cdc4996 100644
--- a/docling_eval/dataset_builders/dpbench_builder.py
+++ b/docling_eval/dataset_builders/dpbench_builder.py
@@ -96,6 +96,7 @@ def __init__(
         split: str = "test",
         begin_index: int = 0,
         end_index: int = -1,
+        max_workers: int = 8,
     ):
         """
         Initialize the DPBench dataset builder.
@@ -105,10 +106,13 @@ def __init__(
             split: Dataset split to use
             begin_index: Start index for processing (inclusive)
             end_index: End index for processing (exclusive), -1 means process all
+            max_workers: Number of concurrent downloads (default=8 to avoid rate limits)
         """
         super().__init__(
             name="DPBench",
-            dataset_source=HFSource(repo_id="upstage/dp-bench"),
+            dataset_source=HFSource(
+                repo_id="upstage/dp-bench", max_workers=max_workers
+            ),
             target=target,
             split=split,
             begin_index=begin_index,
diff --git a/docling_eval/dataset_builders/omnidocbench_builder.py b/docling_eval/dataset_builders/omnidocbench_builder.py
index 29f4fb0d..ba5a070d 100644
--- a/docling_eval/dataset_builders/omnidocbench_builder.py
+++ b/docling_eval/dataset_builders/omnidocbench_builder.py
@@ -96,6 +96,7 @@ def __init__(
         split: str = "test",
         begin_index: int = 0,
         end_index: int = -1,
+        max_workers: int = 8,
     ):
         """
         Initialize the OmniDocBench dataset builder.
@@ -108,7 +109,11 @@ def __init__(
         """
         super().__init__(
             name="OmniDocBench: end-to-end",
-            dataset_source=HFSource(repo_id="opendatalab/OmniDocBench"),
+            dataset_source=HFSource(
+                repo_id="opendatalab/OmniDocBench",
+                max_workers=max_workers,
+                revision="v1_0",
+            ),
             target=target,
             split=split,
             begin_index=begin_index,
diff --git a/pyproject.toml b/pyproject.toml
index 7e63af5a..6b301e98 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -54,6 +54,7 @@ dependencies = [
     "scipy>=1.15.3",
     "scipy-stubs>=1.15.3.0",
     "editdistance>=0.8.1",
+    "tenacity>=9.1.2",
 ]
 
 [project.urls]
diff --git a/uv.lock b/uv.lock
index c25e0288..9efea509 100644
--- a/uv.lock
+++ b/uv.lock
@@ -906,6 +906,7 @@ dependencies = [
     { name = "scipy-stubs", version = "1.15.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
     { name = "scipy-stubs", version = "1.16.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
     { name = "tabulate" },
+    { name = "tenacity" },
     { name = "torch" },
     { name = "torchmetrics" },
     { name = "tqdm" },
@@ -985,6 +986,7 @@ requires-dist = [
     { name = "scipy", specifier = ">=1.15.3" },
     { name = "scipy-stubs", specifier = ">=1.15.3.0" },
     { name = "tabulate", specifier = ">=0.9.0,<0.10.0" },
+    { name = "tenacity", specifier = ">=9.1.2" },
     { name = "torch", specifier = ">=2.5.1,<3.0.0" },
     { name = "torchmetrics", specifier = ">=1.6.0,<2.0.0" },
     { name = "tqdm", specifier = ">=4.67.1,<5.0.0" },
@@ -5085,6 +5087,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252, upload-time = "2022-10-06T17:21:44.262Z" },
 ]
 
+[[package]]
+name = "tenacity"
+version = "9.1.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/0a/d4/2b0cd0fe285e14b36db076e78c93766ff1d529d70408bd1d2a5a84f1d929/tenacity-9.1.2.tar.gz", hash = "sha256:1169d376c297e7de388d18b4481760d478b0e99a777cad3a9c86e556f4b697cb", size = 48036, upload-time = "2025-04-02T08:25:09.966Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e5/30/643397144bfbfec6f6ef821f36f33e57d35946c44a2352d3c9f0ae847619/tenacity-9.1.2-py3-none-any.whl", hash = "sha256:f77bf36710d8b73a50b2dd155c97b870017ad21afe6ab300326b0371b3b05138", size = 28248, upload-time = "2025-04-02T08:25:07.678Z" },
+]
+
 [[package]]
 name = "tifffile"
 version = "2025.5.10"

From 3d65896a14545ea86b63b88ffd5f11be8b0fb000 Mon Sep 17 00:00:00 2001
From: samiuc <sami.ullah.chat@gmail.com>
Date: Tue, 7 Oct 2025 18:38:03 -0700
Subject: [PATCH 2/2] fix type errors

Signed-off-by: samiuc <sami.ullah.chat@gmail.com>
---
 .pre-commit-config.yaml                          | 12 ++++++------
 docling_eval/dataset_builders/dataset_builder.py | 11 +++++++++--
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 09a67d34..69c828f9 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -14,12 +14,12 @@ repos:
         pass_filenames: false
         language: system
         files: '\.py$'
-      # - id: mypy
-      #   name: MyPy
-      #   entry: uv run --no-sync mypy docling_eval tests docs/examples
-      #   pass_filenames: false
-      #   language: system
-      #   files: '\.py$'
+      - id: mypy
+        name: MyPy
+        entry: uv run --no-sync mypy docling_eval tests docs/examples
+        pass_filenames: false
+        language: system
+        files: '\.py$'
   - repo: https://github.com/astral-sh/uv-pre-commit
     rev: 0.7.8
     hooks:
diff --git a/docling_eval/dataset_builders/dataset_builder.py b/docling_eval/dataset_builders/dataset_builder.py
index 85a58585..aa624e25 100644
--- a/docling_eval/dataset_builders/dataset_builder.py
+++ b/docling_eval/dataset_builders/dataset_builder.py
@@ -173,7 +173,13 @@ def __init__(
         ),
     )
     def _download_with_retry(
-        self, repo_id: str, token: Optional[str], local_dir: Path, max_workers: int
+        self,
+        repo_id: str,
+        repo_type: str,
+        token: Optional[str],
+        local_dir: Path,
+        max_workers: int,
+        revision: Optional[str] = None,
     ) -> Path:
         """
         Download dataset with exponential backoff on rate limit errors.
@@ -184,7 +190,8 @@ def _download_with_retry(
         try:
             path_str = snapshot_download(
                 repo_id=repo_id,
-                repo_type="dataset",
+                revision=revision,
+                repo_type=repo_type,
                 token=token,
                 local_dir=local_dir,
                 max_workers=max_workers,