From 65803d1811b3c9160ae1b2d28d43db5ae2c24741 Mon Sep 17 00:00:00 2001 From: samiuc Date: Mon, 6 Oct 2025 12:00:37 -0700 Subject: [PATCH 1/2] feat: implement retry mechanism for dataset downloads to handle rate limits Signed-off-by: samiuc --- .../dataset_builders/dataset_builder.py | 77 ++++++++++++++++--- .../dataset_builders/dpbench_builder.py | 6 +- .../dataset_builders/omnidocbench_builder.py | 7 +- pyproject.toml | 1 + uv.lock | 11 +++ 5 files changed, 91 insertions(+), 11 deletions(-) diff --git a/docling_eval/dataset_builders/dataset_builder.py b/docling_eval/dataset_builders/dataset_builder.py index ae0a10e1..5dd9d851 100644 --- a/docling_eval/dataset_builders/dataset_builder.py +++ b/docling_eval/dataset_builders/dataset_builder.py @@ -10,7 +10,15 @@ from docling.utils.utils import chunkify from docling_core.types.doc.document import ImageRefMode from huggingface_hub import snapshot_download +from huggingface_hub.errors import HfHubHTTPError from pydantic import BaseModel +from tenacity import ( + retry, + retry_if_exception_type, + stop_after_attempt, + stop_after_delay, + wait_exponential, +) from docling_eval.datamodels.dataset_record import ( DatasetRecord, @@ -34,6 +42,7 @@ class HFSource(BaseModel): repo_id: str revision: Optional[str] = None hf_token: Optional[str] = os.getenv("HF_TOKEN", None) + max_workers: int = 8 # Reduce concurrent downloads to avoid rate limits class S3Source(BaseModel): @@ -154,6 +163,44 @@ def __init__( self.must_retrieve = False + @retry( + retry=retry_if_exception_type(HfHubHTTPError), + stop=(stop_after_attempt(10) | stop_after_delay(600)), + wait=wait_exponential(multiplier=2, min=10, max=120), + before_sleep=lambda retry_state: _log.warning( + f"Rate limit hit. Retrying in {retry_state.next_action.sleep if retry_state.next_action else 'unknown'} seconds... " + f"(Attempt {retry_state.attempt_number}/10)" + ), + ) + def _download_with_retry( + self, repo_id: str, token: Optional[str], local_dir: Path, max_workers: int + ) -> Path: + """ + Download dataset with exponential backoff on rate limit errors. + + Retries up to 10 times with exponential backoff (10s, 20s, 40s, 80s, 120s, 120s...). + Will stop retrying after 10 minutes total elapsed time. + """ + try: + path_str = snapshot_download( + repo_id=repo_id, + repo_type="dataset", + token=token, + local_dir=local_dir, + max_workers=max_workers, + ) + return Path(path_str) + except HfHubHTTPError as e: + if e.response.status_code == 429: + _log.warning( + f"Rate limit exceeded (429). Will retry with backoff. " + f"Tip: Reduce max_workers (currently {max_workers}) if this persists." + ) + raise # Re-raise to trigger retry + else: + _log.error(f"HTTP error downloading dataset: {e}") + raise + def retrieve_input_dataset(self) -> Path: """ Download and retrieve the input dataset. @@ -163,21 +210,33 @@ def retrieve_input_dataset(self) -> Path: """ if isinstance(self.dataset_source, HFSource): if not self.dataset_local_path: - path_str = snapshot_download( - repo_id=self.dataset_source.repo_id, - repo_type="dataset", - token=self.dataset_source.hf_token, + self.dataset_local_path = self.target / "source_data" + if self.dataset_local_path.exists(): + _log.info( + f"Dataset already exists at {self.dataset_local_path}, skipping download" ) - path: Path = Path(path_str) - self.dataset_local_path = path - else: - path_str = snapshot_download( + self.retrieved = True + return self.dataset_local_path + + _log.info(f"Downloading dataset to {self.dataset_local_path}") + try: + path_str = self._download_with_retry( repo_id=self.dataset_source.repo_id, - repo_type="dataset", token=self.dataset_source.hf_token, local_dir=self.dataset_local_path, + max_workers=self.dataset_source.max_workers, ) path = Path(path_str) + except Exception as e: + _log.error(f"Failed to download dataset: {e}") + _log.info("If you encounter rate limit errors, try:") + _log.info("1. Wait a few minutes before retrying") + _log.info( + "2. Set HF_TOKEN environment variable with your HuggingFace token" + ) + _log.info("3. Use a local copy of the dataset if available") + raise + elif isinstance(self.dataset_source, Path): path = self.dataset_source elif isinstance(self.dataset_source, S3Source): diff --git a/docling_eval/dataset_builders/dpbench_builder.py b/docling_eval/dataset_builders/dpbench_builder.py index 1fca4f44..0cdc4996 100644 --- a/docling_eval/dataset_builders/dpbench_builder.py +++ b/docling_eval/dataset_builders/dpbench_builder.py @@ -96,6 +96,7 @@ def __init__( split: str = "test", begin_index: int = 0, end_index: int = -1, + max_workers: int = 8, ): """ Initialize the DPBench dataset builder. @@ -105,10 +106,13 @@ def __init__( split: Dataset split to use begin_index: Start index for processing (inclusive) end_index: End index for processing (exclusive), -1 means process all + max_workers: Number of concurrent downloads (default=8 to avoid rate limits) """ super().__init__( name="DPBench", - dataset_source=HFSource(repo_id="upstage/dp-bench"), + dataset_source=HFSource( + repo_id="upstage/dp-bench", max_workers=max_workers + ), target=target, split=split, begin_index=begin_index, diff --git a/docling_eval/dataset_builders/omnidocbench_builder.py b/docling_eval/dataset_builders/omnidocbench_builder.py index 29f4fb0d..ba5a070d 100644 --- a/docling_eval/dataset_builders/omnidocbench_builder.py +++ b/docling_eval/dataset_builders/omnidocbench_builder.py @@ -96,6 +96,7 @@ def __init__( split: str = "test", begin_index: int = 0, end_index: int = -1, + max_workers: int = 8, ): """ Initialize the OmniDocBench dataset builder. @@ -108,7 +109,11 @@ def __init__( """ super().__init__( name="OmniDocBench: end-to-end", - dataset_source=HFSource(repo_id="opendatalab/OmniDocBench"), + dataset_source=HFSource( + repo_id="opendatalab/OmniDocBench", + max_workers=max_workers, + revision="v1_0", + ), target=target, split=split, begin_index=begin_index, diff --git a/pyproject.toml b/pyproject.toml index 7e63af5a..6b301e98 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,6 +54,7 @@ dependencies = [ "scipy>=1.15.3", "scipy-stubs>=1.15.3.0", "editdistance>=0.8.1", + "tenacity>=9.1.2", ] [project.urls] diff --git a/uv.lock b/uv.lock index c25e0288..9efea509 100644 --- a/uv.lock +++ b/uv.lock @@ -906,6 +906,7 @@ dependencies = [ { name = "scipy-stubs", version = "1.15.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "scipy-stubs", version = "1.16.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "tabulate" }, + { name = "tenacity" }, { name = "torch" }, { name = "torchmetrics" }, { name = "tqdm" }, @@ -985,6 +986,7 @@ requires-dist = [ { name = "scipy", specifier = ">=1.15.3" }, { name = "scipy-stubs", specifier = ">=1.15.3.0" }, { name = "tabulate", specifier = ">=0.9.0,<0.10.0" }, + { name = "tenacity", specifier = ">=9.1.2" }, { name = "torch", specifier = ">=2.5.1,<3.0.0" }, { name = "torchmetrics", specifier = ">=1.6.0,<2.0.0" }, { name = "tqdm", specifier = ">=4.67.1,<5.0.0" }, @@ -5085,6 +5087,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252, upload-time = "2022-10-06T17:21:44.262Z" }, ] +[[package]] +name = "tenacity" +version = "9.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0a/d4/2b0cd0fe285e14b36db076e78c93766ff1d529d70408bd1d2a5a84f1d929/tenacity-9.1.2.tar.gz", hash = "sha256:1169d376c297e7de388d18b4481760d478b0e99a777cad3a9c86e556f4b697cb", size = 48036, upload-time = "2025-04-02T08:25:09.966Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e5/30/643397144bfbfec6f6ef821f36f33e57d35946c44a2352d3c9f0ae847619/tenacity-9.1.2-py3-none-any.whl", hash = "sha256:f77bf36710d8b73a50b2dd155c97b870017ad21afe6ab300326b0371b3b05138", size = 28248, upload-time = "2025-04-02T08:25:07.678Z" }, +] + [[package]] name = "tifffile" version = "2025.5.10" From 3d65896a14545ea86b63b88ffd5f11be8b0fb000 Mon Sep 17 00:00:00 2001 From: samiuc Date: Tue, 7 Oct 2025 18:38:03 -0700 Subject: [PATCH 2/2] fix type errors Signed-off-by: samiuc --- .pre-commit-config.yaml | 12 ++++++------ docling_eval/dataset_builders/dataset_builder.py | 11 +++++++++-- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 09a67d34..69c828f9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -14,12 +14,12 @@ repos: pass_filenames: false language: system files: '\.py$' - # - id: mypy - # name: MyPy - # entry: uv run --no-sync mypy docling_eval tests docs/examples - # pass_filenames: false - # language: system - # files: '\.py$' + - id: mypy + name: MyPy + entry: uv run --no-sync mypy docling_eval tests docs/examples + pass_filenames: false + language: system + files: '\.py$' - repo: https://github.com/astral-sh/uv-pre-commit rev: 0.7.8 hooks: diff --git a/docling_eval/dataset_builders/dataset_builder.py b/docling_eval/dataset_builders/dataset_builder.py index 85a58585..aa624e25 100644 --- a/docling_eval/dataset_builders/dataset_builder.py +++ b/docling_eval/dataset_builders/dataset_builder.py @@ -173,7 +173,13 @@ def __init__( ), ) def _download_with_retry( - self, repo_id: str, token: Optional[str], local_dir: Path, max_workers: int + self, + repo_id: str, + repo_type: str, + token: Optional[str], + local_dir: Path, + max_workers: int, + revision: Optional[str] = None, ) -> Path: """ Download dataset with exponential backoff on rate limit errors. @@ -184,7 +190,8 @@ def _download_with_retry( try: path_str = snapshot_download( repo_id=repo_id, - repo_type="dataset", + revision=revision, + repo_type=repo_type, token=token, local_dir=local_dir, max_workers=max_workers,