diff --git a/docling_eval/dataset_builders/dataset_builder.py b/docling_eval/dataset_builders/dataset_builder.py index c276d4ac..aa624e25 100644 --- a/docling_eval/dataset_builders/dataset_builder.py +++ b/docling_eval/dataset_builders/dataset_builder.py @@ -10,7 +10,15 @@ from docling.utils.utils import chunkify from docling_core.types.doc.document import ImageRefMode from huggingface_hub import snapshot_download +from huggingface_hub.errors import HfHubHTTPError from pydantic import BaseModel +from tenacity import ( + retry, + retry_if_exception_type, + stop_after_attempt, + stop_after_delay, + wait_exponential, +) from docling_eval.datamodels.dataset_record import ( DatasetRecord, @@ -34,6 +42,7 @@ class HFSource(BaseModel): repo_id: str revision: Optional[str] = None hf_token: Optional[str] = os.getenv("HF_TOKEN", None) + max_workers: int = 8 # Reduce concurrent downloads to avoid rate limits class S3Source(BaseModel): @@ -154,6 +163,51 @@ def __init__( self.must_retrieve = False + @retry( + retry=retry_if_exception_type(HfHubHTTPError), + stop=(stop_after_attempt(10) | stop_after_delay(600)), + wait=wait_exponential(multiplier=2, min=10, max=120), + before_sleep=lambda retry_state: _log.warning( + f"Rate limit hit. Retrying in {retry_state.next_action.sleep if retry_state.next_action else 'unknown'} seconds... " + f"(Attempt {retry_state.attempt_number}/10)" + ), + ) + def _download_with_retry( + self, + repo_id: str, + repo_type: str, + token: Optional[str], + local_dir: Path, + max_workers: int, + revision: Optional[str] = None, + ) -> Path: + """ + Download dataset with exponential backoff on rate limit errors. + + Retries up to 10 times with exponential backoff (10s, 20s, 40s, 80s, 120s, 120s...). + Will stop retrying after 10 minutes total elapsed time. + """ + try: + path_str = snapshot_download( + repo_id=repo_id, + revision=revision, + repo_type=repo_type, + token=token, + local_dir=local_dir, + max_workers=max_workers, + ) + return Path(path_str) + except HfHubHTTPError as e: + if e.response.status_code == 429: + _log.warning( + f"Rate limit exceeded (429). Will retry with backoff. " + f"Tip: Reduce max_workers (currently {max_workers}) if this persists." + ) + raise # Re-raise to trigger retry + else: + _log.error(f"HTTP error downloading dataset: {e}") + raise + def retrieve_input_dataset(self) -> Path: """ Download and retrieve the input dataset. @@ -163,23 +217,35 @@ def retrieve_input_dataset(self) -> Path: """ if isinstance(self.dataset_source, HFSource): if not self.dataset_local_path: - path_str = snapshot_download( - repo_id=self.dataset_source.repo_id, - revision=self.dataset_source.revision, - repo_type="dataset", - token=self.dataset_source.hf_token, + self.dataset_local_path = self.target / "source_data" + if self.dataset_local_path.exists(): + _log.info( + f"Dataset already exists at {self.dataset_local_path}, skipping download" ) - path: Path = Path(path_str) - self.dataset_local_path = path - else: - path_str = snapshot_download( + self.retrieved = True + return self.dataset_local_path + + _log.info(f"Downloading dataset to {self.dataset_local_path}") + try: + path_str = self._download_with_retry( repo_id=self.dataset_source.repo_id, revision=self.dataset_source.revision, repo_type="dataset", token=self.dataset_source.hf_token, local_dir=self.dataset_local_path, + max_workers=self.dataset_source.max_workers, ) path = Path(path_str) + except Exception as e: + _log.error(f"Failed to download dataset: {e}") + _log.info("If you encounter rate limit errors, try:") + _log.info("1. Wait a few minutes before retrying") + _log.info( + "2. Set HF_TOKEN environment variable with your HuggingFace token" + ) + _log.info("3. Use a local copy of the dataset if available") + raise + elif isinstance(self.dataset_source, Path): path = self.dataset_source elif isinstance(self.dataset_source, S3Source): diff --git a/docling_eval/dataset_builders/dpbench_builder.py b/docling_eval/dataset_builders/dpbench_builder.py index 1fca4f44..0cdc4996 100644 --- a/docling_eval/dataset_builders/dpbench_builder.py +++ b/docling_eval/dataset_builders/dpbench_builder.py @@ -96,6 +96,7 @@ def __init__( split: str = "test", begin_index: int = 0, end_index: int = -1, + max_workers: int = 8, ): """ Initialize the DPBench dataset builder. @@ -105,10 +106,13 @@ def __init__( split: Dataset split to use begin_index: Start index for processing (inclusive) end_index: End index for processing (exclusive), -1 means process all + max_workers: Number of concurrent downloads (default=8 to avoid rate limits) """ super().__init__( name="DPBench", - dataset_source=HFSource(repo_id="upstage/dp-bench"), + dataset_source=HFSource( + repo_id="upstage/dp-bench", max_workers=max_workers + ), target=target, split=split, begin_index=begin_index, diff --git a/docling_eval/dataset_builders/omnidocbench_builder.py b/docling_eval/dataset_builders/omnidocbench_builder.py index 68d0d9a7..ba5a070d 100644 --- a/docling_eval/dataset_builders/omnidocbench_builder.py +++ b/docling_eval/dataset_builders/omnidocbench_builder.py @@ -96,6 +96,7 @@ def __init__( split: str = "test", begin_index: int = 0, end_index: int = -1, + max_workers: int = 8, ): """ Initialize the OmniDocBench dataset builder. @@ -109,7 +110,9 @@ def __init__( super().__init__( name="OmniDocBench: end-to-end", dataset_source=HFSource( - repo_id="opendatalab/OmniDocBench", revision="v1_0" + repo_id="opendatalab/OmniDocBench", + max_workers=max_workers, + revision="v1_0", ), target=target, split=split, diff --git a/pyproject.toml b/pyproject.toml index a0480d5a..e8688a5e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,7 +53,8 @@ dependencies = [ "scipy>=1.15.3", "scipy-stubs>=1.15.3.0", "editdistance>=0.8.1", - "reportlab>=4.4.3", + "tenacity>=9.1.2", + "reportlab>=4.4.3" ] [project.urls] diff --git a/uv.lock b/uv.lock index 7fc76d87..f25090b5 100644 --- a/uv.lock +++ b/uv.lock @@ -1187,6 +1187,7 @@ dependencies = [ { name = "scipy-stubs", version = "1.15.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, { name = "scipy-stubs", version = "1.16.1.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, { name = "tabulate" }, + { name = "tenacity" }, { name = "torch", version = "2.7.1", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine != 'arm64' or sys_platform != 'darwin'" }, { name = "torch", version = "2.8.0", source = { registry = "https://pypi.org/simple" }, marker = "platform_machine == 'arm64' and sys_platform == 'darwin'" }, { name = "torchmetrics" }, @@ -1267,6 +1268,7 @@ requires-dist = [ { name = "scipy", specifier = ">=1.15.3" }, { name = "scipy-stubs", specifier = ">=1.15.3.0" }, { name = "tabulate", specifier = ">=0.9.0,<0.10.0" }, + { name = "tenacity", specifier = ">=9.1.2" }, { name = "torch", specifier = ">=2.5.1,<3.0.0" }, { name = "torchmetrics", specifier = ">=1.6.0,<2.0.0" }, { name = "tqdm", specifier = ">=4.67.1,<5.0.0" }, @@ -4179,6 +4181,9 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/20/8a/b35a615ae6f04550d696bb179c414538b3b477999435fdd4ad75b76139e4/pybase64-1.4.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:a370dea7b1cee2a36a4d5445d4e09cc243816c5bc8def61f602db5a6f5438e52", size = 54320, upload-time = "2025-07-27T13:03:27.495Z" }, { url = "https://files.pythonhosted.org/packages/d3/a9/8bd4f9bcc53689f1b457ecefed1eaa080e4949d65a62c31a38b7253d5226/pybase64-1.4.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9aa4de83f02e462a6f4e066811c71d6af31b52d7484de635582d0e3ec3d6cc3e", size = 56482, upload-time = "2025-07-27T13:03:28.942Z" }, { url = "https://files.pythonhosted.org/packages/75/e5/4a7735b54a1191f61c3f5c2952212c85c2d6b06eb5fb3671c7603395f70c/pybase64-1.4.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83a1c2f9ed00fee8f064d548c8654a480741131f280e5750bb32475b7ec8ee38", size = 70959, upload-time = "2025-07-27T13:03:30.171Z" }, + { url = "https://files.pythonhosted.org/packages/f4/56/5337f27a8b8d2d6693f46f7b36bae47895e5820bfa259b0072574a4e1057/pybase64-1.4.2-cp313-cp313-android_21_arm64_v8a.whl", hash = "sha256:0f331aa59549de21f690b6ccc79360ffed1155c3cfbc852eb5c097c0b8565a2b", size = 33888, upload-time = "2025-07-27T13:03:35.698Z" }, + { url = "https://files.pythonhosted.org/packages/e3/ff/470768f0fe6de0aa302a8cb1bdf2f9f5cffc3f69e60466153be68bc953aa/pybase64-1.4.2-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:69d3f0445b0faeef7bb7f93bf8c18d850785e2a77f12835f49e524cc54af04e7", size = 30914, upload-time = "2025-07-27T13:03:38.475Z" }, + { url = "https://files.pythonhosted.org/packages/75/6b/d328736662665e0892409dc410353ebef175b1be5eb6bab1dad579efa6df/pybase64-1.4.2-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:2372b257b1f4dd512f317fb27e77d313afd137334de64c87de8374027aacd88a", size = 31380, upload-time = "2025-07-27T13:03:39.7Z" }, { url = "https://files.pythonhosted.org/packages/ca/96/7ff718f87c67f4147c181b73d0928897cefa17dc75d7abc6e37730d5908f/pybase64-1.4.2-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:fb794502b4b1ec91c4ca5d283ae71aef65e3de7721057bd9e2b3ec79f7a62d7d", size = 38230, upload-time = "2025-07-27T13:03:41.637Z" }, { url = "https://files.pythonhosted.org/packages/71/ab/db4dbdfccb9ca874d6ce34a0784761471885d96730de85cee3d300381529/pybase64-1.4.2-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:d377d48acf53abf4b926c2a7a24a19deb092f366a04ffd856bf4b3aa330b025d", size = 71608, upload-time = "2025-07-27T13:03:47.01Z" }, { url = "https://files.pythonhosted.org/packages/f2/58/7f2cef1ceccc682088958448d56727369de83fa6b29148478f4d2acd107a/pybase64-1.4.2-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.whl", hash = "sha256:ab9cdb6a8176a5cb967f53e6ad60e40c83caaa1ae31c5e1b29e5c8f507f17538", size = 56413, upload-time = "2025-07-27T13:03:49.908Z" }, @@ -4200,6 +4205,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/95/f0/c392c4ac8ccb7a34b28377c21faa2395313e3c676d76c382642e19a20703/pybase64-1.4.2-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:ad59362fc267bf15498a318c9e076686e4beeb0dfe09b457fabbc2b32468b97a", size = 58103, upload-time = "2025-07-27T13:04:29.996Z" }, { url = "https://files.pythonhosted.org/packages/32/30/00ab21316e7df8f526aa3e3dc06f74de6711d51c65b020575d0105a025b2/pybase64-1.4.2-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:01593bd064e7dcd6c86d04e94e44acfe364049500c20ac68ca1e708fbb2ca970", size = 60779, upload-time = "2025-07-27T13:04:31.549Z" }, { url = "https://files.pythonhosted.org/packages/a6/65/114ca81839b1805ce4a2b7d58bc16e95634734a2059991f6382fc71caf3e/pybase64-1.4.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5b81547ad8ea271c79fdf10da89a1e9313cb15edcba2a17adf8871735e9c02a0", size = 74684, upload-time = "2025-07-27T13:04:32.976Z" }, + { url = "https://files.pythonhosted.org/packages/99/bf/00a87d951473ce96c8c08af22b6983e681bfabdb78dd2dcf7ee58eac0932/pybase64-1.4.2-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:4157ad277a32cf4f02a975dffc62a3c67d73dfa4609b2c1978ef47e722b18b8e", size = 30924, upload-time = "2025-07-27T13:04:39.189Z" }, + { url = "https://files.pythonhosted.org/packages/ae/43/dee58c9d60e60e6fb32dc6da722d84592e22f13c277297eb4ce6baf99a99/pybase64-1.4.2-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:e113267dc349cf624eb4f4fbf53fd77835e1aa048ac6877399af426aab435757", size = 31390, upload-time = "2025-07-27T13:04:40.995Z" }, { url = "https://files.pythonhosted.org/packages/e1/11/b28906fc2e330b8b1ab4bc845a7bef808b8506734e90ed79c6062b095112/pybase64-1.4.2-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:cea5aaf218fd9c5c23afacfe86fd4464dfedc1a0316dd3b5b4075b068cc67df0", size = 38212, upload-time = "2025-07-27T13:04:42.729Z" }, { url = "https://files.pythonhosted.org/packages/e4/2e/851eb51284b97354ee5dfa1309624ab90920696e91a33cd85b13d20cc5c1/pybase64-1.4.2-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:a3e54dcf0d0305ec88473c9d0009f698cabf86f88a8a10090efeff2879c421bb", size = 71674, upload-time = "2025-07-27T13:04:49.294Z" }, { url = "https://files.pythonhosted.org/packages/a4/8e/3479266bc0e65f6cc48b3938d4a83bff045330649869d950a378f2ddece0/pybase64-1.4.2-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.whl", hash = "sha256:753da25d4fd20be7bda2746f545935773beea12d5cb5ec56ec2d2960796477b1", size = 56461, upload-time = "2025-07-27T13:04:52.37Z" }, @@ -4963,6 +4970,7 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3b/b1/5e21d0b517434b7f33588ff76c177c5a167858cc38ef740608898cd329f2/pyzmq-27.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:e829529fcaa09937189178115c49c504e69289abd39967cd8a4c215761373394", size = 1894220, upload-time = "2025-09-08T23:07:57.172Z" }, { url = "https://files.pythonhosted.org/packages/f8/9b/c108cdb55560eaf253f0cbdb61b29971e9fb34d9c3499b0e96e4e60ed8a5/pyzmq-27.1.0-cp312-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:43ad9a73e3da1fab5b0e7e13402f0b2fb934ae1c876c51d0afff0e7c052eca31", size = 840995, upload-time = "2025-09-08T23:08:08.396Z" }, { url = "https://files.pythonhosted.org/packages/46/bd/2d45ad24f5f5ae7e8d01525eb76786fa7557136555cac7d929880519e33a/pyzmq-27.1.0-cp312-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:f30f395a9e6fbca195400ce833c731e7b64c3919aa481af4d88c3759e0cb7496", size = 1878550, upload-time = "2025-09-08T23:08:13.513Z" }, + { url = "https://files.pythonhosted.org/packages/60/cb/84a13459c51da6cec1b7b1dc1a47e6db6da50b77ad7fd9c145842750a011/pyzmq-27.1.0-cp313-cp313-android_24_arm64_v8a.whl", hash = "sha256:93ad4b0855a664229559e45c8d23797ceac03183c7b6f5b4428152a6b06684a5", size = 1122436, upload-time = "2025-09-08T23:08:20.801Z" }, { url = "https://files.pythonhosted.org/packages/3e/cd/9822a7af117f4bc0f1952dbe9ef8358eb50a24928efd5edf54210b850259/pyzmq-27.1.0-cp313-cp313t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6f3afa12c392f0a44a2414056d730eebc33ec0926aae92b5ad5cf26ebb6cc128", size = 847961, upload-time = "2025-09-08T23:08:29.672Z" }, { url = "https://files.pythonhosted.org/packages/d9/94/2da0a60841f757481e402b34bf4c8bf57fa54a5466b965de791b1e6f747d/pyzmq-27.1.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:add071b2d25f84e8189aaf0882d39a285b42fa3853016ebab234a5e78c7a43db", size = 1885394, upload-time = "2025-09-08T23:08:35.51Z" }, { url = "https://files.pythonhosted.org/packages/f5/d2/5f36552c2d3e5685abe60dfa56f91169f7a2d99bbaf67c5271022ab40863/pyzmq-27.1.0-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:01c0e07d558b06a60773744ea6251f769cd79a41a97d11b8bf4ab8f034b0424d", size = 847929, upload-time = "2025-09-08T23:08:49.76Z" }, @@ -6059,6 +6067,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252, upload-time = "2022-10-06T17:21:44.262Z" }, ] +[[package]] +name = "tenacity" +version = "9.1.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/0a/d4/2b0cd0fe285e14b36db076e78c93766ff1d529d70408bd1d2a5a84f1d929/tenacity-9.1.2.tar.gz", hash = "sha256:1169d376c297e7de388d18b4481760d478b0e99a777cad3a9c86e556f4b697cb", size = 48036, upload-time = "2025-04-02T08:25:09.966Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e5/30/643397144bfbfec6f6ef821f36f33e57d35946c44a2352d3c9f0ae847619/tenacity-9.1.2-py3-none-any.whl", hash = "sha256:f77bf36710d8b73a50b2dd155c97b870017ad21afe6ab300326b0371b3b05138", size = 28248, upload-time = "2025-04-02T08:25:07.678Z" }, +] + [[package]] name = "tifffile" version = "2025.5.10"