diff --git a/.gitignore b/.gitignore index d382e17..9dfc18f 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,8 @@ node_modules my-docs Output self_use.py +.history/ +tmp/ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/README.md b/README.md index 121f368..328bdbe 100644 --- a/README.md +++ b/README.md @@ -104,6 +104,8 @@ success, failed, flag = client.pdf2file( pdf_file="tests/pdf", output_path="./Output", output_format="docx", + model="v3-2026", # optional, default is server-side v2 + formula_level=1, # optional: 0(default/recommended)=keep formulas; 1=inline formulas -> text; 2=all formulas (inline+block) -> text ) print(success) print(failed) @@ -127,4 +129,87 @@ print(failed) print(flag) ``` +### V3 JSON updates + +When `model="v3-2026"`: + +- `output_format="json"` now saves the raw Doc2X v3 JSON (`result.pages...`) instead of the legacy simplified `[{text, location}]` structure. +- Raw v3 JSON is always saved as a sidecar `.json` file, even when `output_format` does not include `json` (for example `text`, `detailed`, `md`, `docx`). +- If `output_format` includes `json`, the sidecar JSON name follows the `json` slot in `output_names`. +- If `output_format` does not include `json`, the sidecar JSON name follows the first non-empty entry in `output_names`. +- If `output_names` is omitted, the sidecar JSON falls back to the original PDF basename. +- Deprecated direct upload is no longer used. `oss_choose="always"` and `oss_choose="auto"` both use the preupload API. `oss_choose="never"` / `oss_choose="none"` now raises an error. + +Example: + +```python +from pdfdeal import Doc2X + +client = Doc2X(apikey="Your API key", debug=True) +success, failed, flag = client.pdf2file( + pdf_file="tests/pdf/sample.pdf", + output_path="./Output/test/v3", + output_format="text,json", + output_names=[["plain.txt", "viz.data"]], + model="v3-2026", +) +print(success) # ["page text...", "./Output/test/v3/viz.json"] +print(failed) +print(flag) +``` + +### Helper scripts for v3 figure/table crops + +Two helper scripts were added under [`scripts/`](/Users/cc/work/NoEdgeAI/pdfdeal/scripts): + +- [`extract_v3_figures.py`](/Users/cc/work/NoEdgeAI/pdfdeal/scripts/extract_v3_figures.py): extract figure crops from a PDF using Doc2X v3 JSON +- [`extract_v3_tables.py`](/Users/cc/work/NoEdgeAI/pdfdeal/scripts/extract_v3_tables.py): extract table crops from a PDF using Doc2X v3 JSON + +Both scripts: + +- validate that the v3 JSON matches the crop rules first +- render only pages containing target blocks with `fitz` at the requested `dpi` +- save full-page PNGs under `_pages/` +- crop target regions using the block `bbox/xyxy` and page coordinates from the v3 JSON +- write `manifest.json` with crop metadata + +Examples: + +```bash +python scripts/extract_v3_figures.py \ + --pdf /path/to/input.pdf \ + --v3-json /path/to/input_v3.json \ + --dpi 200 \ + --output-dir ./Output/figures +``` + +```bash +python scripts/extract_v3_tables.py \ + --pdf /path/to/input.pdf \ + --v3-json /path/to/input_v3.json \ + --dpi 200 \ + --output-dir ./Output/tables +``` + +You can also import the helpers directly: + +```python +from pdfdeal import extract_v3_figure_images, extract_v3_table_images + +figure_summary = extract_v3_figure_images( + pdf_path="/path/to/input.pdf", + v3_json_path="/path/to/input_v3.json", + dpi=200, + output_dir="./Output/figures", +) +table_summary = extract_v3_table_images( + pdf_path="/path/to/input.pdf", + v3_json_path="/path/to/input_v3.json", + dpi=200, + output_dir="./Output/tables", +) +print(figure_summary["crop_count"], figure_summary["manifest_path"]) +print(table_summary["crop_count"], table_summary["manifest_path"]) +``` + See the online documentation for details. diff --git a/README_CN.md b/README_CN.md index da667d4..131603b 100644 --- a/README_CN.md +++ b/README_CN.md @@ -102,6 +102,8 @@ success, failed, flag = client.pdf2file( pdf_file="tests/pdf", output_path="./Output", output_format="docx", + model="v3-2026", # 可选,不填则使用服务端默认 v2 + formula_level=1, # 可选:0(默认,推荐)不降级;1 仅降级行内公式(\(...\)、$...$);2 降级所有公式(含 \[...\]、$$...$$) ) print(success) print(failed) @@ -125,4 +127,87 @@ print(failed) print(flag) ``` -更多详细请参见在线文档。 \ No newline at end of file +### V3 JSON 更新 + +当 `model="v3-2026"` 时: + +- `output_format="json"` 现在会保存 Doc2X 原始 v3 JSON(`result.pages...`),不再保存旧的简化 `[{text, location}]` 结构。 +- 即使 `output_format` 不包含 `json`(例如 `text`、`detailed`、`md`、`docx`),也会额外保存一份 sidecar `.json`。 +- 如果 `output_format` 包含 `json`,sidecar JSON 的命名会跟随 `output_names` 里 `json` 这一槽位。 +- 如果 `output_format` 不包含 `json`,sidecar JSON 的命名会跟随 `output_names` 里第一个非空名字。 +- 如果没有传 `output_names`,sidecar JSON 会回退到原 PDF 文件名。 +- 已不再使用过期的小文件直传。`oss_choose="always"` 和 `oss_choose="auto"` 都会走 preupload;`oss_choose="never"` / `oss_choose="none"` 会直接报错。 + +示例: + +```python +from pdfdeal import Doc2X + +client = Doc2X(apikey="Your API key", debug=True) +success, failed, flag = client.pdf2file( + pdf_file="tests/pdf/sample.pdf", + output_path="./Output/test/v3", + output_format="text,json", + output_names=[["plain.txt", "viz.data"]], + model="v3-2026", +) +print(success) # ["页面文本...", "./Output/test/v3/viz.json"] +print(failed) +print(flag) +``` + +### V3 figure/table 裁剪辅助脚本 + +在 [`scripts/`](/Users/cc/work/NoEdgeAI/pdfdeal/scripts) 下新增了两个辅助脚本: + +- [`extract_v3_figures.py`](/Users/cc/work/NoEdgeAI/pdfdeal/scripts/extract_v3_figures.py):基于 Doc2X v3 JSON 从 PDF 中裁剪 figure 图片 +- [`extract_v3_tables.py`](/Users/cc/work/NoEdgeAI/pdfdeal/scripts/extract_v3_tables.py):基于 Doc2X v3 JSON 从 PDF 中裁剪 table 图片 + +这两个脚本都会: + +- 先校验 v3 JSON 是否符合裁剪规则 +- 用 `fitz` 按指定 `dpi` 只渲染包含目标 block 的页面 +- 将整页 PNG 保存到 `_pages/` +- 根据 v3 JSON 中的 block `bbox/xyxy` 和 page 坐标裁剪出目标区域 +- 输出带裁剪元数据的 `manifest.json` + +示例: + +```bash +python scripts/extract_v3_figures.py \ + --pdf /path/to/input.pdf \ + --v3-json /path/to/input_v3.json \ + --dpi 200 \ + --output-dir ./Output/figures +``` + +```bash +python scripts/extract_v3_tables.py \ + --pdf /path/to/input.pdf \ + --v3-json /path/to/input_v3.json \ + --dpi 200 \ + --output-dir ./Output/tables +``` + +你也可以直接 import 这些工具函数: + +```python +from pdfdeal import extract_v3_figure_images, extract_v3_table_images + +figure_summary = extract_v3_figure_images( + pdf_path="/path/to/input.pdf", + v3_json_path="/path/to/input_v3.json", + dpi=200, + output_dir="./Output/figures", +) +table_summary = extract_v3_table_images( + pdf_path="/path/to/input.pdf", + v3_json_path="/path/to/input_v3.json", + dpi=200, + output_dir="./Output/tables", +) +print(figure_summary["crop_count"], figure_summary["manifest_path"]) +print(table_summary["crop_count"], table_summary["manifest_path"]) +``` + +更多详细请参见在线文档。 diff --git a/pyproject.toml b/pyproject.toml index 2a5db4f..40c1a8f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,22 +1,37 @@ [project] name = "pdfdeal" -version = "1.0.2" -authors = [{ name = "Menghuan1918", email = "menghuan@menghuan1918.com" }] -description = "A python wrapper for the Doc2X API and comes with native texts processing (to improve texts recall in RAG)." +version = "1.0.4" +authors = [{ name = "noedgeai", email = "support@noedgeai.com" }] +description = "Python SDK for Doc2X API and some native texts processing (to improve texts recall in RAG)." readme = "README.md" -requires-python = ">=3.8" +requires-python = ">=3.9" classifiers = [ "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ] -dependencies = ["httpx[http2]>=0.23.1, <1", "pypdf"] +dependencies = [ + "aiofiles>=24.1.0", + "cryptography>=46.0.5", + "h2>=4.3.0", + "httpx[http2]>=0.23.1, <1", + "pypdf>=6.8.0", + "pytest>=8.3.5", + "urllib3>=2.6.3", +] [project.optional-dependencies] -tools = ["emoji", "Pillow", "reportlab", "beautifulsoup4"] +tools = [ + "emoji", + "Pillow>=12.1.1; python_version>='3.10'", + "Pillow>=10.4.0,<12.0.0; python_version<'3.10'", + "reportlab", + "beautifulsoup4", +] rag = [ "emoji", - "Pillow", + "Pillow>=12.1.1; python_version>='3.10'", + "Pillow>=10.4.0,<12.0.0; python_version<'3.10'", "reportlab", "oss2", "boto3", @@ -26,7 +41,8 @@ rag = [ dev = [ "pytest", "emoji", - "Pillow", + "Pillow>=12.1.1; python_version>='3.10'", + "Pillow>=10.4.0,<12.0.0; python_version<'3.10'", "reportlab", "oss2", "boto3", @@ -35,10 +51,10 @@ dev = [ ] [project.urls] -Issues = "https://github.com/Menghuan1918/pdfdeal/issues" -Documentation = "https://menghuan1918.github.io/pdfdeal-docs/" -Source = "https://github.com/Menghuan1918/pdfdeal" -Changelog = "https://menghuan1918.github.io/pdfdeal-docs/changes/" +Issues = "https://github.com/NoEdgeAI/pdfdeal/issues" +Documentation = "https://noedgeai.github.io/pdfdeal-docs" +Source = "https://github.com/NoEdgeAI/pdfdeal" +Changelog = "https://noedgeai.github.io/pdfdeal-docs/changes" [project.scripts] doc2x = "pdfdeal.CLI.doc2x:main" diff --git a/scripts/extract_v3_figures.py b/scripts/extract_v3_figures.py new file mode 100644 index 0000000..2acfa2c --- /dev/null +++ b/scripts/extract_v3_figures.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python3 +from pathlib import Path +import sys + +try: + from pdfdeal.v3_media import run_cli +except ImportError: # pragma: no cover - local repo execution fallback + sys.modules.pop("pdfdeal", None) + sys.path.insert(0, str(Path(__file__).resolve().parents[1] / "src")) + from pdfdeal.v3_media import run_cli + + +if __name__ == "__main__": + raise SystemExit(run_cli("figure")) diff --git a/scripts/extract_v3_tables.py b/scripts/extract_v3_tables.py new file mode 100644 index 0000000..9945ee4 --- /dev/null +++ b/scripts/extract_v3_tables.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python3 +from pathlib import Path +import sys + +try: + from pdfdeal.v3_media import run_cli +except ImportError: # pragma: no cover - local repo execution fallback + sys.modules.pop("pdfdeal", None) + sys.path.insert(0, str(Path(__file__).resolve().parents[1] / "src")) + from pdfdeal.v3_media import run_cli + + +if __name__ == "__main__": + raise SystemExit(run_cli("table")) diff --git a/src/pdfdeal/CLI/doc2x.py b/src/pdfdeal/CLI/doc2x.py index 98e6542..898bcd8 100644 --- a/src/pdfdeal/CLI/doc2x.py +++ b/src/pdfdeal/CLI/doc2x.py @@ -1,6 +1,7 @@ import argparse import os from pdfdeal import Doc2X +from pdfdeal.Doc2X.Types import FormulaLevel, V2ParseModel def main(): @@ -30,6 +31,26 @@ def main(): help="The maximum number of pages to process at same time, default is 1000, DO NOT set if you don't know", required=False, ) + parser.add_argument( + "--model", + help='Upload model for v2 preupload API, e.g. "v3-2026". Leave empty to use server default v2.', + required=False, + choices=[model.value for model in V2ParseModel], + ) + parser.add_argument( + "--formula_level", + help=( + 'Formula degradation level for v2 export body. ' + '0 (default, recommended)=keep original formulas; ' + '1=degrade inline formulas (\\(...\\), $...$); ' + '2=degrade all formulas including block formulas (\\[...\\], $$...$$). ' + 'Only effective when --model is "v3-2026".' + ), + required=False, + type=int, + choices=[level.value for level in FormulaLevel], + default=FormulaLevel.KEEP_MARKDOWN.value, + ) parser.add_argument( "-o", "--output", @@ -99,6 +120,8 @@ def main(): pdf_file=filename, output_path=output, output_format=format, + model=args.model, + formula_level=args.formula_level, ) for file in success: diff --git a/src/pdfdeal/Doc2X/ConvertV1.py b/src/pdfdeal/Doc2X/ConvertV1.py index 2092dbe..e4b3560 100644 --- a/src/pdfdeal/Doc2X/ConvertV1.py +++ b/src/pdfdeal/Doc2X/ConvertV1.py @@ -10,8 +10,7 @@ Base_URL = "https://api.doc2x.noedgeai.com/api" warnings.warn( - "V1 API is deprecated and will be removed in a future version. " - "Use V2 API instead.", + "V1 API is deprecated and will be removed in a future version. Use V2 API instead.", DeprecationWarning, stacklevel=2, ) @@ -71,10 +70,10 @@ async def check_folder(path: str) -> bool: @async_retry() async def uuid2file( - apikey: str, - uuid: str, - output_format: Literal["md", "md_dollar", "latex", "docx"], - output_path: str = "./Output", + apikey: str, + uuid: str, + output_format: Literal["md", "md_dollar", "latex", "docx"], + output_path: str = "./Output", ) -> str: """Get the file by the uuid @@ -149,12 +148,12 @@ async def get_limit(apikey: str) -> int: @async_retry() async def upload_pdf( - apikey: str, - pdffile: str, - ocr: bool = True, - translate: bool = False, - language: str = "zh", - model: str = "deepseek", + apikey: str, + pdffile: str, + ocr: bool = True, + translate: bool = False, + language: str = "zh", + model: str = "deepseek", ) -> str: """Upload pdf file to server and return the uuid of the file @@ -212,8 +211,8 @@ async def upload_pdf( if post_res.status_code == 200: try: if ( - "parse_task_limit_exceeded" - == json.loads(post_res.content.decode("utf-8"))["code"] + "parse_task_limit_exceeded" + == json.loads(post_res.content.decode("utf-8"))["code"] ): raise RateLimit() else: @@ -234,10 +233,10 @@ async def upload_pdf( @async_retry() async def upload_img( - apikey: str, - imgfile: str, - formula: bool = False, - img_correction: bool = False, + apikey: str, + imgfile: str, + formula: bool = False, + img_correction: bool = False, ) -> str: """Upload image file to server and return the uuid of the file @@ -279,8 +278,8 @@ async def upload_img( if post_res.status_code == 200: try: if ( - "parse_task_limit_exceeded" - == json.loads(post_res.content.decode("utf-8"))["code"] + "parse_task_limit_exceeded" + == json.loads(post_res.content.decode("utf-8"))["code"] ): raise RateLimit() else: @@ -381,10 +380,10 @@ async def decode_translate(datas: json, convert: bool) -> Tuple[list, list]: @async_retry() async def uuid_status( - apikey: str, - uuid: str, - convert: bool = False, - translate: bool = False, + apikey: str, + uuid: str, + convert: bool = False, + translate: bool = False, ) -> Tuple[int, str, list]: """Get the status of the file @@ -480,4 +479,4 @@ async def process_status(original_file: list, output_file: list): except AttributeError: has_error_flag = False - return success_file, error_file, has_error_flag + return success_file, error_file, has_error_flag \ No newline at end of file diff --git a/src/pdfdeal/Doc2X/ConvertV2.py b/src/pdfdeal/Doc2X/ConvertV2.py index f0821b8..3a638ac 100644 --- a/src/pdfdeal/Doc2X/ConvertV2.py +++ b/src/pdfdeal/Doc2X/ConvertV2.py @@ -1,25 +1,62 @@ +import asyncio +import random +import string + import httpx import json import os import re -from typing import Tuple +from typing import Any, Tuple from .Exception import RateLimit, FileError, RequestError, async_retry, code_check import logging -from .Types import OutputFormat +from .Types import ( + FormulaLevelType, + OutputFormat, + V2ParseModelType, + normalize_formula_level, + normalize_v2_parse_model, +) +import base64 Base_URL = "https://v2.doc2x.noedgeai.com/api" logger = logging.getLogger("pdfdeal.convertV2") +# Add new error codes for image processing +IMAGE_ERROR_CODES = { + "parse_quota_limit": "可用的解析额度不足 (Insufficient parsing quota)", + "parse_error": "解析错误 (Parsing error)", + "parse_file_invalid": "解析文件错误或者不合法 (Invalid or illegal image file)", + "request_limit_exceeded": "请求频率超过限制 (Request frequency limit exceeded)", + "parse_file_too_large": "单个图片大小超过限制 (Image size exceeds limit)", +} + +IMAGE_ERROR_SOLUTIONS = { + "parse_quota_limit": "当前可用的额度不够 (Current available quota is insufficient)", + "parse_error": "图片内容无法解析,请反馈给我们 (Image content cannot be parsed, please provide feedback)", + "parse_file_invalid": "无法解析这个图片,一般是图片不合法 (Cannot parse this image, usually due to invalid format)", + "request_limit_exceeded": "请等待一段时间后再请求 (Please wait for a while before making another request)", + "parse_file_too_large": "当前允许单个图片大小 <= 7M,尝试对图片进行压缩 (Current single image size limit is <= 7M, try compressing the image)", +} + @async_retry(timeout=200) -async def upload_pdf(apikey: str, pdffile: str, oss_choose: str = "always") -> str: +async def upload_pdf( + apikey: str, + pdffile: str, + oss_choose: str = "always", + model: V2ParseModelType = None, +) -> str: """Upload pdf file to server and return the uid of the file Args: apikey (str): The key pdffile (str): The pdf file path - oss_choose (str, optional): OSS upload preference. "always" for always using OSS, "auto" for using OSS only when the file size exceeds 100MB, "never" for never using OSS. Defaults to "always". + oss_choose (str, optional): Upload preference. The deprecated direct-upload + path is no longer used. "always" and "auto" both use the preupload API. + "never"/"none" is rejected because it would require the deprecated direct + upload endpoint. Defaults to "always". + model (V2ParseModelType, optional): Upload model for preupload API. Use "v3-2026" for latest model experience. Defaults to None (server default model). Raises: FileError: Input file size is too large @@ -30,56 +67,31 @@ async def upload_pdf(apikey: str, pdffile: str, oss_choose: str = "always") -> s Returns: str: The uid of the file """ - url = f"{Base_URL}/v2/parse/pdf" - if oss_choose == "always" or ( - oss_choose == "auto" and os.path.getsize(pdffile) >= 100 * 1024 * 1024 - ): - return await upload_pdf_big(apikey, pdffile) - elif oss_choose == "none" and os.path.getsize(pdffile) >= 100 * 1024 * 1024: - logger.warning("Now not support PDF file > 300MB!") - raise RequestError("parse_file_too_large") - try: - with open(pdffile, "rb") as f: - file = f.read() - except Exception as e: - raise FileError(f"Open file error! {e}") - - async with httpx.AsyncClient(timeout=httpx.Timeout(120), http2=True) as client: - post_res = await client.post( - url, - headers={ - "Authorization": f"Bearer {apikey}", - "Content-Type": "application/pdf", - }, - content=file, + oss_mode = oss_choose.strip().lower() + if oss_mode in {"never", "none"}: + raise ValueError( + "oss_choose='never'/'none' is no longer supported because the direct " + "upload endpoint has been deprecated. Use 'always' or 'auto' instead." ) - trace_id = post_res.headers.get("trace-id", "Failed to get trace-id ") - if post_res.status_code == 200: - response_data = json.loads(post_res.content.decode("utf-8")) - uid = response_data.get("data", {}).get("uid") - - await code_check( - code=response_data.get("code", response_data), uid=uid, trace_id=trace_id + if oss_mode not in {"always", "auto"}: + raise ValueError( + "oss_choose must be one of 'always', 'auto', 'never', 'none'" ) - return uid - if post_res.status_code == 429: - raise RateLimit(trace_id=trace_id) - if post_res.status_code == 400: - raise RequestError(error_code=post_res.text, trace_id=trace_id) - elif post_res.status_code == 401: - raise ValueError("API key is unauthorized. (认证失败,请检测API key是否正确)") - raise Exception( - f"Upload file error,trace_id{trace_id}:{post_res.status_code}:{post_res.text}" - ) + return await upload_pdf_big(apikey, pdffile, model=model) -async def upload_pdf_big(apikey: str, pdffile: str) -> str: +async def upload_pdf_big( + apikey: str, + pdffile: str, + model: V2ParseModelType = None, +) -> str: """Upload big pdf file to server and return the uid of the file Args: apikey (str): The key pdffile (str): The pdf file path + model (V2ParseModelType, optional): Upload model for preupload API. Use "v3-2026" for latest model experience. Defaults to None (server default model). Raises: FileError: Input file size is too large @@ -100,12 +112,16 @@ async def upload_pdf_big(apikey: str, pdffile: str) -> str: url = f"{Base_URL}/v2/parse/preupload" filename = os.path.basename(pdffile) + upload_payload = {"file_name": filename} + model_value = normalize_v2_parse_model(model) + if model_value: + upload_payload["model"] = model_value async with httpx.AsyncClient(timeout=httpx.Timeout(15), http2=True) as client: post_res = await client.post( url, headers={"Authorization": f"Bearer {apikey}"}, - json={"file_name": filename}, + json=upload_payload, ) trace_id = post_res.headers.get("trace-id") if post_res.status_code == 200: @@ -172,10 +188,10 @@ async def decode_data(data: dict, convert: bool) -> Tuple[list, list]: @async_retry() async def uid_status( - apikey: str, - uid: str, - convert: bool = False, -) -> Tuple[int, str, list, list]: + apikey: str, + uid: str, + convert: bool = False, +) -> Tuple[int, str, list, list, Any]: """Get the status of the file Args: @@ -188,7 +204,8 @@ async def uid_status( Exception: Get status error Returns: - Tuple[int, str, list, list]: The progress, status, texts and locations + Tuple[int, str, list, list, Any]: The progress, status, texts, locations, + and raw parse result. """ url = f"{Base_URL}/v2/parse/status?uid={uid}" async with httpx.AsyncClient(timeout=httpx.Timeout(30), http2=True) as client: @@ -212,10 +229,10 @@ async def uid_status( progress, status = data["data"].get("progress", 0), data["data"].get("status", "") if status == "processing": - return progress, "Processing file", [], [] + return progress, "Processing file", [], [], None elif status == "success": texts, locations = await decode_data(data["data"], convert) - return 100, "Success", texts, locations + return 100, "Success", texts, locations, data["data"].get("result") elif status == "failed": raise RequestError( f"Failed to deal with file uid {uid}! Trace-id:{trace_id}:{response_data.text}" @@ -224,12 +241,17 @@ async def uid_status( logger.warning( f"Unknown status: {status} in uid {uid} file! Trace-id:{trace_id}:{response_data.text}" ) - return progress, status, [], [] + return progress, status, [], [], None @async_retry() async def convert_parse( - apikey: str, uid: str, to: str, filename: str = None + apikey: str, + uid: str, + to: str, + filename: str = None, + merge_cross_page_forms: bool = False, + formula_level: FormulaLevelType = 0, ) -> Tuple[str, str]: """Convert parsed file to specified format @@ -238,6 +260,13 @@ async def convert_parse( uid (str): The uid of the parsed file to (str): Export format, supports: md|tex|docx|md_dollar filename (str, optional): Output filename for md/tex (without extension). Defaults to None. + merge_cross_page_forms (bool, optional): Whether to merge cross-page forms. Defaults to False. + formula_level (FormulaLevelType, optional): Formula degradation level for export body. + 0 (default, recommended): Keep original formulas. + 1: Degrade inline formulas to plain text (\\(...\\), $...$). + 2: Degrade all formulas to plain text, including inline and block formulas + (\\(...\\), $...$, \\[...\\], $$...$$). + This option only takes effect when upload model is "v3-2026". Raises: ValueError: If 'to' is not a valid format @@ -253,7 +282,14 @@ async def convert_parse( if isinstance(to, OutputFormat): to = to.value - payload = {"uid": uid, "to": to, "formula_mode": "normal"} + formula_level = normalize_formula_level(formula_level) + payload = { + "uid": uid, + "to": to, + "formula_mode": "normal", + "formula_level": formula_level, + "merge_cross_page_forms": merge_cross_page_forms, + } if filename and to in ["md", "md_dollar", "tex"]: payload["filename"] = filename if to == "md_dollar": @@ -331,7 +367,11 @@ async def get_convert_result(apikey: str, uid: str) -> Tuple[str, str]: @async_retry() async def download_file( - url: str, file_type: str, target_folder: str, target_filename: str + url: str, + file_type: str, + target_folder: str, + target_filename: str, + save_subdir: bool = False, ) -> str: """ Download a file from the given URL to the specified target folder with the given filename. @@ -341,6 +381,7 @@ async def download_file( file_type (str): The type of file being downloaded (e.g., 'zip', 'docx'). target_folder (str): The folder where the file should be saved. target_filename (str): The desired filename for the downloaded file, can include subdirectories. + save_subdir (bool, optional): Save the output to a subfolder under output_path. Defaults to False. Raises: Exception: If there's an error creating the target folder or downloading the file. @@ -348,14 +389,27 @@ async def download_file( Returns: str: The full path of the downloaded file. """ + + target_path = os.path.join(target_folder, target_filename) target_dir = os.path.dirname(target_path) filename = os.path.basename(target_path) + if save_subdir: + target_dir = os.path.join(target_dir, os.path.splitext(os.path.basename(filename))[0]) os.makedirs(target_dir, exist_ok=True) - filename = os.path.splitext(filename)[0] + + # 导出 md/tex时,在文件名最后加后缀 + zip_file_suffix = '' + if file_type in ['md', 'md_dollar', 'tex']: + zip_file_suffix = file_type + if file_type != "docx": file_type = "zip" + + + filename = f'{filename}_{zip_file_suffix}' + file_path = os.path.join(target_dir, f"{filename}.{file_type}") counter = 1 while os.path.exists(file_path): @@ -369,3 +423,111 @@ async def download_file( f.write(response.content) return file_path + + +async def image_code_check(code: str, trace_id: str = None): + """Check image processing error codes and raise appropriate exceptions + + Args: + code (str): The error code to check + trace_id (str, optional): The trace ID for debugging. Defaults to None. + + Raises: + RateLimit: When rate limit is reached + RequestError: When a known error occurs + ValueError: When API key is unauthorized + Exception: When an unknown error occurs + """ + if code == "request_limit_exceeded": + raise RateLimit(trace_id=trace_id) + if code in IMAGE_ERROR_CODES: + raise RequestError(code, trace_id=trace_id) + if code == "unauthorized": + raise ValueError("API key is unauthorized. (认证失败,请检测API key是否正确)") + if code not in ["ok", "success"]: + raise Exception(f"Unknown error code: {code}, Trace ID: {trace_id}") + +@async_retry() +async def parse_image_layout( + apikey: str, image_path: str, output_path: str = None, + ) -> tuple[list, str]: + """Parse image layout + + Args: + apikey (str): The API key + image_path (str): Path to the image file + output_path (str): Path to save the result json and decoded base64 image zip. Defaults to Output. + + Raises: + FileError: If file size exceeds limit or file cannot be opened + RateLimit: If rate limit is reached + RequestError: If parsing fails + Exception: For any other errors + + Returns: + tuple: A tuple containing: + - list: List of page dictionaries with page dimensions and md content + - str: The unique identifier (uid) of the request + """ + + # Use the image name as the prefix for the zip file name + output_zip_filename = os.path.splitext(os.path.basename(image_path))[0] + zip_path = os.path.join(output_path, output_zip_filename + '_images.zip') + + counter = 1 + while os.path.exists(zip_path): + zip_path = os.path.join(output_path, output_zip_filename + f'_images_{counter}.zip') + counter += 1 + + + if not os.path.exists(output_path): + os.makedirs(output_path, exist_ok=True) + + # Check file size + if os.path.getsize(image_path) > 7 * 1024 * 1024: # 7MB + raise FileError("Image file size exceeds 7MB limit") + + url = f"{Base_URL}/v2/parse/img/layout" + + try: + with open(image_path, "rb") as f: + file = f.read() + except Exception as e: + raise FileError(f"Open file error! {e}") + + async with httpx.AsyncClient(timeout=httpx.Timeout(30), http2=True) as client: + response = await client.post( + url, + headers={"Authorization": f"Bearer {apikey}"}, + content=file, + ) + + trace_id = response.headers.get("trace-id", "Failed to get trace-id") + + if response.status_code == 429: + raise RateLimit(trace_id=trace_id) + + if response.status_code != 200: + raise Exception( + f"Image layout parsing failed: {response.status_code}:{response.text}" + ) + + data = response.json() + await image_code_check(data.get("code", ""), trace_id=trace_id) + + + output_zip_path = '' + + + # Save zip file if path provided and zip content exists + if zip_path and data.get("data", {}).get("convert_zip"): + zip_content = base64.b64decode(data["data"]["convert_zip"]) + with open(zip_path, "wb") as f: + f.write(zip_content) + output_zip_path = zip_path + + return ( + data.get("data", {}).get("result", {}).get("pages", []), + data.get("data", {}).get("uid", "Failed to get uid"), + output_zip_path + ) diff --git a/src/pdfdeal/Doc2X/Exception.py b/src/pdfdeal/Doc2X/Exception.py index 862a5a1..951028b 100644 --- a/src/pdfdeal/Doc2X/Exception.py +++ b/src/pdfdeal/Doc2X/Exception.py @@ -54,7 +54,7 @@ class RequestError(Exception): SOLUTIONS = { "parse_quota_limit": "当前可用的页数不足,请检查余额或联系负责人 (Insufficient parsing quota, check balance or contact support)", "parse_create_task_error": "短暂等待后重试, 如果还出现报错则请联系负责人 (Retry after a short wait, contact support if error persists)", - "parse_file_too_large": "当前允许单个文件大小 <= 300MB(直接上传) | <= 1GB(通过OSS上传), 请拆分 pdf (File size must be <= 300MB (direct upload) | <= 1GB (OSS upload), please split the PDF)", + "parse_file_too_large": "当前允许单个文件大小 <= 1GB(通过 preupload 上传), 请拆分 pdf (File size must be <= 1GB via preupload upload, please split the PDF)", "parse_file_page_limit": "当前允许单个文件页数 <= 1000页, 请拆分 pdf (File page count must be <= 1000 pages, please split the PDF)", "parse_file_lock": "为了防止反复解析, 暂时锁定一天,考虑PDF可能有兼容性问题, 重新打印后再尝试。仍然失败请反馈request_id给负责人 (Locked for a day to prevent repeated parsing. Consider reprinting the PDF if compatibility issues persist. Report request_id if it still fails)", "parse_pdf_invalid": "不是有效的PDF文件,考虑PDF可能有兼容性问题, 重新打印后再尝试。仍然失败请反馈request_id给负责人 (File is not a valid PDF. Consider reprinting the PDF if compatibility issues persist. Report request_id if it still fails)", diff --git a/src/pdfdeal/Doc2X/Types.py b/src/pdfdeal/Doc2X/Types.py index 08bf421..889e891 100644 --- a/src/pdfdeal/Doc2X/Types.py +++ b/src/pdfdeal/Doc2X/Types.py @@ -1,4 +1,5 @@ from enum import Enum +from typing import Optional, Union class OutputFormat(str, Enum): @@ -9,6 +10,8 @@ class OutputFormat(str, Enum): LATEX = "tex" MD = "md" MD_DOLLAR = "md_dollar" + ZIP = "zip" + JSON = "json" @classmethod def _missing_(cls, value): @@ -64,3 +67,92 @@ def _missing_(cls, value): raise ValueError( f"{value} is not a valid {cls.__name__}, must be one of {', '.join([m.value for m in cls])}" ) + + +class V2ParseModel(str, Enum): + V2 = "v2" + V3_2026 = "v3-2026" + + @classmethod + def _missing_(cls, value): + for member in cls: + if member.value.lower() == value.lower(): + return member + raise ValueError( + f"{value} is not a valid {cls.__name__}, must be one of {', '.join([m.value for m in cls])}" + ) + + +V2ParseModelType = Optional[Union[str, V2ParseModel]] + + +class FormulaLevel(int, Enum): + """Formula degradation levels for v2 export body. + + 0 (default, recommended): Keep original formulas (no degradation). + 1: Degrade inline formulas to plain text (\\(...\\), $...$). + 2: Degrade all formulas to plain text, including inline and block formulas + (\\(...\\), $...$, \\[...\\], $$...$$). + """ + + KEEP_MARKDOWN = 0 + INLINE_TO_TEXT = 1 + ALL_TO_TEXT = 2 + + @classmethod + def _missing_(cls, value): + if isinstance(value, str): + value = value.strip() + if not value: + return None + try: + value = int(value) + except ValueError: + pass + for member in cls: + if member.value == value: + return member + raise ValueError( + f"{value} is not a valid {cls.__name__}, must be one of {', '.join([str(m.value) for m in cls])}" + ) + + +FormulaLevelType = Optional[Union[int, str, FormulaLevel]] + + +def normalize_v2_parse_model(model: V2ParseModelType) -> str: + if model is None: + return "" + if isinstance(model, V2ParseModel): + return "" if model == V2ParseModel.V2 else model.value + + model = model.strip() + if not model: + return "" + + model_enum = V2ParseModel(model) + return "" if model_enum == V2ParseModel.V2 else model_enum.value + + +def normalize_formula_level(formula_level: FormulaLevelType) -> int: + if formula_level is None: + return FormulaLevel.KEEP_MARKDOWN.value + if isinstance(formula_level, FormulaLevel): + return formula_level.value + if isinstance(formula_level, bool): + raise ValueError( + "formula_level must be one of 0, 1, 2 " + "(0=keep original formulas [default/recommended], " + "1=degrade inline formulas, 2=degrade all formulas)" + ) + + try: + level = FormulaLevel(formula_level) + except (TypeError, ValueError): + raise ValueError( + "formula_level must be one of 0, 1, 2 " + "(0=keep original formulas [default/recommended], " + "1=degrade inline formulas, 2=degrade all formulas)" + ) + + return level.value diff --git a/src/pdfdeal/FileTools/file_tools.py b/src/pdfdeal/FileTools/file_tools.py index c1a5b49..57df178 100644 --- a/src/pdfdeal/FileTools/file_tools.py +++ b/src/pdfdeal/FileTools/file_tools.py @@ -1,4 +1,5 @@ import io +import json import re import unicodedata import os @@ -56,52 +57,6 @@ def clear_cache(): if os.path.isfile(file_path): os.remove(file_path) - -def extract_text_and_images(pdf_path, ocr, language=["ch_sim", "en"], GPU=False): - """ - Extract text and images from a PDF file - """ - from pypdf import PdfReader - from PIL import Image - - Text = [] - - # Open the PDF file - with open(pdf_path, "rb") as file: - reader = PdfReader(file) - - for page in reader.pages: - # Get the text content of the page - text = page.extract_text() - temp_image_folder = os.path.join( - os.path.expanduser("~"), ".cache", "pdfdeal", "pictures" - ) - os.makedirs(temp_image_folder, exist_ok=True) - clear_cache() - # Get the images on the page - images = page.images - for id, image in enumerate(images): - image_data = image.data - image_stream = io.BytesIO(image_data) - pil_image = Image.open(image_stream) - # Save to HOME/.cache/pdfdeal/pictures, if the directory does not exist, create it - temp_image_path = os.path.join( - os.path.expanduser("~"), - ".cache", - "pdfdeal", - "pictures", - f"{id}.png", - ) - pil_image.save(temp_image_path) - option = {"GPU": GPU} - # Use ocr to extract text from images - ocr_text, All_Done = ocr(temp_image_folder, language, option) - text += f"\n{ocr_text}" - Text.append(clean_text(text)) - clear_cache() - return Text, All_Done - - def gen_folder_list(path: str, mode: str, recursive: bool = False) -> list: """Generate a list of all files in the folder @@ -163,6 +118,8 @@ def get_files(path: str, mode: str, out: str) -> Tuple[list, list]: mode = Support_File_Type(mode) if isinstance(mode, Support_File_Type): mode = mode.value + if not out: + out = "md_dollar" if out != "pdf": out = OutputFormat(out) if isinstance(out, OutputFormat): @@ -444,3 +401,81 @@ def auto_split_mds( f"=====\nError deal with {failed_file['file']} : {failed_file['error']}" ) return success, failed, flag + + +# json 导出格式会使用该函数 +def save_json( + output_path: str, + output_name: str, + json_content=None, + save_subdir: bool = False, +): + """Save the JSON file + Args: + output_path (str): The path to save the JSON file + output_name(str): JSON file name + json_content: The JSON content to save + """ + if json_content is None: + json_content = [] + base_name, _ = os.path.splitext(output_name) + + if save_subdir: + output_path = os.path.join(output_path, base_name) + + final_json_path = os.path.join(output_path, f"{base_name}.json") + + os.makedirs(output_path, exist_ok=True) + + # 处理重复名字的文件 + counter = 1 + while os.path.exists(final_json_path): + final_json_path = os.path.join(output_path, f"{base_name}_{counter}.json") + counter += 1 + + with open(final_json_path, 'w', encoding='utf-8') as f: + json.dump(json_content, f, ensure_ascii=False, indent=4) + + return final_json_path + + +# image 接口 导出md格式会使用该函数 +def save_md( + output_path: str, + output_name: str, + content: str = '', + save_subdir: bool = False, +): + """Save the md file + Args: + output_path (str): The path to save the JSON file + output_name(str): md file name + content (list[dict]): The md content to save + """ + + base_name, _ = os.path.splitext(output_name) + + if save_subdir: + output_path = os.path.join(output_path, base_name) + + final_md_path = os.path.join(output_path, f"{base_name}.md") + + os.makedirs(output_path, exist_ok=True) + + # 处理重复名字的文件 + counter = 1 + while os.path.exists(final_md_path): + final_md_path = os.path.join(output_path, f"{base_name}_{counter}.md") + counter += 1 + + try: + with open(final_md_path, 'w', encoding='utf-8') as f: + f.write(content) + + except Exception as e: + logging.error(f"Error occurs when saving to {final_md_path}: {str(e)}") + fail_reason = str(e) if str(e) else type(e).__name__ + return '', fail_reason + + return final_md_path, '' + diff --git a/src/pdfdeal/__init__.py b/src/pdfdeal/__init__.py index cbb8e35..cc35f7f 100644 --- a/src/pdfdeal/__init__.py +++ b/src/pdfdeal/__init__.py @@ -1,3 +1,4 @@ from .doc2x import Doc2X +from .v3_media import extract_v3_figure_images, extract_v3_table_images -__all__ = ["Doc2X"] +__all__ = ["Doc2X", "extract_v3_figure_images", "extract_v3_table_images"] diff --git a/src/pdfdeal/doc2x.py b/src/pdfdeal/doc2x.py index e231191..26c87f3 100644 --- a/src/pdfdeal/doc2x.py +++ b/src/pdfdeal/doc2x.py @@ -1,7 +1,13 @@ import asyncio +import csv import os -from typing import Tuple, List +from datetime import datetime +from typing import Dict, Tuple, List, Union, Optional, Any, Coroutine import logging + +import aiofiles +import aiofiles.os + from .Doc2X.ConvertV2 import ( upload_pdf, uid_status, @@ -9,24 +15,140 @@ get_convert_result, download_file, ) -from .Doc2X.Types import OutputFormat +from .Doc2X.Types import ( + FormulaLevelType, + OutputFormat, + V2ParseModel, + V2ParseModelType, + normalize_formula_level, + normalize_v2_parse_model, +) from .Doc2X.Pages import get_pdf_page_count from .Doc2X.Exception import RequestError, RateLimit, run_async -from .FileTools.file_tools import get_files +from .FileTools.file_tools import get_files, save_json import time +from .doc2x_img import ImageProcessor logger = logging.getLogger(name="pdfdeal.doc2x") +async def record_export_history( + csv_path: str, + uid: str, + file_name: str = None, + upload_time: float = None, + status: str = None, + is_export: bool = False, + lock: asyncio.Lock = None, +): + """Record history using csv""" + csv_header = ["uid", "file_name", "upload_time_str", "status", "is_export"] + async with lock: + update_data = {} + if file_name is not None: + update_data["file_name"] = file_name + if upload_time is not None: + update_data["upload_time_str"] = datetime.fromtimestamp(upload_time).strftime("%Y-%m-%d %H:%M:%S") + if status is not None: + update_data["status"] = status + if is_export is not None: + update_data["is_export"] = str(is_export) + output_dir = os.path.dirname(csv_path) + if output_dir: + await aiofiles.os.makedirs(output_dir, exist_ok=True) + try: + async with aiofiles.open(csv_path, mode="r+", encoding="utf-8", newline="") as f: + lines = await f.readlines() + output_lines = [] + uid_found = False + if not lines: + output_lines.append(",".join(csv_header) + "\n") + else: + output_lines.append(lines[0]) + for line in lines[1:]: + if line.strip(): + row_list = line.strip().split(',') + row_uid = row_list[0] if row_list else "" + if row_uid == uid: + uid_found = True + row_dict = dict(zip(csv_header, row_list)) + row_dict.update(update_data) + updated_row_list = [row_dict.get(h, "") for h in csv_header] + output_lines.append(",".join(map(str, updated_row_list)) + "\n") + else: + output_lines.append(line) + if not uid_found: + new_row_dict = { + "uid": uid, + "file_name": file_name, + "upload_time_str": update_data.get("upload_time_str", ""), + "status": status, + "is_export": str(is_export) if is_export is not None else "" + } + new_row_list = [new_row_dict.get(h, "") for h in csv_header] + output_lines.append(",".join(map(str, new_row_list)) + "\n") + await f.seek(0) + await f.truncate() + await f.writelines(output_lines) + + except FileNotFoundError: + async with aiofiles.open(csv_path, mode="w", encoding="utf-8", newline="") as f: + await f.write(",".join(csv_header) + "\n") + new_row_dict = { + "uid": uid, + "file_name": file_name, + "upload_time_str": update_data.get("upload_time_str", ""), + "status": status, + "is_export": str(is_export) if is_export is not None else "" + } + new_row_list = [new_row_dict.get(h, "") for h in csv_header] + await f.write(",".join(map(str, new_row_list)) + "\n") + + +async def read_export_history(csv_path: str) -> Dict[str, bool]: + """Read export history from csv_path""" + file_to_export_status_map: Dict[str, bool] = {} + if not await aiofiles.os.path.exists(csv_path): + return file_to_export_status_map + try: + async with aiofiles.open(csv_path, mode="r", encoding="utf-8", newline="") as f: + header_read = False + file_name_index = -1 + is_export_index = -1 + async for line in f: + if not line.strip(): + continue + row = next(csv.reader([line])) + if not header_read: + try: + file_name_index = row.index("file_name") + is_export_index = row.index("is_export") + header_read = True + except ValueError as e: + print(f"错误: CSV文件中缺少必要的列: {e}") + return {} # 表头不正确,返回空字典 + continue # 跳过表头行,继续下一次循环 + if len(row) > max(file_name_index, is_export_index): + file_name = row[file_name_index] + is_export_str = row[is_export_index] + is_export_bool = is_export_str.strip().lower() == 'true' + file_to_export_status_map[file_name] = is_export_bool + except Exception as e: + print(f"读取或解析CSV文件时发生错误: {e}") + return file_to_export_status_map + + async def parse_pdf( - apikey: str, - pdf_path: str, - maxretry: int, - wait_time: int, - max_time: int, - convert: bool, - oss_choose: str = "auto", -) -> Tuple[str, List[str], List[dict]]: + apikey: str, + pdf_path: str, + maxretry: int, + wait_time: int, + max_time: int, + convert: bool, + oss_choose: str = "auto", + model: V2ParseModelType = None, + export_history: str = "", +) -> Tuple[str, List[str], List[dict], Any]: """Parse PDF file and return uid and extracted text""" async def task_limit_lock(): @@ -49,21 +171,43 @@ async def task_limit_lock(): for attempt in range(maxretry): try: logger.info(f"Uploading {pdf_path}...") - uid = await upload_pdf(apikey, pdf_path, oss_choose) + uid = await upload_pdf(apikey, pdf_path, oss_choose, model=model) + if export_history != "": + await record_export_history( + csv_path=export_history, + uid=uid, + file_name=pdf_path, + upload_time=time.time(), + status="Processing", + is_export=False, + lock=asyncio.Lock()) + logger.info(f"Uploading successful for {pdf_path} with uid {uid}") for _ in range(max_time // 3): try: - progress, status, texts, locations = await uid_status( + progress, status, texts, locations, raw_result = await uid_status( apikey, uid, convert ) if status == "Success": logger.info(f"Parsing successful for {pdf_path} with uid {uid}") - return uid, texts, locations + if export_history != "": + await record_export_history( + csv_path=export_history, + uid=uid, + status="Success", + lock=asyncio.Lock()) + return uid, texts, locations, raw_result elif status == "Processing file": logger.info(f"Processing {uid} : {progress}%") await asyncio.sleep(3) else: + if export_history != "": + await record_export_history( + csv_path=export_history, + uid=uid, + status="Failed", + lock=asyncio.Lock()) raise RequestError( f"Unexpected status: {status} with uid: {uid}" ) @@ -85,22 +229,30 @@ async def task_limit_lock(): raise RequestError( "Max retry reached for parse_pdf, this may be a rate limit issue, try to reduce the number of threads." ) - raise RequestError("Failed to parse PDF after maximum retries") async def convert_to_format( - apikey: str, - uid: str, - output_format: str, - output_path: str, - output_name: str, - max_time: int, -) -> str: + apikey: str, + uid: str, + output_format: str, + output_path: str, + output_name: str, + max_time: int, + merge_cross_page_forms: bool = False, + formula_level: FormulaLevelType = 0, + save_subdir: bool = False, + ) -> str: """Convert parsed PDF to specified format""" - logger.info(f"Converting {uid} to {output_format}...") - status, url = await convert_parse(apikey, uid, output_format) + status, url = await convert_parse( + apikey, + uid, + output_format, + merge_cross_page_forms=merge_cross_page_forms, + formula_level=formula_level, + ) + for _ in range(max_time // 3): if status == "Success": logger.info(f"Downloading {uid} {output_format} file to {output_path}...") @@ -109,6 +261,7 @@ async def convert_to_format( file_type=output_format, target_folder=output_path, target_filename=output_name or uid, + save_subdir=save_subdir, ) elif status == "Processing": logger.info(f"Converting {uid} {output_format} file...") @@ -119,17 +272,42 @@ async def convert_to_format( raise RequestError(f"Max time reached for get_convert_result with uid: {uid}") +async def save_json_format( + output_path: str, + output_name: str, + json_content: Any = None, + save_subdir: bool = False, + ): + """Save the JSON file + Args: + output_path (str): The path to save the JSON file + output_name(str): JSON file name + json_content (Any): The JSON content to save + """ + loop = asyncio.get_running_loop() + saved_path = await loop.run_in_executor( + None, + save_json, + output_path, + output_name, + json_content, + save_subdir, + ) + + return saved_path + + class Doc2X: def __init__( - self, - apikey: str = None, - thread: int = 5, - max_pages: int = 1000, - retry_time: int = 5, - max_time: int = 300, - debug: bool = False, - full_speed: bool = False, - ) -> None: + self, + apikey: str = None, + thread: int = 5, + max_pages: int = 1000, + retry_time: int = 5, + max_time: int = 300, + debug: bool = False, + full_speed: bool = False, + ) -> None: """ Initialize a Doc2X client. @@ -157,6 +335,7 @@ def __init__( self.max_pages = max_pages self.request_interval = 0.1 self.full_speed = full_speed + self._image_processor = None handler = logging.StreamHandler() formatter = logging.Formatter( @@ -170,15 +349,67 @@ def __init__( logging.getLogger("pdfdeal").setLevel(logging.DEBUG) self.debug = debug + @property + def image_processor(self) -> ImageProcessor: + """Lazy initialization of ImageProcessor""" + if self._image_processor is None: + self._image_processor = ImageProcessor(self.apikey) + return self._image_processor + + def piclayout( + self, + pic_file, + output_format: str = "text", + output_path: str = "./Output", + save_subdir: bool = False, + concurrent_limit: Optional[int] = 5, + ) -> tuple[List[Union[list, str]], List[dict], bool]: + """Process image files with layout analysis + + Args: + pic_file (str | List[str]): Path to image files (jpg/png) + output_format (str): The output format. Defaults to "text". Available values are 'text', 'md', ''md_dollar + output_path (str): Path to save the result json and decoded base64 image zip. Defaults to Output. + save_subdir (bool): Save the output to a subfolder under output_path. Defaults to False. + concurrent_limit (int, optional): Maximum number of concurrent tasks. Defaults to 5. + + Returns: + Tuple containing: + - List of layout analysis results (list or str) + - List of dictionaries containing error information + - Boolean indicating if any errors occurred + """ + + if os.path.exists(output_path): + if not os.path.isdir(output_path): + raise ValueError("output_path must be a directory") + else: + os.makedirs(output_path, exist_ok=True) + + return self.image_processor.pic2file( + pic_file=pic_file, + process_type="layout", + output_format=output_format, + output_path=output_path, + save_subdir=save_subdir, + concurrent_limit=concurrent_limit, + ) + async def pdf2file_back( - self, - pdf_file, - output_names: List[str] = None, - output_path: str = "./Output", - output_format: str = "md_dollar", - convert: bool = False, - oss_choose: str = "auto", - ) -> Tuple[List[str], List[dict], bool]: + self, + pdf_file, + output_names: List[str] = None, + output_path: str = "./Output", + output_format: str = "md_dollar", + convert: bool = False, + oss_choose: str = "auto", + model: V2ParseModelType = None, + merge_cross_page_forms: bool = False, + formula_level: FormulaLevelType = 0, + save_subdir: bool = False, + export_history: str = "", + ) -> Tuple[List[str], List[dict], bool]: + if isinstance(pdf_file, str): if os.path.isdir(pdf_file): pdf_file, output_names = get_files( @@ -208,10 +439,34 @@ async def pdf2file_back( else: raise ValueError("Invalid output format, should be a string.") + if 'json' in output_format: + logger.warning( + "You have used JSON result output. The output will contain online links that expire in 24 hours. Please remember to manually save the results. (您使用了 json 结果输出,输出结果中会有 24h 过期的在线链接,请注意手动保存结果)" + ) + for fmt in output_formats: fmt = OutputFormat(fmt) if isinstance(fmt, OutputFormat): fmt = fmt.value + formula_level = normalize_formula_level(formula_level) + + try: + normalized_model = normalize_v2_parse_model(model) + model_enum = V2ParseModel(normalized_model) if normalized_model else None + if model_enum == V2ParseModel.V3_2026: + model_version = "v3" + model_label = normalized_model + else: + model_version = "v2" + model_label = "default(v2)" + except Exception: + normalized_model = str(model).strip() if model is not None else "" + model_version = "custom" + model_label = normalized_model or "default(v2)" + + logger.info( + f"Doc2X parse model selected: {model_version} ({model_label})" + ) # Track total pages and last request time total_pages = 0 @@ -221,7 +476,7 @@ async def pdf2file_back( convert_tasks = set() results = [None] * len(pdf_file) parse_results = [None] * len(pdf_file) - global limit_lock, get_max_limit, max_threads, full_speed, thread_min + global limit_lock, get_max_limit, max_threads, full_speed, thread_min, file_export_map thread_min = self.thread full_speed = self.full_speed limit_lock = asyncio.Lock() @@ -269,7 +524,7 @@ async def process_file(index, pdf, name): # Process the file try: - uid, texts, locations = await parse_pdf( + uid, texts, locations, raw_result = await parse_pdf( apikey=self.apikey, pdf_path=pdf, maxretry=self.retry_time, @@ -277,8 +532,10 @@ async def process_file(index, pdf, name): max_time=self.max_time, convert=convert, oss_choose=oss_choose, + model=model, + export_history=export_history, ) - parse_results[index] = (uid, texts, locations) + parse_results[index] = (uid, texts, locations, raw_result) # Create convert task as soon as parse is complete task = asyncio.create_task(convert_file(index, name)) convert_tasks.add(task) @@ -298,9 +555,48 @@ async def process_file(index, pdf, name): async def convert_file(index, name): if parse_results[index] is None: return - uid, texts, locations = parse_results[index] + uid, texts, locations, raw_result = parse_results[index] all_results = [] all_errors = [] + json_output_reason = "single output name" + if isinstance(name, list): + json_output_name = None + if "json" in output_formats: + json_format_index = output_formats.index("json") + if json_format_index < len(name): + json_output_name = name[json_format_index] + json_output_reason = ( + f'using output_names[{json_format_index}] because ' + f'output_format includes "json"' + ) + if not json_output_name: + json_output_name = next((item for item in name if item), None) + json_output_reason = ( + "using the first non-empty output_names entry because " + 'output_format does not include "json"' + ) + else: + json_output_name = name + json_output_name = json_output_name or os.path.basename(pdf_file[index]) + v3_json_result_path = None + if model_enum == V2ParseModel.V3_2026 and raw_result is not None: + v3_json_result_path = await save_json_format( + output_path=os.path.join( + output_path, os.path.dirname(json_output_name) + ), + output_name=os.path.basename(json_output_name), + json_content=raw_result, + save_subdir=save_subdir, + ) + if isinstance(name, list) and len(name) > 1: + logger.info( + "V3 sidecar JSON naming for %s: %s. Requested output_names=%s. " + "Saved to %s", + pdf_file[index], + json_output_reason, + name, + v3_json_result_path, + ) for name_index, fmt in enumerate(output_formats): if isinstance(name, list): @@ -311,6 +607,7 @@ async def convert_file(index, name): else: name_fmt = name try: + output_name = name_fmt or os.path.basename(pdf_file[index]) if fmt in ["md", "md_dollar", "tex", "docx"]: nonlocal last_request_time # Wait for request interval @@ -324,6 +621,7 @@ async def convert_file(index, name): async with page_lock: last_request_time = time.time() + result = await convert_to_format( apikey=self.apikey, uid=uid, @@ -331,7 +629,16 @@ async def convert_file(index, name): output_path=output_path, output_name=name_fmt, max_time=self.max_time, + merge_cross_page_forms=merge_cross_page_forms, + formula_level=formula_level, + save_subdir=save_subdir ) + if export_history != "": + await record_export_history( + csv_path=export_history, + uid=uid, + is_export=True, + lock=asyncio.Lock()) all_results.append(result) all_errors.append("") # Wait 5 seconds between formats @@ -350,6 +657,31 @@ async def convert_file(index, name): {"text": text, "location": loc} for text, loc in zip(texts, locations) ] + + elif fmt == "json": + if v3_json_result_path is not None: + result = v3_json_result_path + else: + json_content = [ + {"text": text, "location": loc} + for text, loc in zip(texts, locations) + ] + result = await save_json_format( + output_path=os.path.join( + output_path, os.path.dirname(output_name) + ), + output_name=os.path.basename(output_name), + json_content=json_content, + save_subdir=save_subdir, + ) + + + if export_history != "": + await record_export_history( + csv_path=export_history, + uid=uid, + is_export=True, + lock=asyncio.Lock()) all_results.append(result) all_errors.append("") @@ -376,7 +708,16 @@ async def convert_file(index, name): ) # Create and run parse tasks with controlled concurrency + + if export_history != "": + file_export_map = await read_export_history(export_history) + print(f"export_history{file_export_map}") + for i, (pdf, name) in enumerate(zip(pdf_file, output_names)): + if export_history != "": + if file_export_map.get(pdf, False) is True: + results[i] = ('', '', '') + continue while len(parse_tasks) >= max_threads: done, parse_tasks = await asyncio.wait( parse_tasks, return_when=asyncio.FIRST_COMPLETED @@ -437,40 +778,47 @@ async def convert_file(index, name): return success_files, failed_files, has_error def pdf2file( - self, - pdf_file, - output_names: List[str] = None, - output_path: str = "./Output", - output_format: str = "md_dollar", - convert: bool = False, - oss_choose: str = "always", - ocr: bool = False, - ) -> Tuple[List[str], List[dict], bool]: - """Convert PDF files to the specified format. - + self, + pdf_file, + output_names: List[str] = None, + output_path: str = "./Output", + output_format: str = "md_dollar", + convert: bool = False, + oss_choose: str = "always", + model: V2ParseModelType = None, + merge_cross_page_forms: bool = False, + formula_level: FormulaLevelType = 0, + ocr: bool = False, + save_subdir: bool = False, + ) -> Tuple[List[str], List[dict], bool]: + """Convert PDF file to specified format Args: - pdf_file (str | List[str]): Path to a single PDF file or a list of PDF file paths. - output_names (List[str], optional): List of output file names. Defaults to None. - output_path (str, optional): Directory path for output files. Defaults to "./Output". - output_format (str, optional): Desired output format. Defaults to `md_dollar`. Supported formats include:`md_dollar`|`md`|`tex`|`docx`, will return the path of files, support output variable: `text`|`texts`|`detailed`(it means `string in md format`, `list of strings split by page`, `list of strings split by page (including detailed page information)`) - convert (bool, optional): Whether to convert "[" and "[[" to "$" and "$$", only valid if `output_format` is a variable format(`txt`|`txts`|`detailed`). Defaults to False. - oss_choose (str, optional): Now can upload files directly through API or through OSS link given by API. Acceptable values: `auto`, `always`, `never` (it means `Only >=100MB files will be uploaded to OSS`, `All files will be uploaded to OSS`, `All files will be uploaded directly`). Defaults to "always". + pdf_file (str or list): The path of the PDF file or a list of PDF file paths + output_names (List[str], optional): The output file names. Defaults to None. + output_path (str, optional): The output path. Defaults to "./Output". + output_format (str, optional): The output format. Defaults to "md_dollar". + convert (bool, optional): Convert "[" and "[[" to "$" and "$$". Defaults to False. + oss_choose (str, optional): Upload preference. The deprecated direct-upload + path is no longer used. "always" and "auto" both use preupload. + "never"/"none" is rejected because it would require the deprecated + direct-upload endpoint. Defaults to "always". + model (V2ParseModelType, optional): Upload model for v2 preupload API. Use "v3-2026" for latest model experience. Defaults to None (server default model). + merge_cross_page_forms (bool, optional): Whether to merge cross-page forms. Defaults to False. + formula_level (FormulaLevelType, optional): Formula degradation level for export body. + 0 (default, recommended): Keep original formulas. + 1: Degrade inline formulas to plain text (\\(...\\), $...$). + 2: Degrade all formulas to plain text, including inline and block formulas + (\\(...\\), $...$, \\[...\\], $$...$$). + This option only takes effect when upload model is "v3-2026". ocr (bool, optional): This option is deprecated and will not be used. - + save_subdir (bool, optional): Save the output to a subfolder under output_path. Defaults to False. Returns: Tuple[List[str], List[dict], bool]: A tuple containing: - 1. A list of successfully converted file paths or content. - 2. A list of dictionaries containing error information for failed conversions. - 3. A boolean indicating whether any errors occurred during the conversion process. - - Raises: - Any exceptions raised by pdf2file_back or run_async. - - Note: - This method provides a convenient synchronous interface for the asynchronous - PDF conversion functionality. It handles all the necessary setup for running - the asynchronous code in a synchronous context. + - List[str]: List of output file paths + - List[dict]: List of error messages + - bool: Whether there was an error """ + if ocr: import warnings @@ -479,7 +827,6 @@ def pdf2file( DeprecationWarning, stacklevel=2, ) - return run_async( self.pdf2file_back( pdf_file=pdf_file, @@ -488,5 +835,9 @@ def pdf2file( output_format=output_format, convert=convert, oss_choose=oss_choose, + model=model, + merge_cross_page_forms=merge_cross_page_forms, + formula_level=formula_level, + save_subdir=save_subdir, ) ) diff --git a/src/pdfdeal/doc2x_img.py b/src/pdfdeal/doc2x_img.py new file mode 100644 index 0000000..f2617f7 --- /dev/null +++ b/src/pdfdeal/doc2x_img.py @@ -0,0 +1,378 @@ +import asyncio +import logging +from collections import deque +from typing import Dict, List, Optional, Union +from .Doc2X.ConvertV2 import parse_image_layout +from .Doc2X.Exception import RateLimit, run_async +from .Doc2X.Types import OutputFormat +from .FileTools.file_tools import get_files, save_md +import os +import copy + +logger = logging.getLogger("pdfdeal.doc2x") + +async def save_md_format( + output_path: str, + output_name: str, + content: str = '', + save_subdir: bool = False, + ): + """Save the text to md file + Args: + output_path (str): The path to save the MD file + output_name(str): MD file name + content (list[dict]): The MD content to save + """ + loop = asyncio.get_running_loop() + saved_path, fail_reason = await loop.run_in_executor( + None, + save_md, + output_path, + output_name, + content, + save_subdir, + ) + + return saved_path, fail_reason + +class ImageProcessor: + """Image processor with rate limiting support""" + + def __init__(self, apikey: str): + """Initialize the image processor + Args: + apikey (str): API key for authentication + """ + self.apikey = apikey + self._request_times = deque() + self._lock = None + self._loop = None + self._rate = 30 + self._period = 30 + + async def _get_lock(self) -> asyncio.Lock: + if self._lock is None: + # Get the loop from the current async context + self._loop = asyncio.get_running_loop() + self._lock = asyncio.Lock() # Pass the loop explicitly + return self._lock + + async def _check_rate_limit(self): + """Check and enforce rate limit (30 requests per 30 seconds)""" + lock = await self._get_lock() + + while True: + async with lock: + current_time = asyncio.get_event_loop().time() + # Remove requests older than 30 seconds + while self._request_times and (current_time - self._request_times[0] > self._period): + self._request_times.popleft() + if len(self._request_times) < self._rate: + # Append new timestamp to sliding window + self._request_times.append(current_time) + return + # Wait time until oldest timestamp pop out + wait_time = self._period - (current_time - self._request_times[0]) + # Sleep outside the critical section to avoid holding the lock during sleep + if wait_time > 0: + logger.warning( + f"Rate limit reached, waiting for {wait_time:.2f} seconds. " + f"Current count: {len(self._request_times)}" + ) + await asyncio.sleep(wait_time) + + + async def process_image( + self, + image_path: str, + process_type: str = "layout", + output_path: str = 'Output', + save_subdir: bool = False, + ) -> tuple[list, str, bool]: + """Process an image with layout analysis + + Args: + image_path (str): Path to the image file + process_type (str): Type of processing, can be 'layout' + output_path (str, optional): Path to save the result json and decoded base64 image zip. Defaults to Output. + + Returns: + Tuple containing: + - The processing result (list of pages for layout) + - The uid of the processed image + - Boolean indicating if the processing was successful + - The failure information + + Raises: + ValueError: If process_type is invalid or file type is not supported + RateLimit: If rate limit is exceeded + """ + + + if process_type not in ["layout"]: + raise ValueError("process_type must be one of: 'layout'") + + if save_subdir: + subdir_name = os.path.basename(image_path).split('.')[0] + output_path = os.path.join(output_path, subdir_name) + + try: + logger.info(f"Starting {process_type} processing for {image_path}") + if process_type == "layout": + await self._check_rate_limit() + pages, uid, output_zip_path = await parse_image_layout(self.apikey, image_path, output_path) + logger.info( + f"Successfully completed layout analysis for {image_path} with uid {uid}" + ) + if output_zip_path != '': + logger.info(f"Layout results saved to zip file at {output_zip_path}") + + pages[0]['zip_path'] = output_zip_path + pages[0]['path'] = image_path + + return pages, uid, True, "" + else: + logger.error(f"Error process_type: {process_type}") + raise ValueError(f"Unsupported process_type: '{process_type}'") + except RateLimit as e: + logger.error(f"Rate limit exceeded while processing {image_path}: {str(e)}") + raise + except Exception as e: + logger.error(f"Error processing image {image_path}: {str(e)}") + return [], "", False, str(e) + + async def process_multiple_images( + self, + image_paths: List[str], + output_format: str = 'text', + process_type: str = "layout", + output_path: str = 'Output/', + save_subdir: bool = False, + concurrent_limit: int = 5, + ) -> tuple[List[list], Dict[str, bool]]: + """Process multiple images concurrently with rate limiting + + Args: + image_paths (List[str]): List of image file paths + process_type (str): Type of processing, can be 'layout' + output_format (str): The output format. Defaults to "text". Available values are 'text', 'md', ''md_dollar + output_path (str): Path to save the result json and decoded base64 image zip. Defaults to Output. + save_subdir (bool): Save the output to a subfolder under output_path. Defaults to False. + concurrent_limit (int): Maximum number of concurrent processing tasks + + Returns: + Tuple containing: + - List of processing results in order (empty list for failed items) + - Dict mapping image paths to their success status + """ + semaphore = asyncio.Semaphore(concurrent_limit) + + async def process_with_semaphore( + path: str, + index: int, + ) -> tuple[int, str, tuple[list, str, bool]]: + async with semaphore: + logger.debug(f"Processing image {index + 1}/{len(image_paths)}: {path}") + output_result, uid, is_success, fail_reasons = await self.process_image(path, process_type, output_path, save_subdir) + async def save_result_as_md( + image_path: str, + result + ): + + all_results = [] + all_errors = [] + + basename, ext = os.path.basename(image_path).split('.') + + output_formats = [] + + if isinstance(output_format, str): + if "," in output_format: + output_formats = [fmt.strip() for fmt in output_format.split(",")] + else: + output_formats = [output_format] + else: + raise ValueError("Invalid output format, should be a string.") + for fmt in output_formats: + fmt = OutputFormat(fmt) + if isinstance(fmt, OutputFormat): + fmt = fmt.value + + + if fmt in ["md", "md_dollar"]: + if fmt == 'md_dollar': + new_result = copy.deepcopy(result) + new_result[0]['md'] = \ + new_result[0]['md'].replace('\\[', '$$').replace('\\]', '$$').replace('\\(', '$').replace('\\)', '$') + + output_result, fail_reason = await save_md_format( + output_path=output_path, + output_name=f'{basename}_dollar.md', + content=new_result[0]['md'], + save_subdir=save_subdir, + ) + elif fmt == 'md': + new_result = copy.deepcopy(result) + output_result, fail_reason = await save_md_format( + output_path=output_path, + output_name=f'{basename}.md', + content=new_result[0]['md'], + save_subdir=save_subdir, + ) + elif fmt in ['text']: + output_result = result + # 此处只会出现 在保存中 出现的错误,text不保存,不会出现错误 + fail_reason = '' + + all_results.append(output_result) + all_errors.append(fail_reason) + + return all_results, all_errors + + # 处理阶段失败的图片 + if not is_success: + results = [] + fail_reasons = fail_reasons + else: + results, save_fail_reasons = await save_result_as_md(image_path=path, result=output_result) + if all(x != '' for x in save_fail_reasons): + fail_reasons = save_fail_reasons + is_success = False + else: + is_success = True + fail_reasons = save_fail_reasons + + + return index, path, (results, fail_reasons, uid, is_success) + + tasks = [process_with_semaphore(path, i) for i, path in enumerate(image_paths)] + results = await asyncio.gather(*tasks, return_exceptions=True) + + processed_results = [[] for _ in range(len(image_paths))] + fail_reasons = ['' for _ in range(len(image_paths))] + success_status = {} + + success_count = 0 + for result in results: + if isinstance(result, Exception): + logger.error(f"Failed to process a file: {str(result)}") + continue + + index, path, (result_list, fail_reason, _, success) = result + + processed_results[index] = result_list if success else [] + fail_reasons[index] = fail_reason if not success else '' + success_status[path] = success + if success: + success_count += 1 + + logger.info( + f"Batch processing completed. Successfully processed {success_count}/{len(image_paths)} images" + ) + return processed_results, fail_reasons, success_status + + async def pic2file_back( + self, + pic_file, + process_type: str = "layout", + output_format: str = 'text', + output_path: str = "./Output", + save_subdir: bool = False, + concurrent_limit: Optional[int] = None, + ) -> tuple[List[Union[list, str]], List[dict], bool]: + """Process image files with layout analysis + + Args: + pic_file (str | List[str]): Path to image file(s) or directory + process_type (str): Type of processing, can be 'layout' + output_format (str): The output format. Defaults to "text". Available values are 'text', 'md', ''md_dollar + output_path (str): Path to save the result json and decoded base64 image zip .Defaults to Output. + save_subdir (bool): Save the output to a subfolder under output_path. Defaults to False. + concurrent_limit (int, optional): Maximum number of concurrent tasks. Defaults to None. + + + Returns: + Tuple containing: + - List of results in order (empty string for failed items) + - List of dictionaries containing error information + - Boolean indicating if any errors occurred + """ + if isinstance(pic_file, str): + if os.path.isdir(pic_file): + pic_file, _ = get_files(path=pic_file, mode="img", out="zip") + else: + pic_file = [pic_file] + + + + success_results, failed_results, success_status = await self.process_multiple_images( + image_paths=pic_file, + process_type=process_type, + concurrent_limit=concurrent_limit or 5, + output_path=output_path, + output_format=output_format, + save_subdir=save_subdir, + ) + + + failed_files = [] + has_error = False + + # Convert results to final format + final_results = [] + success_count = 0 + for i, path in enumerate(pic_file): + if not success_status.get(path, False): + failed_files.append({"error": failed_results[i], "path": path}) + final_results.append([]) + has_error = True + logger.error(f"Failed to process {path}, error: {failed_results[i]}") + else: + failed_files.append({"error": "", "path": ""}) + final_results.append(success_results[i]) + success_count += 1 + logger.debug(f"Successfully processed {path}") + + if has_error: + logger.error( + f"Processing completed with errors: {len([f for f in failed_files if f['error']])} file(s) failed" + ) + else: + logger.info( + f"Processing completed successfully: {success_count} file(s) processed" + ) + return final_results, failed_files, has_error + + def pic2file( + self, + pic_file, + process_type: str = 'layout', + output_format: str = 'text', + output_path: str = 'Output', + save_subdir: bool = False, + concurrent_limit: Optional[int] = None, + ) -> tuple[List[Union[list, str]], List[dict], bool]: + """Synchronous wrapper for pic2file_back + + Args: + pic_file (str | List[str]): Path to image file(s) or directory + process_type (str): Type of processing, can be 'layout' + output_format (str): The output format. Defaults to "text". Available values are 'text', 'md', ''md_dollar + output_path (str): Path to save the result json and decoded base64 image zip .Defaults to Output. + save_subdir (bool): Save the output to a subfolder under output_path. Defaults to False. + concurrent_limit (int, optional): Maximum number of concurrent tasks. Defaults to None. + + + Returns: + Same as pic2file_back + """ + return run_async( + self.pic2file_back( + pic_file=pic_file, + process_type=process_type, + output_format=output_format, + output_path=output_path, + save_subdir=save_subdir, + concurrent_limit=concurrent_limit, + ) + ) diff --git a/src/pdfdeal/doc2x_legacy.py b/src/pdfdeal/doc2x_legacy.py index b90f284..0f2bad7 100644 --- a/src/pdfdeal/doc2x_legacy.py +++ b/src/pdfdeal/doc2x_legacy.py @@ -43,17 +43,17 @@ async def get_key(apikey: str) -> str: async def pdf2file_v1( - apikey: str, - pdf_path: str, - output_path: str, - output_format: str, - ocr: bool, - maxretry: int, - rpm: int, - convert: bool, - translate: bool = False, - language: str = "zh", - model: str = "deepseek", + apikey: str, + pdf_path: str, + output_path: str, + output_format: str, + ocr: bool, + maxretry: int, + rpm: int, + convert: bool, + translate: bool = False, + language: str = "zh", + model: str = "deepseek", ): """ Convert pdf file to specified file, @@ -110,15 +110,15 @@ async def pdf2file_v1( async def img2file_v1( - apikey: str, - img_path: str, - output_path: str, - output_format: str, - formula: bool, - img_correction: bool, - maxretry: int, - rpm: int, - convert: bool, + apikey: str, + img_path: str, + output_path: str, + output_format: str, + formula: bool, + img_correction: bool, + maxretry: int, + rpm: int, + convert: bool, ) -> str: """ Convert image file to specified file @@ -167,10 +167,10 @@ class Doc2X: """Init the Doc2X class(V1)""" def __init__( - self, - apikey: str = None, - rpm: int = None, - thread: int = None, + self, + apikey: str = None, + rpm: int = None, + thread: int = None, ) -> None: """Init the Doc2X class @@ -201,13 +201,13 @@ def __init__( self.maxretry = None async def pic2file_back( - self, - image_file: list, - output_path: str = "./Output", - output_format: str = "md_dollar", - img_correction: bool = True, - equation=False, - convert: bool = False, + self, + image_file: list, + output_path: str = "./Output", + output_format: str = "md_dollar", + img_correction: bool = True, + equation=False, + convert: bool = False, ) -> str: """ Convert image file to specified file, with rate/thread limit @@ -239,14 +239,14 @@ async def limited_img2file_v1(img): return await process_status(image_file, completed_tasks) def pic2file( - self, - image_file, - output_path: str = "./Output", - output_names: list = None, - output_format: str = "md_dollar", - img_correction: bool = True, - equation: bool = False, - convert: bool = False, + self, + image_file, + output_path: str = "./Output", + output_names: list = None, + output_format: str = "md_dollar", + img_correction: bool = True, + equation: bool = False, + convert: bool = False, ) -> Tuple[list, list, bool]: """Convert image file to specified file @@ -309,15 +309,15 @@ def pic2file( return success, failed, flag async def pdf2file_back( - self, - pdf_file: list, - output_path: str = "./Output", - output_format: str = "md_dollar", - ocr: bool = True, - convert: bool = False, - translate: bool = False, - language: str = "zh", - model: str = "deepseek", + self, + pdf_file: list, + output_path: str = "./Output", + output_format: str = "md_dollar", + ocr: bool = True, + convert: bool = False, + translate: bool = False, + language: str = "zh", + model: str = "deepseek", ) -> str: """ Convert pdf file to specified file, with rate/thread limit, async version @@ -351,13 +351,13 @@ async def limited_pdf2file_v1(pdf): return await process_status(pdf_file, completed_tasks) def pdf2file( - self, - pdf_file, - output_path: str = "./Output", - output_names: list = None, - output_format: str = "md_dollar", - ocr: bool = True, - convert: bool = False, + self, + pdf_file, + output_path: str = "./Output", + output_names: list = None, + output_format: str = "md_dollar", + ocr: bool = True, + convert: bool = False, ) -> Tuple[list, list, bool]: """Convert pdf file to specified file @@ -421,12 +421,12 @@ def get_limit(self) -> int: return run_async(get_limit(self.apikey)) async def pdfdeal_back( - self, - input: str, - output: str, - path: str, - convert: bool, - limit: asyncio.Semaphore, + self, + input: str, + output: str, + path: str, + convert: bool, + limit: asyncio.Semaphore, ): """ Convert pdf files into recognisable pdfs, significantly improving their effectiveness in RAG systems @@ -465,11 +465,11 @@ async def pdfdeal_back( limit.release() async def pdfdeals( - self, - pdf_files: list, - output_path: str = "./Output", - output_format: str = "pdf", - convert: bool = True, + self, + pdf_files: list, + output_path: str = "./Output", + output_format: str = "pdf", + convert: bool = True, ) -> list: """ Convert pdf files into recognisable pdfs, significantly improving their effectiveness in RAG systems @@ -498,12 +498,12 @@ async def pdfdeals( return success_file, error_file, error_flag def pdfdeal( - self, - pdf_file, - output_format: str = "pdf", - output_names: list = None, - output_path: str = "./Output", - convert: bool = True, + self, + pdf_file, + output_format: str = "pdf", + output_names: list = None, + output_path: str = "./Output", + convert: bool = True, ) -> Tuple[list, list, bool]: """Deal with pdf file, convert it to specified format for RAG system @@ -518,7 +518,7 @@ def pdfdeal( tuple[list,list,str]: will return `list1`,`list2`,`bool` `list1`: list of successful files path, if some files are failed, its path will be empty string - `list2`: list of failed files's error message and its original file path, id some files are successful, its error message will be empty string + `list2`: list of failed file's error message and its original file path, id some files are successful, its error message will be empty string `bool`: True means that at least one file process failed """ output_format = RAG_OutputType(output_format) @@ -544,4 +544,4 @@ def pdfdeal( if output_names is not None: success = list_rename(success, output_names) - return success, failed, flag + return success, failed, flag \ No newline at end of file diff --git a/src/pdfdeal/v3_media.py b/src/pdfdeal/v3_media.py new file mode 100644 index 0000000..a2ad275 --- /dev/null +++ b/src/pdfdeal/v3_media.py @@ -0,0 +1,432 @@ +from __future__ import annotations + +import argparse +import json +import re +import sys +from pathlib import Path +from typing import Any, Dict, List, Tuple + +try: + import fitz +except ImportError as exc: # pragma: no cover - runtime guard + raise SystemExit( + "PyMuPDF is required. Install it with `pip install pymupdf`." + ) from exc + +try: + from PIL import Image +except ImportError as exc: # pragma: no cover - runtime guard + raise SystemExit( + "Pillow is required. Install it with `pip install pillow`." + ) from exc + + +TARGET_KIND_TO_BLOCK_TYPE = { + "figure": "Figure", + "table": "Table", +} + + +class V3ValidationError(ValueError): + pass + + +def load_v3_result(json_path: str) -> Dict[str, Any]: + path = Path(json_path) + with path.open("r", encoding="utf-8") as f: + payload = json.load(f) + + if isinstance(payload, dict): + if isinstance(payload.get("pages"), list): + return payload + data = payload.get("data") + if isinstance(data, dict) and isinstance(data.get("result"), dict): + return data["result"] + result = payload.get("result") + if isinstance(result, dict) and isinstance(result.get("pages"), list): + return result + + raise V3ValidationError( + "Unsupported v3 JSON structure. Expected either raw `result` with `pages`, " + "or a wrapped response containing `data.result.pages`." + ) + + +def _safe_stem(text: str) -> str: + cleaned = re.sub(r"[^A-Za-z0-9._-]+", "_", text.strip()) + return cleaned.strip("._-") or "item" + + +def _ensure_xyxy(value: Any, label: str) -> Tuple[float, float, float, float]: + if not isinstance(value, list) or len(value) != 4: + raise V3ValidationError(f"{label} must be a list of 4 numbers.") + try: + x1, y1, x2, y2 = (float(item) for item in value) + except (TypeError, ValueError) as exc: + raise V3ValidationError(f"{label} must contain only numeric values.") from exc + if x2 <= x1 or y2 <= y1: + raise V3ValidationError(f"{label} must satisfy x2>x1 and y2>y1.") + return x1, y1, x2, y2 + + +def _page_xyxy(page: Dict[str, Any]) -> Tuple[float, float, float, float]: + for key in ("page_xyxy", "page_bbox", "bbox", "xyxy"): + if key in page: + return _ensure_xyxy(page[key], f"page.{key}") + + width = page.get("page_width") + height = page.get("page_height") + if not isinstance(width, (int, float)) or not isinstance(height, (int, float)): + raise V3ValidationError( + "Each page must provide positive `page_width` and `page_height`, or an " + "explicit `page_xyxy/page_bbox`." + ) + if width <= 0 or height <= 0: + raise V3ValidationError("`page_width` and `page_height` must be positive.") + return 0.0, 0.0, float(width), float(height) + + +def _block_xyxy(block: Dict[str, Any]) -> Tuple[float, float, float, float]: + for key in ("bbox", "xyxy"): + if key in block: + return _ensure_xyxy(block[key], f"block.{key}") + raise V3ValidationError( + f"Block {block.get('id', '')} is missing `bbox`/`xyxy`." + ) + + +def _validate_ratio( + pdf_page: "fitz.Page", + page_xyxy: Tuple[float, float, float, float], + page_idx: int, +) -> None: + json_width = page_xyxy[2] - page_xyxy[0] + json_height = page_xyxy[3] - page_xyxy[1] + pdf_ratio = pdf_page.rect.width / pdf_page.rect.height + json_ratio = json_width / json_height + ratio_delta = abs(pdf_ratio - json_ratio) / json_ratio + if ratio_delta > 0.02: + raise V3ValidationError( + f"Page {page_idx} aspect ratio mismatch: PDF={pdf_ratio:.6f}, " + f"v3-json={json_ratio:.6f}. This violates the crop mapping rule." + ) + + +def _validate_block_within_page( + block_xyxy: Tuple[float, float, float, float], + page_xyxy: Tuple[float, float, float, float], + block_id: str, +) -> None: + bx1, by1, bx2, by2 = block_xyxy + px1, py1, px2, py2 = page_xyxy + if bx1 < px1 or by1 < py1 or bx2 > px2 or by2 > py2: + raise V3ValidationError( + f"Block {block_id} bbox {list(block_xyxy)} exceeds page bounds " + f"{list(page_xyxy)}." + ) + + +def validate_v3_result( + result: Dict[str, Any], + pdf_path: str, + target_block_type: str, +) -> List[Dict[str, Any]]: + pages = result.get("pages") + if not isinstance(pages, list) or not pages: + raise V3ValidationError("`pages` must be a non-empty list.") + + validated_pages: List[Dict[str, Any]] = [] + seen_page_idx = set() + + with fitz.open(pdf_path) as doc: + for page in pages: + if not isinstance(page, dict): + raise V3ValidationError("Each page entry must be an object.") + + page_idx = page.get("page_idx") + if not isinstance(page_idx, int): + raise V3ValidationError("Each page must provide integer `page_idx`.") + if page_idx in seen_page_idx: + raise V3ValidationError(f"Duplicate page_idx detected: {page_idx}.") + if page_idx < 0 or page_idx >= doc.page_count: + raise V3ValidationError( + f"page_idx {page_idx} is out of PDF page range 0..{doc.page_count - 1}." + ) + seen_page_idx.add(page_idx) + + layout = page.get("layout") + if not isinstance(layout, dict): + raise V3ValidationError( + f"Page {page_idx} is missing object field `layout`." + ) + blocks = layout.get("blocks") + if not isinstance(blocks, list): + raise V3ValidationError( + f"Page {page_idx} is missing list field `layout.blocks`." + ) + + page_xyxy = _page_xyxy(page) + pdf_page = doc.load_page(page_idx) + _validate_ratio(pdf_page, page_xyxy, page_idx) + + target_blocks = [] + for block in blocks: + if not isinstance(block, dict): + raise V3ValidationError( + f"Page {page_idx} contains a non-object block entry." + ) + block_type = block.get("type") + block_id = str(block.get("id", "")) + if not isinstance(block_type, str) or not block_type: + raise V3ValidationError( + f"Page {page_idx} has a block without valid `type`." + ) + if block_type != target_block_type: + continue + block_xyxy = _block_xyxy(block) + _validate_block_within_page( + block_xyxy, page_xyxy, block_id or "" + ) + target_blocks.append( + { + "id": block_id or f"{target_block_type.lower()}_{page_idx}", + "type": block_type, + "xyxy": block_xyxy, + "parent_id": str(block.get("parent_id", "")), + "src": str(block.get("src", "")), + "text": str(block.get("text", "")), + } + ) + + if target_blocks: + validated_pages.append( + { + "page_idx": page_idx, + "page_xyxy": page_xyxy, + "page_width": page_xyxy[2] - page_xyxy[0], + "page_height": page_xyxy[3] - page_xyxy[1], + "target_blocks": target_blocks, + } + ) + + return validated_pages + + +def _page_image_to_pil(page_pixmap: "fitz.Pixmap") -> Image.Image: + mode = "RGB" + if page_pixmap.alpha: + mode = "RGBA" + return Image.frombytes( + mode, [page_pixmap.width, page_pixmap.height], page_pixmap.samples + ) + + +def _crop_box_in_pixels( + block_xyxy: Tuple[float, float, float, float], + page_xyxy: Tuple[float, float, float, float], + image_width: int, + image_height: int, +) -> Tuple[int, int, int, int]: + px1, py1, px2, py2 = page_xyxy + bx1, by1, bx2, by2 = block_xyxy + page_width = px2 - px1 + page_height = py2 - py1 + x1 = int(round((bx1 - px1) / page_width * image_width)) + y1 = int(round((by1 - py1) / page_height * image_height)) + x2 = int(round((bx2 - px1) / page_width * image_width)) + y2 = int(round((by2 - py1) / page_height * image_height)) + x1 = max(0, min(x1, image_width - 1)) + y1 = max(0, min(y1, image_height - 1)) + x2 = max(x1 + 1, min(x2, image_width)) + y2 = max(y1 + 1, min(y2, image_height)) + return x1, y1, x2, y2 + + +def extract_target_images( + pdf_path: str, + v3_json_path: str, + dpi: int, + output_dir: str, + target_kind: str, +) -> Dict[str, Any]: + if target_kind not in TARGET_KIND_TO_BLOCK_TYPE: + raise ValueError( + f"Unsupported target_kind={target_kind!r}. " + f"Choose one of {sorted(TARGET_KIND_TO_BLOCK_TYPE)}." + ) + if dpi <= 0: + raise ValueError("dpi must be a positive integer.") + + target_block_type = TARGET_KIND_TO_BLOCK_TYPE[target_kind] + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) + page_images_dir = output_path / "_pages" + page_images_dir.mkdir(parents=True, exist_ok=True) + + result = load_v3_result(v3_json_path) + validated_pages = validate_v3_result(result, pdf_path, target_block_type) + + manifest: List[Dict[str, Any]] = [] + page_render_count = 0 + + with fitz.open(pdf_path) as doc: + matrix = fitz.Matrix(dpi / 72.0, dpi / 72.0) + for page_info in validated_pages: + page_idx = page_info["page_idx"] + page = doc.load_page(page_idx) + page_pixmap = page.get_pixmap(matrix=matrix, alpha=False) + page_image = _page_image_to_pil(page_pixmap) + page_image_path = page_images_dir / f"page_{page_idx + 1:04d}.png" + page_image.save(page_image_path) + page_render_count += 1 + + for block_idx, block in enumerate(page_info["target_blocks"], start=1): + crop_box = _crop_box_in_pixels( + block_xyxy=block["xyxy"], + page_xyxy=page_info["page_xyxy"], + image_width=page_image.width, + image_height=page_image.height, + ) + crop_image = page_image.crop(crop_box) + crop_name = ( + f"page_{page_idx + 1:04d}_{target_kind}_{block_idx:03d}_" + f"{_safe_stem(block['id'])}.png" + ) + crop_path = output_path / crop_name + crop_image.save(crop_path) + manifest.append( + { + "page_idx": page_idx, + "page_image_path": str(page_image_path), + "target_kind": target_kind, + "block_id": block["id"], + "block_type": block["type"], + "block_xyxy": list(block["xyxy"]), + "page_xyxy": list(page_info["page_xyxy"]), + "crop_box_pixels": list(crop_box), + "crop_path": str(crop_path), + } + ) + + manifest_path = output_path / "manifest.json" + with manifest_path.open("w", encoding="utf-8") as f: + json.dump( + { + "pdf_path": str(Path(pdf_path)), + "v3_json_path": str(Path(v3_json_path)), + "dpi": dpi, + "target_kind": target_kind, + "target_block_type": target_block_type, + "page_count_with_targets": page_render_count, + "crop_count": len(manifest), + "items": manifest, + }, + f, + ensure_ascii=False, + indent=2, + ) + + return { + "output_dir": str(output_path), + "page_images_dir": str(page_images_dir), + "manifest_path": str(manifest_path), + "page_count_with_targets": page_render_count, + "crop_count": len(manifest), + "items": manifest, + } + + +def extract_v3_figure_images( + pdf_path: str, + v3_json_path: str, + dpi: int, + output_dir: str, +) -> Dict[str, Any]: + return extract_target_images( + pdf_path=pdf_path, + v3_json_path=v3_json_path, + dpi=dpi, + output_dir=output_dir, + target_kind="figure", + ) + + +def extract_v3_table_images( + pdf_path: str, + v3_json_path: str, + dpi: int, + output_dir: str, +) -> Dict[str, Any]: + return extract_target_images( + pdf_path=pdf_path, + v3_json_path=v3_json_path, + dpi=dpi, + output_dir=output_dir, + target_kind="table", + ) + + +def build_parser(target_kind: str) -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description=( + f"Extract {target_kind} crops from a PDF using Doc2X v3 JSON. " + "The script first validates that the v3 JSON matches the crop rules, " + "renders only pages containing target blocks, saves full-page PNGs " + "under `_pages/`, and writes cropped images plus `manifest.json`." + ) + ) + parser.add_argument("--pdf", required=True, help="Path to the source PDF.") + parser.add_argument( + "--v3-json", + required=True, + help="Path to the raw v3 JSON (`result`) or wrapped response JSON.", + ) + parser.add_argument( + "--dpi", + required=True, + type=int, + help="Render DPI used for the page PNGs and final crops.", + ) + parser.add_argument( + "--output-dir", + required=True, + help="Directory where crops, page PNGs, and manifest.json are written.", + ) + return parser + + +def run_cli(target_kind: str) -> int: + parser = build_parser(target_kind) + args = parser.parse_args() + try: + summary = extract_target_images( + pdf_path=args.pdf, + v3_json_path=args.v3_json, + dpi=args.dpi, + output_dir=args.output_dir, + target_kind=target_kind, + ) + except (ValueError, V3ValidationError) as exc: + print(f"Validation failed: {exc}", file=sys.stderr) + return 1 + + print( + f"Extracted {summary['crop_count']} {target_kind} crops from " + f"{summary['page_count_with_targets']} page(s)." + ) + print(f"Page PNGs: {summary['page_images_dir']}") + print(f"Manifest: {summary['manifest_path']}") + return 0 + + +__all__ = [ + "V3ValidationError", + "load_v3_result", + "validate_v3_result", + "extract_target_images", + "extract_v3_figure_images", + "extract_v3_table_images", + "build_parser", + "run_cli", +] diff --git a/tests/image/sample.png b/tests/image/sample.png index 416f7a6..1ebddb2 100644 Binary files a/tests/image/sample.png and b/tests/image/sample.png differ diff --git a/tests/image/test/sample1.png b/tests/image/test/sample1.png index 416f7a6..1ebddb2 100644 Binary files a/tests/image/test/sample1.png and b/tests/image/test/sample1.png differ diff --git a/tests/pdf/formula_level.pdf b/tests/pdf/formula_level.pdf new file mode 100644 index 0000000..5d0d24e Binary files /dev/null and b/tests/pdf/formula_level.pdf differ diff --git a/tests/test_pdf2file.py b/tests/test_pdf2file.py index ef15e69..11d196b 100644 --- a/tests/test_pdf2file.py +++ b/tests/test_pdf2file.py @@ -1,28 +1,453 @@ -from pdfdeal import Doc2X +import asyncio +import json import os +from typing import Optional + import pytest +from pdfdeal import Doc2X +from pdfdeal.Doc2X.Types import FormulaLevel, V2ParseModel + + +def _require_apikey() -> str: + apikey = os.getenv("DOC2X_APIKEY") + if not apikey: + pytest.skip("DOC2X_APIKEY is required for integration tests") + return apikey + + +def _build_client(apikey: Optional[str] = None) -> Doc2X: + return Doc2X(apikey=apikey or _require_apikey(), debug=True, thread=1) + + +def _skip_if_doc2x_internal_error(flag, failed): + if not flag or not failed: + return + first_error = failed[0].get("error", "") if isinstance(failed[0], dict) else "" + if "internal_error" in first_error: + pytest.skip(f"Doc2X service internal_error: {first_error}") + + +def _skip_if_transient_integration_error(flag, failed): + if not flag or not failed: + return + error_texts = [] + for item in failed: + if isinstance(item, dict): + error_texts.append(str(item.get("error", ""))) + else: + error_texts.append(str(item)) + combined = " | ".join(error_texts).lower() + transient_markers = ( + "internal_error", + "connecterror", + "all connection attempts failed", + "name or service not known", + "nodename nor servname provided", + "temporary failure in name resolution", + "timed out", + ) + if any(marker in combined for marker in transient_markers): + pytest.skip(f"Transient integration error: {combined}") + + +def _count_pdf_files(path: str) -> int: + return sum( + 1 + for root, _, files in os.walk(path) + for file in files + if file.lower().endswith(".pdf") + ) + + +def _assert_multiple_pdf2file_result(output_path, failed, expected_count: int) -> None: + assert len(output_path) == expected_count + assert len(failed) == expected_count + for file_path, fail in zip(output_path, failed): + if isinstance(file_path, str): + if file_path == "": + assert fail["error"] != "" + else: + assert os.path.isfile(file_path) + assert fail["error"] == "" + + +def test_pdf2file_v3_model_example(): + client = _build_client() + success, failed, flag = client.pdf2file( + pdf_file="tests/pdf/sample.pdf", + output_format="text", + model=V2ParseModel.V3_2026, + ) + + print(success) + print(failed) + print(flag) + assert flag is False + assert isinstance(success[0], str) + assert success[0] != "" + assert failed[0]["error"] == "" + + +def test_pdf2file_mixed_v3_and_v2_models(): + client = _build_client() + + success_v3, failed_v3, flag_v3 = client.pdf2file( + pdf_file="tests/pdf/sample.pdf", + output_format="text", + model=V2ParseModel.V3_2026, + ) + success_v2, failed_v2, flag_v2 = client.pdf2file( + pdf_file="tests/pdf/sample.pdf", + output_format="text", + ) + + print(success_v3) + print(failed_v3) + print(flag_v3) + print(success_v2) + print(failed_v2) + print(flag_v2) + assert flag_v3 is False + assert flag_v2 is False + assert isinstance(success_v3[0], str) + assert isinstance(success_v2[0], str) + assert success_v3[0] != "" + assert success_v2[0] != "" + assert failed_v3[0]["error"] == "" + assert failed_v2[0]["error"] == "" + + +def test_pdf2file_v3_model_formula_level_enum_example(): + client = _build_client() + output_path = "./Output/test/single/formula_level_enum" + success, failed, flag = client.pdf2file( + pdf_file="tests/pdf/formula_level.pdf", + output_path=output_path, + output_format="md", + model=V2ParseModel.V3_2026, + formula_level=FormulaLevel.INLINE_TO_TEXT, + ) + + print(success) + print(failed) + print(flag) + _skip_if_doc2x_internal_error(flag, failed) + assert flag is False + assert isinstance(success[0], str) + assert success[0] != "" + assert os.path.isfile(success[0]) + assert failed[0]["error"] == "" + + +def test_pdf2file_v3_model_formula_level_all_to_text_example(): + client = _build_client() + output_path = "./Output/test/single/formula_level_all_to_text" + success, failed, flag = client.pdf2file( + pdf_file="tests/pdf/formula_level.pdf", + output_path=output_path, + output_format="md", + model=V2ParseModel.V3_2026, + formula_level=FormulaLevel.ALL_TO_TEXT, + ) + + print(success) + print(failed) + print(flag) + _skip_if_doc2x_internal_error(flag, failed) + assert flag is False + assert isinstance(success[0], str) + assert success[0] != "" + assert os.path.isfile(success[0]) + assert failed[0]["error"] == "" + + +def test_pdf2file_invalid_formula_level(): + client = _build_client(apikey="test_apikey") + with pytest.raises(ValueError, match="formula_level must be one of 0, 1, 2"): + client.pdf2file( + pdf_file="tests/pdf/formula_level.pdf", + output_path="./Output/test/single/pdf2file", + output_format="md", + formula_level=3, + ) + + +# 测试一个文件,output_format为json +def test_pdf2json(): + client = _build_client() + output_path = "./Output/test/single/pdf2file" + filepath, failed, flag = client.pdf2file( + pdf_file="tests/pdf", + output_path=output_path, + output_format="json", + save_subdir=False, + ) + print(filepath) + print(failed) + print(flag) + assert flag + + +def test_pdf2json_without_output_names_uses_pdf_basename(monkeypatch, tmp_path): + client = _build_client(apikey="test_apikey") + async def fake_parse_pdf(**kwargs): + pdf_path = kwargs["pdf_path"] + return "uid", [f"text for {os.path.basename(pdf_path)}"], [{ + "url": "", + "page_idx": 0, + "page_width": 100, + "page_height": 200, + }], {"pages": [{"md": f"text for {os.path.basename(pdf_path)}"}]} + + monkeypatch.setattr("pdfdeal.doc2x.parse_pdf", fake_parse_pdf) + monkeypatch.setattr("pdfdeal.doc2x.get_pdf_page_count", lambda _: 1) + + success, failed, flag = asyncio.run( + client.pdf2file_back( + pdf_file=["tests/pdf/sample.pdf", "tests/pdf/sample_bad.pdf"], + output_path=str(tmp_path), + output_format="json", + ) + ) + + assert flag is False + assert [os.path.basename(path) for path in success] == [ + "sample.json", + "sample_bad.json", + ] + assert all(os.path.isfile(path) for path in success) + assert all(item["error"] == "" for item in failed) + + +def test_pdf2json_v3_model_saves_raw_v3_json(monkeypatch, tmp_path): + client = _build_client(apikey="test_apikey") + raw_result = { + "pages": [ + { + "page_idx": 0, + "md": "page markdown", + "layout": { + "blocks": [ + {"id": "blk_0", "type": "Text", "text": "page markdown"} + ] + }, + } + ] + } + + async def fake_parse_pdf(**kwargs): + return "uid", ["page markdown"], [{ + "url": "", + "page_idx": 0, + "page_width": 100, + "page_height": 200, + }], raw_result + + monkeypatch.setattr("pdfdeal.doc2x.parse_pdf", fake_parse_pdf) + monkeypatch.setattr("pdfdeal.doc2x.get_pdf_page_count", lambda _: 1) + + success, failed, flag = asyncio.run( + client.pdf2file_back( + pdf_file="tests/pdf/sample.pdf", + output_path=str(tmp_path), + output_format="json", + model=V2ParseModel.V3_2026, + ) + ) + + assert flag is False + assert os.path.isfile(success[0]) + with open(success[0], "r", encoding="utf-8") as f: + saved_json = json.load(f) + assert saved_json == raw_result + assert failed[0]["error"] == "" + + +def test_pdf2text_v3_model_saves_raw_v3_json_sidecar(monkeypatch, tmp_path): + client = _build_client(apikey="test_apikey") + raw_result = { + "pages": [ + { + "page_idx": 0, + "md": "page markdown", + "layout": { + "blocks": [ + {"id": "blk_0", "type": "Text", "text": "page markdown"} + ] + }, + } + ] + } + + async def fake_parse_pdf(**kwargs): + return "uid", ["page markdown"], [{ + "url": "", + "page_idx": 0, + "page_width": 100, + "page_height": 200, + }], raw_result + + monkeypatch.setattr("pdfdeal.doc2x.parse_pdf", fake_parse_pdf) + monkeypatch.setattr("pdfdeal.doc2x.get_pdf_page_count", lambda _: 1) + + success, failed, flag = asyncio.run( + client.pdf2file_back( + pdf_file="tests/pdf/sample.pdf", + output_path=str(tmp_path), + output_format="text", + model=V2ParseModel.V3_2026, + ) + ) + + sidecar_path = os.path.join(str(tmp_path), "sample.json") + + assert flag is False + assert success[0] == "page markdown" + assert os.path.isfile(sidecar_path) + with open(sidecar_path, "r", encoding="utf-8") as f: + saved_json = json.load(f) + assert saved_json == raw_result + assert failed[0]["error"] == "" + + +def test_pdf2detailed_v3_model_saves_raw_v3_json_sidecar(monkeypatch, tmp_path): + client = _build_client(apikey="test_apikey") + raw_result = { + "pages": [ + { + "page_idx": 0, + "md": "page markdown", + "layout": { + "blocks": [ + {"id": "blk_0", "type": "Text", "text": "page markdown"} + ] + }, + } + ] + } + + async def fake_parse_pdf(**kwargs): + return "uid", ["page markdown"], [{ + "url": "", + "page_idx": 0, + "page_width": 100, + "page_height": 200, + }], raw_result + + monkeypatch.setattr("pdfdeal.doc2x.parse_pdf", fake_parse_pdf) + monkeypatch.setattr("pdfdeal.doc2x.get_pdf_page_count", lambda _: 1) + + success, failed, flag = asyncio.run( + client.pdf2file_back( + pdf_file="tests/pdf/sample.pdf", + output_path=str(tmp_path), + output_format="detailed", + model=V2ParseModel.V3_2026, + ) + ) + + sidecar_path = os.path.join(str(tmp_path), "sample.json") + + assert flag is False + assert success[0] == [{ + "text": "page markdown", + "location": { + "url": "", + "page_idx": 0, + "page_width": 100, + "page_height": 200, + }, + }] + assert os.path.isfile(sidecar_path) + with open(sidecar_path, "r", encoding="utf-8") as f: + saved_json = json.load(f) + assert saved_json == raw_result + assert failed[0]["error"] == "" + + +def test_pdf2file_v3_logs_sidecar_json_naming_rule(monkeypatch, tmp_path, capsys): + client = _build_client(apikey="test_apikey") + raw_result = { + "pages": [ + { + "page_idx": 0, + "md": "page markdown", + "layout": {"blocks": [{"id": "blk_0", "type": "Text"}]}, + } + ] + } + + async def fake_parse_pdf(**kwargs): + return "uid", ["page markdown"], [{ + "url": "", + "page_idx": 0, + "page_width": 100, + "page_height": 200, + }], raw_result + + monkeypatch.setattr("pdfdeal.doc2x.parse_pdf", fake_parse_pdf) + monkeypatch.setattr("pdfdeal.doc2x.get_pdf_page_count", lambda _: 1) + + success, failed, flag = asyncio.run( + client.pdf2file_back( + pdf_file="tests/pdf/sample.pdf", + output_names=[["plain.txt", "viz.data"]], + output_path=str(tmp_path), + output_format="text,json", + model=V2ParseModel.V3_2026, + ) + ) + captured = capsys.readouterr() + + assert flag is False + assert os.path.basename(success[0][1]) == "viz.json" + assert failed[0]["error"] == ["", ""] + assert "using output_names[1] because output_format includes \"json\"" in captured.err + assert "Saved to" in captured.err + +# 测试一个文件,output_format为md_dollar,tex,docx def test_single_pdf2file(): - client = Doc2X(debug=True, thread=1) + client = _build_client() + output_path = "./Output/test/single/pdf2file" filepath, failed, flag = client.pdf2file( pdf_file="tests/pdf/sample.pdf", - output_path="./Output/test/single/pdf2file", - output_names=["Test.zip"], - output_format="md_dollar", - ) - if filepath[0] != "": - assert os.path.exists(filepath[0]) - assert os.path.isfile(filepath[0]) - assert filepath[0].endswith(".zip") - assert os.path.basename(filepath[0]) == "Test.zip" + output_path=output_path, + output_format="md_dollar,tex,docx", + save_subdir=False, + ) print(filepath) print(failed) print(flag) + _skip_if_transient_integration_error(flag, failed) + assert flag == False + assert os.path.dirname(filepath[0][0]) == output_path + assert os.path.dirname(filepath[0][1]) == output_path + assert os.path.dirname(filepath[0][2]) == output_path +# 测试一个文件,output_format为md_dollar,tex,docx,同时保存到子文件夹下 +def test_single_pdf2file_with_subdir(): + client = _build_client() + output_path = "./Output/test/single/pdf2file" + filepath, failed, flag = client.pdf2file( + pdf_file="tests/pdf/sample.pdf", + output_path=output_path, + output_format="md_dollar,tex,docx", + save_subdir=True, + ) + print(filepath) + print(failed) + print(flag) + _skip_if_transient_integration_error(flag, failed) + assert flag == False + assert os.path.dirname(filepath[0][0]) == os.path.join(output_path, "sample") + assert os.path.dirname(filepath[0][1]) == os.path.join(output_path, "sample") + assert os.path.dirname(filepath[0][2]) == os.path.join(output_path, "sample") +# 测试非法的输出格式 def test_error_input_pdf2file(): - client = Doc2X(debug=True, thread=1) + client = _build_client() with pytest.raises(ValueError): client.pdf2file( pdf_file="tests/pdf/sample.pdf", @@ -31,53 +456,31 @@ def test_error_input_pdf2file(): output_format="md_dallar", ) - -def test_multiple_pdf2file(): - client = Doc2X(debug=True, thread=1) - success, failed, flag = client.pdf2file( +@pytest.mark.parametrize("save_subdir", [False, True], ids=["flat", "with_subdir"]) +def test_multiple_pdf2file(save_subdir: bool): + client = _build_client() + expected_count = _count_pdf_files("tests/pdf") + output_path, failed, flag = client.pdf2file( pdf_file="tests/pdf", output_path="./Output/test/multiple/pdf2file", output_format="docx", + save_subdir=save_subdir, ) - assert flag - assert len(success) == len(failed) == 3 - for s in success: - if s != "": - assert s.endswith("sample.docx") or s.endswith("sampleB.docx") - print(success) + print(output_path) print(failed) print(flag) + assert flag + _assert_multiple_pdf2file_result(output_path, failed, expected_count) - +# 测试格式错误或者损坏的pdf文件 def test_all_fail_pdf2file(): - client = Doc2X(debug=True, thread=1) - success, failed, flag = client.pdf2file( + client = _build_client() + output_path, failed, flag = client.pdf2file( pdf_file="tests/pdf/sample_bad.pdf", output_path="./Output/test/allfail/pdf2file", output_format="md", ) - assert flag - assert len(success) == 1 - assert len(failed) == 1 - print(success) - print(failed) - print(flag) - - -def test_multiple_outtypes(): - client = Doc2X(debug=True, thread=1) - success, failed, flag = client.pdf2file( - pdf_file="tests/pdf/sample.pdf", - output_path="./Output/test/multiple_outtypes/pdf2file", - output_names=[["sample1.docx", "sample2.zip"]], - output_format="docx,md", - ) - assert len(success) == 1 - assert len(failed) == 1 - for s in success: - if isinstance(s, list): - for i in s: - assert i.endswith("sample1.docx") or i.endswith("sample2.zip") - print(success) + print(output_path) print(failed) print(flag) + assert flag diff --git a/tests/test_pic2file.py b/tests/test_pic2file.py index 0bdabc8..883bb53 100644 --- a/tests/test_pic2file.py +++ b/tests/test_pic2file.py @@ -1,56 +1,98 @@ -# from pdfdeal import Doc2X -# from pdfdeal.file_tools import get_files -# import os -# import logging - -# httpx_logger = logging.getLogger("httpx") -# httpx_logger.setLevel(logging.WARNING) -# logging.basicConfig(level=logging.INFO) - - -# def test_single_pic2file(): -# client = Doc2X() -# filepath, _, _ = client.pic2file( -# image_file="tests/image/sample.png", -# output_path="./Output/test/single/pic2file", -# output_names=["pic_sample1.docx"], -# output_format="docx", -# ) -# if filepath[0] != "": -# assert os.path.exists(filepath[0]) -# assert os.path.isfile(filepath[0]) -# assert filepath[0].endswith(".docx") -# assert os.path.basename(filepath[0]) == "pic_sample1.docx" - - -# def test_multiple_pic2file(): -# client = Doc2X() -# file_list, rename = get_files("tests/image", "img", "docx") -# success, failed, flag = client.pic2file( -# image_file=file_list, -# output_path="./Output/test/multiple/pic2file", -# output_names=rename, -# output_format="docx", -# ) -# assert flag -# assert len(success) == len(failed) == 3 -# for s in success: -# if s != "": -# assert s.endswith("sample1.docx") or s.endswith("sample.docx") - - -# # def test_multiple_high_rpm(): -# # client = Doc2X() -# # file_list = ["tests/image/sample.png" for _ in range(20)] -# # success, failed, flag = client.pic2file( -# # image_file=file_list, -# # output_path="./Output/test", -# # ) -# # assert len(success) == len(failed) == 20 -# # i = 0 -# # for s in success: -# # if s != "": -# # assert s.endswith(".zip") -# # else: -# # i += 1 -# # print(f"===Failed {i} times===") +from pdfdeal import Doc2X +import os +import pytest +from typing import Optional + + +def _require_apikey() -> str: + apikey = os.getenv("DOC2X_APIKEY") + if not apikey: + pytest.skip("DOC2X_APIKEY is required for integration tests") + return apikey + + +def _build_client(apikey: Optional[str] = None) -> Doc2X: + return Doc2X(apikey=apikey or _require_apikey(), debug=True, thread=1) + + +def test_single_pic2file(): + client = _build_client() + success, failed, flag = client.piclayout( + pic_file="tests/image/sample.png", + output_path="./Output/test/single/pic2file", + ) + print(success) + print(failed) + print(flag) + assert flag is False + assert isinstance(success, list) + assert len(success) == 1 + assert isinstance(success[0], list) + assert len(success[0]) == 1 + assert isinstance(success[0][0], list) + assert len(success[0][0]) > 0 + assert isinstance(success[0][0][0], dict) + assert "md" in success[0][0][0] + assert "zip_path" in success[0][0][0] + assert "path" in success[0][0][0] + assert failed[0]["error"] == "" + + +def test_multiple_pic2file(): + client = _build_client() + success, failed, flag = client.piclayout( + pic_file="tests/image", + output_path="./Output/test/multiple/pic2file", + ) + print(success) + print(failed) + print(flag) + assert flag + assert isinstance(success, list) + assert len(success) == 3 + assert len(failed) == 3 + for idx, result in enumerate(success): + if not result: + assert failed[idx]["error"] != "" + assert failed[idx]["path"] != "" + else: + assert failed[idx]["error"] == "" + + +def test_multiple_high_rpm(): + client = _build_client() + file_list = ["tests/image/sample.png" for _ in range(30)] + success, failed, flag = client.piclayout( + pic_file=file_list, + output_path="./Output/test/highrpm/pic2file", + ) + print(success) + print(failed) + print(flag) + assert flag is False + assert isinstance(success, list) + assert len(success) == 30 + assert all(isinstance(item, list) and item for item in success) + assert all(item["error"] == "" for item in failed) + + +def test_piclayout(): + client = _build_client() + success, failed, flag = client.piclayout( + pic_file="tests/image/sample.png", + ) + print(success) + print(failed) + print(flag) + assert flag is False + assert isinstance(success, list) + assert len(success) == 1 + assert isinstance(success[0], list) + assert len(success[0]) == 1 + assert isinstance(success[0][0], list) + assert len(success[0][0]) > 0 + assert isinstance(success[0][0][0], dict) + assert "md" in success[0][0][0] + assert "zip_path" in success[0][0][0] + assert "path" in success[0][0][0] + assert failed[0]["error"] == "" diff --git a/tests/test_types.py b/tests/test_types.py new file mode 100644 index 0000000..6ecb6ac --- /dev/null +++ b/tests/test_types.py @@ -0,0 +1,92 @@ +import asyncio +import importlib + +import pytest + +from pdfdeal.Doc2X.Types import ( + FormulaLevel, + V2ParseModel, + normalize_formula_level, + normalize_v2_parse_model, +) + +convert_v2 = importlib.import_module("pdfdeal.Doc2X.ConvertV2") + + +@pytest.mark.parametrize( + ("value", "expected"), + [ + (None, 0), + (FormulaLevel.KEEP_MARKDOWN, 0), + (FormulaLevel.INLINE_TO_TEXT, 1), + (FormulaLevel.ALL_TO_TEXT, 2), + (0, 0), + (1, 1), + (2, 2), + ("0", 0), + ("1", 1), + ("2", 2), + ], +) +def test_normalize_formula_level_valid(value, expected): + assert normalize_formula_level(value) == expected + + +@pytest.mark.parametrize("value", [-1, 3, "bad", True, False]) +def test_normalize_formula_level_invalid(value): + with pytest.raises(ValueError, match="formula_level must be one of 0, 1, 2"): + normalize_formula_level(value) + + +@pytest.mark.parametrize( + ("value", "expected"), + [ + (None, ""), + ("", ""), + (" ", ""), + ("v2", ""), + ("V2", ""), + (V2ParseModel.V2, ""), + ("v3-2026", "v3-2026"), + ("V3-2026", "v3-2026"), + (V2ParseModel.V3_2026, "v3-2026"), + ], +) +def test_normalize_v2_parse_model_valid(value, expected): + assert normalize_v2_parse_model(value) == expected + + +def test_normalize_v2_parse_model_invalid(): + with pytest.raises(ValueError, match="is not a valid V2ParseModel"): + normalize_v2_parse_model("v3") + + +def test_upload_pdf_rejects_deprecated_direct_upload_mode(): + with pytest.raises(ValueError, match="direct upload endpoint has been deprecated"): + asyncio.run( + convert_v2.upload_pdf( + "test_apikey", "tests/pdf/sample.pdf", oss_choose="never" + ) + ) + + +def test_upload_pdf_auto_still_uses_preupload(monkeypatch): + calls = [] + + async def fake_upload_pdf_big(apikey, pdffile, model=None): + calls.append((apikey, pdffile, model)) + return "uid_123" + + monkeypatch.setattr(convert_v2, "upload_pdf_big", fake_upload_pdf_big) + + uid = asyncio.run( + convert_v2.upload_pdf( + "test_apikey", + "tests/pdf/sample.pdf", + oss_choose="auto", + model=V2ParseModel.V3_2026, + ) + ) + + assert uid == "uid_123" + assert calls == [("test_apikey", "tests/pdf/sample.pdf", V2ParseModel.V3_2026)] diff --git a/tests/test_v3_media_crop_scripts.py b/tests/test_v3_media_crop_scripts.py new file mode 100644 index 0000000..e8ba0f9 --- /dev/null +++ b/tests/test_v3_media_crop_scripts.py @@ -0,0 +1,89 @@ +import json +from pathlib import Path + +import fitz +from pdfdeal.v3_media import extract_v3_figure_images, extract_v3_table_images + + +def _build_test_pdf(pdf_path: Path) -> None: + doc = fitz.open() + page = doc.new_page(width=200, height=200) + page.draw_rect(fitz.Rect(20, 20, 120, 100), color=(1, 0, 0), fill=(1, 0.8, 0.8)) + page.draw_rect( + fitz.Rect(40, 120, 180, 180), color=(0, 0, 1), fill=(0.8, 0.8, 1) + ) + doc.save(pdf_path) + doc.close() + + +def _build_test_v3_json(json_path: Path) -> None: + payload = { + "pages": [ + { + "page_idx": 0, + "page_width": 1000, + "page_height": 1000, + "layout": { + "blocks": [ + { + "id": "figure_0", + "type": "Figure", + "bbox": [100, 100, 600, 500], + }, + { + "id": "table_0", + "type": "Table", + "bbox": [200, 600, 900, 900], + }, + ] + }, + } + ] + } + json_path.write_text(json.dumps(payload), encoding="utf-8") + + +def test_extract_v3_figure_crops(tmp_path): + pdf_path = tmp_path / "sample.pdf" + json_path = tmp_path / "sample.json" + output_dir = tmp_path / "figures" + + _build_test_pdf(pdf_path) + _build_test_v3_json(json_path) + + summary = extract_v3_figure_images( + pdf_path=str(pdf_path), + v3_json_path=str(json_path), + dpi=144, + output_dir=str(output_dir), + ) + + assert summary["crop_count"] == 1 + assert summary["page_count_with_targets"] == 1 + assert (output_dir / "_pages" / "page_0001.png").is_file() + assert (output_dir / "manifest.json").is_file() + crop_path = Path(summary["items"][0]["crop_path"]) + assert crop_path.is_file() + + +def test_extract_v3_table_crops(tmp_path): + pdf_path = tmp_path / "sample.pdf" + json_path = tmp_path / "sample.json" + output_dir = tmp_path / "tables" + + _build_test_pdf(pdf_path) + _build_test_v3_json(json_path) + + summary = extract_v3_table_images( + pdf_path=str(pdf_path), + v3_json_path=str(json_path), + dpi=144, + output_dir=str(output_dir), + ) + + assert summary["crop_count"] == 1 + assert summary["page_count_with_targets"] == 1 + assert (output_dir / "_pages" / "page_0001.png").is_file() + assert (output_dir / "manifest.json").is_file() + crop_path = Path(summary["items"][0]["crop_path"]) + assert crop_path.is_file()