NoEdgeAI · Muxv · Mar 18, 2026 · Feb 10, 2025 · Feb 13, 2025 · Feb 14, 2025
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,8 @@ node_modules
 my-docs
 Output
 self_use.py
+.history/
+tmp/
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/README.md b/README.md
@@ -104,6 +104,8 @@ success, failed, flag = client.pdf2file(
     pdf_file="tests/pdf",
     output_path="./Output",
     output_format="docx",
+    model="v3-2026",  # optional, default is server-side v2
+    formula_level=1,  # optional: 0(default/recommended)=keep formulas; 1=inline formulas -> text; 2=all formulas (inline+block) -> text
 )
 print(success)
 print(failed)
@@ -127,4 +129,87 @@ print(failed)
 print(flag)
 ```
 
+### V3 JSON updates
+
+When `model="v3-2026"`:
+
+- `output_format="json"` now saves the raw Doc2X v3 JSON (`result.pages...`) instead of the legacy simplified `[{text, location}]` structure.
+- Raw v3 JSON is always saved as a sidecar `.json` file, even when `output_format` does not include `json` (for example `text`, `detailed`, `md`, `docx`).
+- If `output_format` includes `json`, the sidecar JSON name follows the `json` slot in `output_names`.
+- If `output_format` does not include `json`, the sidecar JSON name follows the first non-empty entry in `output_names`.
+- If `output_names` is omitted, the sidecar JSON falls back to the original PDF basename.
+- Deprecated direct upload is no longer used. `oss_choose="always"` and `oss_choose="auto"` both use the preupload API. `oss_choose="never"` / `oss_choose="none"` now raises an error.
+
+Example:
+
+```python
+from pdfdeal import Doc2X
+
+client = Doc2X(apikey="Your API key", debug=True)
+success, failed, flag = client.pdf2file(
+    pdf_file="tests/pdf/sample.pdf",
+    output_path="./Output/test/v3",
+    output_format="text,json",
+    output_names=[["plain.txt", "viz.data"]],
+    model="v3-2026",
+)
+print(success)  # ["page text...", "./Output/test/v3/viz.json"]
+print(failed)
+print(flag)
+```
+
+### Helper scripts for v3 figure/table crops
+
+Two helper scripts were added under [`scripts/`](/Users/cc/work/NoEdgeAI/pdfdeal/scripts):
+
+- [`extract_v3_figures.py`](/Users/cc/work/NoEdgeAI/pdfdeal/scripts/extract_v3_figures.py): extract figure crops from a PDF using Doc2X v3 JSON
+- [`extract_v3_tables.py`](/Users/cc/work/NoEdgeAI/pdfdeal/scripts/extract_v3_tables.py): extract table crops from a PDF using Doc2X v3 JSON
+
+Both scripts:
+
+- validate that the v3 JSON matches the crop rules first
+- render only pages containing target blocks with `fitz` at the requested `dpi`
+- save full-page PNGs under `_pages/`
+- crop target regions using the block `bbox/xyxy` and page coordinates from the v3 JSON
+- write `manifest.json` with crop metadata
+
+Examples:
+
+```bash
+python scripts/extract_v3_figures.py \
+  --pdf /path/to/input.pdf \
+  --v3-json /path/to/input_v3.json \
+  --dpi 200 \
+  --output-dir ./Output/figures
+```
+
+```bash
+python scripts/extract_v3_tables.py \
+  --pdf /path/to/input.pdf \
+  --v3-json /path/to/input_v3.json \
+  --dpi 200 \
+  --output-dir ./Output/tables
+```
+
+You can also import the helpers directly:
+
+```python
+from pdfdeal import extract_v3_figure_images, extract_v3_table_images
+
+figure_summary = extract_v3_figure_images(
+    pdf_path="/path/to/input.pdf",
+    v3_json_path="/path/to/input_v3.json",
+    dpi=200,
+    output_dir="./Output/figures",
+)
+table_summary = extract_v3_table_images(
+    pdf_path="/path/to/input.pdf",
+    v3_json_path="/path/to/input_v3.json",
+    dpi=200,
+    output_dir="./Output/tables",
+)
+print(figure_summary["crop_count"], figure_summary["manifest_path"])
+print(table_summary["crop_count"], table_summary["manifest_path"])
+```
+
 See the online documentation for details.
diff --git a/README_CN.md b/README_CN.md
@@ -102,6 +102,8 @@ success, failed, flag = client.pdf2file(
     pdf_file="tests/pdf",
     output_path="./Output",
     output_format="docx",
+    model="v3-2026",  # 可选，不填则使用服务端默认 v2
+    formula_level=1,  # 可选：0（默认，推荐）不降级；1 仅降级行内公式（\(...\)、$...$）；2 降级所有公式（含 \[...\]、$$...$$）
 )
 print(success)
 print(failed)
@@ -125,4 +127,87 @@ print(failed)
 print(flag)
 ```
 
-更多详细请参见在线文档。
+### V3 JSON 更新
+
+当 `model="v3-2026"` 时：
+
+- `output_format="json"` 现在会保存 Doc2X 原始 v3 JSON（`result.pages...`），不再保存旧的简化 `[{text, location}]` 结构。
+- 即使 `output_format` 不包含 `json`（例如 `text`、`detailed`、`md`、`docx`），也会额外保存一份 sidecar `.json`。
+- 如果 `output_format` 包含 `json`，sidecar JSON 的命名会跟随 `output_names` 里 `json` 这一槽位。
+- 如果 `output_format` 不包含 `json`，sidecar JSON 的命名会跟随 `output_names` 里第一个非空名字。
+- 如果没有传 `output_names`，sidecar JSON 会回退到原 PDF 文件名。
+- 已不再使用过期的小文件直传。`oss_choose="always"` 和 `oss_choose="auto"` 都会走 preupload；`oss_choose="never"` / `oss_choose="none"` 会直接报错。
+
+示例：
+
+```python
+from pdfdeal import Doc2X
+
+client = Doc2X(apikey="Your API key", debug=True)
+success, failed, flag = client.pdf2file(
+    pdf_file="tests/pdf/sample.pdf",
+    output_path="./Output/test/v3",
+    output_format="text,json",
+    output_names=[["plain.txt", "viz.data"]],
+    model="v3-2026",
+)
+print(success)  # ["页面文本...", "./Output/test/v3/viz.json"]
+print(failed)
+print(flag)
+```
+
+### V3 figure/table 裁剪辅助脚本
+
+在 [`scripts/`](/Users/cc/work/NoEdgeAI/pdfdeal/scripts) 下新增了两个辅助脚本：
+
+- [`extract_v3_figures.py`](/Users/cc/work/NoEdgeAI/pdfdeal/scripts/extract_v3_figures.py)：基于 Doc2X v3 JSON 从 PDF 中裁剪 figure 图片
+- [`extract_v3_tables.py`](/Users/cc/work/NoEdgeAI/pdfdeal/scripts/extract_v3_tables.py)：基于 Doc2X v3 JSON 从 PDF 中裁剪 table 图片
+
+这两个脚本都会：
+
+- 先校验 v3 JSON 是否符合裁剪规则
+- 用 `fitz` 按指定 `dpi` 只渲染包含目标 block 的页面
+- 将整页 PNG 保存到 `_pages/`
+- 根据 v3 JSON 中的 block `bbox/xyxy` 和 page 坐标裁剪出目标区域
+- 输出带裁剪元数据的 `manifest.json`
+
+示例：
+
+```bash
+python scripts/extract_v3_figures.py \
+  --pdf /path/to/input.pdf \
+  --v3-json /path/to/input_v3.json \
+  --dpi 200 \
+  --output-dir ./Output/figures
+```
+
+```bash
+python scripts/extract_v3_tables.py \
+  --pdf /path/to/input.pdf \
+  --v3-json /path/to/input_v3.json \
+  --dpi 200 \
+  --output-dir ./Output/tables
+```
+
+你也可以直接 import 这些工具函数：
+
+```python
+from pdfdeal import extract_v3_figure_images, extract_v3_table_images
+
+figure_summary = extract_v3_figure_images(
+    pdf_path="/path/to/input.pdf",
+    v3_json_path="/path/to/input_v3.json",
+    dpi=200,
+    output_dir="./Output/figures",
+)
+table_summary = extract_v3_table_images(
+    pdf_path="/path/to/input.pdf",
+    v3_json_path="/path/to/input_v3.json",
+    dpi=200,
+    output_dir="./Output/tables",
+)
+print(figure_summary["crop_count"], figure_summary["manifest_path"])
+print(table_summary["crop_count"], table_summary["manifest_path"])
+```
+
+更多详细请参见在线文档。
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,22 +1,37 @@
 [project]
 name = "pdfdeal"
-version = "1.0.2"
-authors = [{ name = "Menghuan1918", email = "menghuan@menghuan1918.com" }]
-description = "A python wrapper for the Doc2X API and comes with native texts processing (to improve texts recall in RAG)."
+version = "1.0.4"
+authors = [{ name = "noedgeai", email = "support@noedgeai.com" }]
+description = "Python SDK for Doc2X API and some native texts processing (to improve texts recall in RAG)."
 readme = "README.md"
-requires-python = ">=3.8"
+requires-python = ">=3.9"
 classifiers = [
     "Programming Language :: Python :: 3",
     "License :: OSI Approved :: MIT License",
     "Operating System :: OS Independent",
 ]
-dependencies = ["httpx[http2]>=0.23.1, <1", "pypdf"]
+dependencies = [
+    "aiofiles>=24.1.0",
+    "cryptography>=46.0.5",
+    "h2>=4.3.0",
+    "httpx[http2]>=0.23.1, <1",
+    "pypdf>=6.8.0",
+    "pytest>=8.3.5",
+    "urllib3>=2.6.3",
+]
 
 [project.optional-dependencies]
-tools = ["emoji", "Pillow", "reportlab", "beautifulsoup4"]
+tools = [
+    "emoji",
+    "Pillow>=12.1.1; python_version>='3.10'",
+    "Pillow>=10.4.0,<12.0.0; python_version<'3.10'",
+    "reportlab",
+    "beautifulsoup4",
+]
 rag = [
     "emoji",
-    "Pillow",
+    "Pillow>=12.1.1; python_version>='3.10'",
+    "Pillow>=10.4.0,<12.0.0; python_version<'3.10'",
     "reportlab",
     "oss2",
     "boto3",
@@ -26,7 +41,8 @@ rag = [
 dev = [
     "pytest",
     "emoji",
-    "Pillow",
+    "Pillow>=12.1.1; python_version>='3.10'",
+    "Pillow>=10.4.0,<12.0.0; python_version<'3.10'",
     "reportlab",
     "oss2",
     "boto3",
@@ -35,10 +51,10 @@ dev = [
 ]
 
 [project.urls]
-Issues = "https://github.com/Menghuan1918/pdfdeal/issues"
-Documentation = "https://menghuan1918.github.io/pdfdeal-docs/"
-Source = "https://github.com/Menghuan1918/pdfdeal"
-Changelog = "https://menghuan1918.github.io/pdfdeal-docs/changes/"
+Issues = "https://github.com/NoEdgeAI/pdfdeal/issues"
+Documentation = "https://noedgeai.github.io/pdfdeal-docs"
+Source = "https://github.com/NoEdgeAI/pdfdeal"
+Changelog = "https://noedgeai.github.io/pdfdeal-docs/changes"
 
 [project.scripts]
 doc2x = "pdfdeal.CLI.doc2x:main"

diff --git a/scripts/extract_v3_figures.py b/scripts/extract_v3_figures.py
@@ -0,0 +1,14 @@
+#!/usr/bin/env python3
+from pathlib import Path
+import sys
+
+try:
+    from pdfdeal.v3_media import run_cli
+except ImportError:  # pragma: no cover - local repo execution fallback
+    sys.modules.pop("pdfdeal", None)
+    sys.path.insert(0, str(Path(__file__).resolve().parents[1] / "src"))
+    from pdfdeal.v3_media import run_cli
+
+
+if __name__ == "__main__":
+    raise SystemExit(run_cli("figure"))
diff --git a/scripts/extract_v3_tables.py b/scripts/extract_v3_tables.py
@@ -0,0 +1,14 @@
+#!/usr/bin/env python3
+from pathlib import Path
+import sys
+
+try:
+    from pdfdeal.v3_media import run_cli
+except ImportError:  # pragma: no cover - local repo execution fallback
+    sys.modules.pop("pdfdeal", None)
+    sys.path.insert(0, str(Path(__file__).resolve().parents[1] / "src"))
+    from pdfdeal.v3_media import run_cli
+
+
+if __name__ == "__main__":
+    raise SystemExit(run_cli("table"))
diff --git a/src/pdfdeal/CLI/doc2x.py b/src/pdfdeal/CLI/doc2x.py
@@ -1,6 +1,7 @@
 import argparse
 import os
 from pdfdeal import Doc2X
+from pdfdeal.Doc2X.Types import FormulaLevel, V2ParseModel
 
 
 def main():
@@ -30,6 +31,26 @@ def main():
         help="The maximum number of pages to process at same time, default is 1000, DO NOT set if you don't know",
         required=False,
     )
+    parser.add_argument(
+        "--model",
+        help='Upload model for v2 preupload API, e.g. "v3-2026". Leave empty to use server default v2.',
+        required=False,
+        choices=[model.value for model in V2ParseModel],
+    )
+    parser.add_argument(
+        "--formula_level",
+        help=(
+            'Formula degradation level for v2 export body. '
+            '0 (default, recommended)=keep original formulas; '
+            '1=degrade inline formulas (\\(...\\), $...$); '
+            '2=degrade all formulas including block formulas (\\[...\\], $$...$$). '
+            'Only effective when --model is "v3-2026".'
+        ),
+        required=False,
+        type=int,
+        choices=[level.value for level in FormulaLevel],
+        default=FormulaLevel.KEEP_MARKDOWN.value,
+    )
     parser.add_argument(
         "-o",
         "--output",
@@ -99,6 +120,8 @@ def main():
         pdf_file=filename,
         output_path=output,
         output_format=format,
+        model=args.model,
+        formula_level=args.formula_level,
     )
 
     for file in success: