Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 12 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,17 +33,25 @@ PDF_PATH = "your_pdf_file.pdf"
python pdf_ocr.py
```

> 如需对多个 PDF 文件进行**批量处理**,请使用脚本 `pdf_ocr_batch.py`。设置方法与单文件处理类似,但需要将原本填写 PDF 文件路径的位置替换为目标文件夹(和`pdf_ocr_batch.py`放在同一目录下)的名称。脚本会自动遍历该文件夹下的所有 PDF 文件并进行处理,其处理逻辑与单文件处理完全一致。

```python
# 在 pdf_ocr_batch.py 中设置
API_KEY = "your_mistral_api_key"
DIRECTORY = "your_pdf_file" # 指定包含PDF文件的文件夹路径
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Consider clarifying that the DIRECTORY variable should contain the name of an existing directory, not a file path. It might also be helpful to mention that the directory should contain PDF files.

Suggested change
DIRECTORY = "your_pdf_file" # 指定包含PDF文件的文件夹路径
DIRECTORY = "your_pdf_directory" # 指定包含PDF文件的文件夹名称


# 然后运行脚本
python pdf_ocr_batch.py
```

## 输出结果

脚本将在工作目录下创建一个名为 `ocr_results_[PDF文件名]` 的文件夹,其中包含:
脚本将在工作目录下创建一个名为 `ocr_results_[PDF文件名]` 的文件夹(多文件批处理则是多个类似的文件夹),其中包含:

- `complete.md`: 包含所有页面内容的 Markdown 文件
- `images/`: 保存 PDF 中提取出的所有图像的文件夹


## 注意事项

- 请确保 PDF 文件路径正确
- API 密钥需要具有 OCR 功能的访问权限


97 changes: 97 additions & 0 deletions pdf_ocr_batch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
from mistralai import Mistral
from pathlib import Path
import os
import base64
from mistralai import DocumentURLChunk
from mistralai.models import OCRResponse

def replace_images_in_markdown(markdown_str: str, images_dict: dict) -> str:
for img_name, img_path in images_dict.items():
markdown_str = markdown_str.replace(f"![{img_name}]({img_name})", f"![{img_name}]({img_path})")
return markdown_str

def save_ocr_results(ocr_response: OCRResponse, output_dir: str) -> None:
# 创建输出目录
os.makedirs(output_dir, exist_ok=True)
images_dir = os.path.join(output_dir, "images")
os.makedirs(images_dir, exist_ok=True)

all_markdowns = []
for page in ocr_response.pages:
# 保存图片
page_images = {}
for img in page.images:
img_data = base64.b64decode(img.image_base64.split(',')[1])
img_path = os.path.join(images_dir, f"{img.id}.png")
with open(img_path, 'wb') as f:
f.write(img_data)
page_images[img.id] = f"images/{img.id}.png"

# 处理markdown内容
page_markdown = replace_images_in_markdown(page.markdown, page_images)
all_markdowns.append(page_markdown)

# 保存完整markdown
with open(os.path.join(output_dir, "complete.md"), 'w', encoding='utf-8') as f:
f.write("\n\n".join(all_markdowns))

def process_pdf(pdf_path: str, api_key: str) -> str:
# 初始化客户端
client = Mistral(api_key=api_key)

# 确认PDF文件存在
pdf_file = Path(pdf_path)
if not pdf_file.is_file():
raise FileNotFoundError(f"PDF文件不存在: {pdf_path}")

# 创建输出目录名称
output_dir = f"ocr_results_{pdf_file.stem}"

# 上传并处理PDF
uploaded_file = client.files.upload(
file={
"file_name": pdf_file.stem,
"content": pdf_file.read_bytes(),
},
purpose="ocr",
)

signed_url = client.files.get_signed_url(file_id=uploaded_file.id, expiry=1)
pdf_response = client.ocr.process(
document=DocumentURLChunk(document_url=signed_url.url),
model="mistral-ocr-latest",
include_image_base64=True
)

# 保存结果
save_ocr_results(pdf_response, output_dir)
print(f"OCR处理完成。结果保存在: {output_dir}")
return output_dir

def process_pdfs(pdf_paths: list, api_key: str) -> None:
for pdf_path in pdf_paths:
try:
output_dir = process_pdf(pdf_path, api_key)
print(f"文件 {pdf_path} 处理完成,结果保存在: {output_dir}")
except Exception as e:
print(f"处理文件 {pdf_path} 时出错: {e}")
Comment on lines +76 to +77
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The error message could be more informative. Consider including the specific exception message and the file path that caused the error.

print(f"处理文件 {pdf_path} 时出错: {type(e).__name__} - {e}")


def get_pdf_files_in_directory(directory: str) -> list:
"""获取指定目录中的所有PDF文件路径"""
pdf_files = []
for file in os.listdir(directory):
if file.endswith(".pdf"):
pdf_files.append(os.path.join(directory, file))
return pdf_files

if __name__ == "__main__":
# 使用示例
API_KEY = "your_mistral_api_key"
DIRECTORY = "your_pdf_file" # 指定包含PDF文件的文件夹名称

# 获取文件夹中的所有PDF文件
PDF_PATHS = get_pdf_files_in_directory(DIRECTORY)
if not PDF_PATHS:
print(f"目录 {DIRECTORY} 中没有找到PDF文件。")
Comment on lines +93 to +95
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

It would be beneficial to add a check to ensure that the specified directory exists before attempting to process the PDF files. This can prevent errors if the user provides an invalid directory path.

    PDF_PATHS = get_pdf_files_in_directory(DIRECTORY)
    if not os.path.isdir(DIRECTORY):
        print(f"错误: 目录 {DIRECTORY} 不存在.")
    elif not PDF_PATHS:
        print(f"目录 {DIRECTORY} 中没有找到PDF文件。")

else:
process_pdfs(PDF_PATHS, API_KEY)