-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathpdf2file.py
More file actions
125 lines (102 loc) · 4.02 KB
/
pdf2file.py
File metadata and controls
125 lines (102 loc) · 4.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import json
import time
import requests as rq
import os
base_url = "https://v2.doc2x.noedgeai.com"
secret = os.getenv("DOC2X_APIKEY")
def preupload():
url = f"{base_url}/api/v2/parse/preupload"
headers = {"Authorization": f"Bearer {secret}"}
res = rq.post(url, headers=headers)
res.raise_for_status() # 检查HTTP请求是否成功
data = res.json()
if data.get("code") == "success":
return data["data"]
raise Exception(f"get preupload url failed: {data}")
def put_file(path: str, url: str):
with open(path, "rb") as f:
res = rq.put(url, data=f) # body为文件二进制流
res.raise_for_status() # 检查HTTP请求是否成功
def get_status(uid: str):
url = f"{base_url}/api/v2/parse/status?uid={uid}"
headers = {"Authorization": f"Bearer {secret}"}
res = rq.get(url, headers=headers)
res.raise_for_status() # 检查HTTP请求是否成功
data = res.json()
if data.get("code") == "success":
return data["data"]
raise Exception(f"get status failed: {data}")
#! 此上部分与pdf.py相同,是用于上传文件并等待解析完成的代码
#! The above part is the same as pdf.py, which is used to upload files and wait for parsing to complete
def export_file(uid: str, to_format: str, formula_mode: str):
url = f"{base_url}/api/v2/convert/parse"
headers = {"Authorization": f"Bearer {secret}"}
payload = {
"uid": uid,
"to": to_format,
"formula_mode": formula_mode,
}
res = rq.post(url, headers=headers, json=payload)
res.raise_for_status() # 检查HTTP请求是否成功
data = res.json()
if data.get("code") == "success":
return data["data"]
raise Exception(f"export file failed: {data}")
def get_export_result(uid: str):
url = f"{base_url}/api/v2/convert/parse/result?uid={uid}"
headers = {"Authorization": f"Bearer {secret}"}
res = rq.get(url, headers=headers)
res.raise_for_status() # 检查HTTP请求是否成功
data = res.json()
if data.get("code") == "success":
return data["data"]
raise Exception(f"get export result failed: {data}")
def download_file(file_url: str, output_path: str):
res = rq.get(file_url)
res.raise_for_status() # 检查HTTP请求是否成功
with open(output_path, "wb") as f:
f.write(res.content)
def main(file):
# 上传文件并等待解析完成
upload_data = preupload()
print(upload_data)
url, uid = upload_data["url"], upload_data["uid"]
put_file(file, url)
for _ in range(100):
status_data = get_status(uid)
status = status_data.get("status")
if status == "success":
print("Save result to result.json")
with open("result.json", "w") as f:
json.dump(status_data["result"], f)
break
elif status == "failed":
print(status_data)
raise Exception(f"parse failed: {status_data.get('detail')}")
elif status == "processing":
print(status_data)
print(f"progress: {status_data.get('progress')}")
time.sleep(3)
else:
raise Exception(f"Fails to deal with uid: {uid} after 100 retries")
# 导出文件
print("Start exporting file...")
export_file(uid, "docx", "normal") # 可以根据需要修改格式和公式模式
for _ in range(100):
result_data = get_export_result(uid)
status = result_data.get("status")
if status == "success":
file_url = result_data["url"]
output_path = "output.docx" # 根据实际格式修改扩展名
print(f"Downloading file to {output_path}")
download_file(file_url, output_path)
return
elif status == "failed":
print(result_data)
raise Exception("Export failed")
elif status == "processing":
print("Export processing...")
time.sleep(3)
raise Exception(f"Export timeout with uid: {uid} after 100 retries")
if __name__ == "__main__":
main("test.pdf")