-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtest.py
More file actions
94 lines (73 loc) · 3.25 KB
/
test.py
File metadata and controls
94 lines (73 loc) · 3.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import fitz
import os
def is_caption(text):
text_lower = text.lower().strip()
return text_lower.startswith("figure") or text_lower.startswith("fig.")
def extract_figures(pdf_path, output_folder, dpi=300):
os.makedirs(output_folder, exist_ok=True)
doc = fitz.open(pdf_path)
results = {}
zoom = dpi / 72
matrix = fitz.Matrix(zoom, zoom)
for page_index, page in enumerate(doc, start=1):
print(f"\nProcessing page {page_index}")
# Render full page as reference image
pix = page.get_pixmap(matrix=matrix)
full_img = pix.tobytes("png")
full_image = fitz.Pixmap(full_img)
blocks = page.get_text("dict")["blocks"]
figure_blocks = []
caption_blocks = []
# Separate diagram blocks and caption blocks
for b in blocks:
if b["type"] == 0: # text block
text = ""
for line in b["lines"]:
for span in line["spans"]:
text += span["text"] + " "
if is_caption(text):
caption_blocks.append((b["bbox"], text.strip()))
else:
# non text block (vector drawings etc)
figure_blocks.append(b["bbox"])
# Merge figure blocks and caption into single region per figure
page_results = []
for cap_bbox, caption_text in caption_blocks:
# find nearby diagram blocks
merged_bbox = list(cap_bbox)
for fig_bbox in figure_blocks:
# expand region if overlapping or near
if (
fig_bbox[0] < cap_bbox[2] and
fig_bbox[2] > cap_bbox[0] and
fig_bbox[1] < cap_bbox[3] + 200 and # allow vertical margin
fig_bbox[3] > cap_bbox[1] - 200
):
merged_bbox[0] = min(merged_bbox[0], fig_bbox[0])
merged_bbox[1] = min(merged_bbox[1], fig_bbox[1])
merged_bbox[2] = max(merged_bbox[2], fig_bbox[2])
merged_bbox[3] = max(merged_bbox[3], fig_bbox[3])
# Crop region
rect = fitz.Rect(merged_bbox)
pix_crop = page.get_pixmap(matrix=matrix, clip=rect)
fig_filename = f"page_{page_index}_figure_{len(page_results)+1}.png"
fig_path = os.path.join(output_folder, fig_filename)
pix_crop.save(fig_path)
# Save caption
caption_filename = f"page_{page_index}_figure_{len(page_results)+1}_caption.txt"
caption_path = os.path.join(output_folder, caption_filename)
with open(caption_path, "w", encoding="utf8") as f:
f.write(caption_text)
print(f"Saved figure: {fig_filename}")
print(f"Caption: {caption_text}")
page_results.append({
"image": fig_path,
"caption": caption_text,
"bbox": merged_bbox
})
results[f"page_{page_index}"] = page_results
return results
if __name__ == "__main__":
pdf_path = "C:\\Users\\devri\\OneDrive\\Desktop\\Agentiops\\data\\2510.02125v1.pdf" # put your PDF file here
output_folder = "figures_output"
extract_figures(pdf_path, output_folder)