Skip to content

Commit 9474a41

Browse files
committed
Merge missed text detection boxes in format lines
When merging provider and detection lines, some boxes may be missing, but the layout check fails. This catches and merges in these boxes too.
1 parent ca5f2d2 commit 9474a41

File tree

1 file changed

+21
-2
lines changed

1 file changed

+21
-2
lines changed

marker/builders/line.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ def get_all_lines(self, document: Document, provider: PdfProvider):
211211

212212
# Add in the provider lines - merge ones that get broken by pdftext
213213
merged_provider_lines = self.merge_provider_lines_detected_lines(
214-
provider_lines, detection_boxes, image_size, page_size
214+
provider_lines, detection_boxes, image_size, page_size, document_page.page_id
215215
)
216216

217217
# If fixing lines, mark every line to be passed to the OCR model
@@ -391,6 +391,7 @@ def merge_provider_lines_detected_lines(
391391
text_lines: List[PolygonBox],
392392
image_size,
393393
page_size,
394+
page_id,
394395
):
395396
# When provider lines is empty or no lines detected, return provider lines
396397
if not provider_lines or not text_lines:
@@ -414,7 +415,7 @@ def merge_provider_lines_detected_lines(
414415
]
415416

416417
overlaps = matrix_intersection_area(provider_line_boxes, detected_line_boxes)
417-
418+
418419
# Find potential merges
419420
merge_lines = defaultdict(list)
420421
for i in range(len(provider_line_boxes)):
@@ -532,4 +533,22 @@ def bbox_for_merge_section(
532533
# Sort to preserve original order
533534
out_provider_lines = sorted(out_provider_lines, key=lambda x: x[0])
534535
out_provider_lines = [p for _, p in out_provider_lines]
536+
537+
# Detected lines that do not overlap with any provider lines shoudl be outputted as-is
538+
LineClass: Line = get_block_class(BlockTypes.Line)
539+
for j in range(len(detected_line_boxes)):
540+
if np.max(overlaps[:, j]) == 0:
541+
detected_line_polygon = PolygonBox.from_bbox(detected_line_boxes[j])
542+
out_provider_lines.append(
543+
ProviderOutput(
544+
line=LineClass(
545+
polygon=detected_line_polygon,
546+
page_id=page_id,
547+
text_extraction_method="surya",
548+
),
549+
spans=[],
550+
chars=[],
551+
)
552+
)
553+
535554
return out_provider_lines

0 commit comments

Comments
 (0)