@@ -211,7 +211,7 @@ def get_all_lines(self, document: Document, provider: PdfProvider):
211
211
212
212
# Add in the provider lines - merge ones that get broken by pdftext
213
213
merged_provider_lines = self .merge_provider_lines_detected_lines (
214
- provider_lines , detection_boxes , image_size , page_size
214
+ provider_lines , detection_boxes , image_size , page_size , document_page . page_id
215
215
)
216
216
217
217
# If fixing lines, mark every line to be passed to the OCR model
@@ -391,6 +391,7 @@ def merge_provider_lines_detected_lines(
391
391
text_lines : List [PolygonBox ],
392
392
image_size ,
393
393
page_size ,
394
+ page_id ,
394
395
):
395
396
# When provider lines is empty or no lines detected, return provider lines
396
397
if not provider_lines or not text_lines :
@@ -414,7 +415,7 @@ def merge_provider_lines_detected_lines(
414
415
]
415
416
416
417
overlaps = matrix_intersection_area (provider_line_boxes , detected_line_boxes )
417
-
418
+
418
419
# Find potential merges
419
420
merge_lines = defaultdict (list )
420
421
for i in range (len (provider_line_boxes )):
@@ -532,4 +533,22 @@ def bbox_for_merge_section(
532
533
# Sort to preserve original order
533
534
out_provider_lines = sorted (out_provider_lines , key = lambda x : x [0 ])
534
535
out_provider_lines = [p for _ , p in out_provider_lines ]
536
+
537
+ # Detected lines that do not overlap with any provider lines shoudl be outputted as-is
538
+ LineClass : Line = get_block_class (BlockTypes .Line )
539
+ for j in range (len (detected_line_boxes )):
540
+ if np .max (overlaps [:, j ]) == 0 :
541
+ detected_line_polygon = PolygonBox .from_bbox (detected_line_boxes [j ])
542
+ out_provider_lines .append (
543
+ ProviderOutput (
544
+ line = LineClass (
545
+ polygon = detected_line_polygon ,
546
+ page_id = page_id ,
547
+ text_extraction_method = "surya" ,
548
+ ),
549
+ spans = [],
550
+ chars = [],
551
+ )
552
+ )
553
+
535
554
return out_provider_lines
0 commit comments