From 299990b5c0fd6e2f3b0db6674d1beb75ffd8dcae Mon Sep 17 00:00:00 2001 From: Wang Xin Date: Thu, 5 Mar 2026 14:32:10 +0000 Subject: [PATCH 1/4] fix(structure): improve layout sorting, title levels, and inline formula handling - Port PaddleX's xycut_enhanced algorithm faithfully with proper constants, cross-layout detection, overlapping box shrinking, and weighted distance insertion for reading order sorting - Add multi-signal title level inference using semantic numbering, relative indentation, and font-size k-means clustering - Inject inline formulas into text regions before stitching to prevent duplicate rendering, wrapping LaTeX with $...$ delimiters - Move markdown generation from examples into core StructureResult::to_markdown() - Fix cell detection to also run when use_cells_trans_to_html is enable - Add max_side_len=4000 default and lower box_threshold for table pipelines - Add line_height_iou_threshold for improved line grouping in stitching --- examples/utils/markdown.rs | 515 +------- oar-ocr-core/src/domain/structure.rs | 829 ++++++++++--- oar-ocr-core/src/domain/text_region.rs | 27 +- oar-ocr-core/src/processors/geometry.rs | 37 +- oar-ocr-core/src/processors/layout_sorting.rs | 832 +++++++++---- oar-ocr-core/src/processors/layout_utils.rs | 49 +- oar-ocr-core/src/processors/sorting.rs | 5 +- oar-ocr-vl/src/doc_parser.rs | 10 +- src/oarocr/ocr.rs | 17 + src/oarocr/stitching.rs | 1039 +++++++++++++---- src/oarocr/structure.rs | 51 +- src/oarocr/table_analyzer.rs | 173 ++- 12 files changed, 2461 insertions(+), 1123 deletions(-) diff --git a/examples/utils/markdown.rs b/examples/utils/markdown.rs index 3a72fa8..e68f810 100644 --- a/examples/utils/markdown.rs +++ b/examples/utils/markdown.rs @@ -5,34 +5,7 @@ //! while these examples utilities handle the file system operations. use oar_ocr::domain::structure::{LayoutElementType, StructureResult}; -use oar_ocr::processors::BoundingBox; -use regex::Regex; use std::path::Path; -use std::sync::LazyLock; - -/// Title numbering pattern for detecting section numbers like 1, 1.2, 1.2.3, (1), 一、etc. -static TITLE_NUMBERING_REGEX: LazyLock = LazyLock::new(|| { - Regex::new( - r"(?x) - ^\s* - ( - [1-9][0-9]*(?:\.[1-9][0-9]*)*[\.、]? - | - [((][1-9][0-9]*(?:\.[1-9][0-9]*)*[))] - | - [一二三四五六七八九十百千万亿零壹贰叁肆伍陆柒捌玖拾][、.]? - | - [((][一二三四五六七八九十百千万亿零壹贰叁肆伍陆柒捌玖拾]+[))] - | - (?:I|II|III|IV|V|VI|VII|VIII|IX|X)(?:\.|\b) - ) - (\s+) - (.*) - $ - ", - ) - .expect("Invalid TITLE_NUMBERING_REGEX pattern") -}); /// Exports markdown with extracted images saved to disk. /// @@ -59,280 +32,55 @@ pub fn export_markdown_with_images( std::fs::create_dir_all(&imgs_dir)?; } - // Collect table bboxes for overlap filtering - let table_bboxes: Vec<&BoundingBox> = result - .layout_elements - .iter() - .filter(|e| e.element_type == LayoutElementType::Table) - .map(|e| &e.bbox) - .collect(); - - let mut md = String::new(); - let mut img_counter = 0usize; - let elements = &result.layout_elements; - - for (idx, element) in elements.iter().enumerate() { - // PP-StructureV3 markdown ignores auxiliary labels. + // Extract and save images for Image/Chart elements + for element in &result.layout_elements { if matches!( element.element_type, - LayoutElementType::Number - | LayoutElementType::Footnote - | LayoutElementType::Header - | LayoutElementType::HeaderImage - | LayoutElementType::Footer - | LayoutElementType::FooterImage - | LayoutElementType::AsideText + LayoutElementType::Image | LayoutElementType::Chart ) { - continue; - } - - // Filter out low-confidence text elements that overlap with tables - if element.element_type == LayoutElementType::Text { - let overlaps_table = table_bboxes - .iter() - .any(|table_bbox| element.bbox.ioa(table_bbox) > 0.3); - if overlaps_table && element.confidence < 0.7 { - continue; - } - } - - match element.element_type { - // Document title - LayoutElementType::DocTitle => { - md.push_str("\n# "); - if let Some(text) = &element.text { - let cleaned = clean_ocr_text(text); - md.push_str(&cleaned); - } - md.push_str("\n\n"); - } - // Paragraph/section title - LayoutElementType::ParagraphTitle => { - if let Some(text) = &element.text { - let cleaned = clean_ocr_text(text); - let (level, formatted_title) = format_title_with_level(&cleaned); - md.push('\n'); - for _ in 0..level { - md.push('#'); - } - md.push(' '); - md.push_str(&formatted_title); - md.push_str("\n\n"); - } else { - md.push_str("\n## \n\n"); - } - } - // Table - LayoutElementType::Table => { - if let Some(table) = result - .tables - .iter() - .find(|t| t.bbox.iou(&element.bbox) > 0.5) - { - if let Some(html) = &table.html_structure { - let simplified = simplify_table_html(html); - let table_with_border = - simplified.replacen("", "
", 1); - md.push_str("\n
"); - md.push_str(&table_with_border); - md.push_str("
\n\n"); - } else { - md.push_str("\n[Table]\n\n"); - } - } else { - md.push_str("\n[Table]\n\n"); - } - } - // Formula - detect inline vs display formula based on context - LayoutElementType::Formula | LayoutElementType::FormulaNumber => { - let is_inline = { - let has_prev_text = (0..idx).rev().any(|i| { - let prev = &elements[i]; - !prev.element_type.is_formula() - && (prev.element_type == LayoutElementType::Text - || prev.element_type == LayoutElementType::ReferenceContent) - && is_same_line(&element.bbox, &prev.bbox) - }); - - let has_next_text = ((idx + 1)..elements.len()).any(|i| { - let next = &elements[i]; - !next.element_type.is_formula() - && (next.element_type == LayoutElementType::Text - || next.element_type == LayoutElementType::ReferenceContent) - && is_same_line(&element.bbox, &next.bbox) - }); - - has_prev_text || has_next_text - }; - - if is_inline { - md.push('$'); - if let Some(latex) = &element.text { - md.push_str(latex); - } - md.push_str("$ "); - } else { - md.push_str("\n$$"); - if let Some(latex) = &element.text { - md.push_str(latex); - } - md.push_str("$$\n\n"); - } - } - // Image/Chart - extract and save image region - LayoutElementType::Image | LayoutElementType::Chart => { - let type_name = if element.element_type == LayoutElementType::Chart { - "chart" - } else { - "image" - }; - - // Generate image filename - let img_name = format!( - "img_in_{}_box_{:.0}_{:.0}_{:.0}_{:.0}.jpg", - type_name, - element.bbox.x_min(), - element.bbox.y_min(), - element.bbox.x_max(), - element.bbox.y_max() - ); - let img_path = imgs_dir.join(&img_name); - let relative_path = format!("imgs/{}", img_name); - - // Extract and save image region if we have the source image - if let Some(ref img) = result.rectified_img { - let x = element.bbox.x_min().max(0.0) as u32; - let y = element.bbox.y_min().max(0.0) as u32; - let width = ((element.bbox.x_max() - element.bbox.x_min()) as u32) - .min(img.width().saturating_sub(x)); - let height = ((element.bbox.y_max() - element.bbox.y_min()) as u32) - .min(img.height().saturating_sub(y)); - - if width > 0 && height > 0 { - let cropped = - image::imageops::crop_imm(img.as_ref(), x, y, width, height).to_image(); - // Save as JPEG - if let Err(e) = cropped.save(&img_path) { - tracing::warn!("Failed to save image {}: {}", img_path.display(), e); - } - } - } - - // Calculate width percentage - let width_pct = - ((element.bbox.x_max() - element.bbox.x_min()) / 12.0).clamp(20.0, 100.0); - - md.push_str("\n
\"Image\"
\n\n"); - - img_counter += 1; - } - // Seal - LayoutElementType::Seal => { - md.push_str("\n![Seal]"); - if let Some(text) = &element.text { - md.push_str("\n> "); - md.push_str(text); - } - md.push_str("\n\n"); - } - // Captions - _ if element.element_type.is_caption() => { - if let Some(text) = &element.text { - md.push_str("\n
"); - md.push_str(text); - md.push_str("
\n\n"); - } - } - // Abstract - LayoutElementType::Abstract => { - if let Some(text) = &element.text { - let lower = text.to_lowercase(); - if lower.contains("abstract") || lower.contains("摘要") { - md.push_str("\n## **Abstract**\n\n"); - } - let formatted = format_text_block(text); - md.push_str(&formatted); - md.push_str("\n\n"); - } - } - // Reference - LayoutElementType::Reference => { - if let Some(text) = &element.text { - let formatted = format_reference_block(text); - md.push('\n'); - md.push_str(&formatted); - md.push_str("\n\n"); - } - } - // Content - LayoutElementType::Content => { - if let Some(text) = &element.text { - let formatted = format_content_block(text); - md.push('\n'); - md.push_str(&formatted); - md.push_str("\n\n"); - } - } - // Footnote - LayoutElementType::Footnote => { - if let Some(text) = &element.text { - let formatted = format_vision_footnote_block(text); - md.push('\n'); - md.push_str(&formatted); - md.push_str("\n\n"); - } - } - // List - LayoutElementType::List => { - if let Some(text) = &element.text { - let cleaned = format_text_block(text); - for line in cleaned.lines() { - let line = line.trim(); - if !line.is_empty() { - md.push_str("- "); - md.push_str(line); - md.push('\n'); - } + let type_name = if element.element_type == LayoutElementType::Chart { + "chart" + } else { + "image" + }; + + // Generate image filename matching StructureResult::to_markdown() placeholder + let img_name = format!( + "img_in_{}_box_{:.0}_{:.0}_{:.0}_{:.0}.jpg", + type_name, + element.bbox.x_min(), + element.bbox.y_min(), + element.bbox.x_max(), + element.bbox.y_max() + ); + let img_path = imgs_dir.join(&img_name); + + // Extract and save image region if we have the source image + if let Some(ref img) = result.rectified_img { + let x = element.bbox.x_min().max(0.0) as u32; + let y = element.bbox.y_min().max(0.0) as u32; + let width = ((element.bbox.x_max() - element.bbox.x_min()) as u32) + .min(img.width().saturating_sub(x)); + let height = ((element.bbox.y_max() - element.bbox.y_min()) as u32) + .min(img.height().saturating_sub(y)); + + if width > 0 && height > 0 { + let cropped = + image::imageops::crop_imm(img.as_ref(), x, y, width, height).to_image(); + // Save as JPEG to match extension in markdown + if let Err(e) = cropped.save(&img_path) { + tracing::warn!("Failed to save image {}: {}", img_path.display(), e); } - md.push('\n'); - } - } - // Header/Footer - skip - _ if element.element_type.is_header() || element.element_type.is_footer() => { - continue; - } - // Default text - _ => { - if let Some(text) = &element.text { - let formatted = format_text_block(text); - md.push_str(&formatted); - md.push_str("\n\n"); } } } } - tracing::debug!("Extracted {} images to {:?}", img_counter, imgs_dir); - Ok(md.trim().to_string()) + // Use core library markdown generation (already implements PaddleX rules) + Ok(result.to_markdown()) } -/// Exports concatenated markdown from multiple pages with images. -/// -/// This follows the same concatenation logic as `concatenate_markdown_pages` -/// but also handles image extraction for all pages. -/// -/// # Arguments -/// -/// * `results` - Slice of structure results from multiple pages (in order) -/// * `output_dir` - Directory to save extracted images -/// -/// # Returns -/// -/// A single markdown string with all pages properly concatenated and images extracted +/// Exports concatenated markdown from multiple pages with images and post-processing. pub fn export_concatenated_markdown_with_images( results: &[StructureResult], output_dir: impl AsRef, @@ -343,187 +91,18 @@ pub fn export_concatenated_markdown_with_images( return Ok(String::new()); } - if results.len() == 1 { - return export_markdown_with_images(&results[0], output_dir); - } - - let mut markdown = String::new(); - let mut prev_page_end_flag = true; - - for result in results.iter() { - let flags = result - .page_continuation_flags - .as_ref() - .cloned() - .unwrap_or_else(|| result.calculate_continuation_flags()); - - let page_markdown = export_markdown_with_images(result, output_dir)?; - - if page_markdown.trim().is_empty() { - prev_page_end_flag = flags.paragraph_end; - continue; - } - - let page_first_continues = !flags.paragraph_start; - - if page_first_continues && !prev_page_end_flag { - let last_char = markdown.chars().last(); - let first_char = page_markdown.chars().next(); - - let last_is_chinese = last_char.is_some_and(is_chinese_char); - let first_is_chinese = first_char.is_some_and(is_chinese_char); - - if !last_is_chinese && !first_is_chinese { - markdown.push(' '); - markdown.push_str(page_markdown.trim_start()); - } else { - markdown.push_str(page_markdown.trim_start()); - } - } else { - if !markdown.is_empty() { - markdown.push_str("\n\n"); - } - markdown.push_str(&page_markdown); - } - - prev_page_end_flag = flags.paragraph_end; - } - - Ok(markdown.trim().to_string()) -} - -/// Cleans OCR text content by removing common artifacts. -fn clean_ocr_text(text: &str) -> String { - text.replace("-\n", "").replace('\n', " ") -} - -/// Formats text blocks following PaddleX's text handling. -fn format_text_block(text: &str) -> String { - let dehyphenated = text.replace("-\n", ""); - let step1 = dehyphenated.replace("\n\n", "\n"); - step1.replace('\n', "\n\n") -} - -/// Formats content blocks (table of contents). -fn format_content_block(text: &str) -> String { - let step1 = text.replace("-\n", " \n"); - step1.replace('\n', " \n") -} - -/// Formats reference blocks. -fn format_reference_block(text: &str) -> String { - let dehyphenated = text.replace("-\n", ""); - let lines: Vec<&str> = dehyphenated.lines().collect(); - - let mut result = String::new(); - let mut added_heading = false; - - for (i, line) in lines.iter().enumerate() { - let trimmed = line.trim(); - if trimmed.is_empty() { - continue; - } - - if !added_heading && (trimmed.contains("References") || trimmed.contains("参考文献")) { - result.push_str("## **References**\n\n"); - added_heading = true; - continue; - } - - if i > 0 || result.is_empty() { - if !result.is_empty() { - result.push('\n'); - } - result.push_str(trimmed); - } - } - - if result.is_empty() { - dehyphenated - } else { - result + // First, save all images from all pages + for result in results { + export_markdown_with_images(result, output_dir)?; } -} -/// Formats vision footnote blocks. -fn format_vision_footnote_block(text: &str) -> String { - let dehyphenated = text.replace("-\n", ""); - let step1 = dehyphenated.replace("\n\n", "\n"); - step1.replace('\n', "\n\n") -} - -/// Simplifies table HTML by removing wrapper tags. -fn simplify_table_html(html: &str) -> String { - html.replace("", "") - .replace("", "") - .replace("", "") - .replace("", "") -} + // Use core library concatenation logic (handles paragraph continuity and CJK spacing) + let raw_markdown = oar_ocr::domain::structure::concatenate_markdown_pages(results); -/// Checks if two bounding boxes are on the same line. -fn is_same_line(bbox1: &BoundingBox, bbox2: &BoundingBox) -> bool { - let y1_min = bbox1.y_min(); - let y1_max = bbox1.y_max(); - let y2_min = bbox2.y_min(); - let y2_max = bbox2.y_max(); + // Apply advanced PaddleX post-processing (dehyphenation, word merging fixes, deduplication) + let processed_markdown = oar_ocr::domain::structure::postprocess_markdown(&raw_markdown); - let overlap_start = y1_min.max(y2_min); - let overlap_end = y1_max.min(y2_max); - let overlap = (overlap_end - overlap_start).max(0.0); - - let height1 = y1_max - y1_min; - let height2 = y2_max - y2_min; - let min_height = height1.min(height2); - - min_height > 0.0 && overlap / min_height > 0.5 -} - -/// Checks if a character is a Chinese character. -fn is_chinese_char(c: char) -> bool { - matches!(c, - '\u{4E00}'..='\u{9FFF}' | // CJK Unified Ideographs - '\u{3400}'..='\u{4DBF}' | // CJK Unified Ideographs Extension A - '\u{20000}'..='\u{2A6DF}' | // CJK Unified Ideographs Extension B - '\u{2A700}'..='\u{2B73F}' | // CJK Unified Ideographs Extension C - '\u{2B740}'..='\u{2B81F}' | // CJK Unified Ideographs Extension D - '\u{2B820}'..='\u{2CEAF}' | // CJK Unified Ideographs Extension E - '\u{2CEB0}'..='\u{2EBEF}' // CJK Unified Ideographs Extension F - ) + Ok(processed_markdown) } -/// Title numbering pattern for detecting section numbers. -fn is_numbered_title(title: &str) -> (bool, usize, String) { - let cleaned = title.replace("-\n", "").replace('\n', " "); - - if let Some(captures) = TITLE_NUMBERING_REGEX.captures(&cleaned) { - let numbering = captures.get(1).map(|m| m.as_str().trim()).unwrap_or(""); - let title_content = captures.get(3).map(|m| m.as_str()).unwrap_or(""); - - let dot_count = numbering.matches('.').count(); - let level = dot_count + 2; - - let formatted = if title_content.is_empty() { - numbering.trim_end_matches('.').to_string() - } else { - format!( - "{} {}", - numbering.trim_end_matches('.'), - title_content.trim_start() - ) - }; - - (true, level.clamp(2, 6), formatted) - } else { - (false, 2, cleaned) - } -} - -/// Formats paragraph title with automatic level detection. -fn format_title_with_level(title: &str) -> (usize, String) { - let (is_numbered, level, formatted) = is_numbered_title(title); - if is_numbered { - (level, formatted) - } else { - (2, title.replace("-\n", "").replace('\n', " ")) - } -} +// Remove redundant local helpers that are now handled by core library diff --git a/oar-ocr-core/src/domain/structure.rs b/oar-ocr-core/src/domain/structure.rs index 7358628..4ef042b 100644 --- a/oar-ocr-core/src/domain/structure.rs +++ b/oar-ocr-core/src/domain/structure.rs @@ -9,6 +9,7 @@ use image::RgbImage; use once_cell::sync::Lazy; use regex::Regex; use serde::{Deserialize, Serialize}; +use std::collections::HashMap; use std::path::Path; use std::sync::Arc; @@ -58,20 +59,25 @@ static TITLE_NUMBERING_REGEX: Lazy = Lazy::new(|| { /// - "1 Introduction" -> (2, "1 Introduction") -> `## 1 Introduction` /// - "2.1 Method" -> (3, "2.1 Method") -> `### 2.1 Method` /// - "2.1.1 Details" -> (4, "2.1.1 Details") -> `#### 2.1.1 Details` -fn format_title_with_level(title: &str) -> (usize, String) { - // Clean up line breaks - let cleaned = title.replace("-\n", "").replace('\n', " "); +fn semantic_title_level_and_format(cleaned: &str) -> Option<(usize, String)> { + let trimmed = cleaned.trim(); + + // Common unnumbered top-level section headers. + let keyword = trimmed.trim_end_matches(':').to_ascii_uppercase(); + if matches!( + keyword.as_str(), + "ABSTRACT" | "INTRODUCTION" | "REFERENCES" | "REFERENCE" + ) { + return Some((1, trimmed.to_string())); + } - if let Some(captures) = TITLE_NUMBERING_REGEX.captures(&cleaned) { + if let Some(captures) = TITLE_NUMBERING_REGEX.captures(cleaned) { let numbering = captures.get(1).map(|m| m.as_str().trim()).unwrap_or(""); let title_content = captures.get(3).map(|m| m.as_str()).unwrap_or(""); - // Determine level from dots in numbering (PaddleX: dots + 1, then +1 for base ##) - // 1 -> 2 (##), 1.2 -> 3 (###), 1.2.3 -> 4 (####) let dot_count = numbering.matches('.').count(); - let level = dot_count + 2; // +1 for PaddleX logic, +1 for base ## level + let level = (dot_count + 2).clamp(2, 6); - // Reconstruct title: numbering + space + content let formatted = if title_content.is_empty() { numbering.trim_end_matches('.').to_string() } else { @@ -81,15 +87,213 @@ fn format_title_with_level(title: &str) -> (usize, String) { title_content.trim_start() ) }; + return Some((level, formatted)); + } + + None +} + +fn semantic_title_level(text: &str) -> Option { + let cleaned = text.replace("-\n", "").replace('\n', " "); + semantic_title_level_and_format(&cleaned).map(|(level, _)| level) +} + +fn format_title_with_level(title: &str, clustered_level: Option) -> (usize, String) { + // Clean up line breaks + let cleaned = title.replace("-\n", "").replace('\n', " "); + if let Some((level, formatted)) = semantic_title_level_and_format(&cleaned) { + return (level, formatted); + } + + // No semantic signal: use voting hint from relative/font-size signals. + let level = clustered_level.unwrap_or(2).clamp(2, 6); + (level, cleaned) +} + +/// Estimate per-title heading levels using three-signal voting: +/// 1) semantic numbering/keyword level +/// 2) relative indentation order +/// 3) font-size k-means (k<=4) +/// +fn infer_paragraph_title_levels(elements: &[LayoutElement]) -> HashMap { + let title_indices: Vec = elements + .iter() + .enumerate() + .filter(|(_, e)| e.element_type == LayoutElementType::ParagraphTitle) + .map(|(idx, _)| idx) + .collect(); + if title_indices.is_empty() { + return HashMap::new(); + } + + let height_samples: Vec<(usize, f32)> = title_indices + .iter() + .filter_map(|&idx| { + let e = &elements[idx]; + let height = (e.bbox.y_max() - e.bbox.y_min()).max(1.0); + let line_h = height / e.num_lines.unwrap_or(1).max(1) as f32; + let v = line_h.max(1.0); + if v.is_finite() { Some((idx, v)) } else { None } + }) + .collect(); + + let indent_samples: Vec<(usize, f32)> = title_indices + .iter() + .filter_map(|&idx| { + let x = elements[idx].bbox.x_min(); + if x.is_finite() { Some((idx, x)) } else { None } + }) + .collect(); + let semantic_levels: HashMap = title_indices + .iter() + .filter_map(|&idx| { + elements[idx] + .text + .as_deref() + .and_then(semantic_title_level) + .map(|level| (idx, level)) + }) + .collect(); - // Clamp level to reasonable range (2-6 for markdown, since # is for doc_title) - let level = level.clamp(2, 6); + let font_levels = infer_levels_by_kmeans_feature(&height_samples, true); + // Smaller x_min (less indent) -> higher-level heading. + let relative_levels = infer_levels_by_kmeans_feature(&indent_samples, false); - (level, formatted) + let mut voted = HashMap::new(); + for idx in title_indices { + let semantic_level = semantic_levels.get(&idx).copied(); + let font_level = font_levels.get(&idx).copied(); + let relative_level = relative_levels.get(&idx).copied(); + + let mut score = [0u8; 7]; + if let Some(level) = semantic_level { + score[level.clamp(1, 6)] += 2; + } + if let Some(level) = font_level { + score[level.clamp(1, 6)] += 1; + } + if let Some(level) = relative_level { + score[level.clamp(1, 6)] += 1; + } + + let mut best_level = semantic_level.unwrap_or(2); + let mut best_score = 0u8; + for (level, &s) in score.iter().enumerate().skip(1) { + if s > best_score { + best_score = s; + best_level = level; + } else if s == best_score && s > 0 { + let is_semantic = semantic_level == Some(level); + let best_is_semantic = semantic_level == Some(best_level); + if (is_semantic && !best_is_semantic) + || (is_semantic == best_is_semantic && level < best_level) + { + best_level = level; + } + } + } + + if best_score == 0 { + best_level = semantic_level + .or(font_level) + .or(relative_level) + .unwrap_or(2); + } + + voted.insert(idx, best_level.clamp(1, 6)); + } + + voted +} + +/// Cluster one scalar feature into heading levels with 1D k-means. +/// +/// `descending=true` means larger feature -> higher-level heading (smaller markdown depth). +/// `descending=false` means smaller feature -> higher-level heading. +fn infer_levels_by_kmeans_feature( + samples: &[(usize, f32)], + descending: bool, +) -> HashMap { + let clean_samples: Vec<(usize, f32)> = samples + .iter() + .copied() + .filter(|(_, v)| v.is_finite()) + .collect(); + if clean_samples.len() < 2 { + return HashMap::new(); + } + + let mut values: Vec = clean_samples.iter().map(|(_, v)| *v).collect(); + values.sort_by(|a, b| a.total_cmp(b)); + let unique_count = values + .windows(2) + .filter(|w| (w[1] - w[0]).abs() > 1e-3) + .count() + + 1; + let k = unique_count.clamp(1, 4).min(clean_samples.len()); + if k <= 1 { + return HashMap::new(); + } + + let mut centroids = (0..k) + .map(|i| { + let pos = ((i as f32 + 0.5) / k as f32 * values.len() as f32).floor() as usize; + values[pos.min(values.len() - 1)] + }) + .collect::>(); + + for _ in 0..16 { + let mut sums = vec![0.0f32; k]; + let mut counts = vec![0usize; k]; + for (_, value) in &clean_samples { + let mut best_idx = 0usize; + let mut best_dist = f32::INFINITY; + for (idx, c) in centroids.iter().enumerate() { + let dist = (value - c).abs(); + if dist < best_dist { + best_dist = dist; + best_idx = idx; + } + } + sums[best_idx] += *value; + counts[best_idx] += 1; + } + for idx in 0..k { + if counts[idx] > 0 { + centroids[idx] = sums[idx] / counts[idx] as f32; + } + } + } + + let mut centroid_order: Vec<(usize, f32)> = centroids.iter().copied().enumerate().collect(); + if descending { + centroid_order.sort_by(|a, b| b.1.total_cmp(&a.1)); } else { - // No numbering detected, default to level 2 (## heading) - (2, cleaned) + centroid_order.sort_by(|a, b| a.1.total_cmp(&b.1)); + } + let rank_by_cluster: HashMap = centroid_order + .into_iter() + .enumerate() + .map(|(rank, (cluster_idx, _))| (cluster_idx, rank)) + .collect(); + + let mut result = HashMap::new(); + for (element_idx, value) in &clean_samples { + let mut best_idx = 0usize; + let mut best_dist = f32::INFINITY; + for (idx, c) in centroids.iter().enumerate() { + let dist = (value - c).abs(); + if dist < best_dist { + best_dist = dist; + best_idx = idx; + } + } + let rank = rank_by_cluster.get(&best_idx).copied().unwrap_or(0); + let level = (rank + 2).clamp(2, 6); + result.insert(*element_idx, level); } + + result } /// A detected document region block (from PP-DocBlockLayout). @@ -273,8 +477,25 @@ impl StructureResult { .map(|e| &e.bbox) .collect(); + // Compute original image width for image scaling (PaddleX: original_image_width) + let original_image_width = self + .rectified_img + .as_ref() + .map(|img| img.width() as f32) + .or_else(|| { + // Estimate from max element x-coordinate + self.layout_elements + .iter() + .map(|e| e.bbox.x_max()) + .fold(None, |acc, x| Some(acc.map_or(x, |max: f32| max.max(x)))) + }) + .unwrap_or(1.0); + let mut md = String::new(); let elements = &self.layout_elements; + let paragraph_title_levels = infer_paragraph_title_levels(elements); + let mut last_label: Option = None; + let mut prev_element: Option<&LayoutElement> = None; for (idx, element) in elements.iter().enumerate() { // PP-StructureV3 markdown ignores auxiliary labels. @@ -306,35 +527,56 @@ impl StructureResult { } } + // Determine seg_start_flag for paragraph continuity (PaddleX get_seg_flag). + // When both current and previous are "text" and seg_start_flag is false, + // they belong to the same paragraph — join without \n\n separator. + let seg_start_flag = get_seg_flag(element, prev_element); + + let is_continuation = element.element_type == LayoutElementType::Text + && last_label == Some(LayoutElementType::Text) + && !seg_start_flag; + + // Add separator between elements + if !is_continuation { + // Normal case: separate elements with blank line + } + match element.element_type { // Document title LayoutElementType::DocTitle => { - md.push_str("\n# "); + if !md.is_empty() { + md.push_str("\n\n"); + } + md.push_str("# "); if let Some(text) = &element.text { let cleaned = clean_ocr_text(text); md.push_str(&cleaned); } - md.push_str("\n\n"); } // Paragraph/section title - auto-detect numbering for level LayoutElementType::ParagraphTitle => { + if !md.is_empty() { + md.push_str("\n\n"); + } if let Some(text) = &element.text { let cleaned = clean_ocr_text(text); - let (level, formatted_title) = format_title_with_level(&cleaned); - md.push('\n'); + let clustered = paragraph_title_levels.get(&idx).copied(); + let (level, formatted_title) = format_title_with_level(&cleaned, clustered); for _ in 0..level { md.push('#'); } md.push(' '); md.push_str(&formatted_title); - md.push_str("\n\n"); } else { - md.push_str("\n## \n\n"); + md.push_str("## "); } } // Table - preserve HTML structure with border and center alignment // Following PaddleX's format with
wrapper LayoutElementType::Table => { + if !md.is_empty() { + md.push_str("\n\n"); + } if let Some(table) = self.tables.iter().find(|t| t.bbox.iou(&element.bbox) > 0.5) { @@ -343,19 +585,41 @@ impl StructureResult { let simplified = simplify_table_html(html); let table_with_border = simplified.replacen("
", "
", 1); - // Wrap with center-aligned div for better markdown rendering - md.push_str("\n
"); - md.push_str(&table_with_border); - md.push_str("
\n\n"); + // PaddleX format_centered_by_html: clean newlines then wrap + let cleaned = clean_ocr_text(&table_with_border); + md.push_str("
"); + md.push_str(&cleaned); + md.push_str("
"); } else { - md.push_str("\n[Table]\n\n"); + md.push_str("[Table]"); } } else { - md.push_str("\n[Table]\n\n"); + md.push_str("[Table]"); } } + // FormulaNumber - equation labels like "(1)", "Eq. 1" etc. + // PaddleX does NOT include formula_number in handle_funcs_dict, + // so these are silently skipped in markdown output. + LayoutElementType::FormulaNumber => { + continue; + } // Formula - detect inline vs display formula based on context - LayoutElementType::Formula | LayoutElementType::FormulaNumber => { + LayoutElementType::Formula => { + // Extract and clean LaTeX content; skip if empty + let raw_content = element.text.as_deref().map(|s| s.trim()).unwrap_or(""); + if raw_content.is_empty() { + continue; + } + // Remove only outer $$ or $ wrappers if present (from table cell injection) + let latex_content = + if raw_content.starts_with("$$") && raw_content.ends_with("$$") { + &raw_content[2..raw_content.len() - 2] + } else if raw_content.starts_with('$') && raw_content.ends_with('$') { + &raw_content[1..raw_content.len() - 1] + } else { + raw_content + }; + // Check if this formula is on the same line as adjacent text elements // to determine if it's an inline formula or display formula let is_inline = { @@ -383,23 +647,25 @@ impl StructureResult { if is_inline { // Inline formula: use $...$ md.push('$'); - if let Some(latex) = &element.text { - md.push_str(latex); - } + md.push_str(latex_content); md.push_str("$ "); } else { // Display formula: use $$...$$ - md.push_str("\n$$"); - if let Some(latex) = &element.text { - md.push_str(latex); + if !md.is_empty() { + md.push_str("\n\n"); } - md.push_str("$$\n\n"); + md.push_str("$$"); + md.push_str(latex_content); + md.push_str("$$"); } } // Image/Chart - figure format with center alignment LayoutElementType::Image | LayoutElementType::Chart => { + if !md.is_empty() { + md.push_str("\n\n"); + } // Use HTML img tag with center alignment for better rendering - md.push_str("\n
\"Image\"
\n\n"); + // Calculate width percentage relative to original image width (PaddleX logic) + let image_width = element.bbox.x_max() - element.bbox.x_min(); + let width_pct = (image_width / original_image_width * 100.0) as u32; + let width_pct = width_pct.clamp(1, 100); + md.push_str(&format!("{}%", width_pct)); + md.push_str("\" />"); } // Seal - show as image with text LayoutElementType::Seal => { - md.push_str("\n![Seal]"); + if !md.is_empty() { + md.push_str("\n\n"); + } + md.push_str("![Seal]"); if let Some(text) = &element.text { md.push_str("\n> "); md.push_str(text); } - md.push_str("\n\n"); } // Captions - with center alignment following PaddleX _ if element.element_type.is_caption() => { if let Some(text) = &element.text { - md.push_str("\n
"); - md.push_str(text); - md.push_str("
\n\n"); + if !md.is_empty() { + md.push_str("\n\n"); + } + let cleaned = clean_ocr_text(text); + md.push_str("
"); + md.push_str(&cleaned); + md.push_str("
"); } } - // Abstract - following PaddleX format with proper text handling + // Abstract - following PaddleX's format_first_line_func with spliter=" " LayoutElementType::Abstract => { if let Some(text) = &element.text { - // Check for "Abstract" or "摘要" heading - let lower = text.to_lowercase(); - if lower.contains("abstract") || lower.contains("摘要") { - md.push_str("\n## **Abstract**\n\n"); + if !md.is_empty() { + md.push_str("\n\n"); } - let formatted = format_text_block(text); + let formatted = format_first_line(text, " ", &["abstract", "摘要"], "## "); md.push_str(&formatted); - md.push_str("\n\n"); } } - // Reference - following PaddleX's format_reference_block + // Reference - following PaddleX's format_first_line_func with spliter="\n" LayoutElementType::Reference => { if let Some(text) = &element.text { - let formatted = format_reference_block(text); - md.push('\n'); + if !md.is_empty() { + md.push_str("\n\n"); + } + let formatted = + format_first_line(text, "\n", &["references", "参考文献"], "## "); md.push_str(&formatted); - md.push_str("\n\n"); } } // Content (table of contents) - following PaddleX's soft breaks LayoutElementType::Content => { if let Some(text) = &element.text { + if !md.is_empty() { + md.push_str("\n\n"); + } let formatted = format_content_block(text); - md.push('\n'); md.push_str(&formatted); - md.push_str("\n\n"); } } // Footnote - following PaddleX's vision_footnote handling LayoutElementType::Footnote => { if let Some(text) = &element.text { + if !md.is_empty() { + md.push_str("\n\n"); + } let formatted = format_vision_footnote_block(text); - md.push('\n'); md.push_str(&formatted); - md.push_str("\n\n"); } } // List LayoutElementType::List => { if let Some(text) = &element.text { + if !md.is_empty() { + md.push_str("\n\n"); + } let cleaned = format_text_block(text); // Split by newlines and format as list items for line in cleaned.lines() { @@ -491,7 +768,15 @@ impl StructureResult { md.push('\n'); } } - md.push('\n'); + } + } + // Algorithm block - PaddleX: block.content.strip("\n") + LayoutElementType::Algorithm => { + if let Some(text) = &element.text { + if !md.is_empty() { + md.push_str("\n\n"); + } + md.push_str(text.trim_matches('\n')); } } // Header/Footer - smaller text (typically excluded from markdown) @@ -503,12 +788,23 @@ impl StructureResult { // Default text elements - following PaddleX's text handling _ => { if let Some(text) = &element.text { - let formatted = format_text_block(text); - md.push_str(&formatted); - md.push_str("\n\n"); + // For text continuation (same paragraph), join directly + if is_continuation { + let formatted = format_text_block(text); + md.push_str(&formatted); + } else { + if !md.is_empty() { + md.push_str("\n\n"); + } + let formatted = format_text_block(text); + md.push_str(&formatted); + } } } } + + last_label = Some(element.element_type); + prev_element = Some(element); } md.trim().to_string() } @@ -793,6 +1089,67 @@ impl StructureResult { } } +/// Determines paragraph continuity flags for the current element relative to the previous. +/// +/// This implements PaddleX's `get_seg_flag` logic from `layout_parsing/utils.py`: +/// - `seg_start_flag = true` means this element starts a NEW paragraph +/// - `seg_start_flag = false` means this element CONTINUES the previous paragraph +/// +/// The logic checks whether: +/// 1. Previous block's last line ends near the right edge (text fills to right) +/// 2. Current block's first line starts near the left edge (no indentation) +/// 3. Previous block has more than one line +/// 4. The two blocks are horizontally close enough +/// +/// Returns `seg_start_flag` (true = new paragraph, false = continuation). +fn get_seg_flag(current: &LayoutElement, prev: Option<&LayoutElement>) -> bool { + const COORD_THRESHOLD: f32 = 10.0; + + let seg_start = current.seg_start_x.unwrap_or(current.bbox.x_min()); + let mut context_left = current.bbox.x_min(); + let mut context_right = current.bbox.x_max(); + + if let Some(prev) = prev { + let prev_seg_end = prev.seg_end_x.unwrap_or(prev.bbox.x_max()); + let prev_num_lines = prev.num_lines.unwrap_or(1); + + // Check if blocks overlap horizontally + let overlap_blocks = context_left < prev.bbox.x_max() && context_right > prev.bbox.x_min(); + + let edge_distance; + if overlap_blocks { + context_left = context_left.min(prev.bbox.x_min()); + context_right = context_right.max(prev.bbox.x_max()); + edge_distance = 0.0; + } else { + edge_distance = (current.bbox.x_min() - prev.bbox.x_max()).abs(); + } + + let prev_end_space_small = (context_right - prev_seg_end).abs() < COORD_THRESHOLD; + let current_start_space_small = seg_start - context_left < COORD_THRESHOLD; + let prev_lines_more_than_one = prev_num_lines > 1; + let blocks_close = edge_distance + < (prev.bbox.x_max() - prev.bbox.x_min()) + .max(current.bbox.x_max() - current.bbox.x_min()); + + if prev_end_space_small + && current_start_space_small + && prev_lines_more_than_one + && blocks_close + { + return false; // continuation + } + + true // new paragraph + } else { + // First element: check if text starts near the left edge + if seg_start - context_left < COORD_THRESHOLD { + return false; // continuation from previous page (no indentation) + } + true + } +} + /// Checks if a text element appears to start a new paragraph. /// /// Following PaddleX's logic: if the text starts near the left edge of the page @@ -908,6 +1265,49 @@ fn clean_ocr_text(text: &str) -> String { text.replace("-\n", "").replace('\n', " ") } +/// Formats the first non-empty line of a block if it matches a template keyword. +/// +/// This is the Rust equivalent of PaddleX's `format_first_line_func`: +/// 1. Split text by `spliter` +/// 2. Find the first non-empty token +/// 3. If it matches any template (case-insensitive exact match), replace it with `format_func(token)` +/// 4. Rejoin with `spliter` +/// +/// For abstract: `spliter=" "`, templates=["abstract","摘要"], format_func= `## {}\n` +/// For reference: `spliter="\n"`, templates=["references","参考文献"], format_func= `## {}` +fn format_first_line( + text: &str, + spliter: &str, + templates: &[&str], + heading_prefix: &str, +) -> String { + let parts: Vec<&str> = text.split(spliter).collect(); + let mut result_parts: Vec = Vec::with_capacity(parts.len()); + let mut found_first = false; + + for part in &parts { + if !found_first { + let trimmed = part.trim(); + if trimmed.is_empty() { + result_parts.push(part.to_string()); + continue; + } + found_first = true; + // Check if the first non-empty token matches a template (case-insensitive) + if templates.iter().any(|t| trimmed.eq_ignore_ascii_case(t)) { + // Replace with formatted heading: "## \n" + result_parts.push(format!("{}{}\n", heading_prefix, trimmed)); + } else { + result_parts.push(part.to_string()); + } + } else { + result_parts.push(part.to_string()); + } + } + + result_parts.join(spliter) +} + /// Formats text blocks following PaddleX's text handling: /// 1. First remove hyphenation: `-\n` -> `` (join broken words) /// 2. Then: `.replace("\n\n", "\n").replace("\n", "\n\n")` @@ -933,49 +1333,6 @@ fn format_content_block(text: &str) -> String { step1.replace('\n', " \n") } -/// Formats reference blocks, following PaddleX's `format_first_line_func`: -/// - First remove hyphenation: `-\n` -> `` -/// - Detects "References" or "参考文献" keyword -/// - Adds markdown heading if found -fn format_reference_block(text: &str) -> String { - // First remove hyphenation - let dehyphenated = text.replace("-\n", ""); - let lines: Vec<&str> = dehyphenated.lines().collect(); - - // Check first non-empty line for reference keywords - let mut result = String::new(); - let mut added_heading = false; - - for (i, line) in lines.iter().enumerate() { - let trimmed = line.trim(); - if trimmed.is_empty() { - continue; - } - - // Check if this is a reference heading line - if !added_heading && (trimmed.contains("References") || trimmed.contains("参考文献")) { - result.push_str("## **References**\n\n"); - added_heading = true; - // Skip the heading line itself, continue with content - continue; - } - - // Add remaining lines - if i > 0 || result.is_empty() { - if !result.is_empty() { - result.push('\n'); - } - result.push_str(trimmed); - } - } - - if result.is_empty() { - dehyphenated - } else { - result - } -} - /// Formats vision footnote blocks following PaddleX: /// 1. First remove hyphenation: `-\n` -> `` /// 2. Then: `.replace("\n\n", "\n").replace("\n", "\n\n")` @@ -1018,17 +1375,11 @@ fn is_digit(c: char) -> bool { } /// Removes PDF hyphenation artifacts from text. +/// Dehyphenation: only handles hyphen-newline patterns (word breaks across lines). /// -/// PDFs often break words at line ends with hyphens like "frame-work", -/// "com-pared", etc. This function detects and removes these hyphens -/// when they appear to be line-break hyphens rather than intentional hyphens. -/// -/// Rules: -/// 1. Hyphen followed by lowercase letter is likely a hyphenation artifact -/// 2. Hyphen followed by space and lowercase letter is also artifact -/// 3. Hyphen followed by newline and lowercase letter is artifact -/// 4. Preserve intentional hyphens (compound words, hyphenated phrases) -/// 5. Preserve hyphens in URLs and technical patterns +/// Matches PaddleX's behavior where hyphens are only stripped at line boundaries +/// (hyphen immediately followed by newline). Mid-word hyphens in compound words +/// like "real-time", "end-to-end", "one-to-many" are preserved. fn dehyphenate(text: &str) -> String { let mut result = String::with_capacity(text.len()); let chars: Vec = text.chars().collect(); @@ -1037,7 +1388,6 @@ fn dehyphenate(text: &str) -> String { // Helper to check if we're in a URL-like pattern let is_url_context = |pos: usize| -> bool { - // Look at a window around the hyphen for URL patterns let start = pos.saturating_sub(10); let end = (pos + 5).min(len); let window: String = chars[start..end].iter().collect(); @@ -1046,33 +1396,18 @@ fn dehyphenate(text: &str) -> String { while i < len { if chars[i] == '-' { - // Skip dehyphenation for URL contexts if is_url_context(i) { result.push('-'); i += 1; continue; } - // Check if this is a hyphenation artifact - let is_artifact = if i + 1 < len { - let next = chars[i + 1]; - if next == '\n' { - // Hyphen followed by newline - check what's after the newline - if i + 2 < len { - let after_newline = chars[i + 2]; - is_lowercase(after_newline) - } else { - false - } - } else if is_lowercase(next) { - // Hyphen followed directly by lowercase letter (e.g., "com-puted") - // But check if preceded by lowercase to avoid removing intentional hyphens - // like in "RT-DETR" or "one-to-many" - i > 0 && is_lowercase(chars[i - 1]) - } else if next.is_whitespace() && i + 2 < len { - let after_space = chars[i + 2]; - // Hyphen + space + lowercase letter (e.g., "com- puted") - is_lowercase(after_space) && i > 0 && is_lowercase(chars[i - 1]) + // Only dehyphenate when hyphen is followed by newline (line-break hyphenation). + // Pattern: "word-\nletter" → "wordletter" + let is_artifact = if i + 1 < len && chars[i + 1] == '\n' { + // Hyphen followed by newline — check if next line starts with lowercase + if i + 2 < len { + is_lowercase(chars[i + 2]) } else { false } @@ -1081,14 +1416,8 @@ fn dehyphenate(text: &str) -> String { }; if is_artifact { - // Skip the hyphen - // Also skip following newline/space if present - if i + 1 < len { - let next = chars[i + 1]; - if next == '\n' || next.is_whitespace() { - i += 1; - } - } + // Skip the hyphen and the following newline + i += 1; // skip newline (will be incremented again at end of loop) } else { result.push('-'); } @@ -1372,13 +1701,33 @@ pub fn postprocess_markdown(markdown: &str) -> String { continue; } - // Skip processing inside code/formula blocks - if in_code_block || in_formula { + // Skip processing inside code blocks + if in_code_block { result.push_str(line); result.push('\n'); continue; } + // If inside a formula block, ensure it doesn't contain unescaped dollar signs + // which cause KaTeX "Can't use function '$' in math mode" errors. + if in_formula { + // If the formula content looks like regular text (many spaces, few backslashes) + // and contains a $, KaTeX will fail. We escape the $ inside the math block. + let contains_dollar = line.contains('$'); + let is_plain_text = line.split_whitespace().count() > 3 && !line.contains('\\'); + + if contains_dollar && is_plain_text { + result.push_str(&line.replace('$', "\\$")); + } else if contains_dollar { + // Remove redundant dollar signs inside the block + result.push_str(&line.replace('$', "")); + } else { + result.push_str(line); + } + result.push('\n'); + continue; + } + // Process text content (skip headers, lists, etc.) if trimmed.starts_with('#') || trimmed.starts_with('*') @@ -1482,6 +1831,20 @@ pub struct LayoutElement { /// formulas, images, etc.) will have an order index assigned. /// Headers, footers, and other auxiliary elements may have `None`. pub order_index: Option, + /// X-coordinate of the first text span's left edge within this element. + /// Used by `get_seg_flag` to detect paragraph continuity across blocks. + /// Computed during stitching from the first OCR region (after spatial sort). + #[serde(skip_serializing_if = "Option::is_none")] + pub seg_start_x: Option, + /// X-coordinate of the last text span's right edge within this element. + /// Used by `get_seg_flag` to detect paragraph continuity across blocks. + /// Computed during stitching from the last OCR region (after spatial sort). + #[serde(skip_serializing_if = "Option::is_none")] + pub seg_end_x: Option, + /// Number of text lines within this element. + /// Used by `get_seg_flag` to detect paragraph continuity across blocks. + #[serde(skip_serializing_if = "Option::is_none")] + pub num_lines: Option, } impl LayoutElement { @@ -1494,6 +1857,9 @@ impl LayoutElement { label: None, text: None, order_index: None, + seg_start_x: None, + seg_end_x: None, + num_lines: None, } } @@ -1985,6 +2351,16 @@ pub struct TableResult { /// Structure tokens from table structure recognition (used for HTML generation after stitching) #[serde(skip)] pub structure_tokens: Option>, + /// Detected cell bounding boxes from the cell detection model (in page coordinates). + /// Stored separately from `cells` (which carry structure/grid metadata from the structure model) + /// and used by the stitcher for row-aware IoA-based OCR matching. + #[serde(skip)] + pub detected_cell_bboxes: Option>, + /// Whether the table was processed in end-to-end (E2E) mode. + /// When true, cells come from the structure model only (no separate cell detection). + /// Used by the stitcher to select the appropriate OCR matching strategy. + #[serde(skip)] + pub is_e2e: bool, } impl TableResult { @@ -1999,6 +2375,8 @@ impl TableResult { html_structure: None, cell_texts: None, structure_tokens: None, + detected_cell_bboxes: None, + is_e2e: false, } } @@ -2038,6 +2416,18 @@ impl TableResult { self } + /// Stores detected cell bounding boxes for the stitcher's row-aware IoA matcher. + pub fn with_detected_cell_bboxes(mut self, bboxes: Vec) -> Self { + self.detected_cell_bboxes = Some(bboxes); + self + } + + /// Marks this table as processed in end-to-end (E2E) mode. + pub fn with_e2e(mut self, is_e2e: bool) -> Self { + self.is_e2e = is_e2e; + self + } + /// Returns the best available confidence score for this table. /// /// This method provides a unified confidence API for callers who want to filter @@ -2206,4 +2596,147 @@ mod tests { assert!(html.contains("

Test Document

")); assert!(html.contains("

Hello world

")); } + + #[test] + fn test_format_title_with_level_keywords() { + let (level, text) = format_title_with_level("Abstract", None); + assert_eq!(level, 1); + assert_eq!(text, "Abstract"); + + let (level, text) = format_title_with_level("References:", None); + assert_eq!(level, 1); + assert_eq!(text, "References:"); + } + + #[test] + fn test_format_title_with_level_cluster_fallback() { + let (level, text) = format_title_with_level("Unnumbered Heading", Some(4)); + assert_eq!(level, 4); + assert_eq!(text, "Unnumbered Heading"); + } + + #[test] + fn test_to_markdown_skips_footnote() { + let mut result = StructureResult::new("test.jpg", 0); + let body = LayoutElement::new( + BoundingBox::from_coords(0.0, 0.0, 100.0, 30.0), + LayoutElementType::Text, + 1.0, + ) + .with_text("Body"); + let footnote = LayoutElement::new( + BoundingBox::from_coords(0.0, 40.0, 100.0, 60.0), + LayoutElementType::Footnote, + 1.0, + ) + .with_text("Footnote text"); + result = result.with_layout_elements(vec![body, footnote]); + + let md = result.to_markdown(); + assert!(md.contains("Body")); + assert!(!md.contains("Footnote text")); + } + + #[test] + fn test_to_markdown_doc_title_joins_lines_with_space() { + let mut result = StructureResult::new("test.jpg", 0); + let title = LayoutElement::new( + BoundingBox::from_coords(0.0, 0.0, 100.0, 20.0), + LayoutElementType::DocTitle, + 1.0, + ) + .with_text("Main\nTitle"); + result = result.with_layout_elements(vec![title]); + let md = result.to_markdown(); + assert!(md.contains("# Main Title")); + } + + #[test] + fn test_to_markdown_content_uses_soft_breaks() { + let mut result = StructureResult::new("test.jpg", 0); + let toc = LayoutElement::new( + BoundingBox::from_coords(0.0, 0.0, 100.0, 40.0), + LayoutElementType::Content, + 1.0, + ) + .with_text("1 Intro\n2 Method"); + result = result.with_layout_elements(vec![toc]); + let md = result.to_markdown(); + assert!(md.contains("1 Intro \n2 Method")); + } + + #[test] + fn test_infer_paragraph_title_levels_by_height() { + let titles = vec![ + LayoutElement::new( + BoundingBox::from_coords(0.0, 0.0, 100.0, 40.0), + LayoutElementType::ParagraphTitle, + 1.0, + ) + .with_text("Large"), + LayoutElement::new( + BoundingBox::from_coords(0.0, 50.0, 100.0, 74.0), + LayoutElementType::ParagraphTitle, + 1.0, + ) + .with_text("Medium"), + LayoutElement::new( + BoundingBox::from_coords(0.0, 80.0, 100.0, 98.0), + LayoutElementType::ParagraphTitle, + 1.0, + ) + .with_text("Small"), + ]; + + let levels = infer_paragraph_title_levels(&titles); + let l0 = levels.get(&0).copied().unwrap_or(2); + let l1 = levels.get(&1).copied().unwrap_or(2); + let l2 = levels.get(&2).copied().unwrap_or(2); + assert!(l0 <= l1 && l1 <= l2); + } + + #[test] + fn test_infer_paragraph_title_levels_semantic_vote_wins_tie() { + let titles = vec![ + LayoutElement::new( + BoundingBox::from_coords(0.0, 0.0, 100.0, 40.0), + LayoutElementType::ParagraphTitle, + 1.0, + ) + .with_text("1.1 Detail"), + LayoutElement::new( + BoundingBox::from_coords(0.0, 50.0, 100.0, 70.0), + LayoutElementType::ParagraphTitle, + 1.0, + ) + .with_text("2 Intro"), + ]; + + let levels = infer_paragraph_title_levels(&titles); + assert_eq!(levels.get(&0).copied(), Some(3)); + assert_eq!(levels.get(&1).copied(), Some(2)); + } + + #[test] + fn test_infer_paragraph_title_levels_uses_relative_indent_signal() { + let titles = vec![ + LayoutElement::new( + BoundingBox::from_coords(0.0, 0.0, 100.0, 24.0), + LayoutElementType::ParagraphTitle, + 1.0, + ) + .with_text("Heading A"), + LayoutElement::new( + BoundingBox::from_coords(40.0, 40.0, 140.0, 64.0), + LayoutElementType::ParagraphTitle, + 1.0, + ) + .with_text("Heading B"), + ]; + + let levels = infer_paragraph_title_levels(&titles); + let left_level = levels.get(&0).copied().unwrap_or(2); + let indented_level = levels.get(&1).copied().unwrap_or(2); + assert!(left_level < indented_level); + } } diff --git a/oar-ocr-core/src/domain/text_region.rs b/oar-ocr-core/src/domain/text_region.rs index 44fbbf5..252c4bb 100644 --- a/oar-ocr-core/src/domain/text_region.rs +++ b/oar-ocr-core/src/domain/text_region.rs @@ -37,12 +37,18 @@ pub struct TextRegion { /// Only populated when word-level detection is enabled. /// Each box corresponds to a word or character in the recognized text. pub word_boxes: Option>, + /// Label indicating the type of this text region. + /// Used to distinguish between normal text and special content like formulas. + /// Common values: "formula", "text", "seal", etc. + /// PaddleX: corresponds to `rec_labels` in OCR results. + #[serde(default)] + pub label: Option>, } impl TextRegion { /// Creates a new TextRegion with the given bounding box. /// - /// The text, confidence, orientation_angle, and word_boxes are initially set to None. + /// The text, confidence, orientation_angle, word_boxes, and label are initially set to None. pub fn new(bounding_box: BoundingBox) -> Self { Self { bounding_box, @@ -52,6 +58,7 @@ impl TextRegion { confidence: None, orientation_angle: None, word_boxes: None, + label: None, } } @@ -69,6 +76,7 @@ impl TextRegion { confidence, orientation_angle: None, word_boxes: None, + label: None, } } @@ -87,6 +95,7 @@ impl TextRegion { confidence, orientation_angle, word_boxes: None, + label: None, } } @@ -117,4 +126,20 @@ impl TextRegion { _ => None, } } + + /// Returns true if this text region has a label. + pub fn has_label(&self) -> bool { + self.label.is_some() + } + + /// Returns true if this text region is labeled as a formula. + pub fn is_formula(&self) -> bool { + self.label.as_deref() == Some("formula") + } + + /// Sets the label for this text region. + pub fn with_label(mut self, label: Option<&str>) -> Self { + self.label = label.map(|s| s.into()); + self + } } diff --git a/oar-ocr-core/src/processors/geometry.rs b/oar-ocr-core/src/processors/geometry.rs index fa96f54..602bc65 100644 --- a/oar-ocr-core/src/processors/geometry.rs +++ b/oar-ocr-core/src/processors/geometry.rs @@ -815,23 +815,20 @@ impl BoundingBox { 90 => { // Image was rotated 270° counter-clockwise (or 90° clockwise) to correct // Inverse: rotate box 90° clockwise - // (x, y) in rotated → (rotated_height - 1 - y, x) in original - Point::new(rotated_height as f32 - 1.0 - p.y, p.x) + // (x, y) in rotated → (rotated_height - y, x) in original + Point::new(rotated_height as f32 - p.y, p.x) } 180 => { // Image was rotated 180° to correct // Inverse: rotate box 180° - // (x, y) in rotated → (rotated_width - 1 - x, rotated_height - 1 - y) in original - Point::new( - rotated_width as f32 - 1.0 - p.x, - rotated_height as f32 - 1.0 - p.y, - ) + // (x, y) in rotated → (rotated_width - x, rotated_height - y) in original + Point::new(rotated_width as f32 - p.x, rotated_height as f32 - p.y) } 270 => { // Image was rotated 90° counter-clockwise (or 270° clockwise) to correct // Inverse: rotate box 270° clockwise (or 90° counter-clockwise) - // (x, y) in rotated → (y, rotated_width - 1 - x) in original - Point::new(p.y, rotated_width as f32 - 1.0 - p.x) + // (x, y) in rotated → (y, rotated_width - x) in original + Point::new(p.y, rotated_width as f32 - p.x) } _ => { // No rotation (0° or unknown) @@ -1215,12 +1212,12 @@ mod tests { let bbox = BoundingBox::from_coords(0.0, 0.0, 1.0, 1.0); let rotated = bbox.rotate_back_to_original(90.0, rotated_width, rotated_height); - // angle=90 inverse mapping: (x, y) -> (rotated_height-1-y, x) + // angle=90 inverse mapping: (x, y) -> (rotated_height - y, x) let expected = BoundingBox::new(vec![ - Point::new(3.0, 0.0), + Point::new(4.0, 0.0), + Point::new(4.0, 1.0), Point::new(3.0, 1.0), - Point::new(2.0, 1.0), - Point::new(2.0, 0.0), + Point::new(3.0, 0.0), ]); assert_eq!(rotated.points, expected.points); } @@ -1232,12 +1229,12 @@ mod tests { let bbox = BoundingBox::from_coords(1.0, 1.0, 2.0, 2.0); let rotated = bbox.rotate_back_to_original(180.0, rotated_width, rotated_height); - // angle=180 inverse mapping: (x, y) -> (rotated_width-1-x, rotated_height-1-y) + // angle=180 inverse mapping: (x, y) -> (rotated_width - x, rotated_height - y) let expected = BoundingBox::new(vec![ + Point::new(3.0, 2.0), + Point::new(2.0, 2.0), Point::new(2.0, 1.0), - Point::new(1.0, 1.0), - Point::new(1.0, 0.0), - Point::new(2.0, 0.0), + Point::new(3.0, 1.0), ]); assert_eq!(rotated.points, expected.points); } @@ -1250,12 +1247,12 @@ mod tests { let bbox = BoundingBox::from_coords(0.0, 0.0, 1.0, 1.0); let rotated = bbox.rotate_back_to_original(270.0, rotated_width, rotated_height); - // angle=270 inverse mapping: (x, y) -> (y, rotated_width-1-x) + // angle=270 inverse mapping: (x, y) -> (y, rotated_width - x) let expected = BoundingBox::new(vec![ + Point::new(0.0, 3.0), Point::new(0.0, 2.0), - Point::new(0.0, 1.0), - Point::new(1.0, 1.0), Point::new(1.0, 2.0), + Point::new(1.0, 3.0), ]); assert_eq!(rotated.points, expected.points); } diff --git a/oar-ocr-core/src/processors/layout_sorting.rs b/oar-ocr-core/src/processors/layout_sorting.rs index 7a587f3..f504bdd 100644 --- a/oar-ocr-core/src/processors/layout_sorting.rs +++ b/oar-ocr-core/src/processors/layout_sorting.rs @@ -1,68 +1,72 @@ -//! Enhanced layout sorting logic compatible with PP-StructureV3. +//! Enhanced layout sorting logic — `xycut_enhanced` algorithm. //! -//! This module implements the `xycut_enhanced` strategy which handles complex layouts -//! by separating headers/footers, identifying cross-column elements, and using -//! weighted distance metrics to insert titles and figures into the reading order. +//! Faithful port of PaddleX's `xycut_enhanced` strategy: +//! 1. Header/Footer separation +//! 2. Cross-layout detection (blocks spanning multiple columns) +//! 3. Direction-aware XY-cut sorting +//! 4. Overlapping box shrinking before projection +//! 5. Weighted distance insertion for special blocks +//! 6. Child block association (vision titles → vision parents) use crate::domain::structure::LayoutElementType; +use crate::processors::sorting::calculate_overlap_ratio; use crate::processors::{BoundingBox, SortDirection, sort_by_xycut}; +/// XYCUT_SETTINGS constants (matching PaddleX setting.py) +const EDGE_DISTANCE_COMPARE_TOLERANCE_LEN: f32 = 2.0; +const EDGE_WEIGHT: f32 = 10000.0; // 10^4 +const UP_EDGE_WEIGHT: f32 = 1.0; +const LEFT_EDGE_WEIGHT: f32 = 2.0; +const CROSS_LAYOUT_REF_TEXT_BLOCK_WORDS_NUM_THRESHOLD: f32 = 10.0; + /// Label used for sorting logic. -/// -/// Matches standard block categories. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum OrderLabel { - Header, // header_labels - Footer, // footer_labels - DocTitle, // doc_title_labels - ParagraphTitle, // paragraph_title_labels - Vision, // vision_labels - VisionTitle, // vision_title_labels - Unordered, // unordered_labels - NormalText, // text_labels - CrossLayout, // derived internally - Reference, // special case + Header, + Footer, + DocTitle, + ParagraphTitle, + Vision, + VisionTitle, + Unordered, + NormalText, + CrossLayout, + CrossReference, + Reference, } impl OrderLabel { pub fn from_element_type(et: LayoutElementType) -> Self { - // Mapped based on standard block labels. match et { - // header_labels LayoutElementType::Header | LayoutElementType::HeaderImage => OrderLabel::Header, - // footer_labels LayoutElementType::Footer | LayoutElementType::FooterImage | LayoutElementType::Footnote => OrderLabel::Footer, - // doc_title_labels LayoutElementType::DocTitle => OrderLabel::DocTitle, - // paragraph_title_labels - LayoutElementType::ParagraphTitle - | LayoutElementType::Reference - | LayoutElementType::Content => OrderLabel::ParagraphTitle, + LayoutElementType::ParagraphTitle | LayoutElementType::Content => { + OrderLabel::ParagraphTitle + } + + LayoutElementType::Reference => OrderLabel::Reference, - // vision_labels LayoutElementType::Image | LayoutElementType::Table | LayoutElementType::Chart | LayoutElementType::Algorithm => OrderLabel::Vision, - // vision_title_labels LayoutElementType::FigureTitle | LayoutElementType::TableTitle | LayoutElementType::ChartTitle | LayoutElementType::FigureTableChartTitle => OrderLabel::VisionTitle, - // unordered_labels LayoutElementType::AsideText | LayoutElementType::Seal | LayoutElementType::Number | LayoutElementType::FormulaNumber => OrderLabel::Unordered, - // text_labels (default fallback) LayoutElementType::Text | LayoutElementType::List | LayoutElementType::Abstract @@ -72,13 +76,6 @@ impl OrderLabel { _ => OrderLabel::NormalText, } } - - pub fn is_header(&self) -> bool { - matches!(self, OrderLabel::Header) - } - pub fn is_footer(&self) -> bool { - matches!(self, OrderLabel::Footer) - } } /// A wrapper around layout elements with properties needed for sorting. @@ -87,58 +84,95 @@ pub struct SortableBlock { pub bbox: BoundingBox, pub original_index: usize, pub order_label: OrderLabel, - pub direction: SortDirection, // Derived from aspect ratio + pub element_type: LayoutElementType, + pub direction: SortDirection, + pub num_lines: u32, + pub text_line_height: f32, } impl SortableBlock { - pub fn new(bbox: BoundingBox, original_index: usize, element_type: LayoutElementType) -> Self { + pub fn new( + bbox: BoundingBox, + original_index: usize, + element_type: LayoutElementType, + num_lines: Option, + ) -> Self { let order_label = OrderLabel::from_element_type(element_type); let width = bbox.x_max() - bbox.x_min(); let height = bbox.y_max() - bbox.y_min(); - - // Logic: horizontal if width >= height (ratio 1.0) let direction = if width >= height { SortDirection::Horizontal } else { SortDirection::Vertical }; + let num_lines = num_lines.unwrap_or(1).max(1); + let text_line_height = if num_lines > 0 { + height / num_lines as f32 + } else { + height + }; Self { bbox, original_index, order_label, + element_type, direction, + num_lines, + text_line_height, } } + pub fn width(&self) -> f32 { + self.bbox.x_max() - self.bbox.x_min() + } + + pub fn height(&self) -> f32 { + self.bbox.y_max() - self.bbox.y_min() + } + + pub fn area(&self) -> f32 { + self.width() * self.height() + } + pub fn center(&self) -> (f32, f32) { ( (self.bbox.x_min() + self.bbox.x_max()) / 2.0, (self.bbox.y_min() + self.bbox.y_max()) / 2.0, ) } + + pub fn long_side_length(&self) -> f32 { + self.width().max(self.height()) + } +} + +/// Input element for enhanced sorting. +pub struct SortableElement { + pub bbox: BoundingBox, + pub element_type: LayoutElementType, + pub num_lines: Option, } /// Main entry point for enhanced sorting. /// /// Returns a list of original indices in the correct reading order. pub fn sort_layout_enhanced( - elements: &[(BoundingBox, LayoutElementType)], + elements: &[SortableElement], page_width: f32, - page_height: f32, + _page_height: f32, ) -> Vec { if elements.is_empty() { return Vec::new(); } - // 1. Convert to SortableBlocks let blocks: Vec = elements .iter() .enumerate() - .map(|(i, (bbox, et))| SortableBlock::new(bbox.clone(), i, *et)) + .map(|(i, e)| SortableBlock::new(e.bbox.clone(), i, e.element_type, e.num_lines)) .collect(); - // 2. Separate into groups + // Separate headers/footers let mut header_blocks = Vec::new(); let mut footer_blocks = Vec::new(); let mut main_blocks = Vec::new(); @@ -151,24 +185,11 @@ pub fn sort_layout_enhanced( } } - // 3. Sort Headers and Footers (simple top-to-bottom) - header_blocks.sort_by(|a, b| { - a.bbox - .y_min() - .partial_cmp(&b.bbox.y_min()) - .unwrap_or(std::cmp::Ordering::Equal) - }); - footer_blocks.sort_by(|a, b| { - a.bbox - .y_min() - .partial_cmp(&b.bbox.y_min()) - .unwrap_or(std::cmp::Ordering::Equal) - }); + sort_blocks_by_y(&mut header_blocks); + sort_blocks_by_y(&mut footer_blocks); - // 4. Sort Main Blocks using Enhanced Logic - let sorted_main = sort_main_blocks(main_blocks, page_width, page_height); + let sorted_main = sort_main_blocks(main_blocks, page_width); - // 5. Combine let mut result = Vec::with_capacity(elements.len()); result.extend(header_blocks.into_iter().map(|b| b.original_index)); result.extend(sorted_main.into_iter().map(|b| b.original_index)); @@ -177,87 +198,539 @@ pub fn sort_layout_enhanced( result } -fn sort_main_blocks( - blocks: Vec, - _page_width: f32, - _page_height: f32, -) -> Vec { +fn sort_blocks_by_y(blocks: &mut [SortableBlock]) { + blocks.sort_by(|a, b| { + a.bbox + .y_min() + .partial_cmp(&b.bbox.y_min()) + .unwrap_or(std::cmp::Ordering::Equal) + }); +} + +fn sort_main_blocks(mut blocks: Vec, page_width: f32) -> Vec { + if blocks.is_empty() { + return blocks; + } + + // 1. Cross-layout detection (PaddleX get_layout_structure) + detect_cross_layout(&mut blocks, page_width); + + // 2. Separate blocks for XY-cut vs special insertion + // PaddleX SKIP_ORDER_LABELS are inserted by weighted distance after main XY-cut. let mut xy_cut_blocks = Vec::new(); - let mut vision_blocks = Vec::new(); // Tables, Images (Anchors) - let mut other_unsorted_blocks = Vec::new(); // Titles, etc. let mut doc_title_blocks = Vec::new(); + let mut weighted_insert_blocks = Vec::new(); + let mut unordered_blocks = Vec::new(); for block in blocks { match block.order_label { - OrderLabel::NormalText | OrderLabel::Unordered => xy_cut_blocks.push(block), + OrderLabel::CrossLayout + | OrderLabel::CrossReference + | OrderLabel::Vision + | OrderLabel::VisionTitle => weighted_insert_blocks.push(block), OrderLabel::DocTitle => doc_title_blocks.push(block), - OrderLabel::Vision => vision_blocks.push(block), - _ => other_unsorted_blocks.push(block), + OrderLabel::Unordered => unordered_blocks.push(block), + _ => xy_cut_blocks.push(block), } } - // Sort xy_cut_blocks using standard XY-cut + // 3. Direction-aware XY-cut on xy_cut_blocks let mut sorted_blocks = if !xy_cut_blocks.is_empty() { - let bboxes: Vec = xy_cut_blocks.iter().map(|b| b.bbox.clone()).collect(); - let indices = sort_by_xycut(&bboxes, SortDirection::Vertical, 1); - indices - .into_iter() - .map(|i| xy_cut_blocks[i].clone()) - .collect() + direction_aware_xycut_sort(&mut xy_cut_blocks) } else { Vec::new() }; - // Insertion Order Strategy: - // 1. DocTitle (Global context) - // 2. Vision (Tables/Images - strong anchors) - // 3. VisionTitle/ParagraphTitle (Weakly attached, depend on anchors) + // 4. Match unsorted blocks using weighted distance insertion + // Order: doc_title first (PaddleX inserts first doc_title at position 0) + sort_blocks_by_y(&mut doc_title_blocks); + for (i, block) in doc_title_blocks.into_iter().enumerate() { + if i == 0 && sorted_blocks.is_empty() { + sorted_blocks.push(block); + } else if i == 0 { + sorted_blocks.insert(0, block); + } else { + weighted_distance_insert(block, &mut sorted_blocks, SortDirection::Horizontal); + } + } - // 1. DocTitle - doc_title_blocks.sort_by(|a, b| { - a.bbox - .y_min() - .partial_cmp(&b.bbox.y_min()) - .unwrap_or(std::cmp::Ordering::Equal) - }); - for block in doc_title_blocks { + // Vision/cross-layout/title blocks are inserted after XY-cut. + sort_blocks_by_y(&mut weighted_insert_blocks); + for block in weighted_insert_blocks { weighted_distance_insert(block, &mut sorted_blocks, SortDirection::Horizontal); } - // 2. Vision (Tables, Images) - // Sort by position to stabilize insertion - vision_blocks.sort_by(|a, b| { - a.bbox - .y_min() - .partial_cmp(&b.bbox.y_min()) - .unwrap_or(std::cmp::Ordering::Equal) - }); - for block in vision_blocks { - weighted_distance_insert(block, &mut sorted_blocks, SortDirection::Horizontal); + // Unordered blocks using manhattan distance + sort_blocks_by_y(&mut unordered_blocks); + for block in unordered_blocks { + manhattan_insert(block, &mut sorted_blocks); + } + + // 5. Associate child blocks (vision titles next to vision parents) + associate_child_blocks(&mut sorted_blocks); + + sorted_blocks +} + +/// Direction-aware XY-cut sorting (PaddleX xycut_enhanced lines 539-584). +/// +/// If single column or all blocks have 1 line → use secondary direction (xy_cut). +/// If multi-column → use primary direction (yx_cut). +fn direction_aware_xycut_sort(blocks: &mut [SortableBlock]) -> Vec { + let bboxes: Vec = blocks.iter().map(|b| b.bbox.clone()).collect(); + let max_text_lines = blocks.iter().map(|b| b.num_lines).max().unwrap_or(1); + + // Check column structure using horizontal projection + let discontinuous = calculate_discontinuous_projection(&bboxes, SortDirection::Horizontal); + + // Shrink overlapping boxes before XY-cut + shrink_overlapping_boxes(blocks, SortDirection::Vertical); + + let shrunk_bboxes: Vec = blocks.iter().map(|b| b.bbox.clone()).collect(); + + let sorted_indices = if discontinuous.len() == 1 || max_text_lines == 1 { + // Single column: use secondary direction (XY-cut = X first, then Y) + sort_by_xycut(&shrunk_bboxes, SortDirection::Horizontal, 1) + } else { + // Multi-column: use primary direction (YX-cut = Y first, then X) + sort_by_xycut(&shrunk_bboxes, SortDirection::Vertical, 1) + }; + + sorted_indices + .into_iter() + .map(|i| blocks[i].clone()) + .collect() +} + +/// Cross-layout detection (port of PaddleX `get_layout_structure`). +/// +/// Marks blocks that span multiple columns as `CrossLayout`. +fn detect_cross_layout(blocks: &mut [SortableBlock], _page_width: f32) { + if blocks.len() < 2 { + return; } - // 3. Other Unsorted (Titles, CrossLayout, etc.) - other_unsorted_blocks.sort_by(|a, b| { + // Sort by x_min, then width (matching PaddleX) + blocks.sort_by(|a, b| { a.bbox - .y_min() - .partial_cmp(&b.bbox.y_min()) + .x_min() + .partial_cmp(&b.bbox.x_min()) .unwrap_or(std::cmp::Ordering::Equal) + .then_with(|| { + a.width() + .partial_cmp(&b.width()) + .unwrap_or(std::cmp::Ordering::Equal) + }) }); - for block in other_unsorted_blocks { - weighted_distance_insert(block, &mut sorted_blocks, SortDirection::Horizontal); + + let mask_labels = [ + OrderLabel::DocTitle, + OrderLabel::CrossLayout, + OrderLabel::CrossReference, + ]; + + let n = blocks.len(); + + // We need to work with indices to avoid borrow checker issues + // Collect block data we need for comparisons + let block_data: Vec<(BoundingBox, OrderLabel, f32, f32)> = blocks + .iter() + .map(|b| { + ( + b.bbox.clone(), + b.order_label, + b.area(), + b.long_side_length(), + ) + }) + .collect(); + + let text_line_heights: Vec = blocks.iter().map(|b| b.text_line_height).collect(); + + for block_idx in 0..n { + if mask_labels.contains(&block_data[block_idx].1) { + continue; + } + + let mut mark_block_cross = false; + + for ref_idx in 0..n { + if block_idx == ref_idx || mask_labels.contains(&block_data[ref_idx].1) { + continue; + } + // Skip already-marked blocks + if blocks[ref_idx].order_label == OrderLabel::CrossLayout { + continue; + } + if blocks[block_idx].order_label == OrderLabel::CrossLayout { + break; + } + + let bbox_overlap = + calculate_overlap_ratio(&block_data[block_idx].0, &block_data[ref_idx].0); + + if bbox_overlap > 0.0 { + if block_data[ref_idx].1 == OrderLabel::Vision { + blocks[ref_idx].order_label = OrderLabel::CrossLayout; + continue; + } + if bbox_overlap > 0.1 && block_data[block_idx].2 < block_data[ref_idx].2 { + mark_block_cross = true; + break; + } + } + + // Check projection overlap in primary direction (horizontal) + let match_proj = calculate_projection_overlap_ratio( + &block_data[block_idx].0, + &block_data[ref_idx].0, + SortDirection::Horizontal, + ); + + if match_proj > 0.0 { + for second_ref_idx in 0..n { + if second_ref_idx == block_idx + || second_ref_idx == ref_idx + || mask_labels.contains(&block_data[second_ref_idx].1) + { + continue; + } + if blocks[second_ref_idx].order_label == OrderLabel::CrossLayout { + continue; + } + + let bbox_overlap2 = calculate_overlap_ratio( + &block_data[block_idx].0, + &block_data[second_ref_idx].0, + ); + + if bbox_overlap2 > 0.1 { + if block_data[second_ref_idx].1 == OrderLabel::Vision { + blocks[second_ref_idx].order_label = OrderLabel::CrossLayout; + continue; + } + if block_data[block_idx].1 == OrderLabel::Vision + || block_data[block_idx].2 < block_data[second_ref_idx].2 + { + mark_block_cross = true; + break; + } + } + + let second_match_proj = calculate_projection_overlap_ratio( + &block_data[block_idx].0, + &block_data[second_ref_idx].0, + SortDirection::Horizontal, + ); + let ref_match_proj = calculate_projection_overlap_ratio( + &block_data[ref_idx].0, + &block_data[second_ref_idx].0, + SortDirection::Horizontal, + ); + let secondary_ref_match = calculate_projection_overlap_ratio( + &block_data[ref_idx].0, + &block_data[second_ref_idx].0, + SortDirection::Vertical, + ); + + if second_match_proj > 0.0 && ref_match_proj == 0.0 && secondary_ref_match > 0.0 + { + if block_data[block_idx].1 == OrderLabel::Vision { + mark_block_cross = true; + break; + } + // Both ref blocks are normal text with sufficient width + if block_data[ref_idx].1 == OrderLabel::NormalText + && block_data[second_ref_idx].1 == OrderLabel::NormalText + && block_data[ref_idx].3 + > text_line_heights[ref_idx] + * CROSS_LAYOUT_REF_TEXT_BLOCK_WORDS_NUM_THRESHOLD + && block_data[second_ref_idx].3 + > text_line_heights[second_ref_idx] + * CROSS_LAYOUT_REF_TEXT_BLOCK_WORDS_NUM_THRESHOLD + { + mark_block_cross = true; + break; + } + } + } + + if mark_block_cross { + break; + } + } + } + + if mark_block_cross { + if block_data[block_idx].1 == OrderLabel::Reference { + blocks[block_idx].order_label = OrderLabel::CrossReference; + } else { + blocks[block_idx].order_label = OrderLabel::CrossLayout; + } + } } +} - sorted_blocks +/// Calculate discontinuous projection intervals along a direction. +/// +/// Returns merged intervals where boxes project onto the axis. +/// Single interval = single column; multiple = multi-column. +fn calculate_discontinuous_projection( + bboxes: &[BoundingBox], + direction: SortDirection, +) -> Vec<(i32, i32)> { + if bboxes.is_empty() { + return Vec::new(); + } + + let mut intervals: Vec<(i32, i32)> = bboxes + .iter() + .map(|b| match direction { + SortDirection::Horizontal => (b.x_min() as i32, b.x_max() as i32), + SortDirection::Vertical => (b.y_min() as i32, b.y_max() as i32), + }) + .collect(); + + intervals.sort_by_key(|&(start, _)| start); + + let mut merged = Vec::new(); + let (mut current_start, mut current_end) = intervals[0]; + + for &(start, end) in &intervals[1..] { + if start <= current_end { + current_end = current_end.max(end); + } else { + merged.push((current_start, current_end)); + current_start = start; + current_end = end; + } + } + merged.push((current_start, current_end)); + + merged } -/// Inserts a block into the sorted list using weighted distance logic. +/// Shrink slightly overlapping boxes at their midpoint (PaddleX `shrink_overlapping_boxes`). /// -/// Matches `weighted_distance_insert` logic. +/// For consecutive blocks sorted by position, if they have small overlap in the +/// cut direction (0 < overlap < 10%), split at the midpoint of overlap. +fn shrink_overlapping_boxes(blocks: &mut [SortableBlock], direction: SortDirection) { + if blocks.len() < 2 { + return; + } + + // Sort by the end coordinate of the cut direction + match direction { + SortDirection::Vertical => { + blocks.sort_by(|a, b| { + a.bbox + .y_max() + .partial_cmp(&b.bbox.y_max()) + .unwrap_or(std::cmp::Ordering::Equal) + }); + } + SortDirection::Horizontal => { + blocks.sort_by(|a, b| { + a.bbox + .x_max() + .partial_cmp(&b.bbox.x_max()) + .unwrap_or(std::cmp::Ordering::Equal) + }); + } + } + + for i in 0..blocks.len() - 1 { + let perp_direction = match direction { + SortDirection::Vertical => SortDirection::Horizontal, + SortDirection::Horizontal => SortDirection::Vertical, + }; + + let cut_iou = + calculate_projection_overlap_ratio(&blocks[i].bbox, &blocks[i + 1].bbox, direction); + let match_iou = calculate_projection_overlap_ratio( + &blocks[i].bbox, + &blocks[i + 1].bbox, + perp_direction, + ); + + match direction { + SortDirection::Vertical => { + let y2 = blocks[i].bbox.y_max(); + let y1_prime = blocks[i + 1].bbox.y_min(); + if (match_iou > 0.0 && cut_iou > 0.0 && cut_iou < 0.1) + || y2 == y1_prime + || (y2 - y1_prime).abs() <= 3.0 + { + let overlap_y_min = blocks[i].bbox.y_min().max(blocks[i + 1].bbox.y_min()); + let overlap_y_max = blocks[i].bbox.y_max().min(blocks[i + 1].bbox.y_max()); + let split_y = ((overlap_y_min + overlap_y_max) / 2.0).floor(); + + if blocks[i].bbox.y_min() < blocks[i + 1].bbox.y_min() { + let new_bbox = BoundingBox::from_coords( + blocks[i].bbox.x_min(), + blocks[i].bbox.y_min(), + blocks[i].bbox.x_max(), + split_y - 1.0, + ); + blocks[i].bbox = new_bbox; + let new_bbox2 = BoundingBox::from_coords( + blocks[i + 1].bbox.x_min(), + split_y + 1.0, + blocks[i + 1].bbox.x_max(), + blocks[i + 1].bbox.y_max(), + ); + blocks[i + 1].bbox = new_bbox2; + } else { + let new_bbox = BoundingBox::from_coords( + blocks[i].bbox.x_min(), + split_y - 1.0, + blocks[i].bbox.x_max(), + blocks[i].bbox.y_max(), + ); + blocks[i].bbox = new_bbox; + let new_bbox2 = BoundingBox::from_coords( + blocks[i + 1].bbox.x_min(), + blocks[i + 1].bbox.y_min(), + blocks[i + 1].bbox.x_max(), + split_y + 1.0, + ); + blocks[i + 1].bbox = new_bbox2; + } + } + } + SortDirection::Horizontal => { + let x2 = blocks[i].bbox.x_max(); + let x1_prime = blocks[i + 1].bbox.x_min(); + if (match_iou > 0.0 && cut_iou > 0.0 && cut_iou < 0.1) + || x2 == x1_prime + || (x2 - x1_prime).abs() <= 3.0 + { + let overlap_x_min = blocks[i].bbox.x_min().max(blocks[i + 1].bbox.x_min()); + let overlap_x_max = blocks[i].bbox.x_max().min(blocks[i + 1].bbox.x_max()); + let split_x = ((overlap_x_min + overlap_x_max) / 2.0).floor(); + + if blocks[i].bbox.x_min() < blocks[i + 1].bbox.x_min() { + let new_bbox = BoundingBox::from_coords( + blocks[i].bbox.x_min(), + blocks[i].bbox.y_min(), + split_x - 1.0, + blocks[i].bbox.y_max(), + ); + blocks[i].bbox = new_bbox; + let new_bbox2 = BoundingBox::from_coords( + split_x + 1.0, + blocks[i + 1].bbox.y_min(), + blocks[i + 1].bbox.x_max(), + blocks[i + 1].bbox.y_max(), + ); + blocks[i + 1].bbox = new_bbox2; + } else { + let new_bbox = BoundingBox::from_coords( + split_x - 1.0, + blocks[i].bbox.y_min(), + blocks[i].bbox.x_max(), + blocks[i].bbox.y_max(), + ); + blocks[i].bbox = new_bbox; + let new_bbox2 = BoundingBox::from_coords( + blocks[i + 1].bbox.x_min(), + blocks[i + 1].bbox.y_min(), + split_x + 1.0, + blocks[i + 1].bbox.y_max(), + ); + blocks[i + 1].bbox = new_bbox2; + } + } + } + } + } +} + +/// Associate vision title blocks with their nearest vision parent (PaddleX `insert_child_blocks`). /// -/// # Arguments -/// * `block` - The block to insert. -/// * `sorted_blocks` - The current sorted list. -/// * `region_direction` - The direction of the region/page (usually Horizontal for standard docs). +/// Moves VisionTitle blocks adjacent to their nearest Vision block. +fn associate_child_blocks(sorted_blocks: &mut Vec) { + if sorted_blocks.len() < 2 { + return; + } + + // Find vision title indices that need to be moved + let mut moves: Vec<(usize, usize)> = Vec::new(); // (from_idx, target_vision_idx) + + for (i, block) in sorted_blocks.iter().enumerate() { + if block.order_label != OrderLabel::VisionTitle { + continue; + } + + // Find nearest Vision block by edge distance + let mut best_vision_idx = None; + let mut best_distance = f32::INFINITY; + + for (j, other) in sorted_blocks.iter().enumerate() { + if other.order_label != OrderLabel::Vision { + continue; + } + let dist = get_nearest_edge_distance(&block.bbox, &other.bbox, &[1.0, 1.0, 1.0, 1.0]); + if dist < best_distance { + best_distance = dist; + best_vision_idx = Some(j); + } + } + + // Only move if close enough (< 2 * text_line_height of the vision block) + if let Some(vision_idx) = best_vision_idx { + let threshold = sorted_blocks[vision_idx].text_line_height * 3.0; + if best_distance < threshold { + // Should be placed right before or after the vision block + if block.bbox.y_min() < sorted_blocks[vision_idx].bbox.y_min() { + moves.push((i, vision_idx)); // place before + } else { + moves.push((i, vision_idx + 1)); // place after + } + } + } + } + + // Apply moves (process in reverse order to maintain indices) + for (from_idx, target_idx) in moves.into_iter().rev() { + // Only move if the title is not already adjacent + if from_idx == target_idx || from_idx + 1 == target_idx { + continue; + } + let block = sorted_blocks.remove(from_idx); + let adjusted_target = if from_idx < target_idx { + target_idx - 1 + } else { + target_idx + }; + let insert_pos = adjusted_target.min(sorted_blocks.len()); + sorted_blocks.insert(insert_pos, block); + } +} + +/// Insert a block using Manhattan distance (for unordered blocks). +fn manhattan_insert(block: SortableBlock, sorted_blocks: &mut Vec) { + if sorted_blocks.is_empty() { + sorted_blocks.push(block); + return; + } + + let mut min_distance = f32::INFINITY; + let mut nearest_index = 0; + + for (idx, sorted_block) in sorted_blocks.iter().enumerate() { + let distance = (block.bbox.x_min() - sorted_block.bbox.x_min()).abs() + + (block.bbox.y_min() - sorted_block.bbox.y_min()).abs(); + if distance < min_distance { + min_distance = distance; + nearest_index = idx; + } + } + + sorted_blocks.insert(nearest_index + 1, block); +} + +/// Insert a block using weighted distance logic (PaddleX `weighted_distance_insert`). fn weighted_distance_insert( block: SortableBlock, sorted_blocks: &mut Vec, @@ -268,65 +741,45 @@ fn weighted_distance_insert( return; } - // XY-cut settings - let tolerance_len = 2.0; // edge_distance_compare_tolerance_len - - // Abstract handling - // We don't have "Abstract" label explicitly mapped to a unique OrderLabel in this simplified enum - // unless we map LayoutElementType::Abstract to something specific or check the original type if available. - // For now, assuming standard logic. If we had abstract, we'd multiply tolerance by 2. - - // Distance weights - let edge_weight = 10000.0; - let up_edge_weight = 1.0; - let left_edge_weight = 0.0001; - - let mut min_weighted_distance = f32::INFINITY; - let mut min_edge_distance = f32::INFINITY; - let mut min_up_edge_distance = f32::INFINITY; - - let mut nearest_index = 0; - - let (x1, y1, _x2, _y2) = ( + let tolerance_len = EDGE_DISTANCE_COMPARE_TOLERANCE_LEN; + let (x1, y1, x2, _y2) = ( block.bbox.x_min(), block.bbox.y_min(), block.bbox.x_max(), block.bbox.y_max(), ); + let mut min_weighted_distance = f32::INFINITY; + let mut _min_edge_distance = f32::INFINITY; + let mut min_up_edge_distance = f32::INFINITY; + let mut nearest_index = 0; + for (idx, sorted_block) in sorted_blocks.iter().enumerate() { - let (x1_prime, y1_prime, x2_prime, _y2_prime) = ( + let (x1_prime, y1_prime, x2_prime, y2_prime) = ( sorted_block.bbox.x_min(), sorted_block.bbox.y_min(), sorted_block.bbox.x_max(), sorted_block.bbox.y_max(), ); - // Calculate edge distance let weight = get_weights(&block.order_label, block.direction); - let edge_distance = get_nearest_edge_distance(&block.bbox, &sorted_block.bbox, &weight); + let raw_edge_distance = get_nearest_edge_distance(&block.bbox, &sorted_block.bbox, &weight); - // Calculate up edge distances - // For horizontal region (std doc): up is y1_prime, left is x1_prime - let (mut up_dist, mut left_dist) = if matches!(region_direction, SortDirection::Horizontal) - { - (y1_prime, x1_prime) - } else { - (-x2_prime, y1_prime) // Vertical region? (e.g. text flows horizontal? Unclear mapping, sticking to std) + // Quantize edge distance to 50px buckets to ignore minor vertical misalignments + // between columns, allowing left_dist to correctly resolve reading order. + let edge_distance = (raw_edge_distance / 50.0).floor() * 50.0; + + let (mut up_dist, mut left_dist) = match region_direction { + SortDirection::Horizontal => (y1_prime, x1_prime), + SortDirection::Vertical => (-x2_prime, y1_prime), }; - // Check if block is below sorted_block - let is_below = if matches!(region_direction, SortDirection::Horizontal) { - // sorted_block.y2 < block.y1 (sorted block is strictly above block) - // y2_prime < y1 - _y2_prime < y1 - } else { - // sorted_block.x1 > block.x2 (sorted block is strictly to the right? or left?) - // x1_prime > x2 - x1_prime > _x2 + let is_below = match region_direction { + SortDirection::Horizontal => y2_prime < y1, + SortDirection::Vertical => x1_prime > x2, }; - // Logic: Flip signs if below and not a standard text block + // Flip signs for special blocks that are below let is_special = !matches!(block.order_label, OrderLabel::Unordered) || matches!( block.order_label, @@ -334,6 +787,7 @@ fn weighted_distance_insert( | OrderLabel::ParagraphTitle | OrderLabel::Vision | OrderLabel::VisionTitle + | OrderLabel::CrossLayout ); if is_special && is_below { @@ -341,68 +795,37 @@ fn weighted_distance_insert( left_dist = -left_dist; } - // Tolerance check if (min_up_edge_distance - up_dist).abs() <= tolerance_len { up_dist = min_up_edge_distance; } - // Weighted distance let weighted_dist = - edge_distance * edge_weight + up_dist * up_edge_weight + left_dist * left_edge_weight; + edge_distance * EDGE_WEIGHT + up_dist * UP_EDGE_WEIGHT + left_dist * LEFT_EDGE_WEIGHT; - // Update mins - min_edge_distance = min_edge_distance.min(edge_distance); + _min_edge_distance = _min_edge_distance.min(edge_distance); min_up_edge_distance = min_up_edge_distance.min(up_dist); if weighted_dist < min_weighted_distance { min_weighted_distance = weighted_dist; - // Determine relative order (before or after nearest) - // Python: abs(y1 // 2 - y1_prime // 2) > 0 - // We use floor() / 2 as i32 for parity let y1_i = (y1.floor() as i32) / 2; let y1_p_i = (y1_prime.floor() as i32) / 2; - let sorted_dist_val; - let block_dist_val; - - if (y1_i - y1_p_i).abs() > 0 { - sorted_dist_val = y1_prime; - block_dist_val = y1; + let (sorted_dist_val, block_dist_val) = if (y1_i - y1_p_i).abs() > 0 { + (y1_prime, y1) } else if matches!(region_direction, SortDirection::Horizontal) { let x1_i = (x1.floor() as i32) / 2; - let x2_i = (_x2.floor() as i32) / 2; // Warning: python uses x2 (x_max) here? - // Python: if abs(x1 // 2 - x2 // 2) > 0: - // Wait, python code used: block.bbox[0] and block.bbox[2]? - // No, `block` vs `sorted_block` context. - // Python: if abs(x1 // 2 - x2 // 2) > 0 - // x1 from block, x2 from block? No that makes no sense. - // Re-reading python carefully: - // x1, y1, x2, y2 = block.bbox - // x1_prime, y1_prime... = sorted_block.bbox - // if abs(x1 // 2 - x2 // 2) > 0: - // This checks if the BLOCK ITSELF has width > 0 in 2-pixel buckets? - // If so: - // sorted_distance = x1_prime - // block_distance = x1 - // else: - // use centroid distance - let block_width_check = (x1_i - x2_i).abs() > 0; - if block_width_check { - sorted_dist_val = x1_prime; - block_dist_val = x1; + let x2_i = (x2.floor() as i32) / 2; + if (x1_i - x2_i).abs() > 0 { + (x1_prime, x1) } else { - // Centroid distance let (cx, cy) = block.center(); let (scx, scy) = sorted_block.center(); - sorted_dist_val = scx * scx + scy * scy; - block_dist_val = cx * cx + cy * cy; + (scx * scx + scy * scy, cx * cx + cy * cy) } } else { - // Vertical direction logic ... omitted for brevity/standard doc focus - sorted_dist_val = x1_prime; // simplified - block_dist_val = x1; - } + (x1_prime, x1) + }; if block_dist_val > sorted_dist_val { nearest_index = idx + 1; @@ -412,7 +835,6 @@ fn weighted_distance_insert( } } - // Clamp index if nearest_index > sorted_blocks.len() { nearest_index = sorted_blocks.len(); } @@ -424,24 +846,23 @@ fn get_weights(label: &OrderLabel, direction: SortDirection) -> [f32; 4] { match label { OrderLabel::DocTitle => { if matches!(direction, SortDirection::Horizontal) { - [1.0, 0.1, 0.1, 1.0] // left, right, up, down + [1.0, 0.1, 0.1, 1.0] } else { [0.2, 0.1, 1.0, 1.0] } } - OrderLabel::ParagraphTitle | OrderLabel::Vision | OrderLabel::VisionTitle => { - [1.0, 1.0, 0.1, 1.0] // prioritize up distance - } - _ => [1.0, 1.0, 1.0, 0.1], // default (NormalText, etc.) + OrderLabel::ParagraphTitle + | OrderLabel::Vision + | OrderLabel::VisionTitle + | OrderLabel::CrossLayout => [1.0, 1.0, 0.1, 1.0], + _ => [1.0, 1.0, 1.0, 0.1], } } /// Calculate nearest edge distance between two boxes. -/// -/// Returns 0.0 if they overlap in projection (aligned). fn get_nearest_edge_distance(b1: &BoundingBox, b2: &BoundingBox, weights: &[f32; 4]) -> f32 { - let h_overlap = calculate_projection_overlap(b1, b2, SortDirection::Horizontal); - let v_overlap = calculate_projection_overlap(b1, b2, SortDirection::Vertical); + let h_overlap = calculate_projection_overlap_ratio(b1, b2, SortDirection::Horizontal); + let v_overlap = calculate_projection_overlap_ratio(b1, b2, SortDirection::Vertical); if h_overlap > 0.0 && v_overlap > 0.0 { return 0.0; @@ -475,7 +896,8 @@ fn get_nearest_edge_distance(b1: &BoundingBox, b2: &BoundingBox, weights: &[f32; min_x + min_y } -fn calculate_projection_overlap( +/// Calculate projection overlap ratio (IoU) along a single axis. +fn calculate_projection_overlap_ratio( b1: &BoundingBox, b2: &BoundingBox, direction: SortDirection, @@ -489,7 +911,7 @@ fn calculate_projection_overlap( let union = max1.max(max2) - min1.min(min2); if union > 0.0 { - intersection / union // IOU + intersection / union } else { 0.0 } diff --git a/oar-ocr-core/src/processors/layout_utils.rs b/oar-ocr-core/src/processors/layout_utils.rs index ab09069..585642e 100644 --- a/oar-ocr-core/src/processors/layout_utils.rs +++ b/oar-ocr-core/src/processors/layout_utils.rs @@ -280,16 +280,30 @@ pub fn reconcile_table_cells( // Assign each detected cell to the best matching structure cell for (det_idx, det_box) in det_boxes.iter().enumerate() { - let mut best_iou = 0.001f32; // Minimal threshold + let mut best_ioa = 0.001f32; // Minimal threshold let mut best_struct_idx: Option = None; + let det_area = (det_box.x_max() - det_box.x_min()) * (det_box.y_max() - det_box.y_min()); + for (struct_idx, struct_box) in structure_cells.iter().enumerate() { - // Use IoU for assignment - // Note: We could also use intersection over detection area to handle - // cases where detection is much smaller than structure cell - let iou = calculate_iou(det_box, struct_box); - if iou > best_iou { - best_iou = iou; + // Use Intersection over Area (IoA) of detection for assignment. + // This properly handles cases where the structure cell has rowspan/colspan + // and is significantly larger than the detected text bounding box. + let inter_x1 = det_box.x_min().max(struct_box.x_min()); + let inter_y1 = det_box.y_min().max(struct_box.y_min()); + let inter_x2 = det_box.x_max().min(struct_box.x_max()); + let inter_y2 = det_box.y_max().min(struct_box.y_max()); + + let inter_area = (inter_x2 - inter_x1).max(0.0) * (inter_y2 - inter_y1).max(0.0); + + let ioa = if det_area > 0.0 { + inter_area / det_area + } else { + 0.0 + }; + + if ioa > best_ioa { + best_ioa = ioa; best_struct_idx = Some(struct_idx); } } @@ -626,27 +640,6 @@ fn kmeans_maxdist_init(points: &[(f32, f32)], k: usize) -> Vec<(f32, f32)> { centers } -/// Calculates Intersection over Union (IoU) between two bounding boxes. -fn calculate_iou(a: &BoundingBox, b: &BoundingBox) -> f32 { - let inter_x1 = a.x_min().max(b.x_min()); - let inter_y1 = a.y_min().max(b.y_min()); - let inter_x2 = a.x_max().min(b.x_max()); - let inter_y2 = a.y_max().min(b.y_max()); - - let inter_area = (inter_x2 - inter_x1).max(0.0) * (inter_y2 - inter_y1).max(0.0); - - let area_a = (a.x_max() - a.x_min()) * (a.y_max() - a.y_min()); - let area_b = (b.x_max() - b.x_min()) * (b.y_max() - b.y_min()); - - let union_area = area_a + area_b - inter_area; - - if union_area <= 0.0 { - 0.0 - } else { - inter_area / union_area - } -} - /// Calculates Intersection over Area (IoA) - intersection / smaller box area. fn calculate_ioa_smaller(a: &BoundingBox, b: &BoundingBox) -> f32 { let inter_x1 = a.x_min().max(b.x_min()); diff --git a/oar-ocr-core/src/processors/sorting.rs b/oar-ocr-core/src/processors/sorting.rs index ed08789..24802ea 100644 --- a/oar-ocr-core/src/processors/sorting.rs +++ b/oar-ocr-core/src/processors/sorting.rs @@ -216,7 +216,7 @@ pub fn sort_boxes_xycut(boxes: &[BoundingBox], direction: SortDirection) -> Vec< /// # Returns /// /// A 1D vector representing the projection histogram -fn projection_by_bboxes(boxes: &[[i32; 4]], axis: usize) -> Vec { +pub(crate) fn projection_by_bboxes(boxes: &[[i32; 4]], axis: usize) -> Vec { assert!(axis <= 1, "axis must be 0 or 1"); if boxes.is_empty() { @@ -264,7 +264,7 @@ fn projection_by_bboxes(boxes: &[[i32; 4]], axis: usize) -> Vec { /// # Returns /// /// Optional tuple of (segment_starts, segment_ends) -fn split_projection_profile( +pub(crate) fn split_projection_profile( arr_values: &[i32], min_value: i32, min_gap: i32, @@ -522,6 +522,7 @@ impl SortableRegion { } /// Calculates the IoU (Intersection over Union) between two bounding boxes. +#[allow(dead_code)] pub fn calculate_iou(a: &BoundingBox, b: &BoundingBox) -> f32 { let x1 = a.x_min().max(b.x_min()); let y1 = a.y_min().max(b.y_min()); diff --git a/oar-ocr-vl/src/doc_parser.rs b/oar-ocr-vl/src/doc_parser.rs index 9cdfb8f..423aa0a 100644 --- a/oar-ocr-vl/src/doc_parser.rs +++ b/oar-ocr-vl/src/doc_parser.rs @@ -26,7 +26,7 @@ use oar_ocr_core::domain::structure::{ }; use oar_ocr_core::predictors::LayoutDetectionPredictor; use oar_ocr_core::processors::BoundingBox; -use oar_ocr_core::processors::layout_sorting::sort_layout_enhanced; +use oar_ocr_core::processors::layout_sorting::{SortableElement, sort_layout_enhanced}; use oar_ocr_core::utils::BBoxCrop; use std::sync::Arc; @@ -225,9 +225,13 @@ impl<'a, B: RecognitionBackend> DocParser<'a, B> { let mut sorted_elements: Vec = if layout_result.is_reading_order_sorted { elements } else { - let sortable: Vec<(BoundingBox, LayoutElementType)> = elements + let sortable: Vec = elements .iter() - .map(|e| (e.bbox.clone(), e.element_type)) + .map(|e| SortableElement { + bbox: e.bbox.clone(), + element_type: e.element_type, + num_lines: e.num_lines, + }) .collect(); let sorted_indices = sort_layout_enhanced(&sortable, page_w, page_h); sorted_indices diff --git a/src/oarocr/ocr.rs b/src/oarocr/ocr.rs index d6cd68b..f6a28fc 100644 --- a/src/oarocr/ocr.rs +++ b/src/oarocr/ocr.rs @@ -189,6 +189,7 @@ impl OAROCRBuilder { /// /// This matches the text_type parameter: /// - "seal": Uses polygon-based sorting/cropping for seal text (circular/curved) + /// - "table": Uses table-friendly detection defaults (box_threshold=0.4) /// - Other values or None: Uses quad-based sorting (default) /// /// # Arguments @@ -253,11 +254,26 @@ impl OAROCRBuilder { // Align text detection defaults with OCR pipeline. // Defaults depend on text_type: // - general: limit_side_len=960, limit_type="max", thresh=0.3, box_thresh=0.6, unclip_ratio=2.0 + // - table: limit_side_len=960, limit_type="max", thresh=0.3, box_thresh=0.4, unclip_ratio=2.0 // - seal: limit_side_len=736, limit_type="min", thresh=0.2, box_thresh=0.6, unclip_ratio=0.5 let mut effective_det_cfg = self.text_detection_config.clone().unwrap_or_default(); let has_explicit_det_cfg = self.text_detection_config.is_some(); if !has_explicit_det_cfg { match self.text_type.as_deref().unwrap_or("general") { + "table" => { + effective_det_cfg.score_threshold = 0.3; + effective_det_cfg.box_threshold = 0.4; + effective_det_cfg.unclip_ratio = 2.0; + if effective_det_cfg.limit_side_len.is_none() { + effective_det_cfg.limit_side_len = Some(960); + } + if effective_det_cfg.limit_type.is_none() { + effective_det_cfg.limit_type = Some(crate::processors::LimitType::Max); + } + if effective_det_cfg.max_side_len.is_none() { + effective_det_cfg.max_side_len = Some(4000); + } + } "seal" => { effective_det_cfg.score_threshold = 0.2; effective_det_cfg.box_threshold = 0.6; @@ -762,6 +778,7 @@ impl OAROCR { confidence: Some(score), orientation_angle: region.line_orientation_angle, word_boxes, + label: None, }); } } diff --git a/src/oarocr/stitching.rs b/src/oarocr/stitching.rs index 31f501e..20e10cd 100644 --- a/src/oarocr/stitching.rs +++ b/src/oarocr/stitching.rs @@ -16,7 +16,7 @@ use oar_ocr_core::domain::structure::{ FormulaResult, LayoutElement, LayoutElementType, StructureResult, TableCell, TableResult, }; use oar_ocr_core::processors::{ - BoundingBox, SplitConfig as OcrSplitConfig, create_expanded_ocr_for_table, + BoundingBox, SplitConfig as OcrSplitConfig, create_expanded_ocr_for_table, parse_cell_grid_info, }; use std::cmp::Ordering; @@ -32,11 +32,17 @@ enum OcrSource { /// Labels that should be excluded from OCR text matching. /// These regions have their own specialized content (LaTeX, HTML, etc.) -const EXCLUDED_FROM_OCR_LABELS: [LayoutElementType; 4] = [ - LayoutElementType::Formula, - LayoutElementType::FormulaNumber, +/// Labels excluded from OCR text matching in `stitch_layout_elements`. +/// PaddleX: formula results are injected into the OCR pool (via +/// `convert_formula_res_to_ocr_format`), so formula blocks participate +/// in normal OCR matching — only Table and Seal are excluded. +/// +/// NOTE: After inline formula injection, formula elements have been absorbed +/// into text regions and should be excluded from stitching to prevent duplication. +const EXCLUDED_FROM_OCR_LABELS: [LayoutElementType; 3] = [ LayoutElementType::Table, LayoutElementType::Seal, + LayoutElementType::Formula, // Exclude formulas to prevent duplicate rendering after injection ]; #[derive(Clone)] @@ -46,7 +52,10 @@ pub struct StitchConfig { pub require_text_center_inside_cell: bool, pub cell_merge_min_iou: f32, pub formula_to_cell_min_iou: f32, + /// Fallback pixel tolerance for line grouping. pub same_line_y_tolerance: f32, + /// Minimum vertical overlap ratio (intersection / min(line_height)) to treat two spans as one line. + pub line_height_iou_threshold: f32, /// Whether to enable cross-cell OCR box splitting. /// When enabled, OCR boxes that span multiple table cells will be split /// at cell boundaries and their text distributed proportionally. @@ -62,6 +71,7 @@ impl Default for StitchConfig { cell_merge_min_iou: 0.3, formula_to_cell_min_iou: 0.01, same_line_y_tolerance: 10.0, + line_height_iou_threshold: 0.6, enable_cross_cell_split: true, } } @@ -88,8 +98,8 @@ impl ResultStitcher { // Track which regions have been used let mut used_region_indices = std::collections::HashSet::new(); - // Get text regions (clone to avoid borrow issues) - let regions = result.text_regions.clone().unwrap_or_default(); + // Get text regions (clone to avoid borrow issues, make mutable for injection) + let mut regions = result.text_regions.clone().unwrap_or_default(); tracing::debug!("Stitching: {} text regions", regions.len()); @@ -110,7 +120,20 @@ impl ResultStitcher { used_region_indices.len() ); + // 1.5. Fill formula elements with LaTeX content FIRST + // This must happen before inject_inline_formulas so formulas have text content + Self::fill_formula_elements(&mut result.layout_elements, &result.formulas, cfg); + + // 1.6. Inject inline formulas into text regions + // PaddleX: Small formula elements that overlap with text elements should be + // absorbed into the text flow, not kept as separate layout elements. + // This creates TextRegion entries with label="formula" that will be wrapped + // with $...$ delimiters during text joining. + Self::inject_inline_formulas(&mut result.layout_elements, &mut regions, cfg); + // 2. Stitch text into layout elements (excluding special types) + // Note: after inject_inline_formulas, some formula elements have had their text cleared + // These won't be rendered separately in to_markdown Self::stitch_layout_elements( &mut result.layout_elements, ®ions, @@ -123,21 +146,18 @@ impl ResultStitcher { used_region_indices.len() ); - // 3. Fill formula elements with LaTeX content - Self::fill_formula_content(&mut result.layout_elements, &result.formulas); + // Note: fill_formula_elements was already called before inject_inline_formulas + // Do NOT call it again here, as it would re-fill formulas that were injected and cleared - // 4. Mark text regions that overlap with excluded element types (Formula, Seal) - // as used to prevent them from becoming orphans. - // - Formulas: content comes from LaTeX recognition, OCR is redundant/noise. + // 3. Mark text regions that overlap with Seal elements as used + // to prevent them from becoming orphans. // - Seals: content comes from specialized seal OCR. // - Tables: content comes from OCR stitching. We do NOT suppress tables here because // text inside a table that wasn't assigned to a cell (in step 1) should be preserved // as an orphan (e.g. caption, header, or matching failure). + // - Formulas: now handled through normal OCR matching (step 2), already marked used. for element in &result.layout_elements { - if matches!( - element.element_type, - LayoutElementType::Formula | LayoutElementType::Seal - ) { + if element.element_type == LayoutElementType::Seal { for (idx, region) in regions.iter().enumerate() { if Self::is_overlapping(&element.bbox, ®ion.bounding_box, cfg) { used_region_indices.insert(idx); @@ -157,6 +177,53 @@ impl ResultStitcher { .map(|e| &e.bbox) .collect(); + let image_chart_bboxes: Vec<&BoundingBox> = result + .layout_elements + .iter() + .filter(|e| { + matches!( + e.element_type, + LayoutElementType::Image | LayoutElementType::Chart + ) + }) + .map(|e| &e.bbox) + .collect(); + + // Collect figure/chart caption bboxes to infer undetected figure regions. + // When the layout model detects a caption (e.g. "Figure 3...") but misses + // the figure image itself, OCR text from the figure diagram becomes orphans. + // We infer the figure area as the region above each caption within its x-range. + let figure_caption_bboxes: Vec<&BoundingBox> = result + .layout_elements + .iter() + .filter(|e| { + matches!( + e.element_type, + LayoutElementType::FigureTitle + | LayoutElementType::ChartTitle + | LayoutElementType::FigureTableChartTitle + ) + }) + .map(|e| &e.bbox) + .collect(); + + // Collect text/title element bboxes to check if an orphan is already + // covered by a known content element (avoid filtering legitimate text) + let content_element_bboxes: Vec<&BoundingBox> = result + .layout_elements + .iter() + .filter(|e| { + matches!( + e.element_type, + LayoutElementType::Text + | LayoutElementType::DocTitle + | LayoutElementType::ParagraphTitle + | LayoutElementType::Abstract + ) + }) + .map(|e| &e.bbox) + .collect(); + let original_element_count = result.layout_elements.len(); let mut new_elements = Vec::new(); for (idx, region) in regions.iter().enumerate() { @@ -174,11 +241,53 @@ impl ResultStitcher { continue; } + // Filter out text inside Image/Chart regions + let overlaps_image_chart = image_chart_bboxes + .iter() + .any(|bbox| region.bounding_box.ioa(bbox) > 0.5); + + if overlaps_image_chart { + continue; + } + + // Filter out text in inferred figure regions (above figure/chart captions). + // When the layout model detects a caption but not the figure itself, + // OCR'd annotations from the figure diagram leak as orphan text. + // Check: orphan is above a caption, within its x-range, and not inside + // any existing text/title element. + let in_inferred_figure_region = figure_caption_bboxes.iter().any(|cap| { + let orphan_bb = ®ion.bounding_box; + // Orphan must be above or overlapping with the caption's top + let above_caption = orphan_bb.y_max() < cap.y_max(); + // Orphan must be within the caption's horizontal range (with margin) + let x_margin = (cap.x_max() - cap.x_min()) * 0.1; + let in_x_range = orphan_bb.x_min() >= (cap.x_min() - x_margin) + && orphan_bb.x_max() <= (cap.x_max() + x_margin); + above_caption && in_x_range + }); + + if in_inferred_figure_region { + // Verify the orphan is NOT inside any existing text/title element + let inside_content_element = content_element_bboxes + .iter() + .any(|bbox| region.bounding_box.ioa(bbox) > 0.5); + if !inside_content_element { + continue; + } + } + + // Check if this orphan region is a formula // Create a new layout element for this orphan text - // We treat it as a generic "text" element + // If it's a formula (label="formula"), create a Formula element, otherwise Text + let element_type = if region.is_formula() { + LayoutElementType::Formula + } else { + LayoutElementType::Text + }; + let element = LayoutElement::new( region.bounding_box.clone(), - LayoutElementType::Text, + element_type, region.confidence.unwrap_or(0.0), ) .with_text(text.as_ref().to_string()); @@ -236,37 +345,23 @@ impl ResultStitcher { // by XY-cut with region hierarchy in structure.rs - do NOT re-sort here. // Only sort when region_blocks is NOT present. if result.region_blocks.is_none() { - Self::sort_layout_elements(&mut result.layout_elements, width, cfg); + let height = if let Some(img) = &result.rectified_img { + img.height() as f32 + } else { + result + .layout_elements + .iter() + .map(|e| e.bbox.y_max()) + .fold(0.0f32, f32::max) + .max(1000.0) + }; + Self::sort_layout_elements_enhanced(&mut result.layout_elements, width, height); } // Assign order indices regardless of sorting Self::assign_order_indices(&mut result.layout_elements); } - /// Fills formula layout elements with their corresponding LaTeX content. - /// - /// Matches formula results to layout elements by bounding box overlap (IOU > 0.5). - fn fill_formula_content(elements: &mut [LayoutElement], formulas: &[FormulaResult]) { - for element in elements.iter_mut() { - if element.element_type.is_formula() { - // Find the best matching formula result by IOU - if let Some(formula) = formulas - .iter() - .filter(|f| element.bbox.iou(&f.bbox) > 0.5) - .max_by(|a, b| { - element - .bbox - .iou(&a.bbox) - .partial_cmp(&element.bbox.iou(&b.bbox)) - .unwrap_or(Ordering::Equal) - }) - { - element.text = Some(formula.latex.clone()); - } - } - } - } - /// Assigns reading order indices to layout elements. /// /// Only elements that should be included in reading order get an index. @@ -320,7 +415,10 @@ impl ResultStitcher { if table.cells.is_empty() { continue; } - let e2e_like_cells = table.cells.iter().all(|cell| cell.confidence >= 0.999); + // Use the explicit is_e2e flag from the table analyzer to determine + // the matching strategy, instead of inferring from confidence values. + let has_detected_cells = table.detected_cell_bboxes.is_some(); + let e2e_like_cells = table.is_e2e && !has_detected_cells; // 1. Filter relevant text regions (those overlapping the table area) let table_bbox = table.bbox.clone(); // Use table bbox @@ -387,12 +485,36 @@ impl ResultStitcher { } } + // PaddleX: inject formula results into table OCR candidate pool with $...$ + // wrapping (table_contents_for_img). This lets formulas participate in normal + // cell matching, so formula content appears in the correct table cells. + for formula in formulas { + let w = formula.bbox.x_max() - formula.bbox.x_min(); + let h = formula.bbox.y_max() - formula.bbox.y_min(); + if w <= 1.0 || h <= 1.0 { + continue; + } + if !Self::is_overlapping(&table_bbox, &formula.bbox, cfg) { + continue; + } + let latex = &formula.latex; + let formatted = if latex.starts_with('$') && latex.ends_with('$') { + latex.clone() + } else { + format!("${}$", latex) + }; + let mut formula_region = TextRegion::new(formula.bbox.clone()); + formula_region.text = Some(formatted.into()); + formula_region.confidence = Some(1.0); + ocr_candidates.push((OcrSource::Split, formula_region)); + } + let structure_tokens = table.structure_tokens.clone(); // Prefer PaddleX-style row-aware matching when structure tokens are available. + // Use row-aware matching when cell detection was used (non-E2E mode). let mut td_to_cell_mapping: Option>> = None; - let has_detection_like_cells = table.cells.iter().any(|cell| cell.confidence < 0.999); - if has_detection_like_cells + if !e2e_like_cells && let Some(tokens) = structure_tokens.as_deref() && !ocr_candidates.is_empty() && let Some((mapping, matched_candidate_indices)) = @@ -401,6 +523,7 @@ impl ResultStitcher { tokens, &ocr_candidates, cfg.same_line_y_tolerance, + table.detected_cell_bboxes.as_deref(), ) { td_to_cell_mapping = Some(mapping); @@ -475,13 +598,19 @@ impl ResultStitcher { } } - // Attach formulas after text matching so formula tokens become part of final cell text. - Self::attach_formulas_to_cells(table, formulas, cfg); + // Formulas are now injected into the OCR candidate pool above, + // so they participate in normal cell matching — no separate attach step needed. + + // Optional postprocess for checkbox-style tables: + // normalize common OCR confusions like ü/L/X into ✓/✗ when the table + // clearly exhibits both positive and negative marker patterns. + Self::normalize_checkbox_symbols_in_table(&mut table.cells); // Regenerate HTML from structure tokens and stitched cell text. if let Some(tokens) = structure_tokens.as_deref() { let cell_texts: Vec> = if let Some(ref td_mapping) = td_to_cell_mapping { + // Use the mapping from row-aware matching td_mapping .iter() .map(|cell_idx| { @@ -491,7 +620,10 @@ impl ResultStitcher { }) .collect() } else { - table.cells.iter().map(|c| c.text.clone()).collect() + // Fallback: cells may not be in the same order as structure_tokens. + // We need to create a mapping from cell bbox to its index, then + // iterate through tokens to collect texts in the correct order. + Self::collect_cell_texts_for_tokens(&table.cells, tokens) }; let html_structure = @@ -527,6 +659,29 @@ impl ResultStitcher { } for (candidate_idx, (_, region)) in ocr_candidates.iter().enumerate() { + let ocr_bbox = ®ion.bounding_box; + + // Strategy 1: Center-point-in-cell with high IoA (strongest signal). + // If the OCR box center falls inside a cell AND the box has high overlap + // with that cell (IoA > 0.7), assign directly. The IoA check avoids + // misassignment for boxes that straddle cell boundaries. + let ocr_cx = (ocr_bbox.x_min() + ocr_bbox.x_max()) / 2.0; + let ocr_cy = (ocr_bbox.y_min() + ocr_bbox.y_max()) / 2.0; + let center_cell = cells.iter().enumerate().find(|(_, cell)| { + ocr_cx >= cell.bbox.x_min() + && ocr_cx <= cell.bbox.x_max() + && ocr_cy >= cell.bbox.y_min() + && ocr_cy <= cell.bbox.y_max() + && ocr_bbox.ioa(&cell.bbox) > 0.7 + }); + + if let Some((cell_idx, _)) = center_cell { + cell_to_ocr.entry(cell_idx).or_default().push(candidate_idx); + matched_candidate_indices.insert(candidate_idx); + continue; + } + + // Strategy 2+3: IoU + distance fallback let mut best_cell_idx: Option = None; let mut min_cost = (f32::MAX, f32::MAX); let mut candidate_costs: Vec<(usize, (f32, f32))> = Vec::new(); @@ -706,6 +861,51 @@ impl ResultStitcher { } } + fn normalize_checkbox_symbols_in_table(cells: &mut [TableCell]) { + let mut has_positive_candidate = false; + let mut has_negative_candidate = false; + + for cell in cells.iter() { + let Some(text) = cell.text.as_deref() else { + continue; + }; + let trimmed = text.trim(); + if trimmed.chars().count() != 1 { + continue; + } + match trimmed.chars().next().unwrap_or_default() { + '✓' | 'ü' | 'Ü' | 'L' | '√' | '☑' => has_positive_candidate = true, + '✗' | 'X' | 'x' | '✕' | '✖' | '☒' => has_negative_candidate = true, + _ => {} + } + } + + for cell in cells.iter_mut() { + let Some(text) = cell.text.clone() else { + continue; + }; + let trimmed = text.trim(); + if trimmed.chars().count() != 1 { + continue; + } + let mapped = match trimmed.chars().next().unwrap_or_default() { + // Safe positive normalization. + 'ü' | 'Ü' | '√' | '☑' => Some("✓"), + // Ambiguous L is normalized only when the table appears checkbox-like. + 'L' if has_positive_candidate && has_negative_candidate => Some("✓"), + // Safe negative normalization. + '✕' | '✖' | '☒' => Some("✗"), + // Ambiguous X/x are normalized only when the table appears checkbox-like. + 'X' | 'x' if has_positive_candidate && has_negative_candidate => Some("✗"), + _ => None, + }; + + if let Some(symbol) = mapped { + cell.text = Some(symbol.to_string()); + } + } + } + /// PaddleX-style text concatenation for one cell. fn join_ocr_texts_paddlex_style( candidate_indices: &[usize], @@ -758,14 +958,27 @@ impl ResultStitcher { structure_tokens: &[String], ocr_candidates: &[(OcrSource, TextRegion)], row_y_tolerance: f32, + cell_bboxes_override: Option<&[BoundingBox]>, ) -> Option<(Vec>, std::collections::HashSet)> { if cells.is_empty() || structure_tokens.is_empty() || ocr_candidates.is_empty() { return None; } - let (sorted_cell_indices, table_cells_flag) = - Self::sort_table_cells_boxes(cells, row_y_tolerance); - if sorted_cell_indices.is_empty() || table_cells_flag.is_empty() { + // --- Sort cells into rows --- + // When detected bboxes are available, sort them (better spatial accuracy) + // for the IoA matching loop. + let (match_sorted_indices, match_row_flags) = if let Some(det_bboxes) = cell_bboxes_override + { + let temp_cells: Vec = det_bboxes + .iter() + .map(|b| TableCell::new(b.clone(), 0.5)) + .collect(); + Self::sort_table_cells_boxes(&temp_cells, row_y_tolerance) + } else { + Self::sort_table_cells_boxes(cells, row_y_tolerance) + }; + + if match_sorted_indices.is_empty() || match_row_flags.is_empty() { return None; } @@ -774,23 +987,42 @@ impl ResultStitcher { return None; } - let mut aligned_row_flags = Self::map_and_get_max(&table_cells_flag, &row_start_index); - aligned_row_flags.push(sorted_cell_indices.len()); - row_start_index.push(sorted_cell_indices.len()); + // Align match row flags with structure token row boundaries + let mut match_aligned = Self::map_and_get_max(&match_row_flags, &row_start_index); + match_aligned.push(match_sorted_indices.len()); + row_start_index.push( + structure_tokens + .iter() + .filter(|t| Self::is_td_end_token(t)) + .count(), + ); + // --- Per-row matching: cell → OCR (PaddleX style) --- + // For each cell in the row, collect ALL OCR boxes with IoA > 0.7. + // No cross-row deduplication — each row independently checks all OCR boxes, + // matching PaddleX v2 behavior. The 0.7 IoA threshold naturally prevents + // false cross-row matches. let mut all_matched: Vec>> = Vec::new(); - for k in 0..aligned_row_flags.len().saturating_sub(1) { - let row_start = aligned_row_flags[k].min(sorted_cell_indices.len()); - let row_end = aligned_row_flags[k + 1].min(sorted_cell_indices.len()); + for k in 0..match_aligned.len().saturating_sub(1) { + let row_start = match_aligned[k].min(match_sorted_indices.len()); + let row_end = match_aligned[k + 1].min(match_sorted_indices.len()); + let mut matched: std::collections::HashMap> = std::collections::HashMap::new(); - for (local_idx, sorted_pos) in (row_start..row_end).enumerate() { - let cell_idx = sorted_cell_indices[sorted_pos]; - let cell_box = &cells[cell_idx].bbox; + for (local_idx, &bbox_idx) in + match_sorted_indices[row_start..row_end].iter().enumerate() + { + // Use detected bbox directly when available, else structure cell bbox + let cell_box = cell_bboxes_override + .and_then(|bbs| bbs.get(bbox_idx)) + .unwrap_or_else(|| &cells[bbox_idx.min(cells.len() - 1)].bbox); + for (ocr_idx, (_, ocr_region)) in ocr_candidates.iter().enumerate() { - if Self::compute_inter(cell_box, &ocr_region.bounding_box) > 0.7 { + // IoA = intersection / OCR_area (PaddleX compute_inter > 0.7) + let ioa = ocr_region.bounding_box.ioa(cell_box); + if ioa > 0.7 { matched.entry(local_idx).or_default().push(ocr_idx); } } @@ -799,6 +1031,8 @@ impl ResultStitcher { all_matched.push(matched); } + // --- Build td_to_cell_mapping by iterating structure tokens --- + // table.cells maps exactly 1:1 with td tokens in structure order. let mut td_to_cell_mapping: Vec> = Vec::new(); let mut matched_candidate_indices: std::collections::HashSet = std::collections::HashSet::new(); @@ -808,6 +1042,10 @@ impl ResultStitcher { let mut matched_row_idx = 0usize; for tag in structure_tokens { + if tag == "
" { + td_index = 0; // Reset cell index at row start + continue; + } if !Self::is_td_end_token(tag) { continue; } @@ -821,14 +1059,17 @@ impl ResultStitcher { matched_candidate_indices.extend(indices.iter().copied()); } - let mapped_cell_idx = - aligned_row_flags - .get(matched_row_idx) - .copied() - .and_then(|row_start| { - let sorted_pos = row_start + td_index; - sorted_cell_indices.get(sorted_pos).copied() - }); + // Map td position to the original cell index via sorted ordering. + // match_aligned[matched_row_idx] + td_index gives the position in the + // sorted cell list, and match_sorted_indices maps that back to cells[]. + let mapped_cell_idx = match_aligned + .get(matched_row_idx) + .copied() + .and_then(|row_start| { + let sorted_pos = row_start + td_index; + match_sorted_indices.get(sorted_pos).copied() + }) + .filter(|&idx| idx < cells.len()); td_to_cell_mapping.push(mapped_cell_idx); @@ -852,7 +1093,6 @@ impl ResultStitcher { && td_count >= row_start_index[matched_row_idx + 1] { matched_row_idx += 1; - td_index = 0; } } @@ -863,6 +1103,54 @@ impl ResultStitcher { } } + /// Collects cell texts in the order they appear in structure tokens. + /// + /// Uses grid-based `(row, col)` matching when cells have grid info, which + /// correctly handles rowspan/colspan cases where cells.len() != td_count. + /// Falls back to index-based matching when grid info is unavailable. + fn collect_cell_texts_for_tokens( + cells: &[TableCell], + tokens: &[String], + ) -> Vec> { + if cells.is_empty() { + return Vec::new(); + } + + // Parse grid positions for each
token + let token_grid = parse_cell_grid_info(tokens); + let td_count = token_grid.len(); + + // Build a lookup from (row, col) -> cell index for cells that have grid info + let mut grid_to_cell: std::collections::HashMap<(usize, usize), usize> = + std::collections::HashMap::new(); + let mut has_grid_info = false; + + for (cell_idx, cell) in cells.iter().enumerate() { + if let (Some(row), Some(col)) = (cell.row, cell.col) { + grid_to_cell.insert((row, col), cell_idx); + has_grid_info = true; + } + } + + if has_grid_info { + // Grid-based matching: match tokens to cells by (row, col) position + token_grid + .iter() + .map(|gi| { + grid_to_cell + .get(&(gi.row, gi.col)) + .and_then(|&idx| cells.get(idx)) + .and_then(|cell| cell.text.clone()) + }) + .collect() + } else { + // Fallback: cells don't have grid info, use index-based matching + (0..td_count) + .map(|i| cells.get(i).and_then(|cell| cell.text.clone())) + .collect() + } + } + /// Sort table cells row-by-row (top-to-bottom, left-to-right) and return row flags. /// /// Returns `(sorted_indices, flags)` where `flags` contains cumulative row starts. @@ -1128,75 +1416,6 @@ impl ResultStitcher { (split_regions, split_ocr_indices, cell_assignments) } - /// Attaches recognized formulas to the best-matching table cells. - /// - /// This mirrors behavior where formula recognition results are merged into the - /// OCR content used for table structure recognition. Here we approximate that behavior by: - /// - For each formula, finding the cell with maximum IoU - /// - If IoU exceeds a small threshold, appending `$latex$` to that cell's text - fn attach_formulas_to_cells( - table: &mut TableResult, - formulas: &[FormulaResult], - cfg: &StitchConfig, - ) { - if formulas.is_empty() || table.cells.is_empty() { - return; - } - - for formula in formulas { - let bbox = &formula.bbox; - - // Skip degenerate boxes - let w = bbox.x_max() - bbox.x_min(); - let h = bbox.y_max() - bbox.y_min(); - if w <= 1.0 || h <= 1.0 { - continue; - } - - // Only consider formulas that overlap the table bbox at all - if !Self::is_overlapping(&table.bbox, bbox, cfg) { - continue; - } - - // Find best-matching cell by IoU - let mut best_cell_idx: Option = None; - let mut best_iou = 0.0f32; - - for (cell_idx, cell) in table.cells.iter().enumerate() { - let iou = Self::calculate_iou(&cell.bbox, bbox); - if iou > best_iou { - best_iou = iou; - best_cell_idx = Some(cell_idx); - } - } - - if let Some(cell_idx) = best_cell_idx - && best_iou > cfg.formula_to_cell_min_iou - { - let cell = &mut table.cells[cell_idx]; - - // Append formula as LaTeX wrapped in $...$ - let formatted = if formula.latex.starts_with('$') && formula.latex.ends_with('$') { - formula.latex.clone() - } else { - format!("${}$", formula.latex) - }; - - match &mut cell.text { - Some(existing) => { - if !existing.is_empty() { - existing.push(' '); - } - existing.push_str(&formatted); - } - None => { - cell.text = Some(formatted); - } - } - } - } - } - /// Calculates the Intersection over Union (IoU) between two bounding boxes. fn calculate_iou(bbox1: &BoundingBox, bbox2: &BoundingBox) -> f32 { let x1_min = bbox1.x_min(); @@ -1257,6 +1476,81 @@ impl ResultStitcher { dis + dis_2.min(dis_3) } + /// Marks small inline formulas to be absorbed into the text flow. + /// + /// PaddleX: Small formula elements should be absorbed into the text flow, + /// not kept as separate layout elements. + /// + /// This function: + /// 1. Finds small formula elements that should be inline (not display formulas) + /// 2. Clears their text and order_index so the formula element won't be rendered + /// 3. The corresponding TextRegion with label="formula" (already created in structure.rs) + /// will become an orphan and be handled with proper $...$ wrapping + fn inject_inline_formulas( + elements: &mut [LayoutElement], + _text_regions: &mut Vec, + _cfg: &StitchConfig, + ) { + use oar_ocr_core::domain::structure::LayoutElementType; + + let mut inline_formula_indices: Vec = Vec::new(); + + // Size threshold: formulas smaller than 80k pixels² are likely inline + const INLINE_FORMULA_MAX_AREA: f32 = 80000.0; + + for (idx, element) in elements.iter().enumerate() { + if element.element_type != LayoutElementType::Formula { + continue; + } + + // Only process formulas that have text + let formula_text = if let Some(text) = &element.text { + if !text.is_empty() { + text + } else { + continue; + } + } else { + continue; + }; + + let formula_area = element.bbox.area(); + tracing::debug!( + "Formula idx {}: area={:.1}, text={}", + idx, + formula_area, + formula_text + ); + + // Small formulas are treated as inline + if formula_area < INLINE_FORMULA_MAX_AREA { + inline_formula_indices.push(idx); + tracing::debug!( + "Marking formula idx {} as inline (area {:.1} < {})", + idx, + formula_area, + INLINE_FORMULA_MAX_AREA + ); + } + } + + // Clear inline formula elements so they won't be rendered separately + for idx in &inline_formula_indices { + if let Some(element) = elements.get_mut(*idx) { + tracing::debug!( + "Clearing inline formula idx {} to use TextRegion with label=formula", + idx + ); + element.text = None; + element.order_index = None; + } + } + + if !inline_formula_indices.is_empty() { + tracing::debug!("Marked {} formulas as inline", inline_formula_indices.len()); + } + } + fn stitch_layout_elements( elements: &mut [LayoutElement], text_regions: &[TextRegion], @@ -1301,6 +1595,73 @@ impl ResultStitcher { element.element_type, element_texts.len() ); + + // Debug: log all text regions being joined + for (region, text) in &element_texts { + tracing::debug!(" - region with label={:?}, text={:?}", region.label, text); + } + + // Compute seg metadata (seg_start_x, seg_end_x, num_lines) for get_seg_flag. + // Sort a copy to find first/last spans and count lines. + let mut sorted_for_meta = element_texts.clone(); + sorted_for_meta.sort_by(|(r1, _), (r2, _)| { + r1.bounding_box + .center() + .y + .partial_cmp(&r2.bounding_box.center().y) + .unwrap_or(Ordering::Equal) + }); + let mut lines = Vec::new(); + let mut current_line = Vec::new(); + for item in std::mem::take(&mut sorted_for_meta) { + if current_line.is_empty() { + current_line.push(item); + } else { + let first_in_line = ¤t_line[0].0.bounding_box; + if Self::is_same_text_line_bbox(first_in_line, &item.0.bounding_box, cfg) { + current_line.push(item); + } else { + current_line.sort_by(|(r1, _), (r2, _)| { + r1.bounding_box + .center() + .x + .partial_cmp(&r2.bounding_box.center().x) + .unwrap_or(Ordering::Equal) + }); + lines.push(current_line); + current_line = vec![item]; + } + } + } + if !current_line.is_empty() { + current_line.sort_by(|(r1, _), (r2, _)| { + r1.bounding_box + .center() + .x + .partial_cmp(&r2.bounding_box.center().x) + .unwrap_or(Ordering::Equal) + }); + lines.push(current_line); + } + for mut line in lines { + sorted_for_meta.append(&mut line); + } + + // seg_start_x: first span's left edge (PaddleX: line[0].spans[0].box[0]) + element.seg_start_x = Some(sorted_for_meta[0].0.bounding_box.x_min()); + // seg_end_x: last span's right edge (PaddleX: line[-1].spans[-1].box[2]) + element.seg_end_x = Some(sorted_for_meta.last().unwrap().0.bounding_box.x_max()); + + // Count distinct lines (Y-groups) + let mut num_lines = 1u32; + let mut prev_bbox = &sorted_for_meta[0].0.bounding_box; + for (region, _) in &sorted_for_meta[1..] { + if !Self::is_same_text_line_bbox(prev_bbox, ®ion.bounding_box, cfg) { + num_lines += 1; + prev_bbox = ®ion.bounding_box; + } + } + element.num_lines = Some(num_lines); } Self::sort_and_join_texts(&mut element_texts, Some(&element.bbox), cfg, |joined| { @@ -1309,6 +1670,79 @@ impl ResultStitcher { } } + /// Fills formula layout elements with LaTeX content from formula recognition results. + /// + /// This ensures formula elements have correct content even if OCR matching + /// thresholds prevented proper association. + fn fill_formula_elements( + elements: &mut [LayoutElement], + formulas: &[FormulaResult], + _cfg: &StitchConfig, + ) { + for element in elements.iter_mut() { + if element.element_type != LayoutElementType::Formula { + continue; + } + + // Skip if element already has content from OCR matching + if element.text.is_some() { + continue; + } + + // Find the best matching formula result by bidirectional IoA. + // IoA (intersection / self_area) is much more permissive than IoU for + // size-mismatched bboxes. PaddleX uses simple intersection overlap (>3px). + let mut best_formula: Option<&FormulaResult> = None; + let mut best_score = 0.0f32; + + for formula in formulas { + let ioa_element = element.bbox.ioa(&formula.bbox); + let ioa_formula = formula.bbox.ioa(&element.bbox); + let score = ioa_element.max(ioa_formula); + if score > best_score { + best_score = score; + best_formula = Some(formula); + } + } + + // Fallback: if no IoA match, try center-containment matching. + // Find formula whose center is within the element bbox (or vice versa). + if best_score < 0.05 { + let elem_center = element.bbox.center(); + let mut best_dist = f32::MAX; + + for formula in formulas { + let fc = formula.bbox.center(); + let fc_inside = fc.x >= element.bbox.x_min() + && fc.x <= element.bbox.x_max() + && fc.y >= element.bbox.y_min() + && fc.y <= element.bbox.y_max(); + let ec_inside = elem_center.x >= formula.bbox.x_min() + && elem_center.x <= formula.bbox.x_max() + && elem_center.y >= formula.bbox.y_min() + && elem_center.y <= formula.bbox.y_max(); + + if fc_inside || ec_inside { + let dx = fc.x - elem_center.x; + let dy = fc.y - elem_center.y; + let dist = dx * dx + dy * dy; + if dist < best_dist { + best_dist = dist; + best_formula = Some(formula); + best_score = 0.05; + } + } + } + } + + if best_score >= 0.05 + && let Some(formula) = best_formula + { + element.text = Some(formula.latex.clone()); + } + } + } + /// Checks if two bounding boxes overlap significantly (intersection dimensions > 3px). /// Matches `get_overlap_boxes_idx` logic. fn is_overlapping(bbox1: &BoundingBox, bbox2: &BoundingBox, cfg: &StitchConfig) -> bool { @@ -1333,6 +1767,30 @@ impl ResultStitcher { inter_w > cfg.overlap_min_pixels && inter_h > cfg.overlap_min_pixels } + /// Checks whether two OCR spans should be grouped into the same visual line. + /// + /// Primary signal follows PaddleX-style line-height overlap: + /// vertical_overlap / min(height1, height2) >= threshold. + /// A small adaptive center-Y fallback is kept for robustness on noisy boxes. + fn is_same_text_line_bbox( + bbox1: &BoundingBox, + bbox2: &BoundingBox, + cfg: &StitchConfig, + ) -> bool { + let h1 = (bbox1.y_max() - bbox1.y_min()).max(1.0); + let h2 = (bbox2.y_max() - bbox2.y_min()).max(1.0); + let inter_h = + (bbox1.y_max().min(bbox2.y_max()) - bbox1.y_min().max(bbox2.y_min())).max(0.0); + let overlap_ratio = inter_h / h1.min(h2); + if overlap_ratio >= cfg.line_height_iou_threshold { + return true; + } + + let adaptive_tol = (h1.min(h2) * 0.5).max(1.0); + let center_delta = (bbox1.center().y - bbox2.center().y).abs(); + center_delta <= adaptive_tol.max(cfg.same_line_y_tolerance * 0.25) + } + fn sort_and_join_texts( texts: &mut Vec<(&TextRegion, &str)>, container_bbox: Option<&BoundingBox>, @@ -1347,92 +1805,172 @@ impl ResultStitcher { // Sort spatially: top-to-bottom, then left-to-right texts.sort_by(|(r1, _), (r2, _)| { - let c1 = r1.bounding_box.center(); - let c2 = r2.bounding_box.center(); - - // Y-difference tolerance for same line (10 pixels) - if (c1.y - c2.y).abs() < cfg.same_line_y_tolerance { - c1.x.partial_cmp(&c2.x).unwrap_or(Ordering::Equal) + r1.bounding_box + .center() + .y + .partial_cmp(&r2.bounding_box.center().y) + .unwrap_or(Ordering::Equal) + }); + let mut lines = Vec::new(); + let mut current_line = Vec::new(); + for item in std::mem::take(texts) { + if current_line.is_empty() { + current_line.push(item); } else { - c1.y.partial_cmp(&c2.y).unwrap_or(Ordering::Equal) + let first_in_line = ¤t_line[0].0.bounding_box; + if Self::is_same_text_line_bbox(first_in_line, &item.0.bounding_box, cfg) { + current_line.push(item); + } else { + current_line.sort_by(|(r1, _), (r2, _)| { + r1.bounding_box + .center() + .x + .partial_cmp(&r2.bounding_box.center().x) + .unwrap_or(Ordering::Equal) + }); + lines.push(current_line); + current_line = vec![item]; + } } - }); + } + if !current_line.is_empty() { + current_line.sort_by(|(r1, _), (r2, _)| { + r1.bounding_box + .center() + .x + .partial_cmp(&r2.bounding_box.center().x) + .unwrap_or(Ordering::Equal) + }); + lines.push(current_line); + } + for mut line in lines { + texts.append(&mut line); + } // Smart text joining following format_line logic: // - Texts on the same line are joined directly (no separator) // - A space is added only if the previous text ends with an English letter // - Newlines are added conditionally based on geometric gap (paragraph break detection) let mut result = String::new(); - let mut prev_y: Option = None; let mut prev_region: Option<&TextRegion> = None; + tracing::debug!( + "sort_and_join_texts: processing {} text regions", + texts.len() + ); + for (region, text) in texts.iter() { if text.is_empty() { continue; } - let current_y = region.bounding_box.center().y; - - if let Some(py) = prev_y { - // Check if this is a new line (Y-difference > tolerance) - if (current_y - py).abs() > cfg.same_line_y_tolerance { + if let Some(last_region) = prev_region { + if !Self::is_same_text_line_bbox( + &last_region.bounding_box, + ®ion.bounding_box, + cfg, + ) { // New visual line detected. - // Check for hyphenation: if previous text ends with '-' and current starts with lowercase, - // this is likely a word break that should be joined without the hyphen. - let prev_ends_hyphen = result.ends_with('-'); - let current_starts_lower = - text.chars().next().is_some_and(|c| c.is_lowercase()); + // Decide whether to insert '\n' (hard break) or ' ' (soft break/wrap). + let mut add_newline = false; + let mut is_line_wrap = false; + + if let Some(container) = container_bbox { + let container_width = container.x_max() - container.x_min(); + let right_gap = container.x_max() - last_region.bounding_box.x_max(); + let tail_char = last_non_whitespace_char(&result); + let ends_with_non_break_punct = + tail_char.is_some_and(is_non_break_line_end_punctuation); + // PaddleX: English lines use a larger right-gap threshold. + let paragraph_gap_ratio = + if tail_char.is_some_and(|c| c.is_ascii_alphabetic()) { + 0.5 + } else { + 0.3 + }; + + if !ends_with_non_break_punct + && right_gap > container_width * paragraph_gap_ratio + { + // Previous line ended far from the right edge → paragraph break. + add_newline = true; + } else { + // Previous line extends close to the right edge → line wrap. + is_line_wrap = true; + } + } - if prev_ends_hyphen && current_starts_lower { - // Remove the trailing hyphen and join directly (dehyphenation) + // Dehyphenation: only strip trailing hyphen when the previous line + // is a wrapped line (extends close to container right edge). + // This preserves hyphens in compound words like "real-time", + // "end-to-end", "one-to-many" that end short lines. + // Matches PaddleX format_line behavior where hyphens are stripped + // at line-wrap boundaries. + let prev_ends_hyphen = result.ends_with('-'); + if prev_ends_hyphen && is_line_wrap { + // Line wraps at hyphen → word-break hyphen, remove it result.pop(); // Don't add any separator - words should be joined + } else if add_newline { + if !result.ends_with('\n') { + result.push('\n'); + } } else { - // Decide whether to insert '\n' (hard break) or ' ' (soft break/wrap). - let mut add_newline = false; - - if let Some(container) = container_bbox - && let Some(last_region) = prev_region + // Soft wrap - treat as space if needed (English) or join (CJK) + if let Some(last_char) = result.chars().last() + && last_char != '\n' + && needs_space_after(last_char) { - let container_width = container.x_max() - container.x_min(); - // If the previous line ended far from the right edge, it's likely a paragraph break. - // Heuristic: gap > 30% of container width - // Note: We use container.x_max because we assume LTR text. - let right_gap = container.x_max() - last_region.bounding_box.x_max(); - if right_gap > container_width * 0.3 { - add_newline = true; - } - } - // If no container info, we default to NO newline (soft wrap) to avoid discontinuity, - // unless specific patterns dictate otherwise (future work). - - if add_newline { - if !result.ends_with('\n') { - result.push('\n'); - } - } else { - // Soft wrap - treat as space if needed (English) or join (CJK) - if let Some(last_char) = result.chars().last() - && last_char != '\n' - && needs_space_after(last_char) - { - result.push(' '); - } + result.push(' '); } } } else { // Same visual line - join with smart spacing - if let Some(last_char) = result.chars().last() + // PaddleX format_line: add space after English letters OR after formulas + let needs_spacing = if let Some(last_char) = result.chars().last() && last_char != '\n' && needs_space_after(last_char) { + true + } else { + // PaddleX: add space after formula when next content is on same line + last_region.is_formula() + }; + + if needs_spacing { result.push(' '); } } } - result.push_str(text); - prev_y = Some(current_y); + // PaddleX: formula spans are wrapped with $...$ delimiters + // Inline formulas (mixed with text on same line): $formula$ + // Display formulas (standalone line): $$formula$$ (display math) + let is_formula = region.is_formula(); + let text_to_add = if is_formula { + // Don't double-wrap if formula model already added delimiters + let already_wrapped = + text.starts_with('$') || text.starts_with("\\(") || text.starts_with("\\["); + if already_wrapped { + text.to_string() + } else { + // Check if this is a display formula (starts a new line with no other content yet on this line) + // Display formulas typically appear at the start of a line after a newline + let is_display = result.is_empty() || result.ends_with('\n'); + + if is_display { + // Display formula: $$...$$ + format!("$${}$$", text) + } else { + // Inline formula: $...$ + format!("${}$", text) + } + } + } else { + text.to_string() + }; + + result.push_str(&text_to_add); prev_region = Some(region); } @@ -1441,10 +1979,44 @@ impl ResultStitcher { update_fn(joined); } - /// Sorts layout elements using the XY-cut algorithm. + /// Sorts layout elements using the enhanced xycut_enhanced algorithm. /// - /// When region blocks are not available, this provides a robust column-aware reading - /// order that matches PP-StructureV3's `sort_by_xycut` behavior. + /// Uses cross-layout detection, direction-aware XY-cut, overlapping box shrinking, + /// weighted distance insertion, and child block association for accurate reading order. + fn sort_layout_elements_enhanced( + elements: &mut Vec, + page_width: f32, + page_height: f32, + ) { + use oar_ocr_core::processors::layout_sorting::{SortableElement, sort_layout_enhanced}; + + if elements.is_empty() { + return; + } + + let sortable_elements: Vec<_> = elements + .iter() + .map(|e| SortableElement { + bbox: e.bbox.clone(), + element_type: e.element_type, + num_lines: e.num_lines, + }) + .collect(); + + let sorted_indices = sort_layout_enhanced(&sortable_elements, page_width, page_height); + if sorted_indices.len() != elements.len() { + return; + } + + let sorted_elements: Vec<_> = sorted_indices + .into_iter() + .map(|idx| elements[idx].clone()) + .collect(); + *elements = sorted_elements; + } + + /// Sorts layout elements using the XY-cut algorithm (legacy fallback). + #[allow(dead_code)] fn sort_layout_elements(elements: &mut Vec, _width: f32, _cfg: &StitchConfig) { if elements.len() <= 1 { return; @@ -1477,6 +2049,15 @@ fn needs_space_after(c: char) -> bool { c.is_ascii_alphabetic() } +fn last_non_whitespace_char(text: &str) -> Option { + text.chars().rev().find(|c| !c.is_whitespace()) +} + +/// Punctuation that should not trigger hard paragraph breaks across line wraps. +fn is_non_break_line_end_punctuation(c: char) -> bool { + matches!(c, ',' | ',' | '、' | ';' | ';' | ':' | ':') +} + #[cfg(test)] mod tests { use super::*; @@ -1492,6 +2073,7 @@ mod tests { confidence: Some(0.9), orientation_angle: None, word_boxes: None, + label: None, } } @@ -1544,6 +2126,7 @@ mod tests { confidence: Some(0.9), orientation_angle: None, word_boxes: None, + label: None, }; let r2 = TextRegion { bounding_box: b2.clone(), @@ -1553,6 +2136,7 @@ mod tests { confidence: Some(0.9), orientation_angle: None, word_boxes: None, + label: None, }; let mut texts = vec![(&r1, "A"), (&r2, "B")]; let cfg = StitchConfig::default(); @@ -1563,6 +2147,70 @@ mod tests { assert_eq!(joined, "A B"); } + #[test] + fn test_sort_and_join_texts_english_line_uses_larger_paragraph_gap_threshold() { + let r1 = make_region(BoundingBox::from_coords(0.0, 0.0, 60.0, 10.0), "Line"); + let r2 = make_region(BoundingBox::from_coords(0.0, 20.0, 40.0, 30.0), "next"); + let mut texts = vec![(&r1, "Line"), (&r2, "next")]; + let cfg = StitchConfig::default(); + let container = BoundingBox::from_coords(0.0, 0.0, 100.0, 40.0); + let mut joined = String::new(); + ResultStitcher::sort_and_join_texts(&mut texts, Some(&container), &cfg, |j| joined = j); + assert_eq!(joined, "Line next"); + } + + #[test] + fn test_sort_and_join_texts_non_english_tail_keeps_original_paragraph_gap_threshold() { + let r1 = make_region(BoundingBox::from_coords(0.0, 0.0, 60.0, 10.0), "2024"); + let r2 = make_region(BoundingBox::from_coords(0.0, 20.0, 40.0, 30.0), "next"); + let mut texts = vec![(&r1, "2024"), (&r2, "next")]; + let cfg = StitchConfig::default(); + let container = BoundingBox::from_coords(0.0, 0.0, 100.0, 40.0); + let mut joined = String::new(); + ResultStitcher::sort_and_join_texts(&mut texts, Some(&container), &cfg, |j| joined = j); + assert_eq!(joined, "2024\nnext"); + } + + #[test] + fn test_sort_and_join_texts_non_break_punctuation_suppresses_newline() { + let r1 = make_region(BoundingBox::from_coords(0.0, 0.0, 20.0, 10.0), "Note:"); + let r2 = make_region(BoundingBox::from_coords(0.0, 20.0, 40.0, 30.0), "next"); + let mut texts = vec![(&r1, "Note:"), (&r2, "next")]; + let cfg = StitchConfig::default(); + let container = BoundingBox::from_coords(0.0, 0.0, 100.0, 40.0); + let mut joined = String::new(); + ResultStitcher::sort_and_join_texts(&mut texts, Some(&container), &cfg, |j| joined = j); + assert_eq!(joined, "Note:next"); + } + + #[test] + fn test_normalize_checkbox_symbols_in_table_checkbox_like() { + let mut cells = vec![ + TableCell::new(BoundingBox::from_coords(0.0, 0.0, 10.0, 10.0), 1.0).with_text("ü"), + TableCell::new(BoundingBox::from_coords(10.0, 0.0, 20.0, 10.0), 1.0).with_text("X"), + TableCell::new(BoundingBox::from_coords(20.0, 0.0, 30.0, 10.0), 1.0).with_text("L"), + ]; + + ResultStitcher::normalize_checkbox_symbols_in_table(&mut cells); + + assert_eq!(cells[0].text.as_deref(), Some("✓")); + assert_eq!(cells[1].text.as_deref(), Some("✗")); + assert_eq!(cells[2].text.as_deref(), Some("✓")); + } + + #[test] + fn test_normalize_checkbox_symbols_in_table_keeps_ambiguous_when_not_checkbox_like() { + let mut cells = vec![ + TableCell::new(BoundingBox::from_coords(0.0, 0.0, 10.0, 10.0), 1.0).with_text("L"), + TableCell::new(BoundingBox::from_coords(10.0, 0.0, 20.0, 10.0), 1.0).with_text("A"), + ]; + + ResultStitcher::normalize_checkbox_symbols_in_table(&mut cells); + + assert_eq!(cells[0].text.as_deref(), Some("L")); + assert_eq!(cells[1].text.as_deref(), Some("A")); + } + #[test] fn test_find_row_start_index_with_compact_td_tokens() { let tokens = vec![ @@ -1632,6 +2280,7 @@ mod tests { &structure_tokens, &ocr_candidates, 10.0, + None, ) .expect("expected row-aware matching result"); diff --git a/src/oarocr/structure.rs b/src/oarocr/structure.rs index 5a6dfcc..a1b0e31 100644 --- a/src/oarocr/structure.rs +++ b/src/oarocr/structure.rs @@ -1049,6 +1049,7 @@ impl OARStructureBuilder { // PP-StructureV3 overall OCR uses DB preprocess with: // - limit_side_len=736 // - limit_type="min" + // - max_side_limit=4000 // We fill these defaults here (only for the structure pipeline) unless the caller // explicitly overrides them via `text_detection_config`. let text_detection_adapter = if let Some(ref model_path) = self.text_detection_model { @@ -1057,12 +1058,29 @@ impl OARStructureBuilder { // Note: image_batch_size batching not yet implemented for structure analysis let mut effective_cfg = self.text_detection_config.clone().unwrap_or_default(); + + // Table-heavy documents are sensitive to detection fragmentation. + // Match PaddleX's lower table-scene threshold when users don't override config. + let has_table_pipeline = self.table_classification_model.is_some() + || self.table_structure_recognition_model.is_some() + || self.wired_table_structure_model.is_some() + || self.wireless_table_structure_model.is_some() + || self.table_cell_detection_model.is_some() + || self.wired_table_cell_model.is_some() + || self.wireless_table_cell_model.is_some(); + if self.text_detection_config.is_none() && has_table_pipeline { + effective_cfg.box_threshold = 0.4; + } + if effective_cfg.limit_side_len.is_none() { effective_cfg.limit_side_len = Some(736); } if effective_cfg.limit_type.is_none() { effective_cfg.limit_type = Some(crate::processors::LimitType::Min); } + if effective_cfg.max_side_len.is_none() { + effective_cfg.max_side_len = Some(4000); + } builder = builder.with_config(effective_cfg); if let Some(ref ort_config) = self.ort_session_config { @@ -1303,6 +1321,7 @@ impl OARStructure { confidence: Some(score), orientation_angle: None, word_boxes: None, + label: None, }); } } @@ -1364,6 +1383,7 @@ impl OARStructure { confidence: Some(*score), orientation_angle: None, word_boxes: None, + label: None, }); } } @@ -1601,6 +1621,7 @@ impl OARStructure { confidence: Some(*score), orientation_angle: None, word_boxes: None, + label: None, }); } } @@ -1830,7 +1851,7 @@ impl OARStructure { page_width: f32, page_height: f32, ) { - use oar_ocr_core::processors::layout_sorting::sort_layout_enhanced; + use oar_ocr_core::processors::layout_sorting::{SortableElement, sort_layout_enhanced}; if layout_elements.is_empty() { return; @@ -1838,7 +1859,11 @@ impl OARStructure { let sortable_elements: Vec<_> = layout_elements .iter() - .map(|e| (e.bbox.clone(), e.element_type)) + .map(|e| SortableElement { + bbox: e.bbox.clone(), + element_type: e.element_type, + num_lines: e.num_lines, + }) .collect(); let sorted_indices = sort_layout_enhanced(&sortable_elements, page_width, page_height); @@ -2172,6 +2197,7 @@ impl OARStructure { confidence: Some(score), orientation_angle: None, word_boxes: None, + label: None, }); } } @@ -2322,9 +2348,7 @@ impl OARStructure { // - For each OCR box that overlaps >= k table cells, split at cell boundaries // - Re-run recognition on each split crop // - Replace the original OCR box with the split boxes + texts - let has_detection_backed_table_cells = tables - .iter() - .any(|table| table.cells.iter().any(|cell| cell.confidence < 0.999)); + let has_detection_backed_table_cells = tables.iter().any(|table| !table.is_e2e); if has_detection_backed_table_cells && !text_regions.is_empty() && let Some(ref text_rec_adapter) = self.pipeline.text_recognition_adapter @@ -2411,6 +2435,23 @@ impl OARStructure { } } + // PaddleX: convert_formula_res_to_ocr_format — inject formula results into + // the overall OCR pool so they participate in normal block matching and table + // cell matching. The raw LaTeX text is used here (no $...$ wrapping); + // wrapping is handled by to_markdown() for formula elements, by + // stitch_tables() for table cells, and by sort_and_join_texts for inline formulas. + for formula in &formulas { + let w = formula.bbox.x_max() - formula.bbox.x_min(); + let h = formula.bbox.y_max() - formula.bbox.y_min(); + if w > 1.0 && h > 1.0 { + let mut region = crate::oarocr::TextRegion::new(formula.bbox.clone()); + region.text = Some(formula.latex.clone().into()); + region.confidence = Some(1.0); + region.label = Some("formula".into()); // Mark as formula for inline wrapping + text_regions.push(region); + } + } + // Construct and return result // Ensure rectified_img is always set for markdown image extraction // If no rectification was applied, use current_image diff --git a/src/oarocr/table_analyzer.rs b/src/oarocr/table_analyzer.rs index 5114066..87de6d3 100644 --- a/src/oarocr/table_analyzer.rs +++ b/src/oarocr/table_analyzer.rs @@ -406,36 +406,49 @@ impl<'a> TableAnalyzer<'a> { .or(self.wired_table_structure_adapter), }; - let cell_adapter: Option<&TableCellDetectionAdapter> = if use_e2e_mode { - tracing::info!( - target: "structure", - table_index = idx, - table_type = ?table_type, - "Using E2E mode: skipping cell detection" - ); - None - } else { - tracing::info!( - target: "structure", - table_index = idx, - table_type = ?table_type, - "Using cell detection mode (E2E disabled)" - ); - match table_type { - TableType::Wired => self - .wired_table_cell_adapter - .or(self.table_cell_detection_adapter) - .or(self.wireless_table_cell_adapter), - TableType::Wireless => self - .wireless_table_cell_adapter - .or(self.table_cell_detection_adapter) - .or(self.wired_table_cell_adapter), - TableType::Unknown => self - .table_cell_detection_adapter - .or(self.wired_table_cell_adapter) - .or(self.wireless_table_cell_adapter), - } - }; + // Use cell detection when either: + // 1. E2E mode is disabled, OR + // 2. use_cells_trans_to_html is enabled (user wants detected cells instead of E2E cells) + let cell_adapter: Option<&TableCellDetectionAdapter> = + if !use_e2e_mode || use_cells_trans_to_html { + if use_cells_trans_to_html { + tracing::info!( + target: "structure", + table_index = idx, + table_type = ?table_type, + "Using cell detection (cells_trans_to_html enabled)" + ); + } else { + tracing::info!( + target: "structure", + table_index = idx, + table_type = ?table_type, + "Using cell detection mode (E2E disabled)" + ); + } + match table_type { + TableType::Wired => self + .wired_table_cell_adapter + .or(self.table_cell_detection_adapter) + .or(self.wireless_table_cell_adapter), + TableType::Wireless => self + .wireless_table_cell_adapter + .or(self.table_cell_detection_adapter) + .or(self.wired_table_cell_adapter), + TableType::Unknown => self + .table_cell_detection_adapter + .or(self.wired_table_cell_adapter) + .or(self.wireless_table_cell_adapter), + } + } else { + tracing::info!( + target: "structure", + table_index = idx, + table_type = ?table_type, + "Using E2E mode: skipping cell detection" + ); + None + }; let mut structure_tokens_opt: Option> = None; let mut structure_score_opt: Option = None; @@ -601,6 +614,7 @@ impl<'a> TableAnalyzer<'a> { } } + // Use detected cells when in cells_trans_to_html mode (non-E2E) if use_cells_trans_to_html && !use_e2e_mode && !detected_bboxes_crop.is_empty() { cells = detected_bboxes_crop .iter() @@ -610,7 +624,64 @@ impl<'a> TableAnalyzer<'a> { TableCell::new(bbox, *score) }) .collect(); - } else if !detected_bboxes_crop.is_empty() && !cells.is_empty() { + // Clear structure tokens so that the code below regenerates them + // from the detected cell positions with proper grid info. + structure_tokens_opt = None; + } + + // Approach C: In non-E2E mode with cell detection results, store detected + // bboxes in page coordinates for the stitcher's row-aware IoA-based matcher. + // The structure cells (in `cells`) retain grid metadata (row, col, span); + // detected bboxes travel separately for better OCR matching geometry. + let detected_page_bboxes: Option> = + if !use_e2e_mode && !use_cells_trans_to_html && !detected_bboxes_crop.is_empty() { + Some( + detected_bboxes_crop + .iter() + .map(|bbox_crop| bbox_crop.translate(table_x_offset, table_y_offset)) + .collect(), + ) + } else { + None + }; + + // If we have cells but no structure tokens, generate structure from cell positions. + // This ensures cells are ordered correctly to match the generated tokens. + if !cells.is_empty() && structure_tokens_opt.is_none() { + let cell_bboxes_crop: Vec<_> = cells + .iter() + .map(|c| { + BoundingBox::from_coords( + c.bbox.x_min() - table_x_offset, + c.bbox.y_min() - table_y_offset, + c.bbox.x_max() - table_x_offset, + c.bbox.y_max() - table_y_offset, + ) + }) + .collect(); + + if let Some((generated_tokens, cell_order)) = + table_cells_to_html_structure(&cell_bboxes_crop, 5.0) + { + let mut reordered_cells = Vec::with_capacity(cell_order.len()); + for (source_idx, grid_info) in cell_order { + if let Some(source_cell) = cells.get(source_idx) { + let mut cell = source_cell.clone(); + cell = cell + .with_position(grid_info.row, grid_info.col) + .with_span(grid_info.row_span, grid_info.col_span); + reordered_cells.push(cell); + } + } + if !reordered_cells.is_empty() { + cells = reordered_cells; + structure_tokens_opt = Some(generated_tokens); + } + } + } + + // Fallback: if we have detected cells but no cells yet, try to generate from detected boxes + if !detected_bboxes_crop.is_empty() && cells.is_empty() { let structure_bboxes_crop: Vec<_> = cells .iter() .map(|c| { @@ -753,7 +824,8 @@ impl<'a> TableAnalyzer<'a> { let mut final_result = TableResult::new(table_bbox.clone(), table_type) .with_cells(cells) .with_html_structure(html_structure) - .with_structure_tokens(structure_tokens); + .with_structure_tokens(structure_tokens) + .with_e2e(use_e2e_mode); if let Some(score) = structure_score_opt { final_result = final_result.with_structure_confidence(score); @@ -763,6 +835,10 @@ impl<'a> TableAnalyzer<'a> { final_result = final_result.with_classification_confidence(conf); } + if let Some(detected_bboxes) = detected_page_bboxes { + final_result = final_result.with_detected_cell_bboxes(detected_bboxes); + } + Ok(Some(final_result)) } } @@ -881,12 +957,12 @@ mod tests { // Transform back to original let original = cell_bbox.rotate_back_to_original(90.0, rotated_width, rotated_height); - // For 90° rotation: (x, y) -> (rotated_height - 1 - y, x) + // For 90° rotation: (x, y) -> (rotated_height - y, x) // Original points: (10, 20), (30, 20), (30, 40), (10, 40) - // Expected: (179, 10), (179, 30), (159, 30), (159, 10) - assert!((original.x_min() - 159.0).abs() < 0.01); + // Expected corners in original space: (160, 10), (180, 10), (180, 30), (160, 30) + assert!((original.x_min() - 160.0).abs() < 0.01); assert!((original.y_min() - 10.0).abs() < 0.01); - assert!((original.x_max() - 179.0).abs() < 0.01); + assert!((original.x_max() - 180.0).abs() < 0.01); assert!((original.y_max() - 30.0).abs() < 0.01); } @@ -896,14 +972,15 @@ mod tests { let rotated_height = 200; let cell_bbox = BoundingBox::from_coords(10.0, 20.0, 30.0, 40.0); - let original = cell_bbox.rotate_back_to_original(180.0, rotated_width, rotated_height); - - // For 180° rotation: (x, y) -> (rotated_width - 1 - x, rotated_height - 1 - y) - // Expected corners: (69, 159), (89, 159), (89, 179), (69, 179) - assert!((original.x_min() - 69.0).abs() < 0.01); - assert!((original.y_min() - 159.0).abs() < 0.01); - assert!((original.x_max() - 89.0).abs() < 0.01); - assert!((original.y_max() - 179.0).abs() < 0.01); + let original = + cell_bbox.rotate_back_to_original(180.0, rotated_width, rotated_height as u32); + + // For 180° rotation: (x, y) -> (rotated_width - x, rotated_height - y) + // Expected corners in original: (70, 160), (90, 160), (90, 180), (70, 180) + assert!((original.x_min() - 70.0).abs() < 0.01); + assert!((original.y_min() - 160.0).abs() < 0.01); + assert!((original.x_max() - 90.0).abs() < 0.01); + assert!((original.y_max() - 180.0).abs() < 0.01); } #[test] @@ -914,12 +991,12 @@ mod tests { let cell_bbox = BoundingBox::from_coords(10.0, 20.0, 30.0, 40.0); let original = cell_bbox.rotate_back_to_original(270.0, rotated_width, rotated_height); - // For 270° rotation: (x, y) -> (y, rotated_width - 1 - x) - // Expected corners: (20, 69), (40, 69), (40, 89), (20, 89) + // For 270° rotation: (x, y) -> (y, rotated_width - x) + // Expected corners: (20, 70), (40, 70), (40, 90), (20, 90) assert!((original.x_min() - 20.0).abs() < 0.01); - assert!((original.y_min() - 69.0).abs() < 0.01); + assert!((original.y_min() - 70.0).abs() < 0.01); assert!((original.x_max() - 40.0).abs() < 0.01); - assert!((original.y_max() - 89.0).abs() < 0.01); + assert!((original.y_max() - 90.0).abs() < 0.01); } #[test] From a6bc37e507a376a409b6f19cb9d6d18fc8f780f8 Mon Sep 17 00:00:00 2001 From: Wang Xin Date: Sat, 7 Mar 2026 01:14:27 +0000 Subject: [PATCH 2/4] fix(deps): update hayro to version 0.5 and ort to version 2.0.0-rc.12; adjust usage in PDF processing and ONNX session handling --- Cargo.toml | 2 +- examples/utils/pdf.rs | 8 +++++--- oar-ocr-core/Cargo.toml | 2 +- oar-ocr-core/src/core/inference/ort_infer_builders.rs | 4 ++-- oar-ocr-core/src/core/inference/session.rs | 4 ++-- 5 files changed, 11 insertions(+), 9 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 01fcd50..2020965 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -66,5 +66,5 @@ tracing-subscriber = { version = "0.3", features = ["env-filter"] } clap = { version = "4.5.42", features = ["derive"] } tempfile = "3.19" ab_glyph = "0.2" -hayro = "0.4" +hayro = "0.5" regex = "1" diff --git a/examples/utils/pdf.rs b/examples/utils/pdf.rs index 7710c0d..c4ca156 100644 --- a/examples/utils/pdf.rs +++ b/examples/utils/pdf.rs @@ -5,7 +5,7 @@ use std::path::Path; use std::sync::Arc; -use hayro::Pdf; +use hayro::hayro_syntax::Pdf; /// Error type for PDF processing. #[derive(Debug)] @@ -118,15 +118,17 @@ impl PdfDocument { None => 2.0, // Default scale factor for better quality }; - // Create render settings + // Create render settings (hayro 0.5 defaults bg_color to TRANSPARENT; + // we need WHITE so the RGBA→RGB conversion produces a white background) let settings = RenderSettings { x_scale: scale, y_scale: scale, + bg_color: hayro::vello_cpu::color::palette::css::WHITE, ..Default::default() }; // Render the page using hayro's render function - let interpreter_settings = hayro::InterpreterSettings::default(); + let interpreter_settings = hayro::hayro_interpret::InterpreterSettings::default(); let pixmap = hayro::render(page, &interpreter_settings, &settings); // Convert pixmap to RGB image diff --git a/oar-ocr-core/Cargo.toml b/oar-ocr-core/Cargo.toml index 04f0544..5f9761a 100644 --- a/oar-ocr-core/Cargo.toml +++ b/oar-ocr-core/Cargo.toml @@ -39,7 +39,7 @@ regex = "1.11.1" serde = { version = "1.0", features = ["derive", "rc"] } serde_json = "1.0" toml = "1.0" -ort = { version = "2.0.0-rc.11", default-features = false, features = [ "std", "ndarray", "tracing", "tls-native", "copy-dylibs" ] } +ort = { version = "2.0.0-rc.12", default-features = false, features = [ "std", "ndarray", "tracing", "tls-native", "copy-dylibs" ] } ndarray = "0.17" nalgebra = "0.34" rayon = "1.8" diff --git a/oar-ocr-core/src/core/inference/ort_infer_builders.rs b/oar-ocr-core/src/core/inference/ort_infer_builders.rs index 3759f64..921a56b 100644 --- a/oar-ocr-core/src/core/inference/ort_infer_builders.rs +++ b/oar-ocr-core/src/core/inference/ort_infer_builders.rs @@ -10,7 +10,7 @@ impl OrtInfer { let path = model_path.as_ref(); let session = session::load_session_with( path, - |builder| builder.with_log_level(LogLevel::Error), + |builder| Ok(builder.with_log_level(LogLevel::Error)?), Some("verify model path and compatibility with selected execution providers"), )?; let model_name = "unknown_model".to_string(); @@ -38,7 +38,7 @@ impl OrtInfer { if let Some(cfg) = &common.ort_session { Self::apply_ort_config(builder, cfg) } else { - builder.with_log_level(LogLevel::Error) + Ok(builder.with_log_level(LogLevel::Error)?) } }, Some("check device/EP configuration and model file"), diff --git a/oar-ocr-core/src/core/inference/session.rs b/oar-ocr-core/src/core/inference/session.rs index 9224a62..348dba4 100644 --- a/oar-ocr-core/src/core/inference/session.rs +++ b/oar-ocr-core/src/core/inference/session.rs @@ -11,7 +11,7 @@ const SESSION_CREATION_FAILURE: &str = "failed to create ONNX session"; pub fn load_session(model_path: impl AsRef) -> Result { load_session_with( model_path, - |builder| builder.with_log_level(LogLevel::Error), + |builder| Ok(builder.with_log_level(LogLevel::Error)?), Some("verify model file exists and is readable"), ) } @@ -27,7 +27,7 @@ where { let path = model_path.as_ref(); let builder = Session::builder()?; - let builder = configure_builder(builder)?; + let mut builder = configure_builder(builder)?; let session = builder.commit_from_file(path).map_err(|e| { OCRError::model_load_error(path, SESSION_CREATION_FAILURE, suggestion, Some(e)) })?; From 1c17e1cbd7bf810aeb76e47e56b33c3e30802006 Mon Sep 17 00:00:00 2001 From: Wang Xin Date: Sat, 7 Mar 2026 08:27:11 +0000 Subject: [PATCH 3/4] =?UTF-8?q?fix:=20correct=20cell-index=20mismatch=20an?= =?UTF-8?q?d=20cross-layout=20O(n=C2=B3)=20complexity?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- oar-ocr-core/src/processors/layout_sorting.rs | 163 ++++++++++-------- src/oarocr/stitching.rs | 40 +++-- src/oarocr/table_analyzer.rs | 14 +- 3 files changed, 126 insertions(+), 91 deletions(-) diff --git a/oar-ocr-core/src/processors/layout_sorting.rs b/oar-ocr-core/src/processors/layout_sorting.rs index f504bdd..43b8fb3 100644 --- a/oar-ocr-core/src/processors/layout_sorting.rs +++ b/oar-ocr-core/src/processors/layout_sorting.rs @@ -305,6 +305,13 @@ fn direction_aware_xycut_sort(blocks: &mut [SortableBlock]) -> Vec = blocks .iter() .map(|b| { @@ -347,6 +352,29 @@ fn detect_cross_layout(blocks: &mut [SortableBlock], _page_width: f32) { let text_line_heights: Vec = blocks.iter().map(|b| b.text_line_height).collect(); + // Precompute the full horizontal-projection overlap matrix (O(n²)) so that + // inner loops can do a single table lookup instead of recomputing the ratio. + let h_proj: Vec> = (0..n) + .map(|i| { + (0..n) + .map(|j| { + calculate_projection_overlap_ratio( + &block_data[i].0, + &block_data[j].0, + SortDirection::Horizontal, + ) + }) + .collect() + }) + .collect(); + + // For each block, the set of other blocks that horizontally overlap with it. + // Both inner loops only act on blocks in this set, so we iterate only over + // neighbors rather than 0..n. + let h_neighbors: Vec> = (0..n) + .map(|i| (0..n).filter(|&j| j != i && h_proj[i][j] > 0.0).collect()) + .collect(); + for block_idx in 0..n { if mask_labels.contains(&block_data[block_idx].1) { continue; @@ -354,11 +382,13 @@ fn detect_cross_layout(blocks: &mut [SortableBlock], _page_width: f32) { let mut mark_block_cross = false; - for ref_idx in 0..n { - if block_idx == ref_idx || mask_labels.contains(&block_data[ref_idx].1) { + // Iterate only over blocks that horizontally overlap with block_idx. + // Any block without horizontal overlap has bbox_overlap == 0 and + // match_proj == 0, so it cannot affect the cross-layout decision. + for &ref_idx in &h_neighbors[block_idx] { + if mask_labels.contains(&block_data[ref_idx].1) { continue; } - // Skip already-marked blocks if blocks[ref_idx].order_label == OrderLabel::CrossLayout { continue; } @@ -380,84 +410,71 @@ fn detect_cross_layout(blocks: &mut [SortableBlock], _page_width: f32) { } } - // Check projection overlap in primary direction (horizontal) - let match_proj = calculate_projection_overlap_ratio( - &block_data[block_idx].0, - &block_data[ref_idx].0, - SortDirection::Horizontal, - ); + // h_proj[block_idx][ref_idx] > 0 is guaranteed by h_neighbors, so + // the match_proj > 0 guard from the original is always satisfied here. - if match_proj > 0.0 { - for second_ref_idx in 0..n { - if second_ref_idx == block_idx - || second_ref_idx == ref_idx - || mask_labels.contains(&block_data[second_ref_idx].1) - { + // Iterate over the same neighbor set for second_ref: every triggering + // condition (bbox_overlap2 > 0.1 or second_match_proj > 0) requires + // horizontal overlap with block_idx, which is exactly h_neighbors. + for &second_ref_idx in &h_neighbors[block_idx] { + if second_ref_idx == ref_idx || mask_labels.contains(&block_data[second_ref_idx].1) + { + continue; + } + if blocks[second_ref_idx].order_label == OrderLabel::CrossLayout { + continue; + } + + let bbox_overlap2 = calculate_overlap_ratio( + &block_data[block_idx].0, + &block_data[second_ref_idx].0, + ); + + if bbox_overlap2 > 0.1 { + if block_data[second_ref_idx].1 == OrderLabel::Vision { + blocks[second_ref_idx].order_label = OrderLabel::CrossLayout; continue; } - if blocks[second_ref_idx].order_label == OrderLabel::CrossLayout { - continue; + if block_data[block_idx].1 == OrderLabel::Vision + || block_data[block_idx].2 < block_data[second_ref_idx].2 + { + mark_block_cross = true; + break; } + } - let bbox_overlap2 = calculate_overlap_ratio( - &block_data[block_idx].0, - &block_data[second_ref_idx].0, - ); - - if bbox_overlap2 > 0.1 { - if block_data[second_ref_idx].1 == OrderLabel::Vision { - blocks[second_ref_idx].order_label = OrderLabel::CrossLayout; - continue; - } - if block_data[block_idx].1 == OrderLabel::Vision - || block_data[block_idx].2 < block_data[second_ref_idx].2 - { - mark_block_cross = true; - break; - } + // second_match_proj > 0 is guaranteed (second_ref_idx ∈ h_neighbors[block_idx]). + // Use precomputed table for ref_match_proj to avoid re-computing. + let ref_match_proj = h_proj[ref_idx][second_ref_idx]; + let secondary_ref_match = calculate_projection_overlap_ratio( + &block_data[ref_idx].0, + &block_data[second_ref_idx].0, + SortDirection::Vertical, + ); + + if ref_match_proj == 0.0 && secondary_ref_match > 0.0 { + if block_data[block_idx].1 == OrderLabel::Vision { + mark_block_cross = true; + break; } - - let second_match_proj = calculate_projection_overlap_ratio( - &block_data[block_idx].0, - &block_data[second_ref_idx].0, - SortDirection::Horizontal, - ); - let ref_match_proj = calculate_projection_overlap_ratio( - &block_data[ref_idx].0, - &block_data[second_ref_idx].0, - SortDirection::Horizontal, - ); - let secondary_ref_match = calculate_projection_overlap_ratio( - &block_data[ref_idx].0, - &block_data[second_ref_idx].0, - SortDirection::Vertical, - ); - - if second_match_proj > 0.0 && ref_match_proj == 0.0 && secondary_ref_match > 0.0 + // Both ref blocks are normal text with sufficient width + if block_data[ref_idx].1 == OrderLabel::NormalText + && block_data[second_ref_idx].1 == OrderLabel::NormalText + && block_data[ref_idx].3 + > text_line_heights[ref_idx] + * CROSS_LAYOUT_REF_TEXT_BLOCK_WORDS_NUM_THRESHOLD + && block_data[second_ref_idx].3 + > text_line_heights[second_ref_idx] + * CROSS_LAYOUT_REF_TEXT_BLOCK_WORDS_NUM_THRESHOLD { - if block_data[block_idx].1 == OrderLabel::Vision { - mark_block_cross = true; - break; - } - // Both ref blocks are normal text with sufficient width - if block_data[ref_idx].1 == OrderLabel::NormalText - && block_data[second_ref_idx].1 == OrderLabel::NormalText - && block_data[ref_idx].3 - > text_line_heights[ref_idx] - * CROSS_LAYOUT_REF_TEXT_BLOCK_WORDS_NUM_THRESHOLD - && block_data[second_ref_idx].3 - > text_line_heights[second_ref_idx] - * CROSS_LAYOUT_REF_TEXT_BLOCK_WORDS_NUM_THRESHOLD - { - mark_block_cross = true; - break; - } + mark_block_cross = true; + break; } } + } - if mark_block_cross { - break; - } + if mark_block_cross { + break; } } diff --git a/src/oarocr/stitching.rs b/src/oarocr/stitching.rs index 20e10cd..426bfb8 100644 --- a/src/oarocr/stitching.rs +++ b/src/oarocr/stitching.rs @@ -965,18 +965,28 @@ impl ResultStitcher { } // --- Sort cells into rows --- - // When detected bboxes are available, sort them (better spatial accuracy) - // for the IoA matching loop. - let (match_sorted_indices, match_row_flags) = if let Some(det_bboxes) = cell_bboxes_override - { - let temp_cells: Vec = det_bboxes - .iter() - .map(|b| TableCell::new(b.clone(), 0.5)) - .collect(); - Self::sort_table_cells_boxes(&temp_cells, row_y_tolerance) - } else { - Self::sort_table_cells_boxes(cells, row_y_tolerance) - }; + // When detected bboxes are available we sort them (better spatial accuracy) + // to pick the IoA bbox for OCR matching. We also independently sort the + // structure cells so that the td→cell text-assignment step uses a valid + // index into `cells[]`. Without this separation the det-bbox sort indices + // are silently reused as structure-cell indices, misassigning OCR to wrong + // cells whenever the two orderings differ. + let (match_sorted_indices, cell_sorted_indices, match_row_flags) = + if let Some(det_bboxes) = cell_bboxes_override { + let temp_cells: Vec = det_bboxes + .iter() + .map(|b| TableCell::new(b.clone(), 0.5)) + .collect(); + let (det_sorted, row_flags) = + Self::sort_table_cells_boxes(&temp_cells, row_y_tolerance); + // Sort structure cells independently so their indices stay valid. + let (cell_sorted, _) = Self::sort_table_cells_boxes(cells, row_y_tolerance); + (det_sorted, cell_sorted, row_flags) + } else { + let (sorted, row_flags) = Self::sort_table_cells_boxes(cells, row_y_tolerance); + // When there is no override the two index lists are identical. + (sorted.clone(), sorted, row_flags) + }; if match_sorted_indices.is_empty() || match_row_flags.is_empty() { return None; @@ -1061,13 +1071,15 @@ impl ResultStitcher { // Map td position to the original cell index via sorted ordering. // match_aligned[matched_row_idx] + td_index gives the position in the - // sorted cell list, and match_sorted_indices maps that back to cells[]. + // sorted cell list. Use cell_sorted_indices (indices into cells[]) + // rather than match_sorted_indices (which may be indices into det_bboxes + // when cell_bboxes_override is active). let mapped_cell_idx = match_aligned .get(matched_row_idx) .copied() .and_then(|row_start| { let sorted_pos = row_start + td_index; - match_sorted_indices.get(sorted_pos).copied() + cell_sorted_indices.get(sorted_pos).copied() }) .filter(|&idx| idx < cells.len()); diff --git a/src/oarocr/table_analyzer.rs b/src/oarocr/table_analyzer.rs index 87de6d3..eeda3a6 100644 --- a/src/oarocr/table_analyzer.rs +++ b/src/oarocr/table_analyzer.rs @@ -393,6 +393,12 @@ impl<'a> TableAnalyzer<'a> { TableType::Unknown => false, }; + // When use_cells_trans_to_html is set it overrides E2E mode: detected + // cells are used in place of SLANet structure tokens. Anything that + // conditions on "are we actually in E2E mode?" should use this flag + // rather than use_e2e_mode directly. + let effective_use_e2e = use_e2e_mode && !use_cells_trans_to_html; + let structure_adapter: Option<&TableStructureRecognitionAdapter> = match table_type { TableType::Wired => self .wired_table_structure_adapter @@ -482,7 +488,7 @@ impl<'a> TableAnalyzer<'a> { } } Err(e) => { - if use_cells_trans_to_html && !use_e2e_mode { + if use_cells_trans_to_html { tracing::warn!( target: "structure", table_index = idx, @@ -508,7 +514,7 @@ impl<'a> TableAnalyzer<'a> { } } None => { - if !use_cells_trans_to_html || use_e2e_mode { + if !use_cells_trans_to_html || effective_use_e2e { tracing::warn!( target: "structure", table_index = idx, @@ -614,8 +620,8 @@ impl<'a> TableAnalyzer<'a> { } } - // Use detected cells when in cells_trans_to_html mode (non-E2E) - if use_cells_trans_to_html && !use_e2e_mode && !detected_bboxes_crop.is_empty() { + // Use detected cells when in cells_trans_to_html mode (overrides E2E). + if use_cells_trans_to_html && !detected_bboxes_crop.is_empty() { cells = detected_bboxes_crop .iter() .zip(detected_scores.iter()) From b421f4cd21f0ecab7a2f17af6cd7c6ddbfdf75eb Mon Sep 17 00:00:00 2001 From: Wang Xin Date: Sat, 7 Mar 2026 10:46:19 +0000 Subject: [PATCH 4/4] ix: escape dollar signs in math blocks and remove stale allow(dead_code) --- oar-ocr-core/src/domain/structure.rs | 6 ++++-- oar-ocr-core/src/processors/sorting.rs | 1 - 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/oar-ocr-core/src/domain/structure.rs b/oar-ocr-core/src/domain/structure.rs index 4ef042b..4fcf687 100644 --- a/oar-ocr-core/src/domain/structure.rs +++ b/oar-ocr-core/src/domain/structure.rs @@ -1719,8 +1719,10 @@ pub fn postprocess_markdown(markdown: &str) -> String { if contains_dollar && is_plain_text { result.push_str(&line.replace('$', "\\$")); } else if contains_dollar { - // Remove redundant dollar signs inside the block - result.push_str(&line.replace('$', "")); + // Escape bare dollar signs inside the math block to avoid + // "Can't use function '$' in math mode" KaTeX errors while + // preserving literal dollars (e.g. \text{$10}). + result.push_str(&line.replace('$', "\\$")); } else { result.push_str(line); } diff --git a/oar-ocr-core/src/processors/sorting.rs b/oar-ocr-core/src/processors/sorting.rs index 24802ea..aeb2972 100644 --- a/oar-ocr-core/src/processors/sorting.rs +++ b/oar-ocr-core/src/processors/sorting.rs @@ -522,7 +522,6 @@ impl SortableRegion { } /// Calculates the IoU (Intersection over Union) between two bounding boxes. -#[allow(dead_code)] pub fn calculate_iou(a: &BoundingBox, b: &BoundingBox) -> f32 { let x1 = a.x_min().max(b.x_min()); let y1 = a.y_min().max(b.y_min());