Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 44 additions & 31 deletions examples/structure.rs
Original file line number Diff line number Diff line change
Expand Up @@ -674,49 +674,62 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
// Collect all results for potential concatenation
let mut all_results: Vec<oar_ocr::domain::structure::StructureResult> = Vec::new();

// Process each input source
for (idx, source) in std::mem::take(&mut input_sources).into_iter().enumerate() {
// Collect images and metadata for batch processing (cross-page formula batching)
let mut images: Vec<image::RgbImage> = Vec::new();
let mut source_meta: Vec<(String, String)> = Vec::new(); // (source_path, source_stem)

for source in std::mem::take(&mut input_sources) {
let source_path = source.path();
let source_stem = {
match &source {
InputSource::ImageFile(p) => p
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("result")
.to_string(),
InputSource::PdfPage {
pdf_path,
page_number,
..
} => {
format!(
"{}_page_{:03}",
pdf_path
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("pdf"),
page_number
)
}
let source_stem = match &source {
InputSource::ImageFile(p) => p
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("result")
.to_string(),
InputSource::PdfPage {
pdf_path,
page_number,
..
} => {
format!(
"{}_page_{:03}",
pdf_path
.file_stem()
.and_then(|s| s.to_str())
.unwrap_or("pdf"),
page_number
)
}
};
info!("\nProcessing input {}: {}", idx + 1, source_path);

let image = match source.into_image() {
Ok(img) => img,
match source.into_image() {
Ok(img) => {
images.push(img);
source_meta.push((source_path, source_stem));
}
Err(err) => {
error!("Failed to load image: {}", err);
continue;
error!("Failed to load image {}: {}", source_path, err);
}
};
}
}

info!(
"Batch processing {} image(s) with cross-page formula batching",
images.len()
);
let batch_results = analyzer.predict_images(images);

let mut result = match analyzer.predict_image(image) {
// Process each result: assign metadata, save, visualize, log
for (idx, (page_result, (source_path, source_stem))) in
batch_results.into_iter().zip(source_meta).enumerate()
{
let mut result = match page_result {
Ok(res) => res,
Err(err) => {
error!("Failed to analyze {}: {}", source_path, err);
continue;
}
};
info!("\nProcessed input {}: {}", idx + 1, source_path);
result.input_path = std::sync::Arc::from(source_path.clone());

// Always collect results for potential concatenation
Expand Down
112 changes: 83 additions & 29 deletions oar-ocr-core/src/domain/structure.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ fn semantic_title_level_and_format(cleaned: &str) -> Option<(usize, String)> {
keyword.as_str(),
"ABSTRACT" | "INTRODUCTION" | "REFERENCES" | "REFERENCE"
) {
return Some((1, trimmed.to_string()));
return Some((2, trimmed.to_string()));
}

if let Some(captures) = TITLE_NUMBERING_REGEX.captures(cleaned) {
Expand Down Expand Up @@ -494,8 +494,9 @@ impl StructureResult {
let mut md = String::new();
let elements = &self.layout_elements;
let paragraph_title_levels = infer_paragraph_title_levels(elements);
let mut last_label: Option<LayoutElementType> = None;
let mut prev_element: Option<&LayoutElement> = None;
// Track the most recent Text/ReferenceContent element so paragraph
// continuation works across intervening figures/tables.
let mut prev_text_element: Option<&LayoutElement> = None;

for (idx, element) in elements.iter().enumerate() {
// PP-StructureV3 markdown ignores auxiliary labels.
Expand Down Expand Up @@ -530,10 +531,10 @@ impl StructureResult {
// Determine seg_start_flag for paragraph continuity (PaddleX get_seg_flag).
// When both current and previous are "text" and seg_start_flag is false,
// they belong to the same paragraph — join without \n\n separator.
let seg_start_flag = get_seg_flag(element, prev_element);
let seg_start_flag = get_seg_flag(element, prev_text_element);

let is_continuation = element.element_type == LayoutElementType::Text
&& last_label == Some(LayoutElementType::Text)
&& prev_text_element.is_some()
&& !seg_start_flag;

// Add separator between elements
Expand All @@ -547,9 +548,18 @@ impl StructureResult {
if !md.is_empty() {
md.push_str("\n\n");
}
md.push_str("# ");
if let Some(text) = &element.text {
let cleaned = clean_ocr_text(text);
// Downgrade section-level keywords to ## when misclassified as DocTitle
let keyword = cleaned.trim().trim_end_matches(':').to_ascii_uppercase();
if matches!(
keyword.as_str(),
"ABSTRACT" | "INTRODUCTION" | "REFERENCES" | "REFERENCE"
) {
md.push_str("## ");
} else {
md.push_str("# ");
}
md.push_str(&cleaned);
}
}
Expand Down Expand Up @@ -621,27 +631,40 @@ impl StructureResult {
};

// Check if this formula is on the same line as adjacent text elements
// to determine if it's an inline formula or display formula
// to determine if it's an inline formula or display formula.
// Only consider the nearest non-formula/non-formula-number neighbor
// on each side, and require BOTH sides to have text on the same line.
// This prevents display formulas from being misclassified as inline
// when they happen to be vertically aligned with a distant text block.
let is_inline = {
// Look for previous non-formula text element on the same line
let has_prev_text = (0..idx).rev().any(|i| {
let prev = &elements[i];
!prev.element_type.is_formula()
&& (prev.element_type == LayoutElementType::Text
let has_prev_text = (0..idx)
.rev()
.find(|&i| {
let t = elements[i].element_type;
!t.is_formula() && t != LayoutElementType::FormulaNumber
})
.is_some_and(|i| {
let prev = &elements[i];
(prev.element_type == LayoutElementType::Text
|| prev.element_type == LayoutElementType::ReferenceContent)
&& is_same_line(&element.bbox, &prev.bbox)
});

// Look for next non-formula text element on the same line
let has_next_text = ((idx + 1)..elements.len()).any(|i| {
let next = &elements[i];
!next.element_type.is_formula()
&& (next.element_type == LayoutElementType::Text
&& is_same_line(&element.bbox, &prev.bbox)
});

let has_next_text = ((idx + 1)..elements.len())
.find(|&i| {
let t = elements[i].element_type;
!t.is_formula() && t != LayoutElementType::FormulaNumber
})
.is_some_and(|i| {
let next = &elements[i];
(next.element_type == LayoutElementType::Text
|| next.element_type == LayoutElementType::ReferenceContent)
&& is_same_line(&element.bbox, &next.bbox)
});
&& is_same_line(&element.bbox, &next.bbox)
});

has_prev_text || has_next_text
// Require text on BOTH sides for inline — a formula with text
// only on one side is almost always a display equation.
has_prev_text && has_next_text
};

if is_inline {
Expand Down Expand Up @@ -788,8 +811,13 @@ impl StructureResult {
// Default text elements - following PaddleX's text handling
_ => {
if let Some(text) = &element.text {
// For text continuation (same paragraph), join directly
if is_continuation {
let cleaned = clean_ocr_text(text);
if has_bullet_markers(&cleaned) {
if !md.is_empty() {
md.push_str("\n\n");
}
format_as_bullet_list(&cleaned, &mut md);
} else if is_continuation {
let formatted = format_text_block(text);
md.push_str(&formatted);
} else {
Expand All @@ -803,8 +831,11 @@ impl StructureResult {
}
}

last_label = Some(element.element_type);
prev_element = Some(element);
if element.element_type == LayoutElementType::Text
|| element.element_type == LayoutElementType::ReferenceContent
{
prev_text_element = Some(element);
}
}
md.trim().to_string()
}
Expand Down Expand Up @@ -1342,6 +1373,29 @@ fn format_vision_footnote_block(text: &str) -> String {
step1.replace('\n', "\n\n")
}

/// Bullet marker characters commonly found in OCR text.
const BULLET_MARKERS: &[char] = &['•', '●', '◦', '▪', '◆'];

/// Checks if text contains bullet markers that should be formatted as a list.
fn has_bullet_markers(text: &str) -> bool {
BULLET_MARKERS.iter().any(|&m| text.contains(m))
}

/// Formats text with bullet markers as a markdown list.
///
/// Splits on any bullet marker character so mixed markers (e.g. `• item1 ▪ item2`)
/// are all handled correctly.
fn format_as_bullet_list(text: &str, md: &mut String) {
for item in text.split(|c: char| BULLET_MARKERS.contains(&c)) {
let item = item.trim();
if !item.is_empty() {
md.push_str("- ");
md.push_str(item);
md.push('\n');
}
}
}

/// Checks if a character is a Chinese character.
///
/// Used to determine spacing rules when concatenating pages.
Expand Down Expand Up @@ -2602,11 +2656,11 @@ mod tests {
#[test]
fn test_format_title_with_level_keywords() {
let (level, text) = format_title_with_level("Abstract", None);
assert_eq!(level, 1);
assert_eq!(level, 2);
assert_eq!(text, "Abstract");

let (level, text) = format_title_with_level("References:", None);
assert_eq!(level, 1);
assert_eq!(level, 2);
assert_eq!(text, "References:");
}

Expand Down
2 changes: 1 addition & 1 deletion oar-ocr-core/src/processors/formula_preprocess.rs
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ impl FormulaPreprocessor {
let final_width = new_width.min(target_width);
let final_height = new_height.min(target_height);

let resized = resize(img, final_width, final_height, FilterType::Lanczos3);
let resized = resize(img, final_width, final_height, FilterType::Triangle);

// Calculate padding to center the image
let delta_width = target_width - final_width;
Expand Down
Loading