From 605ff3543216f528793ca4244cc1f2d8fb3ae449 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Mon, 22 Sep 2025 13:02:40 +0200 Subject: [PATCH 01/14] added the field Signed-off-by: Peter Staar --- docling_core/types/doc/document.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 07693b88..932fdc40 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -952,6 +952,8 @@ class NodeItem(BaseModel): model_config = ConfigDict(extra="forbid") + summary: Optional[str] = None # + def get_ref(self) -> RefItem: """get_ref.""" return RefItem(cref=self.self_ref) From 79a13470a7502d107e1935a7938171b995b6722e Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Mon, 22 Sep 2025 13:18:50 +0200 Subject: [PATCH 02/14] added the MD-summary serializer Signed-off-by: Peter Staar --- .../transforms/serializer/markdown_summary.py | 104 ++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 docling_core/transforms/serializer/markdown_summary.py diff --git a/docling_core/transforms/serializer/markdown_summary.py b/docling_core/transforms/serializer/markdown_summary.py new file mode 100644 index 00000000..852b1bbc --- /dev/null +++ b/docling_core/transforms/serializer/markdown_summary.py @@ -0,0 +1,104 @@ +from docling_core.types.doc import ( + ContentLayer, + DocItemLabel, + DoclingDocument, + NodeItem, + GroupItem, + GroupLabel, + DocItem, + LevelNumber, + ListItem, + SectionHeaderItem, + TableItem, + TextItem, + TitleItem, + RefItem, + PictureItem, +) + +class MarkdownSummaryParams(CommonParams): + """Markdown-specific serialization parameters.""" + + use_markdown_headers: bool = False + +class MarkdownSummarySerializer(DocSerializer): + """Markdown-specific document summary serializer.""" + + params: MarkdownParams = MarkdownParams() + + @override + def serialize_bold(self, text: str, **kwargs: Any): + """Apply Markdown-specific bold serialization.""" + return f"**{text}**" + + @override + def serialize_italic(self, text: str, **kwargs: Any): + """Apply Markdown-specific italic serialization.""" + return f"*{text}*" + + @override + def serialize_strikethrough(self, text: str, **kwargs: Any): + """Apply Markdown-specific strikethrough serialization.""" + return f"~~{text}~~" + + @override + def serialize_hyperlink( + self, + text: str, + hyperlink: Union[AnyUrl, Path], + **kwargs: Any, + ): + """Apply Markdown-specific hyperlink serialization.""" + return f"[{text}]({str(hyperlink)})" + + @override + def serialize_doc( + self, + *, + parts: list[SerializationResult], + **kwargs: Any, + ) -> SerializationResult: + """Serialize a document out of its parts.""" + text_res = "\n\n".join([p.text for p in parts if p.text]) + + return create_ser_result(text=text_res, span_source=parts) + + def _create_document_outline(self, doc: DoclingDocument) -> str: + label_counter: dict[DocItemLabel, int] = { + DocItemLabel.TABLE: 0, + DocItemLabel.PICTURE: 0, + DocItemLabel.TEXT: 0, + } + + lines = [] + for item, level in doc.iterate_items(with_groups=True): + if isinstance(item, TitleItem): + lines.append(f"title (reference={item.self_ref}): {item.text}") + + elif isinstance(item, SectionHeaderItem): + lines.append( + f"section-header (level={item.level}, reference={item.self_ref}): {item.text}" + ) + + elif isinstance(item, ListItem): + continue + + elif isinstance(item, TextItem): + lines.append(f"{item.label} (reference={item.self_ref})") + + elif isinstance(item, TableItem): + label_counter[item.label] += 1 + lines.append( + f"{item.label} {label_counter[item.label]} (reference={item.self_ref})" + ) + + elif isinstance(item, PictureItem): + label_counter[item.label] += 1 + lines.append( + f"{item.label} {label_counter[item.label]} (reference={item.self_ref})" + ) + + outline = "\n\n".join(lines) + + return outline + From 003883bae0fffdbe7d22df7a873aac53461b24aa Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Tue, 23 Sep 2025 09:58:29 +0200 Subject: [PATCH 03/14] added the first attempt at a markdown_summary Signed-off-by: Peter Staar --- .../transforms/serializer/markdown_summary.py | 168 +++++++++++++----- 1 file changed, 120 insertions(+), 48 deletions(-) diff --git a/docling_core/transforms/serializer/markdown_summary.py b/docling_core/transforms/serializer/markdown_summary.py index 852b1bbc..4ba0379f 100644 --- a/docling_core/transforms/serializer/markdown_summary.py +++ b/docling_core/transforms/serializer/markdown_summary.py @@ -1,44 +1,86 @@ +from typing import Any, Optional, Union +from pathlib import Path + +from pydantic import AnyUrl +from typing_extensions import override + +from docling_core.transforms.serializer.base import ( + BaseAnnotationSerializer, + BaseDocSerializer, + BaseFallbackSerializer, + BaseFormSerializer, + BaseInlineSerializer, + BaseKeyValueSerializer, + BaseListSerializer, + BasePictureSerializer, + BaseTableSerializer, + BaseTextSerializer, + SerializationResult, +) +from docling_core.transforms.serializer.common import ( + CommonParams, + DocSerializer, + create_ser_result, +) +from docling_core.transforms.serializer.markdown import ( + MarkdownAnnotationSerializer, + MarkdownFallbackSerializer, + MarkdownFormSerializer, + MarkdownInlineSerializer, + MarkdownKeyValueSerializer, + MarkdownListSerializer, + MarkdownPictureSerializer, + MarkdownTableSerializer, + MarkdownTextSerializer, +) from docling_core.types.doc import ( - ContentLayer, + DocItem, DocItemLabel, DoclingDocument, - NodeItem, - GroupItem, - GroupLabel, - DocItem, - LevelNumber, ListItem, + NodeItem, + PictureItem, SectionHeaderItem, TableItem, TextItem, TitleItem, - RefItem, - PictureItem, ) + class MarkdownSummaryParams(CommonParams): - """Markdown-specific serialization parameters.""" + """Markdown-specific serialization parameters for outline.""" use_markdown_headers: bool = False - + + class MarkdownSummarySerializer(DocSerializer): """Markdown-specific document summary serializer.""" - params: MarkdownParams = MarkdownParams() + # Provide required serializer attributes to satisfy DocSerializer’s model + text_serializer: BaseTextSerializer = MarkdownTextSerializer() + table_serializer: BaseTableSerializer = MarkdownTableSerializer() + picture_serializer: BasePictureSerializer = MarkdownPictureSerializer() + key_value_serializer: BaseKeyValueSerializer = MarkdownKeyValueSerializer() + form_serializer: BaseFormSerializer = MarkdownFormSerializer() + fallback_serializer: BaseFallbackSerializer = MarkdownFallbackSerializer() + + list_serializer: BaseListSerializer = MarkdownListSerializer() + inline_serializer: BaseInlineSerializer = MarkdownInlineSerializer() + + annotation_serializer: BaseAnnotationSerializer = MarkdownAnnotationSerializer() + + params: MarkdownSummaryParams = MarkdownSummaryParams() @override - def serialize_bold(self, text: str, **kwargs: Any): - """Apply Markdown-specific bold serialization.""" + def serialize_bold(self, text: str, **kwargs: Any) -> str: return f"**{text}**" @override - def serialize_italic(self, text: str, **kwargs: Any): - """Apply Markdown-specific italic serialization.""" + def serialize_italic(self, text: str, **kwargs: Any) -> str: return f"*{text}*" @override - def serialize_strikethrough(self, text: str, **kwargs: Any): - """Apply Markdown-specific strikethrough serialization.""" + def serialize_strikethrough(self, text: str, **kwargs: Any) -> str: return f"~~{text}~~" @override @@ -47,10 +89,19 @@ def serialize_hyperlink( text: str, hyperlink: Union[AnyUrl, Path], **kwargs: Any, - ): - """Apply Markdown-specific hyperlink serialization.""" + ) -> str: return f"[{text}]({str(hyperlink)})" + @override + def get_parts( + self, + item: Optional[NodeItem] = None, + **kwargs: Any, + ) -> list[SerializationResult]: + """Return a single part containing the document (or subtree) outline.""" + outline = self._create_document_outline(root=item, **kwargs) + return [create_ser_result(text=outline, span_source=[])] if outline else [] + @override def serialize_doc( self, @@ -58,47 +109,68 @@ def serialize_doc( parts: list[SerializationResult], **kwargs: Any, ) -> SerializationResult: - """Serialize a document out of its parts.""" text_res = "\n\n".join([p.text for p in parts if p.text]) - return create_ser_result(text=text_res, span_source=parts) - - def _create_document_outline(self, doc: DoclingDocument) -> str: + + def _create_document_outline( + self, + *, + root: Optional[NodeItem] = None, + **kwargs: Any, + ) -> str: + """Create an outline, respecting params and recursive traversal.""" + params = self.params.merge_with_patch(patch=kwargs) + excluded = self.get_excluded_refs(**kwargs) + label_counter: dict[DocItemLabel, int] = { DocItemLabel.TABLE: 0, DocItemLabel.PICTURE: 0, DocItemLabel.TEXT: 0, } + lines: list[str] = [] + visited: set[str] = set() - lines = [] - for item, level in doc.iterate_items(with_groups=True): - if isinstance(item, TitleItem): - lines.append(f"title (reference={item.self_ref}): {item.text}") - - elif isinstance(item, SectionHeaderItem): - lines.append( - f"section-header (level={item.level}, reference={item.self_ref}): {item.text}" - ) + # Iterate depth-first with groups, similar to MarkdownSerializer + for node, level in self.doc.iterate_items(root=root, with_groups=True): + if node.self_ref in visited: + continue + visited.add(node.self_ref) - elif isinstance(item, ListItem): + # Skip list items in outline + if isinstance(node, ListItem): continue - - elif isinstance(item, TextItem): - lines.append(f"{item.label} (reference={item.self_ref})") - - elif isinstance(item, TableItem): - label_counter[item.label] += 1 + + # Respect excluded refs and skip caption text items + if isinstance(node, DocItem): + if node.self_ref in excluded: + continue + if isinstance(node, TextItem) and node.self_ref in self._captions_of_some_item: + continue + + if isinstance(node, TitleItem): + if params.use_markdown_headers: + lines.append(f"# {node.text}") + else: + lines.append(f"title (reference={node.self_ref}): {node.text}") + elif isinstance(node, SectionHeaderItem): + if params.use_markdown_headers: + hashes = "#" * (node.level + 1) + lines.append(f"{hashes} {node.text}") + else: + lines.append( + f"section-header (level={node.level}, reference={node.self_ref}): {node.text}" + ) + elif isinstance(node, TextItem): + lines.append(f"{node.label} (reference={node.self_ref})") + elif isinstance(node, TableItem): + label_counter[DocItemLabel.TABLE] += 1 lines.append( - f"{item.label} {label_counter[item.label]} (reference={item.self_ref})" + f"{node.label} {label_counter[DocItemLabel.TABLE]} (reference={node.self_ref})" ) - - elif isinstance(item, PictureItem): - label_counter[item.label] += 1 + elif isinstance(node, PictureItem): + label_counter[DocItemLabel.PICTURE] += 1 lines.append( - f"{item.label} {label_counter[item.label]} (reference={item.self_ref})" + f"{node.label} {label_counter[DocItemLabel.PICTURE]} (reference={node.self_ref})" ) - outline = "\n\n".join(lines) - - return outline - + return "\n\n".join(lines) From e64b83d5aec3c7f469c33c53571908a09df6f056 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Tue, 23 Sep 2025 10:21:20 +0200 Subject: [PATCH 04/14] added a test for markdown_summary Signed-off-by: Peter Staar --- .../transforms/serializer/markdown_summary.py | 5 ++ test/test_markdown_summary.py | 56 +++++++++++++++++++ 2 files changed, 61 insertions(+) create mode 100644 test/test_markdown_summary.py diff --git a/docling_core/transforms/serializer/markdown_summary.py b/docling_core/transforms/serializer/markdown_summary.py index 4ba0379f..e511b4a5 100644 --- a/docling_core/transforms/serializer/markdown_summary.py +++ b/docling_core/transforms/serializer/markdown_summary.py @@ -92,6 +92,11 @@ def serialize_hyperlink( ) -> str: return f"[{text}]({str(hyperlink)})" + @override + def requires_page_break(self) -> bool: + """Whether to add page breaks.""" + return False + @override def get_parts( self, diff --git a/test/test_markdown_summary.py b/test/test_markdown_summary.py new file mode 100644 index 00000000..6c3b72dc --- /dev/null +++ b/test/test_markdown_summary.py @@ -0,0 +1,56 @@ +"""Tests for MarkdownSummarySerializer (document outline).""" + +from pathlib import Path + +import pytest + +from docling_core.transforms.serializer.markdown_summary import ( + MarkdownSummaryParams, + MarkdownSummarySerializer, +) + +from .test_docling_doc import _construct_doc + + +@pytest.mark.parametrize("use_md_headers", [False, True]) +def test_markdown_summary_outline(use_md_headers: bool): + # Build a representative document with title, headers, text, lists, table, and pictures + doc = _construct_doc() + + ser = MarkdownSummarySerializer( + doc=doc, + params=MarkdownSummaryParams(use_markdown_headers=use_md_headers), + ) + + outline = ser.serialize().text + + print(outline) + + # Leading list items should not appear in the outline + assert "item of leading list" not in outline + + # Captions should be excluded from outline + assert "This is the caption of table 1." not in outline + assert "This is the caption of figure 1." not in outline + assert "This is the caption of figure 2." not in outline + + # Title and section header formatting based on params + if use_md_headers: + # Markdown-style headers + assert "# Title of the Document" in outline + assert "## 1. Introduction" in outline + # Ensure we don't get the verbose label style when using MD headers + assert "title (reference=" not in outline.splitlines()[0] + else: + # Verbose outline lines with references + first_line = outline.splitlines()[0] + assert first_line.startswith("title (reference=") and first_line.endswith( + "): Title of the Document" + ) + # Section header line contains level and reference + assert "section-header (level=1, reference=" in outline + + # Tables and pictures should be numbered and listed with references + assert "table 1 (reference=" in outline + assert "picture 1 (reference=" in outline + From da4e775c206ce8591503faaec169072cf562d92c Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Tue, 23 Sep 2025 13:55:41 +0200 Subject: [PATCH 05/14] refactored the markdown summary serializer Signed-off-by: Peter Staar --- .../transforms/serializer/markdown_summary.py | 172 +++++++++++++++--- docling_core/types/doc/document.py | 2 +- ...ndent_mdhdr_false_indent_true_size_2.gt.md | 59 ++++++ ...indent_mdhdr_true_indent_true_size_2.gt.md | 59 ++++++ ...sum_outline_mdhdr_false_indent_false.gt.md | 59 ++++++ ...dsum_outline_mdhdr_false_indent_true.gt.md | 59 ++++++ ...dsum_outline_mdhdr_true_indent_false.gt.md | 59 ++++++ ...mdsum_outline_mdhdr_true_indent_true.gt.md | 59 ++++++ ...of_contents_mdhdr_false_indent_false.gt.md | 3 + ..._of_contents_mdhdr_false_indent_true.gt.md | 3 + ..._of_contents_mdhdr_true_indent_false.gt.md | 3 + ...e_of_contents_mdhdr_true_indent_true.gt.md | 3 + test/test_markdown_summary.py | 89 +++++---- 13 files changed, 570 insertions(+), 59 deletions(-) create mode 100644 test/data/doc/constructed_mdsum_indent_mdhdr_false_indent_true_size_2.gt.md create mode 100644 test/data/doc/constructed_mdsum_indent_mdhdr_true_indent_true_size_2.gt.md create mode 100644 test/data/doc/constructed_mdsum_outline_mdhdr_false_indent_false.gt.md create mode 100644 test/data/doc/constructed_mdsum_outline_mdhdr_false_indent_true.gt.md create mode 100644 test/data/doc/constructed_mdsum_outline_mdhdr_true_indent_false.gt.md create mode 100644 test/data/doc/constructed_mdsum_outline_mdhdr_true_indent_true.gt.md create mode 100644 test/data/doc/constructed_mdsum_table_of_contents_mdhdr_false_indent_false.gt.md create mode 100644 test/data/doc/constructed_mdsum_table_of_contents_mdhdr_false_indent_true.gt.md create mode 100644 test/data/doc/constructed_mdsum_table_of_contents_mdhdr_true_indent_false.gt.md create mode 100644 test/data/doc/constructed_mdsum_table_of_contents_mdhdr_true_indent_true.gt.md diff --git a/docling_core/transforms/serializer/markdown_summary.py b/docling_core/transforms/serializer/markdown_summary.py index e511b4a5..bbfd3ac6 100644 --- a/docling_core/transforms/serializer/markdown_summary.py +++ b/docling_core/transforms/serializer/markdown_summary.py @@ -1,5 +1,6 @@ from typing import Any, Optional, Union from pathlib import Path +from enum import Enum from pydantic import AnyUrl from typing_extensions import override @@ -34,9 +35,12 @@ MarkdownTextSerializer, ) from docling_core.types.doc import ( + CodeItem, DocItem, DocItemLabel, DoclingDocument, + FormItem, + ListGroup, ListItem, NodeItem, PictureItem, @@ -46,12 +50,29 @@ TitleItem, ) - +class MarkdownSummaryMode(str, Enum): + + OUTLINE = "outline" + TABLE_OF_CONTENTS = "table_of_contents" + class MarkdownSummaryParams(CommonParams): """Markdown-specific serialization parameters for outline.""" + mode: MarkdownSummaryMode = MarkdownSummaryMode.OUTLINE + use_markdown_headers: bool = False + add_label_counter: bool = False + add_references: bool = True + add_summary: bool = True + + # Indentation control: when enabled, indent each line according to + # the latest encountered section-header level (title treated as level 0). + indent_by_section_level: bool = False + indent_size: int = 2 + + toc_labels: list[DocItemLabel] = [DocItemLabel.TITLE, DocItemLabel.SECTION_HEADER] + class MarkdownSummarySerializer(DocSerializer): """Markdown-specific document summary serializer.""" @@ -104,8 +125,8 @@ def get_parts( **kwargs: Any, ) -> list[SerializationResult]: """Return a single part containing the document (or subtree) outline.""" - outline = self._create_document_outline(root=item, **kwargs) - return [create_ser_result(text=outline, span_source=[])] if outline else [] + return self._create_document_outline(root=item, **kwargs) + #return [create_ser_result(text=outline, span_source=[])] if outline else [] @override def serialize_doc( @@ -122,60 +143,157 @@ def _create_document_outline( *, root: Optional[NodeItem] = None, **kwargs: Any, - ) -> str: + ) -> list[SerializationResult]: """Create an outline, respecting params and recursive traversal.""" params = self.params.merge_with_patch(patch=kwargs) excluded = self.get_excluded_refs(**kwargs) - label_counter: dict[DocItemLabel, int] = { - DocItemLabel.TABLE: 0, - DocItemLabel.PICTURE: 0, - DocItemLabel.TEXT: 0, - } + # Per-label counters; used consistently when params.add_label_counter is True + # and always for table/picture numbering. + label_counter: dict[DocItemLabel, int] = {} lines: list[str] = [] visited: set[str] = set() + result: list[SerializationResult] = [] + + # Track latest section header level for indentation + current_section_level: int = 0 + + # Helper to increment and fetch the counter for a given label + def _next_idx(lbl: DocItemLabel) -> int: + label_counter[lbl] = label_counter.get(lbl, 0) + 1 + return label_counter[lbl] + + # Helper to identify if the label should be included in the table-of-contents + def _include(lbl: DocItemLabel) -> int: + if params.mode==MarkdownSummaryMode.TABLE_OF_CONTENTS and \ + (lbl not in params.toc_labels): + return False + + return True + # Iterate depth-first with groups, similar to MarkdownSerializer for node, level in self.doc.iterate_items(root=root, with_groups=True): if node.self_ref in visited: continue + visited.add(node.self_ref) + if not _include(lbl=node.label): + continue + + summary = "" + if params.add_summary and \ + (node.summary is not None) and \ + isinstance(node.summary, str): + summary = node.summary + # Skip list items in outline if isinstance(node, ListItem): continue - # Respect excluded refs and skip caption text items + # Respect excluded refs if isinstance(node, DocItem): if node.self_ref in excluded: continue if isinstance(node, TextItem) and node.self_ref in self._captions_of_some_item: continue + line:str = "" + + # Base label string (normalize underscores to hyphens) + node_label = str(node.label).replace("_", "-") + if params.add_label_counter and not isinstance(node, (TableItem, PictureItem)): + # Apply generic counters to non-table/picture items + node_label = f"{node_label} {_next_idx(node.label)}" + + # Build optional reference snippet only when enabled + ref_part = f" (reference={node.self_ref})" if params.add_references else "" + if isinstance(node, TitleItem): + + raw_text = self.text_serializer.serialize( + item=node, doc_serializer=self, doc=self.doc + ).text + if params.use_markdown_headers: - lines.append(f"# {node.text}") + # raw_text already includes the heading marker + text = raw_text.lstrip() + line = f"{text}{ref_part}" else: - lines.append(f"title (reference={node.self_ref}): {node.text}") + # strip leading markdown header markers for verbose representation + text = raw_text.lstrip().lstrip("# ") if raw_text.startswith("#") else raw_text + if params.add_references: + line = f"{node_label}{ref_part}: {text}" + else: + line = f"{node_label}: {text}" + elif isinstance(node, SectionHeaderItem): + + raw_text = self.text_serializer.serialize( + item=node, doc_serializer=self, doc=self.doc + ).text + if params.use_markdown_headers: - hashes = "#" * (node.level + 1) - lines.append(f"{hashes} {node.text}") + # raw_text already includes the correct number of '#' + text = raw_text.lstrip() + if params.add_references: + line = f"{text} (level={node.level}, reference={node.self_ref})" + else: + line = f"{text} (level={node.level})" else: - lines.append( - f"section-header (level={node.level}, reference={node.self_ref}): {node.text}" - ) + # strip leading markdown header markers for verbose representation + stripped = raw_text.lstrip() + while stripped.startswith("#"): + stripped = stripped.lstrip("#").lstrip() + text = stripped + if params.add_references: + line = f"{node_label} (level={node.level}, reference={node.self_ref}): {text}" + else: + line = f"{node_label} (level={node.level}): {text}" + + # Update current section level for subsequent items + current_section_level = node.level + + elif isinstance(node, ListGroup): + # Skip listing list groups in summary to avoid leading list noise + line = "" + elif isinstance(node, TextItem): - lines.append(f"{node.label} (reference={node.self_ref})") + line = f"{node_label}{ref_part}" + + elif isinstance(node, FormItem): + line = f"{node_label}{ref_part}" + + elif isinstance(node, CodeItem): + line = f"{node_label}{ref_part}" + elif isinstance(node, TableItem): - label_counter[DocItemLabel.TABLE] += 1 - lines.append( - f"{node.label} {label_counter[DocItemLabel.TABLE]} (reference={node.self_ref})" - ) + # Tables are always numbered in the summary + line = f"{node_label} {_next_idx(DocItemLabel.TABLE)}{ref_part}" + elif isinstance(node, PictureItem): - label_counter[DocItemLabel.PICTURE] += 1 - lines.append( - f"{node.label} {label_counter[DocItemLabel.PICTURE]} (reference={node.self_ref})" - ) + # Pictures are always numbered in the summary + line = f"{node_label} {_next_idx(DocItemLabel.PICTURE)}{ref_part}" + + if len(summary)>0: + line += f" (summary={summary})" - return "\n\n".join(lines) + # Apply indentation based on latest section level if enabled + if params.indent_by_section_level: + indent_level = current_section_level + # For a section-header, indent by its own level + if isinstance(node, SectionHeaderItem): + indent_level = node.level + indent = " " * (params.indent_size * indent_level) + line = f"{indent}{line}" if line else line + + if line: + result.append( + create_ser_result( + text=line, + span_source=node if isinstance(node, DocItem) else [], + ) + ) + + return result diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 932fdc40..32d7c533 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -952,7 +952,7 @@ class NodeItem(BaseModel): model_config = ConfigDict(extra="forbid") - summary: Optional[str] = None # + summary: Optional[str] = Field(default=None, exclude=True) # optional, not serialized def get_ref(self) -> RefItem: """get_ref.""" diff --git a/test/data/doc/constructed_mdsum_indent_mdhdr_false_indent_true_size_2.gt.md b/test/data/doc/constructed_mdsum_indent_mdhdr_false_indent_true_size_2.gt.md new file mode 100644 index 00000000..da6a4cb7 --- /dev/null +++ b/test/data/doc/constructed_mdsum_indent_mdhdr_false_indent_true_size_2.gt.md @@ -0,0 +1,59 @@ +title (reference=#/texts/1): Title of the Document + +text (reference=#/texts/2) + +text (reference=#/texts/3) + + section-header (level=1, reference=#/texts/4): 1. Introduction + + text (reference=#/texts/5) + + table 1 (reference=#/tables/0) + + picture 1 (reference=#/pictures/0) + + picture 2 (reference=#/pictures/1) + + text (reference=#/texts/24) + + code (reference=#/texts/25) + + text (reference=#/texts/26) + + text (reference=#/texts/28) + + formula (reference=#/texts/29) + + text (reference=#/texts/30) + + text (reference=#/texts/31) + + code (reference=#/texts/32) + + text (reference=#/texts/33) + + formula (reference=#/texts/34) + + form (reference=#/form_items/0) + + text (reference=#/texts/35) + + text (reference=#/texts/36) + + text (reference=#/texts/37) + + text (reference=#/texts/38) + + text (reference=#/texts/39) + + text (reference=#/texts/40) + + text (reference=#/texts/41) + + text (reference=#/texts/42) + + text (reference=#/texts/43) + + text (reference=#/texts/44) + + text (reference=#/texts/55) diff --git a/test/data/doc/constructed_mdsum_indent_mdhdr_true_indent_true_size_2.gt.md b/test/data/doc/constructed_mdsum_indent_mdhdr_true_indent_true_size_2.gt.md new file mode 100644 index 00000000..68f8efc7 --- /dev/null +++ b/test/data/doc/constructed_mdsum_indent_mdhdr_true_indent_true_size_2.gt.md @@ -0,0 +1,59 @@ +# Title of the Document (reference=#/texts/1) + +text (reference=#/texts/2) + +text (reference=#/texts/3) + + ## 1. Introduction (level=1, reference=#/texts/4) + + text (reference=#/texts/5) + + table 1 (reference=#/tables/0) + + picture 1 (reference=#/pictures/0) + + picture 2 (reference=#/pictures/1) + + text (reference=#/texts/24) + + code (reference=#/texts/25) + + text (reference=#/texts/26) + + text (reference=#/texts/28) + + formula (reference=#/texts/29) + + text (reference=#/texts/30) + + text (reference=#/texts/31) + + code (reference=#/texts/32) + + text (reference=#/texts/33) + + formula (reference=#/texts/34) + + form (reference=#/form_items/0) + + text (reference=#/texts/35) + + text (reference=#/texts/36) + + text (reference=#/texts/37) + + text (reference=#/texts/38) + + text (reference=#/texts/39) + + text (reference=#/texts/40) + + text (reference=#/texts/41) + + text (reference=#/texts/42) + + text (reference=#/texts/43) + + text (reference=#/texts/44) + + text (reference=#/texts/55) diff --git a/test/data/doc/constructed_mdsum_outline_mdhdr_false_indent_false.gt.md b/test/data/doc/constructed_mdsum_outline_mdhdr_false_indent_false.gt.md new file mode 100644 index 00000000..da005563 --- /dev/null +++ b/test/data/doc/constructed_mdsum_outline_mdhdr_false_indent_false.gt.md @@ -0,0 +1,59 @@ +title (reference=#/texts/1): Title of the Document + +text (reference=#/texts/2) + +text (reference=#/texts/3) + +section-header (level=1, reference=#/texts/4): 1. Introduction + +text (reference=#/texts/5) + +table 1 (reference=#/tables/0) + +picture 1 (reference=#/pictures/0) + +picture 2 (reference=#/pictures/1) + +text (reference=#/texts/24) + +code (reference=#/texts/25) + +text (reference=#/texts/26) + +text (reference=#/texts/28) + +formula (reference=#/texts/29) + +text (reference=#/texts/30) + +text (reference=#/texts/31) + +code (reference=#/texts/32) + +text (reference=#/texts/33) + +formula (reference=#/texts/34) + +form (reference=#/form_items/0) + +text (reference=#/texts/35) + +text (reference=#/texts/36) + +text (reference=#/texts/37) + +text (reference=#/texts/38) + +text (reference=#/texts/39) + +text (reference=#/texts/40) + +text (reference=#/texts/41) + +text (reference=#/texts/42) + +text (reference=#/texts/43) + +text (reference=#/texts/44) + +text (reference=#/texts/55) diff --git a/test/data/doc/constructed_mdsum_outline_mdhdr_false_indent_true.gt.md b/test/data/doc/constructed_mdsum_outline_mdhdr_false_indent_true.gt.md new file mode 100644 index 00000000..da6a4cb7 --- /dev/null +++ b/test/data/doc/constructed_mdsum_outline_mdhdr_false_indent_true.gt.md @@ -0,0 +1,59 @@ +title (reference=#/texts/1): Title of the Document + +text (reference=#/texts/2) + +text (reference=#/texts/3) + + section-header (level=1, reference=#/texts/4): 1. Introduction + + text (reference=#/texts/5) + + table 1 (reference=#/tables/0) + + picture 1 (reference=#/pictures/0) + + picture 2 (reference=#/pictures/1) + + text (reference=#/texts/24) + + code (reference=#/texts/25) + + text (reference=#/texts/26) + + text (reference=#/texts/28) + + formula (reference=#/texts/29) + + text (reference=#/texts/30) + + text (reference=#/texts/31) + + code (reference=#/texts/32) + + text (reference=#/texts/33) + + formula (reference=#/texts/34) + + form (reference=#/form_items/0) + + text (reference=#/texts/35) + + text (reference=#/texts/36) + + text (reference=#/texts/37) + + text (reference=#/texts/38) + + text (reference=#/texts/39) + + text (reference=#/texts/40) + + text (reference=#/texts/41) + + text (reference=#/texts/42) + + text (reference=#/texts/43) + + text (reference=#/texts/44) + + text (reference=#/texts/55) diff --git a/test/data/doc/constructed_mdsum_outline_mdhdr_true_indent_false.gt.md b/test/data/doc/constructed_mdsum_outline_mdhdr_true_indent_false.gt.md new file mode 100644 index 00000000..8e03ee91 --- /dev/null +++ b/test/data/doc/constructed_mdsum_outline_mdhdr_true_indent_false.gt.md @@ -0,0 +1,59 @@ +# Title of the Document (reference=#/texts/1) + +text (reference=#/texts/2) + +text (reference=#/texts/3) + +## 1. Introduction (level=1, reference=#/texts/4) + +text (reference=#/texts/5) + +table 1 (reference=#/tables/0) + +picture 1 (reference=#/pictures/0) + +picture 2 (reference=#/pictures/1) + +text (reference=#/texts/24) + +code (reference=#/texts/25) + +text (reference=#/texts/26) + +text (reference=#/texts/28) + +formula (reference=#/texts/29) + +text (reference=#/texts/30) + +text (reference=#/texts/31) + +code (reference=#/texts/32) + +text (reference=#/texts/33) + +formula (reference=#/texts/34) + +form (reference=#/form_items/0) + +text (reference=#/texts/35) + +text (reference=#/texts/36) + +text (reference=#/texts/37) + +text (reference=#/texts/38) + +text (reference=#/texts/39) + +text (reference=#/texts/40) + +text (reference=#/texts/41) + +text (reference=#/texts/42) + +text (reference=#/texts/43) + +text (reference=#/texts/44) + +text (reference=#/texts/55) diff --git a/test/data/doc/constructed_mdsum_outline_mdhdr_true_indent_true.gt.md b/test/data/doc/constructed_mdsum_outline_mdhdr_true_indent_true.gt.md new file mode 100644 index 00000000..68f8efc7 --- /dev/null +++ b/test/data/doc/constructed_mdsum_outline_mdhdr_true_indent_true.gt.md @@ -0,0 +1,59 @@ +# Title of the Document (reference=#/texts/1) + +text (reference=#/texts/2) + +text (reference=#/texts/3) + + ## 1. Introduction (level=1, reference=#/texts/4) + + text (reference=#/texts/5) + + table 1 (reference=#/tables/0) + + picture 1 (reference=#/pictures/0) + + picture 2 (reference=#/pictures/1) + + text (reference=#/texts/24) + + code (reference=#/texts/25) + + text (reference=#/texts/26) + + text (reference=#/texts/28) + + formula (reference=#/texts/29) + + text (reference=#/texts/30) + + text (reference=#/texts/31) + + code (reference=#/texts/32) + + text (reference=#/texts/33) + + formula (reference=#/texts/34) + + form (reference=#/form_items/0) + + text (reference=#/texts/35) + + text (reference=#/texts/36) + + text (reference=#/texts/37) + + text (reference=#/texts/38) + + text (reference=#/texts/39) + + text (reference=#/texts/40) + + text (reference=#/texts/41) + + text (reference=#/texts/42) + + text (reference=#/texts/43) + + text (reference=#/texts/44) + + text (reference=#/texts/55) diff --git a/test/data/doc/constructed_mdsum_table_of_contents_mdhdr_false_indent_false.gt.md b/test/data/doc/constructed_mdsum_table_of_contents_mdhdr_false_indent_false.gt.md new file mode 100644 index 00000000..4d406b13 --- /dev/null +++ b/test/data/doc/constructed_mdsum_table_of_contents_mdhdr_false_indent_false.gt.md @@ -0,0 +1,3 @@ +title (reference=#/texts/1): Title of the Document + +section-header (level=1, reference=#/texts/4): 1. Introduction diff --git a/test/data/doc/constructed_mdsum_table_of_contents_mdhdr_false_indent_true.gt.md b/test/data/doc/constructed_mdsum_table_of_contents_mdhdr_false_indent_true.gt.md new file mode 100644 index 00000000..fdb5b964 --- /dev/null +++ b/test/data/doc/constructed_mdsum_table_of_contents_mdhdr_false_indent_true.gt.md @@ -0,0 +1,3 @@ +title (reference=#/texts/1): Title of the Document + + section-header (level=1, reference=#/texts/4): 1. Introduction diff --git a/test/data/doc/constructed_mdsum_table_of_contents_mdhdr_true_indent_false.gt.md b/test/data/doc/constructed_mdsum_table_of_contents_mdhdr_true_indent_false.gt.md new file mode 100644 index 00000000..628c2a04 --- /dev/null +++ b/test/data/doc/constructed_mdsum_table_of_contents_mdhdr_true_indent_false.gt.md @@ -0,0 +1,3 @@ +# Title of the Document (reference=#/texts/1) + +## 1. Introduction (level=1, reference=#/texts/4) diff --git a/test/data/doc/constructed_mdsum_table_of_contents_mdhdr_true_indent_true.gt.md b/test/data/doc/constructed_mdsum_table_of_contents_mdhdr_true_indent_true.gt.md new file mode 100644 index 00000000..341caca5 --- /dev/null +++ b/test/data/doc/constructed_mdsum_table_of_contents_mdhdr_true_indent_true.gt.md @@ -0,0 +1,3 @@ +# Title of the Document (reference=#/texts/1) + + ## 1. Introduction (level=1, reference=#/texts/4) diff --git a/test/test_markdown_summary.py b/test/test_markdown_summary.py index 6c3b72dc..372ff6c6 100644 --- a/test/test_markdown_summary.py +++ b/test/test_markdown_summary.py @@ -5,52 +5,79 @@ import pytest from docling_core.transforms.serializer.markdown_summary import ( + MarkdownSummaryMode, MarkdownSummaryParams, MarkdownSummarySerializer, ) from .test_docling_doc import _construct_doc +from .test_data_gen_flag import GEN_TEST_DATA + +def verify(exp_file: Path, actual: str): + if GEN_TEST_DATA: + with open(exp_file, "w", encoding="utf-8") as f: + f.write(f"{actual}\n") + else: + with open(exp_file, "r", encoding="utf-8") as f: + expected = f.read().rstrip() + assert expected == actual + +@pytest.mark.parametrize( + "mode", + [ + MarkdownSummaryMode.OUTLINE, + MarkdownSummaryMode.TABLE_OF_CONTENTS, + ], +) @pytest.mark.parametrize("use_md_headers", [False, True]) -def test_markdown_summary_outline(use_md_headers: bool): +@pytest.mark.parametrize("indent_by_section_level", [False, True]) +def test_markdown_summary_outline( + mode: MarkdownSummaryMode, use_md_headers: bool, indent_by_section_level: bool +): # Build a representative document with title, headers, text, lists, table, and pictures doc = _construct_doc() ser = MarkdownSummarySerializer( doc=doc, - params=MarkdownSummaryParams(use_markdown_headers=use_md_headers), + params=MarkdownSummaryParams( + use_markdown_headers=use_md_headers, + mode=mode, + indent_by_section_level=indent_by_section_level, + ), ) outline = ser.serialize().text - print(outline) - - # Leading list items should not appear in the outline - assert "item of leading list" not in outline - - # Captions should be excluded from outline - assert "This is the caption of table 1." not in outline - assert "This is the caption of figure 1." not in outline - assert "This is the caption of figure 2." not in outline - - # Title and section header formatting based on params - if use_md_headers: - # Markdown-style headers - assert "# Title of the Document" in outline - assert "## 1. Introduction" in outline - # Ensure we don't get the verbose label style when using MD headers - assert "title (reference=" not in outline.splitlines()[0] - else: - # Verbose outline lines with references - first_line = outline.splitlines()[0] - assert first_line.startswith("title (reference=") and first_line.endswith( - "): Title of the Document" - ) - # Section header line contains level and reference - assert "section-header (level=1, reference=" in outline - - # Tables and pictures should be numbered and listed with references - assert "table 1 (reference=" in outline - assert "picture 1 (reference=" in outline + # Compare with or generate ground-truth output + root_dir = Path("./test/data/doc") + exp_path = ( + root_dir + / f"constructed_mdsum_{mode.value}_mdhdr_{str(use_md_headers).lower()}_indent_{str(indent_by_section_level).lower()}.gt.md" + ) + verify(exp_file=exp_path, actual=outline) + +@pytest.mark.parametrize("use_md_headers", [False, True]) +def test_markdown_summary_indentation(use_md_headers: bool): + # Build a representative document + doc = _construct_doc() + + ser = MarkdownSummarySerializer( + doc=doc, + params=MarkdownSummaryParams( + use_markdown_headers=use_md_headers, + indent_by_section_level=True, + indent_size=2, + ), + ) + + outline = ser.serialize().text + # Compare with or generate ground-truth output for indentation-specific case + root_dir = Path("./test/data/doc") + exp_path = ( + root_dir + / f"constructed_mdsum_indent_mdhdr_{str(use_md_headers).lower()}_indent_true_size_2.gt.md" + ) + verify(exp_file=exp_path, actual=outline) From c6ace9a84ec7cc08019a2b062cd26622be61c008 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Tue, 23 Sep 2025 14:30:19 +0200 Subject: [PATCH 06/14] passed all the pre-commit hooks Signed-off-by: Peter Staar --- .../transforms/serializer/markdown_summary.py | 354 +++++++++++------- docling_core/types/doc/document.py | 6 +- test/test_markdown_summary.py | 5 +- 3 files changed, 234 insertions(+), 131 deletions(-) diff --git a/docling_core/transforms/serializer/markdown_summary.py b/docling_core/transforms/serializer/markdown_summary.py index bbfd3ac6..60403441 100644 --- a/docling_core/transforms/serializer/markdown_summary.py +++ b/docling_core/transforms/serializer/markdown_summary.py @@ -1,13 +1,18 @@ -from typing import Any, Optional, Union -from pathlib import Path +"""Markdown document summary serializers (outline and TOC). + +This module provides a Markdown-focused serializer that emits a compact +document outline or a table of contents derived from a Docling document. +""" + from enum import Enum +from pathlib import Path +from typing import Any, Optional, Union from pydantic import AnyUrl from typing_extensions import override from docling_core.transforms.serializer.base import ( BaseAnnotationSerializer, - BaseDocSerializer, BaseFallbackSerializer, BaseFormSerializer, BaseInlineSerializer, @@ -38,8 +43,8 @@ CodeItem, DocItem, DocItemLabel, - DoclingDocument, FormItem, + GroupItem, ListGroup, ListItem, NodeItem, @@ -50,16 +55,19 @@ TitleItem, ) + class MarkdownSummaryMode(str, Enum): - + """Display mode for document summary output.""" + OUTLINE = "outline" TABLE_OF_CONTENTS = "table_of_contents" - + + class MarkdownSummaryParams(CommonParams): """Markdown-specific serialization parameters for outline.""" mode: MarkdownSummaryMode = MarkdownSummaryMode.OUTLINE - + use_markdown_headers: bool = False add_label_counter: bool = False @@ -72,7 +80,7 @@ class MarkdownSummaryParams(CommonParams): indent_size: int = 2 toc_labels: list[DocItemLabel] = [DocItemLabel.TITLE, DocItemLabel.SECTION_HEADER] - + class MarkdownSummarySerializer(DocSerializer): """Markdown-specific document summary serializer.""" @@ -94,14 +102,17 @@ class MarkdownSummarySerializer(DocSerializer): @override def serialize_bold(self, text: str, **kwargs: Any) -> str: + """Apply Markdown bold formatting to ``text``.""" return f"**{text}**" @override def serialize_italic(self, text: str, **kwargs: Any) -> str: + """Apply Markdown italic formatting to ``text``.""" return f"*{text}*" @override def serialize_strikethrough(self, text: str, **kwargs: Any) -> str: + """Apply Markdown strikethrough formatting to ``text``.""" return f"~~{text}~~" @override @@ -111,13 +122,17 @@ def serialize_hyperlink( hyperlink: Union[AnyUrl, Path], **kwargs: Any, ) -> str: + """Render a Markdown hyperlink around ``text``. + + Returns a ``[text](href)`` string with the provided URL/path. + """ return f"[{text}]({str(hyperlink)})" @override def requires_page_break(self) -> bool: """Whether to add page breaks.""" return False - + @override def get_parts( self, @@ -126,7 +141,8 @@ def get_parts( ) -> list[SerializationResult]: """Return a single part containing the document (or subtree) outline.""" return self._create_document_outline(root=item, **kwargs) - #return [create_ser_result(text=outline, span_source=[])] if outline else [] + + # return [create_ser_result(text=outline, span_source=[])] if outline else [] @override def serialize_doc( @@ -135,9 +151,170 @@ def serialize_doc( parts: list[SerializationResult], **kwargs: Any, ) -> SerializationResult: + """Serialize a document summary from pre-rendered parts.""" text_res = "\n\n".join([p.text for p in parts if p.text]) return create_ser_result(text=text_res, span_source=parts) + # ------------------------- + # Helper methods (internal) + # ------------------------- + + def _next_idx( + self, *, lbl: DocItemLabel, label_counter: dict[DocItemLabel, int] + ) -> int: + label_counter[lbl] = label_counter.get(lbl, 0) + 1 + return label_counter[lbl] + + def _include_label( + self, *, params: MarkdownSummaryParams, lbl: DocItemLabel + ) -> bool: + """Return True if label should be included (esp. for TOC mode).""" + if ( + params.mode == MarkdownSummaryMode.TABLE_OF_CONTENTS + and lbl not in params.toc_labels + ): + return False + return True + + def _is_node_excluded( + self, + *, + node: NodeItem, + excluded: set[str], + params: MarkdownSummaryParams, + ) -> bool: + """Centralize exclusion logic applied to nodes in the outline.""" + if isinstance(node, DocItem): + if node.self_ref in excluded: + return True + if ( + isinstance(node, TextItem) + and node.self_ref in self._captions_of_some_item + ): + return True + if not self._include_label(params=params, lbl=node.label): + return True + return False + + def _compose_node_label( + self, + *, + node: NodeItem, + params: MarkdownSummaryParams, + label_counter: dict[DocItemLabel, int], + ) -> str: + """Compute the textual label for a node (without refs). + + - When ``add_label_counter`` is True, add counters for non-table/picture + DocItems. + - Tables/pictures are numbered separately when building the final line. + - For groups, expose the raw normalized label but do not emit a line. + """ + node_label = "" + if ( + params.add_label_counter + and isinstance(node, DocItem) + and not isinstance(node, (TableItem, PictureItem)) + ): + base = str(node.label).replace("_", "-") + lbl_cnt = self._next_idx(lbl=node.label, label_counter=label_counter) + node_label = f"{base} {lbl_cnt}" + elif isinstance(node, (DocItem, GroupItem)): + node_label = str(node.label).replace("_", "-") + return node_label + + def _ref_part(self, *, node: NodeItem, params: MarkdownSummaryParams) -> str: + return f" (reference={node.self_ref})" if params.add_references else "" + + def _strip_md_header_prefix(self, text: str) -> str: + stripped = text.lstrip() + while stripped.startswith("#"): + stripped = stripped.lstrip("#").lstrip() + return stripped + + def _line_for_title( + self, + *, + node: TitleItem, + params: MarkdownSummaryParams, + node_label: str, + ref_part: str, + ) -> str: + raw_text = self.text_serializer.serialize( + item=node, doc_serializer=self, doc=self.doc + ).text + if params.use_markdown_headers: + text = raw_text.lstrip() + return f"{text}{ref_part}" + text = raw_text.lstrip().lstrip("# ") if raw_text.startswith("#") else raw_text + return ( + f"{node_label}{ref_part}: {text}" + if params.add_references + else f"{node_label}: {text}" + ) + + def _line_for_section_header( + self, + *, + node: SectionHeaderItem, + params: MarkdownSummaryParams, + node_label: str, + ) -> str: + raw_text = self.text_serializer.serialize( + item=node, doc_serializer=self, doc=self.doc + ).text + if params.use_markdown_headers: + text = raw_text.lstrip() + if params.add_references: + return f"{text} (level={node.level}, reference={node.self_ref})" + return f"{text} (level={node.level})" + stripped = self._strip_md_header_prefix(raw_text) + if params.add_references: + return f"{node_label} (level={node.level}, reference={node.self_ref}): {stripped}" + return f"{node_label} (level={node.level}): {stripped}" + + def _line_for_simple_label(self, *, node_label: str, ref_part: str) -> str: + return f"{node_label}{ref_part}" + + def _line_for_table( + self, *, node_label: str, ref_part: str, label_counter: dict[DocItemLabel, int] + ) -> str: + lbl_cnt = self._next_idx(lbl=DocItemLabel.TABLE, label_counter=label_counter) + return f"{node_label} {lbl_cnt}{ref_part}" + + def _line_for_picture( + self, *, node_label: str, ref_part: str, label_counter: dict[DocItemLabel, int] + ) -> str: + lbl_cnt = self._next_idx(lbl=DocItemLabel.PICTURE, label_counter=label_counter) + return f"{node_label} {lbl_cnt}{ref_part}" + + def _get_summary(self, *, node: NodeItem, params: MarkdownSummaryParams) -> str: + if ( + params.add_summary + and (node.summary is not None) + and isinstance(node.summary, str) + ): + return node.summary + return "" + + def _indent_line( + self, + *, + line: str, + node: NodeItem, + current_section_level: int, + params: MarkdownSummaryParams, + ) -> str: + if not line: + return line + if not params.indent_by_section_level: + return line + indent_level = ( + node.level if isinstance(node, SectionHeaderItem) else current_section_level + ) + indent = " " * (params.indent_size * indent_level) + return f"{indent}{line}" + def _create_document_outline( self, *, @@ -148,145 +325,68 @@ def _create_document_outline( params = self.params.merge_with_patch(patch=kwargs) excluded = self.get_excluded_refs(**kwargs) - # Per-label counters; used consistently when params.add_label_counter is True - # and always for table/picture numbering. label_counter: dict[DocItemLabel, int] = {} - lines: list[str] = [] visited: set[str] = set() - result: list[SerializationResult] = [] - - # Track latest section header level for indentation current_section_level: int = 0 - # Helper to increment and fetch the counter for a given label - def _next_idx(lbl: DocItemLabel) -> int: - label_counter[lbl] = label_counter.get(lbl, 0) + 1 - return label_counter[lbl] - - # Helper to identify if the label should be included in the table-of-contents - def _include(lbl: DocItemLabel) -> int: - if params.mode==MarkdownSummaryMode.TABLE_OF_CONTENTS and \ - (lbl not in params.toc_labels): - return False - - return True - - # Iterate depth-first with groups, similar to MarkdownSerializer - for node, level in self.doc.iterate_items(root=root, with_groups=True): + for node, _level in self.doc.iterate_items(root=root, with_groups=True): if node.self_ref in visited: continue - visited.add(node.self_ref) - if not _include(lbl=node.label): - continue - - summary = "" - if params.add_summary and \ - (node.summary is not None) and \ - isinstance(node.summary, str): - summary = node.summary - # Skip list items in outline if isinstance(node, ListItem): continue - # Respect excluded refs - if isinstance(node, DocItem): - if node.self_ref in excluded: - continue - if isinstance(node, TextItem) and node.self_ref in self._captions_of_some_item: - continue - - line:str = "" + # Respect exclusion logic + if self._is_node_excluded(node=node, excluded=excluded, params=params): + continue - # Base label string (normalize underscores to hyphens) - node_label = str(node.label).replace("_", "-") - if params.add_label_counter and not isinstance(node, (TableItem, PictureItem)): - # Apply generic counters to non-table/picture items - node_label = f"{node_label} {_next_idx(node.label)}" + summary = self._get_summary(node=node, params=params) + node_label = self._compose_node_label( + node=node, params=params, label_counter=label_counter + ) + ref_part = self._ref_part(node=node, params=params) - # Build optional reference snippet only when enabled - ref_part = f" (reference={node.self_ref})" if params.add_references else "" - + line = "" if isinstance(node, TitleItem): - - raw_text = self.text_serializer.serialize( - item=node, doc_serializer=self, doc=self.doc - ).text - - if params.use_markdown_headers: - # raw_text already includes the heading marker - text = raw_text.lstrip() - line = f"{text}{ref_part}" - else: - # strip leading markdown header markers for verbose representation - text = raw_text.lstrip().lstrip("# ") if raw_text.startswith("#") else raw_text - if params.add_references: - line = f"{node_label}{ref_part}: {text}" - else: - line = f"{node_label}: {text}" - + line = self._line_for_title( + node=node, params=params, node_label=node_label, ref_part=ref_part + ) elif isinstance(node, SectionHeaderItem): - - raw_text = self.text_serializer.serialize( - item=node, doc_serializer=self, doc=self.doc - ).text - - if params.use_markdown_headers: - # raw_text already includes the correct number of '#' - text = raw_text.lstrip() - if params.add_references: - line = f"{text} (level={node.level}, reference={node.self_ref})" - else: - line = f"{text} (level={node.level})" - else: - # strip leading markdown header markers for verbose representation - stripped = raw_text.lstrip() - while stripped.startswith("#"): - stripped = stripped.lstrip("#").lstrip() - text = stripped - if params.add_references: - line = f"{node_label} (level={node.level}, reference={node.self_ref}): {text}" - else: - line = f"{node_label} (level={node.level}): {text}" - - # Update current section level for subsequent items + line = self._line_for_section_header( + node=node, params=params, node_label=node_label + ) current_section_level = node.level - elif isinstance(node, ListGroup): - # Skip listing list groups in summary to avoid leading list noise - line = "" - - elif isinstance(node, TextItem): - line = f"{node_label}{ref_part}" - - elif isinstance(node, FormItem): - line = f"{node_label}{ref_part}" - - elif isinstance(node, CodeItem): - line = f"{node_label}{ref_part}" - + line = "" # intentionally skip + elif isinstance(node, (TextItem, FormItem, CodeItem)): + line = self._line_for_simple_label( + node_label=node_label, ref_part=ref_part + ) elif isinstance(node, TableItem): - # Tables are always numbered in the summary - line = f"{node_label} {_next_idx(DocItemLabel.TABLE)}{ref_part}" - + line = self._line_for_table( + node_label=node_label, + ref_part=ref_part, + label_counter=label_counter, + ) elif isinstance(node, PictureItem): - # Pictures are always numbered in the summary - line = f"{node_label} {_next_idx(DocItemLabel.PICTURE)}{ref_part}" + line = self._line_for_picture( + node_label=node_label, + ref_part=ref_part, + label_counter=label_counter, + ) - if len(summary)>0: - line += f" (summary={summary})" + if summary: + line = f"{line} (summary={summary})" if line else line - # Apply indentation based on latest section level if enabled - if params.indent_by_section_level: - indent_level = current_section_level - # For a section-header, indent by its own level - if isinstance(node, SectionHeaderItem): - indent_level = node.level - indent = " " * (params.indent_size * indent_level) - line = f"{indent}{line}" if line else line + line = self._indent_line( + line=line, + node=node, + current_section_level=current_section_level, + params=params, + ) if line: result.append( @@ -295,5 +395,5 @@ def _include(lbl: DocItemLabel) -> int: span_source=node if isinstance(node, DocItem) else [], ) ) - + return result diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 32d7c533..7ad20a0a 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -952,8 +952,10 @@ class NodeItem(BaseModel): model_config = ConfigDict(extra="forbid") - summary: Optional[str] = Field(default=None, exclude=True) # optional, not serialized - + summary: Optional[str] = Field( + default=None, exclude=True + ) # optional, not serialized + def get_ref(self) -> RefItem: """get_ref.""" return RefItem(cref=self.self_ref) diff --git a/test/test_markdown_summary.py b/test/test_markdown_summary.py index 372ff6c6..ddeb6e74 100644 --- a/test/test_markdown_summary.py +++ b/test/test_markdown_summary.py @@ -10,9 +10,8 @@ MarkdownSummarySerializer, ) -from .test_docling_doc import _construct_doc - from .test_data_gen_flag import GEN_TEST_DATA +from .test_docling_doc import _construct_doc def verify(exp_file: Path, actual: str): @@ -24,6 +23,7 @@ def verify(exp_file: Path, actual: str): expected = f.read().rstrip() assert expected == actual + @pytest.mark.parametrize( "mode", [ @@ -58,6 +58,7 @@ def test_markdown_summary_outline( ) verify(exp_file=exp_path, actual=outline) + @pytest.mark.parametrize("use_md_headers", [False, True]) def test_markdown_summary_indentation(use_md_headers: bool): # Build a representative document From 01bb9ee733f478d56ac64d62a627253c6b6f135e Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Tue, 23 Sep 2025 15:35:22 +0200 Subject: [PATCH 07/14] fixing some gt-data Signed-off-by: Peter Staar --- docling_core/types/doc/document.py | 4 +--- test/data/docling_document/unit/CodeItem.yaml | 15 ++++++++------- test/data/docling_document/unit/FloatingItem.yaml | 3 ++- test/data/docling_document/unit/FormItem.yaml | 3 ++- test/data/docling_document/unit/FormulaItem.yaml | 9 +++++---- test/data/docling_document/unit/KeyValueItem.yaml | 3 ++- test/data/docling_document/unit/ListItem.yaml | 9 +++++---- test/data/docling_document/unit/PictureItem.yaml | 3 ++- .../docling_document/unit/SectionHeaderItem.yaml | 9 +++++---- test/data/docling_document/unit/TableItem.yaml | 5 +++-- test/data/docling_document/unit/TextItem.yaml | 9 +++++---- test/data/docling_document/unit/TitleItem.yaml | 9 +++++---- test/test_docling_doc.py | 11 ++++++----- test/test_json_schema_to_search_mapper.py | 2 +- test/test_otsl_table_export.py | 6 +++--- 15 files changed, 55 insertions(+), 45 deletions(-) diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 7ad20a0a..51237db6 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -952,9 +952,7 @@ class NodeItem(BaseModel): model_config = ConfigDict(extra="forbid") - summary: Optional[str] = Field( - default=None, exclude=True - ) # optional, not serialized + summary: Optional[str] = None # serialized only when not None def get_ref(self) -> RefItem: """get_ref.""" diff --git a/test/data/docling_document/unit/CodeItem.yaml b/test/data/docling_document/unit/CodeItem.yaml index 09995640..9d0aee3d 100644 --- a/test/data/docling_document/unit/CodeItem.yaml +++ b/test/data/docling_document/unit/CodeItem.yaml @@ -1,15 +1,16 @@ -children: [] captions: [] -footnotes: [] -references: [] -image: null +children: [] code_language: Python content_layer: body +footnotes: [] +formatting: null +hyperlink: null +image: null label: code orig: whatever parent: null prov: [] +references: [] self_ref: '#' -text: print(Hello World!) -formatting: null -hyperlink: null +summary: null +text: print(Hello World!) \ No newline at end of file diff --git a/test/data/docling_document/unit/FloatingItem.yaml b/test/data/docling_document/unit/FloatingItem.yaml index 21beef40..2338cc35 100644 --- a/test/data/docling_document/unit/FloatingItem.yaml +++ b/test/data/docling_document/unit/FloatingItem.yaml @@ -1,5 +1,6 @@ captions: [] children: [] +content_layer: body footnotes: [] image: null label: text @@ -7,4 +8,4 @@ parent: null prov: [] references: [] self_ref: '#' -content_layer: body \ No newline at end of file +summary: null \ No newline at end of file diff --git a/test/data/docling_document/unit/FormItem.yaml b/test/data/docling_document/unit/FormItem.yaml index af7a61e1..39818d6d 100644 --- a/test/data/docling_document/unit/FormItem.yaml +++ b/test/data/docling_document/unit/FormItem.yaml @@ -28,4 +28,5 @@ label: form parent: null prov: [] references: [] -self_ref: '#' \ No newline at end of file +self_ref: '#' +summary: null \ No newline at end of file diff --git a/test/data/docling_document/unit/FormulaItem.yaml b/test/data/docling_document/unit/FormulaItem.yaml index 25057908..680b8acb 100644 --- a/test/data/docling_document/unit/FormulaItem.yaml +++ b/test/data/docling_document/unit/FormulaItem.yaml @@ -1,10 +1,11 @@ children: [] +content_layer: body +formatting: null +hyperlink: null label: formula orig: whatever parent: null prov: [] self_ref: '#' -text: E=mc^2 -content_layer: body -formatting: null -hyperlink: null +summary: null +text: E=mc^2 \ No newline at end of file diff --git a/test/data/docling_document/unit/KeyValueItem.yaml b/test/data/docling_document/unit/KeyValueItem.yaml index 219e951e..09a31ed7 100644 --- a/test/data/docling_document/unit/KeyValueItem.yaml +++ b/test/data/docling_document/unit/KeyValueItem.yaml @@ -28,4 +28,5 @@ label: key_value_region parent: null prov: [] references: [] -self_ref: '#' \ No newline at end of file +self_ref: '#' +summary: null \ No newline at end of file diff --git a/test/data/docling_document/unit/ListItem.yaml b/test/data/docling_document/unit/ListItem.yaml index 20d8de90..ebcc755a 100644 --- a/test/data/docling_document/unit/ListItem.yaml +++ b/test/data/docling_document/unit/ListItem.yaml @@ -1,12 +1,13 @@ children: [] +content_layer: body enumerated: true +formatting: null +hyperlink: null label: list_item marker: (1) orig: whatever parent: null prov: [] self_ref: '#' -text: whatever -content_layer: body -formatting: null -hyperlink: null +summary: null +text: whatever \ No newline at end of file diff --git a/test/data/docling_document/unit/PictureItem.yaml b/test/data/docling_document/unit/PictureItem.yaml index ffe342a6..f4f07d3e 100644 --- a/test/data/docling_document/unit/PictureItem.yaml +++ b/test/data/docling_document/unit/PictureItem.yaml @@ -1,6 +1,7 @@ annotations: [] captions: [] children: [] +content_layer: body footnotes: [] image: null label: picture @@ -8,4 +9,4 @@ parent: null prov: [] references: [] self_ref: '#' -content_layer: body \ No newline at end of file +summary: null \ No newline at end of file diff --git a/test/data/docling_document/unit/SectionHeaderItem.yaml b/test/data/docling_document/unit/SectionHeaderItem.yaml index 68f641f9..67f662e6 100644 --- a/test/data/docling_document/unit/SectionHeaderItem.yaml +++ b/test/data/docling_document/unit/SectionHeaderItem.yaml @@ -1,11 +1,12 @@ children: [] +content_layer: body +formatting: null +hyperlink: null label: section_header level: 2 orig: whatever parent: null prov: [] self_ref: '#' -text: whatever -content_layer: body -formatting: null -hyperlink: null +summary: null +text: whatever \ No newline at end of file diff --git a/test/data/docling_document/unit/TableItem.yaml b/test/data/docling_document/unit/TableItem.yaml index ae08e00e..778cb312 100644 --- a/test/data/docling_document/unit/TableItem.yaml +++ b/test/data/docling_document/unit/TableItem.yaml @@ -1,5 +1,7 @@ +annotations: [] captions: [] children: [] +content_layer: body data: grid: - - bbox: null @@ -192,5 +194,4 @@ parent: null prov: [] references: [] self_ref: '#' -content_layer: body -annotations: [] +summary: null \ No newline at end of file diff --git a/test/data/docling_document/unit/TextItem.yaml b/test/data/docling_document/unit/TextItem.yaml index 1f72637a..ee8247eb 100644 --- a/test/data/docling_document/unit/TextItem.yaml +++ b/test/data/docling_document/unit/TextItem.yaml @@ -1,10 +1,11 @@ children: [] +content_layer: body +formatting: null +hyperlink: null label: text orig: whatever parent: null prov: [] self_ref: '#' -text: whatever -content_layer: body -formatting: null -hyperlink: null +summary: null +text: whatever \ No newline at end of file diff --git a/test/data/docling_document/unit/TitleItem.yaml b/test/data/docling_document/unit/TitleItem.yaml index 8e2a3dea..02d61247 100644 --- a/test/data/docling_document/unit/TitleItem.yaml +++ b/test/data/docling_document/unit/TitleItem.yaml @@ -1,10 +1,11 @@ children: [] +content_layer: body +formatting: null +hyperlink: null label: title orig: whatever parent: null prov: [] self_ref: '#' -text: whatever -content_layer: body -formatting: null -hyperlink: null +summary: null +text: whatever \ No newline at end of file diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py index d5ddb4dc..c60d9894 100644 --- a/test/test_docling_doc.py +++ b/test/test_docling_doc.py @@ -415,14 +415,15 @@ def read(name: str): def verify(dc, obj): pred = serialise(obj).strip() - if dc is KeyValueItem or dc is FormItem: - write(dc.__name__, pred) + # if dc is KeyValueItem or dc is FormItem or dc is TextItem: + # write(dc.__name__, pred) pred = yaml.safe_load(pred) - # print(f"\t{dc.__name__}:\n {pred}") - gold = read(dc.__name__) + gold = read(dc.__name__) + # print(f"\t{dc.__name__}:\n {gold}") + assert pred == gold, f"pred!=gold for {dc.__name__}" # Iterate over the derived classes of the BaseClass @@ -727,7 +728,7 @@ def _test_export_methods( second_page = first_page + 1 if second_page in doc.pages: # Only test if document has at least 2 pages dt_pages_pred = doc.export_to_doctags(pages={first_page, second_page}) - print(dt_pages_pred) + # print(dt_pages_pred) _verify_regression_test(dt_pages_pred, filename=filename, ext="pages.dt") # Test Tables export ... diff --git a/test/test_json_schema_to_search_mapper.py b/test/test_json_schema_to_search_mapper.py index e52984d4..2973fa67 100644 --- a/test/test_json_schema_to_search_mapper.py +++ b/test/test_json_schema_to_search_mapper.py @@ -60,7 +60,7 @@ def test_json_schema_to_search_mapper_0(): def test_json_schema_to_search_mapper_1(): """Test the class JsonSchemaToSearchMapper.""" s = Record.model_json_schema() - print(json.dumps(s, indent=2)) + # print(json.dumps(s, indent=2)) _meta = { "aliases": [".production", "ccc"], diff --git a/test/test_otsl_table_export.py b/test/test_otsl_table_export.py index 4b3534f3..c6b4e6b3 100644 --- a/test/test_otsl_table_export.py +++ b/test/test_otsl_table_export.py @@ -275,9 +275,9 @@ def test_table_export_to_otsl(): add_cell_location=False, add_cell_text=False, doc=doc ) print_friendly = otsl_string.split("") - print("OTSL out:") - for s in print_friendly: - print(s) + # print("OTSL out:") + # for s in print_friendly: + # print(s) assert ( otsl_string == "" From 830ba5c3f40c594ba31466360694ffed3445ce49 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Tue, 23 Sep 2025 15:37:44 +0200 Subject: [PATCH 08/14] refactord the code Signed-off-by: Peter Staar --- docs/DoclingDocument.json | 158 +++++++++++++++++++++++++++++++++ test/test_docling_doc.py | 2 +- test/test_otsl_table_export.py | 2 +- 3 files changed, 160 insertions(+), 2 deletions(-) diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json index 305f5a9b..96b84fe9 100644 --- a/docs/DoclingDocument.json +++ b/docs/DoclingDocument.json @@ -194,6 +194,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "label": { "const": "code", "default": "code", @@ -475,6 +487,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "label": { "const": "form", "default": "form", @@ -598,6 +622,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "label": { "const": "formula", "default": "formula", @@ -807,6 +843,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "name": { "default": "group", "title": "Name", @@ -912,6 +960,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "name": { "default": "group", "title": "Name", @@ -962,6 +1022,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "label": { "const": "key_value_region", "default": "key_value_region", @@ -1054,6 +1126,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "name": { "default": "group", "title": "Name", @@ -1104,6 +1188,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "label": { "const": "list_item", "default": "list_item", @@ -1341,6 +1437,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "label": { "default": "picture", "enum": [ @@ -1842,6 +1950,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "label": { "const": "section_header", "default": "section_header", @@ -2065,6 +2185,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "label": { "default": "table", "enum": [ @@ -2182,6 +2314,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "label": { "enum": [ "caption", @@ -2285,6 +2429,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "label": { "const": "title", "default": "title", @@ -2382,6 +2538,7 @@ "parent": null, "children": [], "content_layer": "furniture", + "summary": null, "name": "_root_", "label": "unspecified" }, @@ -2394,6 +2551,7 @@ "parent": null, "children": [], "content_layer": "body", + "summary": null, "name": "_root_", "label": "unspecified" } diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py index c60d9894..6f8fbd99 100644 --- a/test/test_docling_doc.py +++ b/test/test_docling_doc.py @@ -423,7 +423,7 @@ def verify(dc, obj): gold = read(dc.__name__) # print(f"\t{dc.__name__}:\n {gold}") - + assert pred == gold, f"pred!=gold for {dc.__name__}" # Iterate over the derived classes of the BaseClass diff --git a/test/test_otsl_table_export.py b/test/test_otsl_table_export.py index c6b4e6b3..dde0744c 100644 --- a/test/test_otsl_table_export.py +++ b/test/test_otsl_table_export.py @@ -274,7 +274,7 @@ def test_table_export_to_otsl(): otsl_string = doc.tables[0].export_to_otsl( add_cell_location=False, add_cell_text=False, doc=doc ) - print_friendly = otsl_string.split("") + otsl_string.split("") # print("OTSL out:") # for s in print_friendly: # print(s) From f376c01e2a4481babcbb8fbc1303831d8325b4f6 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Wed, 24 Sep 2025 09:43:41 +0200 Subject: [PATCH 09/14] refactored the markdown summary in experimental for now Signed-off-by: Peter Staar --- docling_core/experimental/serializer/__init__.py | 5 +++++ .../serializer/markdown_summary.py | 0 test/test_doc_schema.py | 6 +++--- test/test_markdown_summary.py | 2 +- 4 files changed, 9 insertions(+), 4 deletions(-) create mode 100644 docling_core/experimental/serializer/__init__.py rename docling_core/{transforms => experimental}/serializer/markdown_summary.py (100%) diff --git a/docling_core/experimental/serializer/__init__.py b/docling_core/experimental/serializer/__init__.py new file mode 100644 index 00000000..5c450a0e --- /dev/null +++ b/docling_core/experimental/serializer/__init__.py @@ -0,0 +1,5 @@ +"""Experimental serializers for docling-core. + +This package contains experimental serialization utilities (e.g., Markdown +summaries) that may change without notice. +""" diff --git a/docling_core/transforms/serializer/markdown_summary.py b/docling_core/experimental/serializer/markdown_summary.py similarity index 100% rename from docling_core/transforms/serializer/markdown_summary.py rename to docling_core/experimental/serializer/markdown_summary.py diff --git a/test/test_doc_schema.py b/test/test_doc_schema.py index 109e7c88..acc560b4 100644 --- a/test/test_doc_schema.py +++ b/test/test_doc_schema.py @@ -35,8 +35,8 @@ def test_ccs_document(): # try as well as dictionary doc = json.loads(file_json) CCSDocument.model_validate(doc) - except ValidationError as e: - print(f"Validation error in file {filename}:\n{e.json()}") + except ValidationError: + # print(f"Validation error in file {filename}:\n{e.json()}") raise # check doc-error-1 is invalid in logs @@ -47,7 +47,7 @@ def test_ccs_document(): assert False, f"Data in file {filename} should be invalid for CCSDocument model" except ValidationError as e: for error in e.errors(): - print(type(error)) + # print(type(error)) assert all( item in error["loc"] for item in ("description", "logs") ), f"Data in file {filename} should fail in logs" diff --git a/test/test_markdown_summary.py b/test/test_markdown_summary.py index ddeb6e74..d3f5b4b3 100644 --- a/test/test_markdown_summary.py +++ b/test/test_markdown_summary.py @@ -4,7 +4,7 @@ import pytest -from docling_core.transforms.serializer.markdown_summary import ( +from docling_core.experimental.serializer.markdown_summary import ( MarkdownSummaryMode, MarkdownSummaryParams, MarkdownSummarySerializer, From 93a341c17d9af1ed0cd5db0fb78a0abd8c074bb5 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Wed, 24 Sep 2025 09:57:01 +0200 Subject: [PATCH 10/14] refactored the code to make it inherit from MarkdownSerializer Signed-off-by: Peter Staar --- .../serializer/markdown_summary.py | 104 +++--------------- 1 file changed, 15 insertions(+), 89 deletions(-) diff --git a/docling_core/experimental/serializer/markdown_summary.py b/docling_core/experimental/serializer/markdown_summary.py index 60403441..e18e2a74 100644 --- a/docling_core/experimental/serializer/markdown_summary.py +++ b/docling_core/experimental/serializer/markdown_summary.py @@ -5,39 +5,15 @@ """ from enum import Enum -from pathlib import Path -from typing import Any, Optional, Union +from typing import Any, Optional -from pydantic import AnyUrl from typing_extensions import override -from docling_core.transforms.serializer.base import ( - BaseAnnotationSerializer, - BaseFallbackSerializer, - BaseFormSerializer, - BaseInlineSerializer, - BaseKeyValueSerializer, - BaseListSerializer, - BasePictureSerializer, - BaseTableSerializer, - BaseTextSerializer, - SerializationResult, -) -from docling_core.transforms.serializer.common import ( - CommonParams, - DocSerializer, - create_ser_result, -) +from docling_core.transforms.serializer.base import SerializationResult +from docling_core.transforms.serializer.common import create_ser_result from docling_core.transforms.serializer.markdown import ( - MarkdownAnnotationSerializer, - MarkdownFallbackSerializer, - MarkdownFormSerializer, - MarkdownInlineSerializer, - MarkdownKeyValueSerializer, - MarkdownListSerializer, - MarkdownPictureSerializer, - MarkdownTableSerializer, - MarkdownTextSerializer, + MarkdownDocSerializer, + MarkdownParams, ) from docling_core.types.doc import ( CodeItem, @@ -63,8 +39,11 @@ class MarkdownSummaryMode(str, Enum): TABLE_OF_CONTENTS = "table_of_contents" -class MarkdownSummaryParams(CommonParams): - """Markdown-specific serialization parameters for outline.""" +class MarkdownSummaryParams(MarkdownParams): + """Markdown-specific serialization parameters for outline. + + Inherits MarkdownParams to retain Markdown behaviors (escaping, links, etc.). + """ mode: MarkdownSummaryMode = MarkdownSummaryMode.OUTLINE @@ -82,57 +61,15 @@ class MarkdownSummaryParams(CommonParams): toc_labels: list[DocItemLabel] = [DocItemLabel.TITLE, DocItemLabel.SECTION_HEADER] -class MarkdownSummarySerializer(DocSerializer): - """Markdown-specific document summary serializer.""" - - # Provide required serializer attributes to satisfy DocSerializer’s model - text_serializer: BaseTextSerializer = MarkdownTextSerializer() - table_serializer: BaseTableSerializer = MarkdownTableSerializer() - picture_serializer: BasePictureSerializer = MarkdownPictureSerializer() - key_value_serializer: BaseKeyValueSerializer = MarkdownKeyValueSerializer() - form_serializer: BaseFormSerializer = MarkdownFormSerializer() - fallback_serializer: BaseFallbackSerializer = MarkdownFallbackSerializer() +class MarkdownSummarySerializer(MarkdownDocSerializer): + """Markdown-specific document summary serializer. - list_serializer: BaseListSerializer = MarkdownListSerializer() - inline_serializer: BaseInlineSerializer = MarkdownInlineSerializer() - - annotation_serializer: BaseAnnotationSerializer = MarkdownAnnotationSerializer() + Inherits MarkdownDocSerializer to reuse Markdown formatting/post-processing + and sub-serializers; overrides only the parts selection logic. + """ params: MarkdownSummaryParams = MarkdownSummaryParams() - @override - def serialize_bold(self, text: str, **kwargs: Any) -> str: - """Apply Markdown bold formatting to ``text``.""" - return f"**{text}**" - - @override - def serialize_italic(self, text: str, **kwargs: Any) -> str: - """Apply Markdown italic formatting to ``text``.""" - return f"*{text}*" - - @override - def serialize_strikethrough(self, text: str, **kwargs: Any) -> str: - """Apply Markdown strikethrough formatting to ``text``.""" - return f"~~{text}~~" - - @override - def serialize_hyperlink( - self, - text: str, - hyperlink: Union[AnyUrl, Path], - **kwargs: Any, - ) -> str: - """Render a Markdown hyperlink around ``text``. - - Returns a ``[text](href)`` string with the provided URL/path. - """ - return f"[{text}]({str(hyperlink)})" - - @override - def requires_page_break(self) -> bool: - """Whether to add page breaks.""" - return False - @override def get_parts( self, @@ -144,17 +81,6 @@ def get_parts( # return [create_ser_result(text=outline, span_source=[])] if outline else [] - @override - def serialize_doc( - self, - *, - parts: list[SerializationResult], - **kwargs: Any, - ) -> SerializationResult: - """Serialize a document summary from pre-rendered parts.""" - text_res = "\n\n".join([p.text for p in parts if p.text]) - return create_ser_result(text=text_res, span_source=parts) - # ------------------------- # Helper methods (internal) # ------------------------- From 99a041f3a2120389386ab46d3b1e3c413a1ed6d0 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Wed, 24 Sep 2025 11:28:45 +0200 Subject: [PATCH 11/14] updated the tests without indent Signed-off-by: Peter Staar --- .../serializer/markdown_summary.py | 31 ---------- ...ndent_mdhdr_false_indent_true_size_2.gt.md | 59 ------------------- ...indent_mdhdr_true_indent_true_size_2.gt.md | 59 ------------------- ...nstructed_mdsum_outline_mdhdr_false.gt.md} | 0 ...dsum_outline_mdhdr_false_indent_true.gt.md | 59 ------------------- ...onstructed_mdsum_outline_mdhdr_true.gt.md} | 0 ...mdsum_outline_mdhdr_true_indent_true.gt.md | 59 ------------------- ...mdsum_table_of_contents_mdhdr_false.gt.md} | 0 ..._of_contents_mdhdr_false_indent_true.gt.md | 3 - ..._mdsum_table_of_contents_mdhdr_true.gt.md} | 0 ...e_of_contents_mdhdr_true_indent_true.gt.md | 3 - test/test_markdown_summary.py | 30 +--------- 12 files changed, 2 insertions(+), 301 deletions(-) delete mode 100644 test/data/doc/constructed_mdsum_indent_mdhdr_false_indent_true_size_2.gt.md delete mode 100644 test/data/doc/constructed_mdsum_indent_mdhdr_true_indent_true_size_2.gt.md rename test/data/doc/{constructed_mdsum_outline_mdhdr_false_indent_false.gt.md => constructed_mdsum_outline_mdhdr_false.gt.md} (100%) delete mode 100644 test/data/doc/constructed_mdsum_outline_mdhdr_false_indent_true.gt.md rename test/data/doc/{constructed_mdsum_outline_mdhdr_true_indent_false.gt.md => constructed_mdsum_outline_mdhdr_true.gt.md} (100%) delete mode 100644 test/data/doc/constructed_mdsum_outline_mdhdr_true_indent_true.gt.md rename test/data/doc/{constructed_mdsum_table_of_contents_mdhdr_false_indent_false.gt.md => constructed_mdsum_table_of_contents_mdhdr_false.gt.md} (100%) delete mode 100644 test/data/doc/constructed_mdsum_table_of_contents_mdhdr_false_indent_true.gt.md rename test/data/doc/{constructed_mdsum_table_of_contents_mdhdr_true_indent_false.gt.md => constructed_mdsum_table_of_contents_mdhdr_true.gt.md} (100%) delete mode 100644 test/data/doc/constructed_mdsum_table_of_contents_mdhdr_true_indent_true.gt.md diff --git a/docling_core/experimental/serializer/markdown_summary.py b/docling_core/experimental/serializer/markdown_summary.py index e18e2a74..4667ce46 100644 --- a/docling_core/experimental/serializer/markdown_summary.py +++ b/docling_core/experimental/serializer/markdown_summary.py @@ -53,10 +53,6 @@ class MarkdownSummaryParams(MarkdownParams): add_references: bool = True add_summary: bool = True - # Indentation control: when enabled, indent each line according to - # the latest encountered section-header level (title treated as level 0). - indent_by_section_level: bool = False - indent_size: int = 2 toc_labels: list[DocItemLabel] = [DocItemLabel.TITLE, DocItemLabel.SECTION_HEADER] @@ -223,24 +219,6 @@ def _get_summary(self, *, node: NodeItem, params: MarkdownSummaryParams) -> str: return node.summary return "" - def _indent_line( - self, - *, - line: str, - node: NodeItem, - current_section_level: int, - params: MarkdownSummaryParams, - ) -> str: - if not line: - return line - if not params.indent_by_section_level: - return line - indent_level = ( - node.level if isinstance(node, SectionHeaderItem) else current_section_level - ) - indent = " " * (params.indent_size * indent_level) - return f"{indent}{line}" - def _create_document_outline( self, *, @@ -254,7 +232,6 @@ def _create_document_outline( label_counter: dict[DocItemLabel, int] = {} visited: set[str] = set() result: list[SerializationResult] = [] - current_section_level: int = 0 for node, _level in self.doc.iterate_items(root=root, with_groups=True): if node.self_ref in visited: @@ -284,7 +261,6 @@ def _create_document_outline( line = self._line_for_section_header( node=node, params=params, node_label=node_label ) - current_section_level = node.level elif isinstance(node, ListGroup): line = "" # intentionally skip elif isinstance(node, (TextItem, FormItem, CodeItem)): @@ -307,13 +283,6 @@ def _create_document_outline( if summary: line = f"{line} (summary={summary})" if line else line - line = self._indent_line( - line=line, - node=node, - current_section_level=current_section_level, - params=params, - ) - if line: result.append( create_ser_result( diff --git a/test/data/doc/constructed_mdsum_indent_mdhdr_false_indent_true_size_2.gt.md b/test/data/doc/constructed_mdsum_indent_mdhdr_false_indent_true_size_2.gt.md deleted file mode 100644 index da6a4cb7..00000000 --- a/test/data/doc/constructed_mdsum_indent_mdhdr_false_indent_true_size_2.gt.md +++ /dev/null @@ -1,59 +0,0 @@ -title (reference=#/texts/1): Title of the Document - -text (reference=#/texts/2) - -text (reference=#/texts/3) - - section-header (level=1, reference=#/texts/4): 1. Introduction - - text (reference=#/texts/5) - - table 1 (reference=#/tables/0) - - picture 1 (reference=#/pictures/0) - - picture 2 (reference=#/pictures/1) - - text (reference=#/texts/24) - - code (reference=#/texts/25) - - text (reference=#/texts/26) - - text (reference=#/texts/28) - - formula (reference=#/texts/29) - - text (reference=#/texts/30) - - text (reference=#/texts/31) - - code (reference=#/texts/32) - - text (reference=#/texts/33) - - formula (reference=#/texts/34) - - form (reference=#/form_items/0) - - text (reference=#/texts/35) - - text (reference=#/texts/36) - - text (reference=#/texts/37) - - text (reference=#/texts/38) - - text (reference=#/texts/39) - - text (reference=#/texts/40) - - text (reference=#/texts/41) - - text (reference=#/texts/42) - - text (reference=#/texts/43) - - text (reference=#/texts/44) - - text (reference=#/texts/55) diff --git a/test/data/doc/constructed_mdsum_indent_mdhdr_true_indent_true_size_2.gt.md b/test/data/doc/constructed_mdsum_indent_mdhdr_true_indent_true_size_2.gt.md deleted file mode 100644 index 68f8efc7..00000000 --- a/test/data/doc/constructed_mdsum_indent_mdhdr_true_indent_true_size_2.gt.md +++ /dev/null @@ -1,59 +0,0 @@ -# Title of the Document (reference=#/texts/1) - -text (reference=#/texts/2) - -text (reference=#/texts/3) - - ## 1. Introduction (level=1, reference=#/texts/4) - - text (reference=#/texts/5) - - table 1 (reference=#/tables/0) - - picture 1 (reference=#/pictures/0) - - picture 2 (reference=#/pictures/1) - - text (reference=#/texts/24) - - code (reference=#/texts/25) - - text (reference=#/texts/26) - - text (reference=#/texts/28) - - formula (reference=#/texts/29) - - text (reference=#/texts/30) - - text (reference=#/texts/31) - - code (reference=#/texts/32) - - text (reference=#/texts/33) - - formula (reference=#/texts/34) - - form (reference=#/form_items/0) - - text (reference=#/texts/35) - - text (reference=#/texts/36) - - text (reference=#/texts/37) - - text (reference=#/texts/38) - - text (reference=#/texts/39) - - text (reference=#/texts/40) - - text (reference=#/texts/41) - - text (reference=#/texts/42) - - text (reference=#/texts/43) - - text (reference=#/texts/44) - - text (reference=#/texts/55) diff --git a/test/data/doc/constructed_mdsum_outline_mdhdr_false_indent_false.gt.md b/test/data/doc/constructed_mdsum_outline_mdhdr_false.gt.md similarity index 100% rename from test/data/doc/constructed_mdsum_outline_mdhdr_false_indent_false.gt.md rename to test/data/doc/constructed_mdsum_outline_mdhdr_false.gt.md diff --git a/test/data/doc/constructed_mdsum_outline_mdhdr_false_indent_true.gt.md b/test/data/doc/constructed_mdsum_outline_mdhdr_false_indent_true.gt.md deleted file mode 100644 index da6a4cb7..00000000 --- a/test/data/doc/constructed_mdsum_outline_mdhdr_false_indent_true.gt.md +++ /dev/null @@ -1,59 +0,0 @@ -title (reference=#/texts/1): Title of the Document - -text (reference=#/texts/2) - -text (reference=#/texts/3) - - section-header (level=1, reference=#/texts/4): 1. Introduction - - text (reference=#/texts/5) - - table 1 (reference=#/tables/0) - - picture 1 (reference=#/pictures/0) - - picture 2 (reference=#/pictures/1) - - text (reference=#/texts/24) - - code (reference=#/texts/25) - - text (reference=#/texts/26) - - text (reference=#/texts/28) - - formula (reference=#/texts/29) - - text (reference=#/texts/30) - - text (reference=#/texts/31) - - code (reference=#/texts/32) - - text (reference=#/texts/33) - - formula (reference=#/texts/34) - - form (reference=#/form_items/0) - - text (reference=#/texts/35) - - text (reference=#/texts/36) - - text (reference=#/texts/37) - - text (reference=#/texts/38) - - text (reference=#/texts/39) - - text (reference=#/texts/40) - - text (reference=#/texts/41) - - text (reference=#/texts/42) - - text (reference=#/texts/43) - - text (reference=#/texts/44) - - text (reference=#/texts/55) diff --git a/test/data/doc/constructed_mdsum_outline_mdhdr_true_indent_false.gt.md b/test/data/doc/constructed_mdsum_outline_mdhdr_true.gt.md similarity index 100% rename from test/data/doc/constructed_mdsum_outline_mdhdr_true_indent_false.gt.md rename to test/data/doc/constructed_mdsum_outline_mdhdr_true.gt.md diff --git a/test/data/doc/constructed_mdsum_outline_mdhdr_true_indent_true.gt.md b/test/data/doc/constructed_mdsum_outline_mdhdr_true_indent_true.gt.md deleted file mode 100644 index 68f8efc7..00000000 --- a/test/data/doc/constructed_mdsum_outline_mdhdr_true_indent_true.gt.md +++ /dev/null @@ -1,59 +0,0 @@ -# Title of the Document (reference=#/texts/1) - -text (reference=#/texts/2) - -text (reference=#/texts/3) - - ## 1. Introduction (level=1, reference=#/texts/4) - - text (reference=#/texts/5) - - table 1 (reference=#/tables/0) - - picture 1 (reference=#/pictures/0) - - picture 2 (reference=#/pictures/1) - - text (reference=#/texts/24) - - code (reference=#/texts/25) - - text (reference=#/texts/26) - - text (reference=#/texts/28) - - formula (reference=#/texts/29) - - text (reference=#/texts/30) - - text (reference=#/texts/31) - - code (reference=#/texts/32) - - text (reference=#/texts/33) - - formula (reference=#/texts/34) - - form (reference=#/form_items/0) - - text (reference=#/texts/35) - - text (reference=#/texts/36) - - text (reference=#/texts/37) - - text (reference=#/texts/38) - - text (reference=#/texts/39) - - text (reference=#/texts/40) - - text (reference=#/texts/41) - - text (reference=#/texts/42) - - text (reference=#/texts/43) - - text (reference=#/texts/44) - - text (reference=#/texts/55) diff --git a/test/data/doc/constructed_mdsum_table_of_contents_mdhdr_false_indent_false.gt.md b/test/data/doc/constructed_mdsum_table_of_contents_mdhdr_false.gt.md similarity index 100% rename from test/data/doc/constructed_mdsum_table_of_contents_mdhdr_false_indent_false.gt.md rename to test/data/doc/constructed_mdsum_table_of_contents_mdhdr_false.gt.md diff --git a/test/data/doc/constructed_mdsum_table_of_contents_mdhdr_false_indent_true.gt.md b/test/data/doc/constructed_mdsum_table_of_contents_mdhdr_false_indent_true.gt.md deleted file mode 100644 index fdb5b964..00000000 --- a/test/data/doc/constructed_mdsum_table_of_contents_mdhdr_false_indent_true.gt.md +++ /dev/null @@ -1,3 +0,0 @@ -title (reference=#/texts/1): Title of the Document - - section-header (level=1, reference=#/texts/4): 1. Introduction diff --git a/test/data/doc/constructed_mdsum_table_of_contents_mdhdr_true_indent_false.gt.md b/test/data/doc/constructed_mdsum_table_of_contents_mdhdr_true.gt.md similarity index 100% rename from test/data/doc/constructed_mdsum_table_of_contents_mdhdr_true_indent_false.gt.md rename to test/data/doc/constructed_mdsum_table_of_contents_mdhdr_true.gt.md diff --git a/test/data/doc/constructed_mdsum_table_of_contents_mdhdr_true_indent_true.gt.md b/test/data/doc/constructed_mdsum_table_of_contents_mdhdr_true_indent_true.gt.md deleted file mode 100644 index 341caca5..00000000 --- a/test/data/doc/constructed_mdsum_table_of_contents_mdhdr_true_indent_true.gt.md +++ /dev/null @@ -1,3 +0,0 @@ -# Title of the Document (reference=#/texts/1) - - ## 1. Introduction (level=1, reference=#/texts/4) diff --git a/test/test_markdown_summary.py b/test/test_markdown_summary.py index d3f5b4b3..3644fd22 100644 --- a/test/test_markdown_summary.py +++ b/test/test_markdown_summary.py @@ -32,9 +32,8 @@ def verify(exp_file: Path, actual: str): ], ) @pytest.mark.parametrize("use_md_headers", [False, True]) -@pytest.mark.parametrize("indent_by_section_level", [False, True]) def test_markdown_summary_outline( - mode: MarkdownSummaryMode, use_md_headers: bool, indent_by_section_level: bool + mode: MarkdownSummaryMode, use_md_headers: bool ): # Build a representative document with title, headers, text, lists, table, and pictures doc = _construct_doc() @@ -44,7 +43,6 @@ def test_markdown_summary_outline( params=MarkdownSummaryParams( use_markdown_headers=use_md_headers, mode=mode, - indent_by_section_level=indent_by_section_level, ), ) @@ -54,31 +52,7 @@ def test_markdown_summary_outline( root_dir = Path("./test/data/doc") exp_path = ( root_dir - / f"constructed_mdsum_{mode.value}_mdhdr_{str(use_md_headers).lower()}_indent_{str(indent_by_section_level).lower()}.gt.md" + / f"constructed_mdsum_{mode.value}_mdhdr_{str(use_md_headers).lower()}.gt.md" ) verify(exp_file=exp_path, actual=outline) - -@pytest.mark.parametrize("use_md_headers", [False, True]) -def test_markdown_summary_indentation(use_md_headers: bool): - # Build a representative document - doc = _construct_doc() - - ser = MarkdownSummarySerializer( - doc=doc, - params=MarkdownSummaryParams( - use_markdown_headers=use_md_headers, - indent_by_section_level=True, - indent_size=2, - ), - ) - - outline = ser.serialize().text - - # Compare with or generate ground-truth output for indentation-specific case - root_dir = Path("./test/data/doc") - exp_path = ( - root_dir - / f"constructed_mdsum_indent_mdhdr_{str(use_md_headers).lower()}_indent_true_size_2.gt.md" - ) - verify(exp_file=exp_path, actual=outline) From 776e957e4aa374c0f1b80ff7ab2dc89ee1e185f8 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Wed, 24 Sep 2025 11:30:41 +0200 Subject: [PATCH 12/14] refactoring the markdown_summary Signed-off-by: Peter Staar --- docling_core/experimental/serializer/markdown_summary.py | 1 - test/test_markdown_summary.py | 5 +---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/docling_core/experimental/serializer/markdown_summary.py b/docling_core/experimental/serializer/markdown_summary.py index 4667ce46..3db6bebe 100644 --- a/docling_core/experimental/serializer/markdown_summary.py +++ b/docling_core/experimental/serializer/markdown_summary.py @@ -53,7 +53,6 @@ class MarkdownSummaryParams(MarkdownParams): add_references: bool = True add_summary: bool = True - toc_labels: list[DocItemLabel] = [DocItemLabel.TITLE, DocItemLabel.SECTION_HEADER] diff --git a/test/test_markdown_summary.py b/test/test_markdown_summary.py index 3644fd22..63bdf719 100644 --- a/test/test_markdown_summary.py +++ b/test/test_markdown_summary.py @@ -32,9 +32,7 @@ def verify(exp_file: Path, actual: str): ], ) @pytest.mark.parametrize("use_md_headers", [False, True]) -def test_markdown_summary_outline( - mode: MarkdownSummaryMode, use_md_headers: bool -): +def test_markdown_summary_outline(mode: MarkdownSummaryMode, use_md_headers: bool): # Build a representative document with title, headers, text, lists, table, and pictures doc = _construct_doc() @@ -55,4 +53,3 @@ def test_markdown_summary_outline( / f"constructed_mdsum_{mode.value}_mdhdr_{str(use_md_headers).lower()}.gt.md" ) verify(exp_file=exp_path, actual=outline) - From f2e28db2a039cd0d918baf0d70d71d33549b6da9 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Wed, 24 Sep 2025 11:36:19 +0200 Subject: [PATCH 13/14] reset the docs/DoclingDocument.json Signed-off-by: Peter Staar --- docs/DoclingDocument.json | 158 -------------------------------------- 1 file changed, 158 deletions(-) diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json index 96b84fe9..305f5a9b 100644 --- a/docs/DoclingDocument.json +++ b/docs/DoclingDocument.json @@ -194,18 +194,6 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, - "summary": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Summary" - }, "label": { "const": "code", "default": "code", @@ -487,18 +475,6 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, - "summary": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Summary" - }, "label": { "const": "form", "default": "form", @@ -622,18 +598,6 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, - "summary": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Summary" - }, "label": { "const": "formula", "default": "formula", @@ -843,18 +807,6 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, - "summary": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Summary" - }, "name": { "default": "group", "title": "Name", @@ -960,18 +912,6 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, - "summary": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Summary" - }, "name": { "default": "group", "title": "Name", @@ -1022,18 +962,6 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, - "summary": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Summary" - }, "label": { "const": "key_value_region", "default": "key_value_region", @@ -1126,18 +1054,6 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, - "summary": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Summary" - }, "name": { "default": "group", "title": "Name", @@ -1188,18 +1104,6 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, - "summary": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Summary" - }, "label": { "const": "list_item", "default": "list_item", @@ -1437,18 +1341,6 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, - "summary": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Summary" - }, "label": { "default": "picture", "enum": [ @@ -1950,18 +1842,6 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, - "summary": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Summary" - }, "label": { "const": "section_header", "default": "section_header", @@ -2185,18 +2065,6 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, - "summary": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Summary" - }, "label": { "default": "table", "enum": [ @@ -2314,18 +2182,6 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, - "summary": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Summary" - }, "label": { "enum": [ "caption", @@ -2429,18 +2285,6 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, - "summary": { - "anyOf": [ - { - "type": "string" - }, - { - "type": "null" - } - ], - "default": null, - "title": "Summary" - }, "label": { "const": "title", "default": "title", @@ -2538,7 +2382,6 @@ "parent": null, "children": [], "content_layer": "furniture", - "summary": null, "name": "_root_", "label": "unspecified" }, @@ -2551,7 +2394,6 @@ "parent": null, "children": [], "content_layer": "body", - "summary": null, "name": "_root_", "label": "unspecified" } From 0c21580c109ded1dafa7df0334fa115b9cb58cde Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Wed, 24 Sep 2025 12:42:56 +0200 Subject: [PATCH 14/14] fix for the Docs Signed-off-by: Peter Staar --- docs/DoclingDocument.json | 158 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 158 insertions(+) diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json index 305f5a9b..96b84fe9 100644 --- a/docs/DoclingDocument.json +++ b/docs/DoclingDocument.json @@ -194,6 +194,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "label": { "const": "code", "default": "code", @@ -475,6 +487,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "label": { "const": "form", "default": "form", @@ -598,6 +622,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "label": { "const": "formula", "default": "formula", @@ -807,6 +843,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "name": { "default": "group", "title": "Name", @@ -912,6 +960,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "name": { "default": "group", "title": "Name", @@ -962,6 +1022,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "label": { "const": "key_value_region", "default": "key_value_region", @@ -1054,6 +1126,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "name": { "default": "group", "title": "Name", @@ -1104,6 +1188,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "label": { "const": "list_item", "default": "list_item", @@ -1341,6 +1437,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "label": { "default": "picture", "enum": [ @@ -1842,6 +1950,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "label": { "const": "section_header", "default": "section_header", @@ -2065,6 +2185,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "label": { "default": "table", "enum": [ @@ -2182,6 +2314,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "label": { "enum": [ "caption", @@ -2285,6 +2429,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "label": { "const": "title", "default": "title", @@ -2382,6 +2538,7 @@ "parent": null, "children": [], "content_layer": "furniture", + "summary": null, "name": "_root_", "label": "unspecified" }, @@ -2394,6 +2551,7 @@ "parent": null, "children": [], "content_layer": "body", + "summary": null, "name": "_root_", "label": "unspecified" }