diff --git a/docling_core/experimental/serializer/__init__.py b/docling_core/experimental/serializer/__init__.py new file mode 100644 index 00000000..5c450a0e --- /dev/null +++ b/docling_core/experimental/serializer/__init__.py @@ -0,0 +1,5 @@ +"""Experimental serializers for docling-core. + +This package contains experimental serialization utilities (e.g., Markdown +summaries) that may change without notice. +""" diff --git a/docling_core/experimental/serializer/markdown_summary.py b/docling_core/experimental/serializer/markdown_summary.py new file mode 100644 index 00000000..3db6bebe --- /dev/null +++ b/docling_core/experimental/serializer/markdown_summary.py @@ -0,0 +1,293 @@ +"""Markdown document summary serializers (outline and TOC). + +This module provides a Markdown-focused serializer that emits a compact +document outline or a table of contents derived from a Docling document. +""" + +from enum import Enum +from typing import Any, Optional + +from typing_extensions import override + +from docling_core.transforms.serializer.base import SerializationResult +from docling_core.transforms.serializer.common import create_ser_result +from docling_core.transforms.serializer.markdown import ( + MarkdownDocSerializer, + MarkdownParams, +) +from docling_core.types.doc import ( + CodeItem, + DocItem, + DocItemLabel, + FormItem, + GroupItem, + ListGroup, + ListItem, + NodeItem, + PictureItem, + SectionHeaderItem, + TableItem, + TextItem, + TitleItem, +) + + +class MarkdownSummaryMode(str, Enum): + """Display mode for document summary output.""" + + OUTLINE = "outline" + TABLE_OF_CONTENTS = "table_of_contents" + + +class MarkdownSummaryParams(MarkdownParams): + """Markdown-specific serialization parameters for outline. + + Inherits MarkdownParams to retain Markdown behaviors (escaping, links, etc.). + """ + + mode: MarkdownSummaryMode = MarkdownSummaryMode.OUTLINE + + use_markdown_headers: bool = False + + add_label_counter: bool = False + add_references: bool = True + add_summary: bool = True + + toc_labels: list[DocItemLabel] = [DocItemLabel.TITLE, DocItemLabel.SECTION_HEADER] + + +class MarkdownSummarySerializer(MarkdownDocSerializer): + """Markdown-specific document summary serializer. + + Inherits MarkdownDocSerializer to reuse Markdown formatting/post-processing + and sub-serializers; overrides only the parts selection logic. + """ + + params: MarkdownSummaryParams = MarkdownSummaryParams() + + @override + def get_parts( + self, + item: Optional[NodeItem] = None, + **kwargs: Any, + ) -> list[SerializationResult]: + """Return a single part containing the document (or subtree) outline.""" + return self._create_document_outline(root=item, **kwargs) + + # return [create_ser_result(text=outline, span_source=[])] if outline else [] + + # ------------------------- + # Helper methods (internal) + # ------------------------- + + def _next_idx( + self, *, lbl: DocItemLabel, label_counter: dict[DocItemLabel, int] + ) -> int: + label_counter[lbl] = label_counter.get(lbl, 0) + 1 + return label_counter[lbl] + + def _include_label( + self, *, params: MarkdownSummaryParams, lbl: DocItemLabel + ) -> bool: + """Return True if label should be included (esp. for TOC mode).""" + if ( + params.mode == MarkdownSummaryMode.TABLE_OF_CONTENTS + and lbl not in params.toc_labels + ): + return False + return True + + def _is_node_excluded( + self, + *, + node: NodeItem, + excluded: set[str], + params: MarkdownSummaryParams, + ) -> bool: + """Centralize exclusion logic applied to nodes in the outline.""" + if isinstance(node, DocItem): + if node.self_ref in excluded: + return True + if ( + isinstance(node, TextItem) + and node.self_ref in self._captions_of_some_item + ): + return True + if not self._include_label(params=params, lbl=node.label): + return True + return False + + def _compose_node_label( + self, + *, + node: NodeItem, + params: MarkdownSummaryParams, + label_counter: dict[DocItemLabel, int], + ) -> str: + """Compute the textual label for a node (without refs). + + - When ``add_label_counter`` is True, add counters for non-table/picture + DocItems. + - Tables/pictures are numbered separately when building the final line. + - For groups, expose the raw normalized label but do not emit a line. + """ + node_label = "" + if ( + params.add_label_counter + and isinstance(node, DocItem) + and not isinstance(node, (TableItem, PictureItem)) + ): + base = str(node.label).replace("_", "-") + lbl_cnt = self._next_idx(lbl=node.label, label_counter=label_counter) + node_label = f"{base} {lbl_cnt}" + elif isinstance(node, (DocItem, GroupItem)): + node_label = str(node.label).replace("_", "-") + return node_label + + def _ref_part(self, *, node: NodeItem, params: MarkdownSummaryParams) -> str: + return f" (reference={node.self_ref})" if params.add_references else "" + + def _strip_md_header_prefix(self, text: str) -> str: + stripped = text.lstrip() + while stripped.startswith("#"): + stripped = stripped.lstrip("#").lstrip() + return stripped + + def _line_for_title( + self, + *, + node: TitleItem, + params: MarkdownSummaryParams, + node_label: str, + ref_part: str, + ) -> str: + raw_text = self.text_serializer.serialize( + item=node, doc_serializer=self, doc=self.doc + ).text + if params.use_markdown_headers: + text = raw_text.lstrip() + return f"{text}{ref_part}" + text = raw_text.lstrip().lstrip("# ") if raw_text.startswith("#") else raw_text + return ( + f"{node_label}{ref_part}: {text}" + if params.add_references + else f"{node_label}: {text}" + ) + + def _line_for_section_header( + self, + *, + node: SectionHeaderItem, + params: MarkdownSummaryParams, + node_label: str, + ) -> str: + raw_text = self.text_serializer.serialize( + item=node, doc_serializer=self, doc=self.doc + ).text + if params.use_markdown_headers: + text = raw_text.lstrip() + if params.add_references: + return f"{text} (level={node.level}, reference={node.self_ref})" + return f"{text} (level={node.level})" + stripped = self._strip_md_header_prefix(raw_text) + if params.add_references: + return f"{node_label} (level={node.level}, reference={node.self_ref}): {stripped}" + return f"{node_label} (level={node.level}): {stripped}" + + def _line_for_simple_label(self, *, node_label: str, ref_part: str) -> str: + return f"{node_label}{ref_part}" + + def _line_for_table( + self, *, node_label: str, ref_part: str, label_counter: dict[DocItemLabel, int] + ) -> str: + lbl_cnt = self._next_idx(lbl=DocItemLabel.TABLE, label_counter=label_counter) + return f"{node_label} {lbl_cnt}{ref_part}" + + def _line_for_picture( + self, *, node_label: str, ref_part: str, label_counter: dict[DocItemLabel, int] + ) -> str: + lbl_cnt = self._next_idx(lbl=DocItemLabel.PICTURE, label_counter=label_counter) + return f"{node_label} {lbl_cnt}{ref_part}" + + def _get_summary(self, *, node: NodeItem, params: MarkdownSummaryParams) -> str: + if ( + params.add_summary + and (node.summary is not None) + and isinstance(node.summary, str) + ): + return node.summary + return "" + + def _create_document_outline( + self, + *, + root: Optional[NodeItem] = None, + **kwargs: Any, + ) -> list[SerializationResult]: + """Create an outline, respecting params and recursive traversal.""" + params = self.params.merge_with_patch(patch=kwargs) + excluded = self.get_excluded_refs(**kwargs) + + label_counter: dict[DocItemLabel, int] = {} + visited: set[str] = set() + result: list[SerializationResult] = [] + + for node, _level in self.doc.iterate_items(root=root, with_groups=True): + if node.self_ref in visited: + continue + visited.add(node.self_ref) + + # Skip list items in outline + if isinstance(node, ListItem): + continue + + # Respect exclusion logic + if self._is_node_excluded(node=node, excluded=excluded, params=params): + continue + + summary = self._get_summary(node=node, params=params) + node_label = self._compose_node_label( + node=node, params=params, label_counter=label_counter + ) + ref_part = self._ref_part(node=node, params=params) + + line = "" + if isinstance(node, TitleItem): + line = self._line_for_title( + node=node, params=params, node_label=node_label, ref_part=ref_part + ) + elif isinstance(node, SectionHeaderItem): + line = self._line_for_section_header( + node=node, params=params, node_label=node_label + ) + elif isinstance(node, ListGroup): + line = "" # intentionally skip + elif isinstance(node, (TextItem, FormItem, CodeItem)): + line = self._line_for_simple_label( + node_label=node_label, ref_part=ref_part + ) + elif isinstance(node, TableItem): + line = self._line_for_table( + node_label=node_label, + ref_part=ref_part, + label_counter=label_counter, + ) + elif isinstance(node, PictureItem): + line = self._line_for_picture( + node_label=node_label, + ref_part=ref_part, + label_counter=label_counter, + ) + + if summary: + line = f"{line} (summary={summary})" if line else line + + if line: + result.append( + create_ser_result( + text=line, + span_source=node if isinstance(node, DocItem) else [], + ) + ) + + return result diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py index 07693b88..51237db6 100644 --- a/docling_core/types/doc/document.py +++ b/docling_core/types/doc/document.py @@ -952,6 +952,8 @@ class NodeItem(BaseModel): model_config = ConfigDict(extra="forbid") + summary: Optional[str] = None # serialized only when not None + def get_ref(self) -> RefItem: """get_ref.""" return RefItem(cref=self.self_ref) diff --git a/docs/DoclingDocument.json b/docs/DoclingDocument.json index 305f5a9b..96b84fe9 100644 --- a/docs/DoclingDocument.json +++ b/docs/DoclingDocument.json @@ -194,6 +194,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "label": { "const": "code", "default": "code", @@ -475,6 +487,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "label": { "const": "form", "default": "form", @@ -598,6 +622,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "label": { "const": "formula", "default": "formula", @@ -807,6 +843,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "name": { "default": "group", "title": "Name", @@ -912,6 +960,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "name": { "default": "group", "title": "Name", @@ -962,6 +1022,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "label": { "const": "key_value_region", "default": "key_value_region", @@ -1054,6 +1126,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "name": { "default": "group", "title": "Name", @@ -1104,6 +1188,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "label": { "const": "list_item", "default": "list_item", @@ -1341,6 +1437,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "label": { "default": "picture", "enum": [ @@ -1842,6 +1950,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "label": { "const": "section_header", "default": "section_header", @@ -2065,6 +2185,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "label": { "default": "table", "enum": [ @@ -2182,6 +2314,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "label": { "enum": [ "caption", @@ -2285,6 +2429,18 @@ "$ref": "#/$defs/ContentLayer", "default": "body" }, + "summary": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Summary" + }, "label": { "const": "title", "default": "title", @@ -2382,6 +2538,7 @@ "parent": null, "children": [], "content_layer": "furniture", + "summary": null, "name": "_root_", "label": "unspecified" }, @@ -2394,6 +2551,7 @@ "parent": null, "children": [], "content_layer": "body", + "summary": null, "name": "_root_", "label": "unspecified" } diff --git a/test/data/doc/constructed_mdsum_outline_mdhdr_false.gt.md b/test/data/doc/constructed_mdsum_outline_mdhdr_false.gt.md new file mode 100644 index 00000000..da005563 --- /dev/null +++ b/test/data/doc/constructed_mdsum_outline_mdhdr_false.gt.md @@ -0,0 +1,59 @@ +title (reference=#/texts/1): Title of the Document + +text (reference=#/texts/2) + +text (reference=#/texts/3) + +section-header (level=1, reference=#/texts/4): 1. Introduction + +text (reference=#/texts/5) + +table 1 (reference=#/tables/0) + +picture 1 (reference=#/pictures/0) + +picture 2 (reference=#/pictures/1) + +text (reference=#/texts/24) + +code (reference=#/texts/25) + +text (reference=#/texts/26) + +text (reference=#/texts/28) + +formula (reference=#/texts/29) + +text (reference=#/texts/30) + +text (reference=#/texts/31) + +code (reference=#/texts/32) + +text (reference=#/texts/33) + +formula (reference=#/texts/34) + +form (reference=#/form_items/0) + +text (reference=#/texts/35) + +text (reference=#/texts/36) + +text (reference=#/texts/37) + +text (reference=#/texts/38) + +text (reference=#/texts/39) + +text (reference=#/texts/40) + +text (reference=#/texts/41) + +text (reference=#/texts/42) + +text (reference=#/texts/43) + +text (reference=#/texts/44) + +text (reference=#/texts/55) diff --git a/test/data/doc/constructed_mdsum_outline_mdhdr_true.gt.md b/test/data/doc/constructed_mdsum_outline_mdhdr_true.gt.md new file mode 100644 index 00000000..8e03ee91 --- /dev/null +++ b/test/data/doc/constructed_mdsum_outline_mdhdr_true.gt.md @@ -0,0 +1,59 @@ +# Title of the Document (reference=#/texts/1) + +text (reference=#/texts/2) + +text (reference=#/texts/3) + +## 1. Introduction (level=1, reference=#/texts/4) + +text (reference=#/texts/5) + +table 1 (reference=#/tables/0) + +picture 1 (reference=#/pictures/0) + +picture 2 (reference=#/pictures/1) + +text (reference=#/texts/24) + +code (reference=#/texts/25) + +text (reference=#/texts/26) + +text (reference=#/texts/28) + +formula (reference=#/texts/29) + +text (reference=#/texts/30) + +text (reference=#/texts/31) + +code (reference=#/texts/32) + +text (reference=#/texts/33) + +formula (reference=#/texts/34) + +form (reference=#/form_items/0) + +text (reference=#/texts/35) + +text (reference=#/texts/36) + +text (reference=#/texts/37) + +text (reference=#/texts/38) + +text (reference=#/texts/39) + +text (reference=#/texts/40) + +text (reference=#/texts/41) + +text (reference=#/texts/42) + +text (reference=#/texts/43) + +text (reference=#/texts/44) + +text (reference=#/texts/55) diff --git a/test/data/doc/constructed_mdsum_table_of_contents_mdhdr_false.gt.md b/test/data/doc/constructed_mdsum_table_of_contents_mdhdr_false.gt.md new file mode 100644 index 00000000..4d406b13 --- /dev/null +++ b/test/data/doc/constructed_mdsum_table_of_contents_mdhdr_false.gt.md @@ -0,0 +1,3 @@ +title (reference=#/texts/1): Title of the Document + +section-header (level=1, reference=#/texts/4): 1. Introduction diff --git a/test/data/doc/constructed_mdsum_table_of_contents_mdhdr_true.gt.md b/test/data/doc/constructed_mdsum_table_of_contents_mdhdr_true.gt.md new file mode 100644 index 00000000..628c2a04 --- /dev/null +++ b/test/data/doc/constructed_mdsum_table_of_contents_mdhdr_true.gt.md @@ -0,0 +1,3 @@ +# Title of the Document (reference=#/texts/1) + +## 1. Introduction (level=1, reference=#/texts/4) diff --git a/test/data/docling_document/unit/CodeItem.yaml b/test/data/docling_document/unit/CodeItem.yaml index 09995640..9d0aee3d 100644 --- a/test/data/docling_document/unit/CodeItem.yaml +++ b/test/data/docling_document/unit/CodeItem.yaml @@ -1,15 +1,16 @@ -children: [] captions: [] -footnotes: [] -references: [] -image: null +children: [] code_language: Python content_layer: body +footnotes: [] +formatting: null +hyperlink: null +image: null label: code orig: whatever parent: null prov: [] +references: [] self_ref: '#' -text: print(Hello World!) -formatting: null -hyperlink: null +summary: null +text: print(Hello World!) \ No newline at end of file diff --git a/test/data/docling_document/unit/FloatingItem.yaml b/test/data/docling_document/unit/FloatingItem.yaml index 21beef40..2338cc35 100644 --- a/test/data/docling_document/unit/FloatingItem.yaml +++ b/test/data/docling_document/unit/FloatingItem.yaml @@ -1,5 +1,6 @@ captions: [] children: [] +content_layer: body footnotes: [] image: null label: text @@ -7,4 +8,4 @@ parent: null prov: [] references: [] self_ref: '#' -content_layer: body \ No newline at end of file +summary: null \ No newline at end of file diff --git a/test/data/docling_document/unit/FormItem.yaml b/test/data/docling_document/unit/FormItem.yaml index af7a61e1..39818d6d 100644 --- a/test/data/docling_document/unit/FormItem.yaml +++ b/test/data/docling_document/unit/FormItem.yaml @@ -28,4 +28,5 @@ label: form parent: null prov: [] references: [] -self_ref: '#' \ No newline at end of file +self_ref: '#' +summary: null \ No newline at end of file diff --git a/test/data/docling_document/unit/FormulaItem.yaml b/test/data/docling_document/unit/FormulaItem.yaml index 25057908..680b8acb 100644 --- a/test/data/docling_document/unit/FormulaItem.yaml +++ b/test/data/docling_document/unit/FormulaItem.yaml @@ -1,10 +1,11 @@ children: [] +content_layer: body +formatting: null +hyperlink: null label: formula orig: whatever parent: null prov: [] self_ref: '#' -text: E=mc^2 -content_layer: body -formatting: null -hyperlink: null +summary: null +text: E=mc^2 \ No newline at end of file diff --git a/test/data/docling_document/unit/KeyValueItem.yaml b/test/data/docling_document/unit/KeyValueItem.yaml index 219e951e..09a31ed7 100644 --- a/test/data/docling_document/unit/KeyValueItem.yaml +++ b/test/data/docling_document/unit/KeyValueItem.yaml @@ -28,4 +28,5 @@ label: key_value_region parent: null prov: [] references: [] -self_ref: '#' \ No newline at end of file +self_ref: '#' +summary: null \ No newline at end of file diff --git a/test/data/docling_document/unit/ListItem.yaml b/test/data/docling_document/unit/ListItem.yaml index 20d8de90..ebcc755a 100644 --- a/test/data/docling_document/unit/ListItem.yaml +++ b/test/data/docling_document/unit/ListItem.yaml @@ -1,12 +1,13 @@ children: [] +content_layer: body enumerated: true +formatting: null +hyperlink: null label: list_item marker: (1) orig: whatever parent: null prov: [] self_ref: '#' -text: whatever -content_layer: body -formatting: null -hyperlink: null +summary: null +text: whatever \ No newline at end of file diff --git a/test/data/docling_document/unit/PictureItem.yaml b/test/data/docling_document/unit/PictureItem.yaml index ffe342a6..f4f07d3e 100644 --- a/test/data/docling_document/unit/PictureItem.yaml +++ b/test/data/docling_document/unit/PictureItem.yaml @@ -1,6 +1,7 @@ annotations: [] captions: [] children: [] +content_layer: body footnotes: [] image: null label: picture @@ -8,4 +9,4 @@ parent: null prov: [] references: [] self_ref: '#' -content_layer: body \ No newline at end of file +summary: null \ No newline at end of file diff --git a/test/data/docling_document/unit/SectionHeaderItem.yaml b/test/data/docling_document/unit/SectionHeaderItem.yaml index 68f641f9..67f662e6 100644 --- a/test/data/docling_document/unit/SectionHeaderItem.yaml +++ b/test/data/docling_document/unit/SectionHeaderItem.yaml @@ -1,11 +1,12 @@ children: [] +content_layer: body +formatting: null +hyperlink: null label: section_header level: 2 orig: whatever parent: null prov: [] self_ref: '#' -text: whatever -content_layer: body -formatting: null -hyperlink: null +summary: null +text: whatever \ No newline at end of file diff --git a/test/data/docling_document/unit/TableItem.yaml b/test/data/docling_document/unit/TableItem.yaml index ae08e00e..778cb312 100644 --- a/test/data/docling_document/unit/TableItem.yaml +++ b/test/data/docling_document/unit/TableItem.yaml @@ -1,5 +1,7 @@ +annotations: [] captions: [] children: [] +content_layer: body data: grid: - - bbox: null @@ -192,5 +194,4 @@ parent: null prov: [] references: [] self_ref: '#' -content_layer: body -annotations: [] +summary: null \ No newline at end of file diff --git a/test/data/docling_document/unit/TextItem.yaml b/test/data/docling_document/unit/TextItem.yaml index 1f72637a..ee8247eb 100644 --- a/test/data/docling_document/unit/TextItem.yaml +++ b/test/data/docling_document/unit/TextItem.yaml @@ -1,10 +1,11 @@ children: [] +content_layer: body +formatting: null +hyperlink: null label: text orig: whatever parent: null prov: [] self_ref: '#' -text: whatever -content_layer: body -formatting: null -hyperlink: null +summary: null +text: whatever \ No newline at end of file diff --git a/test/data/docling_document/unit/TitleItem.yaml b/test/data/docling_document/unit/TitleItem.yaml index 8e2a3dea..02d61247 100644 --- a/test/data/docling_document/unit/TitleItem.yaml +++ b/test/data/docling_document/unit/TitleItem.yaml @@ -1,10 +1,11 @@ children: [] +content_layer: body +formatting: null +hyperlink: null label: title orig: whatever parent: null prov: [] self_ref: '#' -text: whatever -content_layer: body -formatting: null -hyperlink: null +summary: null +text: whatever \ No newline at end of file diff --git a/test/test_doc_schema.py b/test/test_doc_schema.py index 109e7c88..acc560b4 100644 --- a/test/test_doc_schema.py +++ b/test/test_doc_schema.py @@ -35,8 +35,8 @@ def test_ccs_document(): # try as well as dictionary doc = json.loads(file_json) CCSDocument.model_validate(doc) - except ValidationError as e: - print(f"Validation error in file {filename}:\n{e.json()}") + except ValidationError: + # print(f"Validation error in file {filename}:\n{e.json()}") raise # check doc-error-1 is invalid in logs @@ -47,7 +47,7 @@ def test_ccs_document(): assert False, f"Data in file {filename} should be invalid for CCSDocument model" except ValidationError as e: for error in e.errors(): - print(type(error)) + # print(type(error)) assert all( item in error["loc"] for item in ("description", "logs") ), f"Data in file {filename} should fail in logs" diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py index d5ddb4dc..6f8fbd99 100644 --- a/test/test_docling_doc.py +++ b/test/test_docling_doc.py @@ -415,13 +415,14 @@ def read(name: str): def verify(dc, obj): pred = serialise(obj).strip() - if dc is KeyValueItem or dc is FormItem: - write(dc.__name__, pred) + # if dc is KeyValueItem or dc is FormItem or dc is TextItem: + # write(dc.__name__, pred) pred = yaml.safe_load(pred) - # print(f"\t{dc.__name__}:\n {pred}") + gold = read(dc.__name__) + # print(f"\t{dc.__name__}:\n {gold}") assert pred == gold, f"pred!=gold for {dc.__name__}" @@ -727,7 +728,7 @@ def _test_export_methods( second_page = first_page + 1 if second_page in doc.pages: # Only test if document has at least 2 pages dt_pages_pred = doc.export_to_doctags(pages={first_page, second_page}) - print(dt_pages_pred) + # print(dt_pages_pred) _verify_regression_test(dt_pages_pred, filename=filename, ext="pages.dt") # Test Tables export ... diff --git a/test/test_json_schema_to_search_mapper.py b/test/test_json_schema_to_search_mapper.py index e52984d4..2973fa67 100644 --- a/test/test_json_schema_to_search_mapper.py +++ b/test/test_json_schema_to_search_mapper.py @@ -60,7 +60,7 @@ def test_json_schema_to_search_mapper_0(): def test_json_schema_to_search_mapper_1(): """Test the class JsonSchemaToSearchMapper.""" s = Record.model_json_schema() - print(json.dumps(s, indent=2)) + # print(json.dumps(s, indent=2)) _meta = { "aliases": [".production", "ccc"], diff --git a/test/test_markdown_summary.py b/test/test_markdown_summary.py new file mode 100644 index 00000000..63bdf719 --- /dev/null +++ b/test/test_markdown_summary.py @@ -0,0 +1,55 @@ +"""Tests for MarkdownSummarySerializer (document outline).""" + +from pathlib import Path + +import pytest + +from docling_core.experimental.serializer.markdown_summary import ( + MarkdownSummaryMode, + MarkdownSummaryParams, + MarkdownSummarySerializer, +) + +from .test_data_gen_flag import GEN_TEST_DATA +from .test_docling_doc import _construct_doc + + +def verify(exp_file: Path, actual: str): + if GEN_TEST_DATA: + with open(exp_file, "w", encoding="utf-8") as f: + f.write(f"{actual}\n") + else: + with open(exp_file, "r", encoding="utf-8") as f: + expected = f.read().rstrip() + assert expected == actual + + +@pytest.mark.parametrize( + "mode", + [ + MarkdownSummaryMode.OUTLINE, + MarkdownSummaryMode.TABLE_OF_CONTENTS, + ], +) +@pytest.mark.parametrize("use_md_headers", [False, True]) +def test_markdown_summary_outline(mode: MarkdownSummaryMode, use_md_headers: bool): + # Build a representative document with title, headers, text, lists, table, and pictures + doc = _construct_doc() + + ser = MarkdownSummarySerializer( + doc=doc, + params=MarkdownSummaryParams( + use_markdown_headers=use_md_headers, + mode=mode, + ), + ) + + outline = ser.serialize().text + + # Compare with or generate ground-truth output + root_dir = Path("./test/data/doc") + exp_path = ( + root_dir + / f"constructed_mdsum_{mode.value}_mdhdr_{str(use_md_headers).lower()}.gt.md" + ) + verify(exp_file=exp_path, actual=outline) diff --git a/test/test_otsl_table_export.py b/test/test_otsl_table_export.py index 4b3534f3..dde0744c 100644 --- a/test/test_otsl_table_export.py +++ b/test/test_otsl_table_export.py @@ -274,10 +274,10 @@ def test_table_export_to_otsl(): otsl_string = doc.tables[0].export_to_otsl( add_cell_location=False, add_cell_text=False, doc=doc ) - print_friendly = otsl_string.split("") - print("OTSL out:") - for s in print_friendly: - print(s) + otsl_string.split("") + # print("OTSL out:") + # for s in print_friendly: + # print(s) assert ( otsl_string == ""